Skip to content

Commit fd0f895

Browse files
committed
Release 1.10.2
2 parents 7c16b56 + ea879e1 commit fd0f895

14 files changed

+169
-45
lines changed

Makefile

+2
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@ RANLIB = ranlib
3030
htslib_default_libs = -lz -lm -lbz2 -llzma -lcurl
3131

3232
CPPFLAGS =
33+
# TODO: make the 64-bit support for VCF optional via configure, for now add -DVCF_ALLOW_INT64
34+
# to CFLAGS manually, here or in config.mk if the latter exists.
3335
# TODO: probably update cram code to make it compile cleanly with -Wc++-compat
3436
# For testing strict C99 support add -std=c99 -D_XOPEN_SOURCE=600
3537
#CFLAGS = -g -Wall -O2 -pedantic -std=c99 -D_XOPEN_SOURCE=600

NEWS

+16
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,19 @@
1+
Noteworthy changes in release 1.10.2 (19th December 2019)
2+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
3+
4+
This is a release fix that corrects minor inconsistencies discovered in
5+
previous deliverables.
6+
7+
8+
Noteworthy changes in release 1.10.1 (17th December 2019)
9+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
10+
11+
The support for 64-bit coordinates in VCF brought problems for files
12+
not conforming to VCF/BCF specification. While previous versions would
13+
make out-of-range values silently overflow creating nonsense values
14+
but parseable file, the version 1.10 would silently create an invalid BCF.
15+
16+
117
Noteworthy changes in release 1.10 (6th December 2019)
218
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
319

README.large_positions.md

+3-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,9 @@ which have, or are expected to have, chromosomes longer than two gigabases.
99
Currently 64 bit positions can only be stored in SAM and VCF format files.
1010
Binary BAM, CRAM and BCF cannot be used due to limitations in the formats
1111
themselves. As SAM and VCF are text formats, they have no limit on the
12-
size of numeric values.
12+
size of numeric values. Note that while 64 bit positions are supported by
13+
default for SAM, for VCF they must be enabled explicitly at compile time
14+
by editing Makefile and adding -DVCF_ALLOW_INT64=1 to CFLAGS.
1315

1416
# Compatibility issues to check
1517

bgzip.1

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
.TH bgzip 1 "6 December 2019" "htslib-1.10" "Bioinformatics tools"
1+
.TH bgzip 1 "19 December 2019" "htslib-1.10.2" "Bioinformatics tools"
22
.SH NAME
33
.PP
44
bgzip \- Block compression/decompression utility

htsfile.1

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
.TH htsfile 1 "6 December 2019" "htslib-1.10" "Bioinformatics tools"
1+
.TH htsfile 1 "19 December 2019" "htslib-1.10.2" "Bioinformatics tools"
22
.SH NAME
33
htsfile \- identify high-throughput sequencing data files
44
.\"

htslib-s3-plugin.7

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
.TH htslib-s3-plugin 7 "6 December 2019" "htslib-1.10" "Bioinformatics tools"
1+
.TH htslib-s3-plugin 7 "19 December 2019" "htslib-1.10.2" "Bioinformatics tools"
22
.SH NAME
33
s3 plugin \- htslib AWS S3 plugin
44
.\"

htslib/hts.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -421,7 +421,7 @@ const char *hts_version(void);
421421
// Immediately after release, bump ZZ to 90 to distinguish in-development
422422
// Git repository builds from the release; you may wish to increment this
423423
// further when significant features are merged.
424-
#define HTS_VERSION 101000
424+
#define HTS_VERSION 101002
425425

426426
/*!
427427
@abstract Determine format by peeking at the start of a file

htslib/vcf.h

+5-5
Original file line numberDiff line numberDiff line change
@@ -1390,10 +1390,10 @@ static inline int bcf_enc_int1(kstring_t *s, int32_t x)
13901390
@param[out] q Location to store an updated value for p
13911391
@return The integer value, or zero if @p type is not valid.
13921392
1393-
If @p type is not one of BCF_BT_INT8, BCF_BT_INT16 or BCF_BT_INT32, zero
1394-
will be returned and @p *q will not be updated. Otherwise, the integer
1395-
value will be returned and @p *q will be set to the memory location
1396-
immediately following the integer value.
1393+
If @p type is not one of BCF_BT_INT8, BCF_BT_INT16, BCF_BT_INT32 or
1394+
BCF_BT_INT64, zero will be returned and @p *q will not be updated.
1395+
Otherwise, the integer value will be returned and @p *q will be set
1396+
to the memory location immediately following the integer value.
13971397
13981398
Cautious callers can detect invalid type codes by checking that *q has
13991399
actually been updated.
@@ -1411,7 +1411,7 @@ static inline int64_t bcf_dec_int1(const uint8_t *p, int type, uint8_t **q)
14111411
*q = (uint8_t*)p + 4;
14121412
return le_to_i32(p);
14131413
} else if (type == BCF_BT_INT64) {
1414-
*q = (uint8_t*)p + 4;
1414+
*q = (uint8_t*)p + 8;
14151415
return le_to_i64(p);
14161416
} else { // Invalid type.
14171417
return 0;

tabix.1

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
.TH tabix 1 "6 December 2019" "htslib-1.10" "Bioinformatics tools"
1+
.TH tabix 1 "19 December 2019" "htslib-1.10.2" "Bioinformatics tools"
22
.SH NAME
33
.PP
44
tabix \- Generic indexer for TAB-delimited genome position files

test/longrefs/index.expected2.vcf

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
1 10010000110 . G <DEL> 0 . SVTYPE=DEL;SVLEN=-890;END=10010001000 PL 0,1,45
1+
1 10010000110 . G <DEL> 0 . SVTYPE=DEL;SVLEN=-890;END=10010001000;QS=1,0 PL 0,1,45

test/longrefs/index.vcf

+1-1
Original file line numberDiff line numberDiff line change
@@ -213,4 +213,4 @@
213213
1 10010000107 . G <*> 0 . DP=1;I16=0,1,0,0,33,1089,0,0,29,841,0,0,2,4,0,0;QS=1,0;MQ0F=0 PL 0,3,29
214214
1 10010000108 . C <*> 0 . DP=1;I16=0,1,0,0,32,1024,0,0,29,841,0,0,1,1,0,0;QS=1,0;MQ0F=0 PL 0,3,29
215215
1 10010000109 . A <*> 0 . DP=1;I16=0,1,0,0,35,1225,0,0,29,841,0,0,0,0,0,0;QS=1,0;MQ0F=0 PL 0,3,29
216-
1 10010000110 . G <DEL> 0 . SVTYPE=DEL;SVLEN=-890;END=10010001000 PL 0,1,45
216+
1 10010000110 . G <DEL> 0 . SVTYPE=DEL;SVLEN=-890;END=10010001000;QS=1,0 PL 0,1,45

test/test.pl

+13-12
Original file line numberDiff line numberDiff line change
@@ -660,18 +660,19 @@ sub test_view
660660
testv $opts, "./test_view $tv_args -M -p longrefs/longref_multi.tmp.sam longrefs/longref.tmp.sam.gz CHROMOSOME_I:10000000000-10000000003 CHROMOSOME_I:10000000100-10000000110";
661661
testv $opts, "./compare_sam.pl longrefs/longref_multi.expected.sam longrefs/longref_multi.tmp.sam";
662662

663-
# VCF round trip
664-
unlink("longrefs/index.tmp.vcf.gz.csi"); # To stop vcf_hdr_read from reading a stale index
665-
testv $opts, "./test_view $tv_args -z -p longrefs/index.tmp.vcf.gz -x longrefs/index.tmp.vcf.gz.csi.otf -m 14 longrefs/index.vcf";
666-
testv $opts, "./test_view $tv_args -p longrefs/index.tmp.vcf_ longrefs/index.tmp.vcf.gz";
667-
testv $opts, "cmp longrefs/index.vcf longrefs/index.tmp.vcf_";
668-
669-
# Build index and compare with on-the-fly one made earlier.
670-
test_compare $opts, "$$opts{path}/test_index -c longrefs/index.tmp.vcf.gz", "longrefs/index.tmp.vcf.gz.csi.otf", "longrefs/index.tmp.vcf.gz.csi", gz=>1;
671-
672-
# test_view can't do indexed look-ups on vcf, but we can use tabix
673-
test_compare $opts, "$$opts{bin}/tabix longrefs/index.tmp.vcf.gz 1:10010000100-10010000105 > longrefs/index.tmp.tabix1.vcf", "longrefs/index.expected1.vcf", "longrefs/index.tmp.tabix1.vcf", fix_newlines => 1;
674-
test_compare $opts, "$$opts{bin}/tabix longrefs/index.tmp.vcf.gz 1:10010000120-10010000130 > longrefs/index.tmp.tabix2.vcf", "longrefs/index.expected2.vcf", "longrefs/index.tmp.tabix2.vcf", fix_newlines => 1;
663+
# 64-bit positions are currently not compiled in by default for VCF
664+
# # VCF round trip
665+
# unlink("longrefs/index.tmp.vcf.gz.csi"); # To stop vcf_hdr_read from reading a stale index
666+
# testv $opts, "./test_view $tv_args -z -p longrefs/index.tmp.vcf.gz -x longrefs/index.tmp.vcf.gz.csi.otf -m 14 longrefs/index.vcf";
667+
# testv $opts, "./test_view $tv_args -p longrefs/index.tmp.vcf_ longrefs/index.tmp.vcf.gz";
668+
# testv $opts, "cmp longrefs/index.vcf longrefs/index.tmp.vcf_";
669+
#
670+
# # Build index and compare with on-the-fly one made earlier.
671+
# test_compare $opts, "$$opts{path}/test_index -c longrefs/index.tmp.vcf.gz", "longrefs/index.tmp.vcf.gz.csi.otf", "longrefs/index.tmp.vcf.gz.csi", gz=>1;
672+
#
673+
# # test_view can't do indexed look-ups on vcf, but we can use tabix
674+
# test_compare $opts, "$$opts{bin}/tabix longrefs/index.tmp.vcf.gz 1:10010000100-10010000105 > longrefs/index.tmp.tabix1.vcf", "longrefs/index.expected1.vcf", "longrefs/index.tmp.tabix1.vcf", fix_newlines => 1;
675+
# test_compare $opts, "$$opts{bin}/tabix longrefs/index.tmp.vcf.gz 1:10010000120-10010000130 > longrefs/index.tmp.tabix2.vcf", "longrefs/index.expected2.vcf", "longrefs/index.tmp.tabix2.vcf", fix_newlines => 1;
675676

676677
if ($test_view_failures == 0) {
677678
passed($opts, "large position tests");

vcf.c

+122-19
Original file line numberDiff line numberDiff line change
@@ -59,10 +59,26 @@ HTSLIB_EXPORT
5959
uint32_t bcf_float_vector_end = 0x7F800002;
6060

6161
HTSLIB_EXPORT
62-
uint8_t bcf_type_shift[] = { 0, 0, 1, 2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
62+
uint8_t bcf_type_shift[] = { 0, 0, 1, 2, 3, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
6363

6464
static bcf_idinfo_t bcf_idinfo_def = { .info = { 15, 15, 15 }, .hrec = { NULL, NULL, NULL}, .id = -1 };
6565

66+
/*
67+
Partial support for 64-bit POS and Number=1 INFO tags.
68+
Notes:
69+
- the support for 64-bit values is motivated by POS and INFO/END for large genomes
70+
- the use of 64-bit values does not conform to the specification
71+
- cannot output 64-bit BCF and if it does, it is not compatible with anything
72+
- experimental, use at your risk
73+
*/
74+
#ifdef VCF_ALLOW_INT64
75+
#define BCF_MAX_BT_INT64 (0x7fffffffffffffff) /* INT64_MAX, for internal use only */
76+
#define BCF_MIN_BT_INT64 -9223372036854775800LL /* INT64_MIN + 8, for internal use only */
77+
#endif
78+
79+
#define BCF_IS_64BIT (1<<30)
80+
81+
6682
static const char *dump_char(char *buffer, char c)
6783
{
6884
switch (c) {
@@ -1251,6 +1267,14 @@ static int bcf_dec_typed_int1_safe(uint8_t *p, uint8_t *end, uint8_t **q,
12511267
if (end - p < 4) return -1;
12521268
*q = p + 4;
12531269
*val = le_to_i32(p);
1270+
#ifdef VCF_ALLOW_INT64
1271+
} else if (t == BCF_BT_INT64) {
1272+
// This case should never happen because there should be no 64-bit BCFs
1273+
// at all, definitely not coming from htslib
1274+
if (end - p < 8) return -1;
1275+
*q = p + 8;
1276+
*val = le_to_i64(p);
1277+
#endif
12541278
} else {
12551279
return -1;
12561280
}
@@ -1290,6 +1314,9 @@ static int bcf_record_check(const bcf_hdr_t *hdr, bcf1_t *rec) {
12901314
uint32_t i, reports;
12911315
const uint32_t is_integer = ((1 << BCF_BT_INT8) |
12921316
(1 << BCF_BT_INT16) |
1317+
#ifdef VCF_ALLOW_INT64
1318+
(1 << BCF_BT_INT64) |
1319+
#endif
12931320
(1 << BCF_BT_INT32));
12941321
const uint32_t is_valid_type = (is_integer |
12951322
(1 << BCF_BT_NULL) |
@@ -1728,6 +1755,12 @@ int bcf_write(htsFile *hfp, bcf_hdr_t *h, bcf1_t *v)
17281755
}
17291756
bcf1_sync(v); // check if the BCF record was modified
17301757

1758+
if ( v->unpacked & BCF_IS_64BIT )
1759+
{
1760+
hts_log_error("Data contains 64-bit values not representable in BCF. Please use VCF instead");
1761+
return -1;
1762+
}
1763+
17311764
BGZF *fp = hfp->fp.bgzf;
17321765
union {
17331766
uint32_t i;
@@ -2040,6 +2073,7 @@ int bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize)
20402073
return 0; // FIXME: check for errs in this function
20412074
}
20422075

2076+
#ifdef VCF_ALLOW_INT64
20432077
static int bcf_enc_long1(kstring_t *s, int64_t x) {
20442078
uint32_t e = 0;
20452079
if (x <= BCF_MAX_BT_INT32 && x >= BCF_MIN_BT_INT32)
@@ -2057,6 +2091,7 @@ static int bcf_enc_long1(kstring_t *s, int64_t x) {
20572091
}
20582092
return e == 0 ? 0 : -1;
20592093
}
2094+
#endif
20602095

20612096
static inline int serialize_float_array(kstring_t *s, size_t n, const float *a) {
20622097
uint8_t *p;
@@ -2169,6 +2204,7 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p
21692204
{
21702205
if ( !bcf_hdr_nsamples(h) ) return 0;
21712206

2207+
static int extreme_int_warned = 0;
21722208
char *r, *t;
21732209
int j, l, m, g;
21742210
khint_t k;
@@ -2362,7 +2398,23 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p
23622398
int32_t *x = (int32_t*)(z->buf + z->size * m);
23632399
for (l = 0;; ++t) {
23642400
if (*t == '.') x[l++] = bcf_int32_missing, ++t; // ++t to skip "."
2365-
else x[l++] = strtol(t, &t, 10);
2401+
else
2402+
{
2403+
errno = 0;
2404+
char *te;
2405+
long int tmp_val = strtol(t, &te, 10);
2406+
if ( te==t || errno!=0 || tmp_val<BCF_MIN_BT_INT32 || tmp_val>BCF_MAX_BT_INT32 )
2407+
{
2408+
if ( !extreme_int_warned )
2409+
{
2410+
hts_log_warning("Extreme FORMAT/%s value encountered and set to missing at %s:%"PRIhts_pos,h->id[BCF_DT_ID][fmt[j-1].key].key,bcf_seqname(h,v), v->pos+1);
2411+
extreme_int_warned = 1;
2412+
}
2413+
tmp_val = bcf_int32_missing;
2414+
}
2415+
x[l++] = tmp_val;
2416+
t = te;
2417+
}
23662418
if (*t != ',') break;
23672419
}
23682420
if ( !l ) x[l++] = bcf_int32_missing;
@@ -2469,6 +2521,7 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p
24692521

24702522
int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v)
24712523
{
2524+
static int extreme_int_warned = 0, negative_rlen_warned = 0;
24722525
int i = 0;
24732526
char *p, *q, *r, *t;
24742527
kstring_t *str;
@@ -2526,6 +2579,8 @@ int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v)
25262579
} else {
25272580
v->pos -= 1;
25282581
}
2582+
if (v->pos >= INT32_MAX)
2583+
v->unpacked |= BCF_IS_64BIT;
25292584
} else if (i == 2) { // ID
25302585
if (strcmp(p, ".")) bcf_enc_vchar(str, q - p, p);
25312586
else bcf_enc_size(str, 0, BCF_BT_CHAR);
@@ -2672,31 +2727,77 @@ int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v)
26722727
val_a = z;
26732728
}
26742729
if ((y>>4&0xf) == BCF_HT_INT) {
2675-
// Allow first value only to be 64 bit
2676-
// (for large END value)
2677-
int64_t v64 = strtoll(val, &te, 10);
2678-
if ( te==val ) { // conversion failed
2679-
val_a[0] = bcf_int32_missing;
2680-
v64 = bcf_int64_missing;
2681-
} else {
2682-
val_a[0] = v64 >= BCF_MIN_BT_INT32 && v64 <= BCF_MAX_BT_INT32 ? v64 : bcf_int32_missing;
2730+
i = 0, t = val;
2731+
int64_t val1;
2732+
#ifdef VCF_ALLOW_INT64
2733+
int is_int64 = 0;
2734+
if ( n_val==1 )
2735+
{
2736+
errno = 0;
2737+
long long int tmp_val = strtoll(val, &te, 10);
2738+
if ( te==val ) tmp_val = bcf_int32_missing;
2739+
else if ( te==val || errno!=0 || tmp_val<BCF_MIN_BT_INT64 || tmp_val>BCF_MAX_BT_INT64 )
2740+
{
2741+
if ( !extreme_int_warned )
2742+
{
2743+
hts_log_warning("Extreme INFO/%s value encountered and set to missing at %s:%"PRIhts_pos,key,bcf_seqname(h,v), v->pos+1);
2744+
extreme_int_warned = 1;
2745+
}
2746+
tmp_val = bcf_int32_missing;
2747+
}
2748+
else
2749+
is_int64 = 1;
2750+
val1 = tmp_val;
2751+
t = te;
2752+
i = 1; // this is just to avoid adding another nested block...
26832753
}
2684-
for (t = te; *t && *t != ','; t++);
2685-
if (*t == ',') ++t;
2686-
for (i = 1; i < n_val; ++i, ++t)
2754+
#endif
2755+
for (; i < n_val; ++i, ++t)
26872756
{
2688-
val_a[i] = strtol(t, &te, 10);
2689-
if ( te==t ) // conversion failed
2690-
val_a[i] = bcf_int32_missing;
2757+
errno = 0;
2758+
long int tmp_val = strtol(t, &te, 10);
2759+
if ( te==t ) tmp_val = bcf_int32_missing;
2760+
else if ( errno!=0 || tmp_val<BCF_MIN_BT_INT32 || tmp_val>BCF_MAX_BT_INT32 )
2761+
{
2762+
if ( !extreme_int_warned )
2763+
{
2764+
hts_log_warning("Extreme INFO/%s value encountered and set to missing at %s:%"PRIhts_pos,key,bcf_seqname(h,v), v->pos+1);
2765+
extreme_int_warned = 1;
2766+
}
2767+
tmp_val = bcf_int32_missing;
2768+
}
2769+
val_a[i] = tmp_val;
26912770
for (t = te; *t && *t != ','; t++);
26922771
}
26932772
if (n_val == 1) {
2694-
bcf_enc_long1(str, v64);
2773+
#ifdef VCF_ALLOW_INT64
2774+
if ( is_int64 )
2775+
{
2776+
v->unpacked |= BCF_IS_64BIT;
2777+
bcf_enc_long1(str, val1);
2778+
}
2779+
else
2780+
bcf_enc_int1(str, (int32_t)val1);
2781+
#else
2782+
val1 = val_a[0];
2783+
bcf_enc_int1(str, (int32_t)val1);
2784+
#endif
26952785
} else {
26962786
bcf_enc_vint(str, n_val, val_a, -1);
26972787
}
2698-
if (strcmp(key, "END") == 0)
2699-
v->rlen = v64 - v->pos;
2788+
if (n_val==1 && strcmp(key, "END") == 0)
2789+
{
2790+
if ( val1 <= v->pos )
2791+
{
2792+
if ( !negative_rlen_warned )
2793+
{
2794+
hts_log_warning("INFO/END=%"PRIhts_pos" is smaller than POS at %s:%"PRIhts_pos,val1,bcf_seqname(h,v),v->pos+1);
2795+
negative_rlen_warned = 1;
2796+
}
2797+
}
2798+
else
2799+
v->rlen = val1 - v->pos;
2800+
}
27002801
} else if ((y>>4&0xf) == BCF_HT_REAL) {
27012802
float *val_f = (float *)val_a;
27022803
for (i = 0, t = val; i < n_val; ++i, ++t)
@@ -3835,6 +3936,7 @@ int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const v
38353936
else
38363937
bcf_enc_vchar(&str, strlen((char*)values), (char*)values);
38373938
}
3939+
#ifdef VCF_ALLOW_INT64
38383940
else if ( type==BCF_HT_LONG )
38393941
{
38403942
if (n != 1) {
@@ -3843,6 +3945,7 @@ int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const v
38433945
}
38443946
bcf_enc_long1(&str, *(int64_t *) values);
38453947
}
3948+
#endif
38463949
else
38473950
{
38483951
hts_log_error("The type %d not implemented yet", type);

version.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
# DEALINGS IN THE SOFTWARE.
2525

2626
# Master version, for use in tarballs or non-git source copies
27-
VERSION=1.10
27+
VERSION=1.10.2
2828

2929
# If we have a git clone, then check against the current tag
3030
if [ -e .git ]

0 commit comments

Comments
 (0)