@@ -59,10 +59,26 @@ HTSLIB_EXPORT
59
59
uint32_t bcf_float_vector_end = 0x7F800002 ;
60
60
61
61
HTSLIB_EXPORT
62
- uint8_t bcf_type_shift [] = { 0 , 0 , 1 , 2 , 0 , 2 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 };
62
+ uint8_t bcf_type_shift [] = { 0 , 0 , 1 , 2 , 3 , 2 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 };
63
63
64
64
static bcf_idinfo_t bcf_idinfo_def = { .info = { 15 , 15 , 15 }, .hrec = { NULL , NULL , NULL }, .id = -1 };
65
65
66
+ /*
67
+ Partial support for 64-bit POS and Number=1 INFO tags.
68
+ Notes:
69
+ - the support for 64-bit values is motivated by POS and INFO/END for large genomes
70
+ - the use of 64-bit values does not conform to the specification
71
+ - cannot output 64-bit BCF and if it does, it is not compatible with anything
72
+ - experimental, use at your risk
73
+ */
74
+ #ifdef VCF_ALLOW_INT64
75
+ #define BCF_MAX_BT_INT64 (0x7fffffffffffffff) /* INT64_MAX, for internal use only */
76
+ #define BCF_MIN_BT_INT64 -9223372036854775800LL /* INT64_MIN + 8, for internal use only */
77
+ #endif
78
+
79
+ #define BCF_IS_64BIT (1<<30)
80
+
81
+
66
82
static const char * dump_char (char * buffer , char c )
67
83
{
68
84
switch (c ) {
@@ -1251,6 +1267,14 @@ static int bcf_dec_typed_int1_safe(uint8_t *p, uint8_t *end, uint8_t **q,
1251
1267
if (end - p < 4 ) return -1 ;
1252
1268
* q = p + 4 ;
1253
1269
* val = le_to_i32 (p );
1270
+ #ifdef VCF_ALLOW_INT64
1271
+ } else if (t == BCF_BT_INT64 ) {
1272
+ // This case should never happen because there should be no 64-bit BCFs
1273
+ // at all, definitely not coming from htslib
1274
+ if (end - p < 8 ) return -1 ;
1275
+ * q = p + 8 ;
1276
+ * val = le_to_i64 (p );
1277
+ #endif
1254
1278
} else {
1255
1279
return -1 ;
1256
1280
}
@@ -1290,6 +1314,9 @@ static int bcf_record_check(const bcf_hdr_t *hdr, bcf1_t *rec) {
1290
1314
uint32_t i , reports ;
1291
1315
const uint32_t is_integer = ((1 << BCF_BT_INT8 ) |
1292
1316
(1 << BCF_BT_INT16 ) |
1317
+ #ifdef VCF_ALLOW_INT64
1318
+ (1 << BCF_BT_INT64 ) |
1319
+ #endif
1293
1320
(1 << BCF_BT_INT32 ));
1294
1321
const uint32_t is_valid_type = (is_integer |
1295
1322
(1 << BCF_BT_NULL ) |
@@ -1728,6 +1755,12 @@ int bcf_write(htsFile *hfp, bcf_hdr_t *h, bcf1_t *v)
1728
1755
}
1729
1756
bcf1_sync (v ); // check if the BCF record was modified
1730
1757
1758
+ if ( v -> unpacked & BCF_IS_64BIT )
1759
+ {
1760
+ hts_log_error ("Data contains 64-bit values not representable in BCF. Please use VCF instead" );
1761
+ return -1 ;
1762
+ }
1763
+
1731
1764
BGZF * fp = hfp -> fp .bgzf ;
1732
1765
union {
1733
1766
uint32_t i ;
@@ -2040,6 +2073,7 @@ int bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize)
2040
2073
return 0 ; // FIXME: check for errs in this function
2041
2074
}
2042
2075
2076
+ #ifdef VCF_ALLOW_INT64
2043
2077
static int bcf_enc_long1 (kstring_t * s , int64_t x ) {
2044
2078
uint32_t e = 0 ;
2045
2079
if (x <= BCF_MAX_BT_INT32 && x >= BCF_MIN_BT_INT32 )
@@ -2057,6 +2091,7 @@ static int bcf_enc_long1(kstring_t *s, int64_t x) {
2057
2091
}
2058
2092
return e == 0 ? 0 : -1 ;
2059
2093
}
2094
+ #endif
2060
2095
2061
2096
static inline int serialize_float_array (kstring_t * s , size_t n , const float * a ) {
2062
2097
uint8_t * p ;
@@ -2169,6 +2204,7 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p
2169
2204
{
2170
2205
if ( !bcf_hdr_nsamples (h ) ) return 0 ;
2171
2206
2207
+ static int extreme_int_warned = 0 ;
2172
2208
char * r , * t ;
2173
2209
int j , l , m , g ;
2174
2210
khint_t k ;
@@ -2362,7 +2398,23 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p
2362
2398
int32_t * x = (int32_t * )(z -> buf + z -> size * m );
2363
2399
for (l = 0 ;; ++ t ) {
2364
2400
if (* t == '.' ) x [l ++ ] = bcf_int32_missing , ++ t ; // ++t to skip "."
2365
- else x [l ++ ] = strtol (t , & t , 10 );
2401
+ else
2402
+ {
2403
+ errno = 0 ;
2404
+ char * te ;
2405
+ long int tmp_val = strtol (t , & te , 10 );
2406
+ if ( te == t || errno != 0 || tmp_val < BCF_MIN_BT_INT32 || tmp_val > BCF_MAX_BT_INT32 )
2407
+ {
2408
+ if ( !extreme_int_warned )
2409
+ {
2410
+ hts_log_warning ("Extreme FORMAT/%s value encountered and set to missing at %s:%" PRIhts_pos ,h -> id [BCF_DT_ID ][fmt [j - 1 ].key ].key ,bcf_seqname (h ,v ), v -> pos + 1 );
2411
+ extreme_int_warned = 1 ;
2412
+ }
2413
+ tmp_val = bcf_int32_missing ;
2414
+ }
2415
+ x [l ++ ] = tmp_val ;
2416
+ t = te ;
2417
+ }
2366
2418
if (* t != ',' ) break ;
2367
2419
}
2368
2420
if ( !l ) x [l ++ ] = bcf_int32_missing ;
@@ -2469,6 +2521,7 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p
2469
2521
2470
2522
int vcf_parse (kstring_t * s , const bcf_hdr_t * h , bcf1_t * v )
2471
2523
{
2524
+ static int extreme_int_warned = 0 , negative_rlen_warned = 0 ;
2472
2525
int i = 0 ;
2473
2526
char * p , * q , * r , * t ;
2474
2527
kstring_t * str ;
@@ -2526,6 +2579,8 @@ int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v)
2526
2579
} else {
2527
2580
v -> pos -= 1 ;
2528
2581
}
2582
+ if (v -> pos >= INT32_MAX )
2583
+ v -> unpacked |= BCF_IS_64BIT ;
2529
2584
} else if (i == 2 ) { // ID
2530
2585
if (strcmp (p , "." )) bcf_enc_vchar (str , q - p , p );
2531
2586
else bcf_enc_size (str , 0 , BCF_BT_CHAR );
@@ -2672,31 +2727,77 @@ int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v)
2672
2727
val_a = z ;
2673
2728
}
2674
2729
if ((y >>4 & 0xf ) == BCF_HT_INT ) {
2675
- // Allow first value only to be 64 bit
2676
- // (for large END value)
2677
- int64_t v64 = strtoll (val , & te , 10 );
2678
- if ( te == val ) { // conversion failed
2679
- val_a [0 ] = bcf_int32_missing ;
2680
- v64 = bcf_int64_missing ;
2681
- } else {
2682
- val_a [0 ] = v64 >= BCF_MIN_BT_INT32 && v64 <= BCF_MAX_BT_INT32 ? v64 : bcf_int32_missing ;
2730
+ i = 0 , t = val ;
2731
+ int64_t val1 ;
2732
+ #ifdef VCF_ALLOW_INT64
2733
+ int is_int64 = 0 ;
2734
+ if ( n_val == 1 )
2735
+ {
2736
+ errno = 0 ;
2737
+ long long int tmp_val = strtoll (val , & te , 10 );
2738
+ if ( te == val ) tmp_val = bcf_int32_missing ;
2739
+ else if ( te == val || errno != 0 || tmp_val < BCF_MIN_BT_INT64 || tmp_val > BCF_MAX_BT_INT64 )
2740
+ {
2741
+ if ( !extreme_int_warned )
2742
+ {
2743
+ hts_log_warning ("Extreme INFO/%s value encountered and set to missing at %s:%" PRIhts_pos ,key ,bcf_seqname (h ,v ), v -> pos + 1 );
2744
+ extreme_int_warned = 1 ;
2745
+ }
2746
+ tmp_val = bcf_int32_missing ;
2747
+ }
2748
+ else
2749
+ is_int64 = 1 ;
2750
+ val1 = tmp_val ;
2751
+ t = te ;
2752
+ i = 1 ; // this is just to avoid adding another nested block...
2683
2753
}
2684
- for (t = te ; * t && * t != ',' ; t ++ );
2685
- if (* t == ',' ) ++ t ;
2686
- for (i = 1 ; i < n_val ; ++ i , ++ t )
2754
+ #endif
2755
+ for (; i < n_val ; ++ i , ++ t )
2687
2756
{
2688
- val_a [i ] = strtol (t , & te , 10 );
2689
- if ( te == t ) // conversion failed
2690
- val_a [i ] = bcf_int32_missing ;
2757
+ errno = 0 ;
2758
+ long int tmp_val = strtol (t , & te , 10 );
2759
+ if ( te == t ) tmp_val = bcf_int32_missing ;
2760
+ else if ( errno != 0 || tmp_val < BCF_MIN_BT_INT32 || tmp_val > BCF_MAX_BT_INT32 )
2761
+ {
2762
+ if ( !extreme_int_warned )
2763
+ {
2764
+ hts_log_warning ("Extreme INFO/%s value encountered and set to missing at %s:%" PRIhts_pos ,key ,bcf_seqname (h ,v ), v -> pos + 1 );
2765
+ extreme_int_warned = 1 ;
2766
+ }
2767
+ tmp_val = bcf_int32_missing ;
2768
+ }
2769
+ val_a [i ] = tmp_val ;
2691
2770
for (t = te ; * t && * t != ',' ; t ++ );
2692
2771
}
2693
2772
if (n_val == 1 ) {
2694
- bcf_enc_long1 (str , v64 );
2773
+ #ifdef VCF_ALLOW_INT64
2774
+ if ( is_int64 )
2775
+ {
2776
+ v -> unpacked |= BCF_IS_64BIT ;
2777
+ bcf_enc_long1 (str , val1 );
2778
+ }
2779
+ else
2780
+ bcf_enc_int1 (str , (int32_t )val1 );
2781
+ #else
2782
+ val1 = val_a [0 ];
2783
+ bcf_enc_int1 (str , (int32_t )val1 );
2784
+ #endif
2695
2785
} else {
2696
2786
bcf_enc_vint (str , n_val , val_a , -1 );
2697
2787
}
2698
- if (strcmp (key , "END" ) == 0 )
2699
- v -> rlen = v64 - v -> pos ;
2788
+ if (n_val == 1 && strcmp (key , "END" ) == 0 )
2789
+ {
2790
+ if ( val1 <= v -> pos )
2791
+ {
2792
+ if ( !negative_rlen_warned )
2793
+ {
2794
+ hts_log_warning ("INFO/END=%" PRIhts_pos " is smaller than POS at %s:%" PRIhts_pos ,val1 ,bcf_seqname (h ,v ),v -> pos + 1 );
2795
+ negative_rlen_warned = 1 ;
2796
+ }
2797
+ }
2798
+ else
2799
+ v -> rlen = val1 - v -> pos ;
2800
+ }
2700
2801
} else if ((y >>4 & 0xf ) == BCF_HT_REAL ) {
2701
2802
float * val_f = (float * )val_a ;
2702
2803
for (i = 0 , t = val ; i < n_val ; ++ i , ++ t )
@@ -3835,6 +3936,7 @@ int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const v
3835
3936
else
3836
3937
bcf_enc_vchar (& str , strlen ((char * )values ), (char * )values );
3837
3938
}
3939
+ #ifdef VCF_ALLOW_INT64
3838
3940
else if ( type == BCF_HT_LONG )
3839
3941
{
3840
3942
if (n != 1 ) {
@@ -3843,6 +3945,7 @@ int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const v
3843
3945
}
3844
3946
bcf_enc_long1 (& str , * (int64_t * ) values );
3845
3947
}
3948
+ #endif
3846
3949
else
3847
3950
{
3848
3951
hts_log_error ("The type %d not implemented yet" , type );
0 commit comments