Skip to content

Commit 209f94b

Browse files
committed
Release 1.7: long CIGAR support, multi-region iterator
2 parents 746549d + 1b2c3b4 commit 209f94b

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

52 files changed

+2536
-475
lines changed

.appveyor.yml

-4
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,6 @@ version: 'vers.{build}'
55

66
# branches to build
77
branches:
8-
# Whitelist
9-
only:
10-
- develop
11-
128
# Blacklist
139
except:
1410
- gh-pages

.gitignore

+2-1
Original file line numberDiff line numberDiff line change
@@ -40,11 +40,12 @@ lib*.so.*
4040
/test/sam
4141
/test/tabix/*.tmp.*
4242
/test/tabix/FAIL*
43+
/test/test-bcf-sr
44+
/test/test-bcf-translate
4345
/test/test_bgzf
4446
/test/test-regidx
4547
/test/test-vcf-api
4648
/test/test-vcf-sweep
47-
/test/test-bcf-sr
4849
/test/test_view
4950
/test/thrash_threads[1-6]
5051
/test/*.tmp

.travis.yml

+1-2
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,7 @@ addons:
2222

2323
# For MacOSX systems
2424
before_install:
25-
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update ; fi
26-
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install xz; fi
25+
- if [[ "$TRAVIS_OS_NAME" == "osx" && "$USE_CONFIG" == "no" ]]; then HOMEBREW_NO_AUTO_UPDATE=1 brew install xz || ( brew update && brew install xz ); fi
2726

2827
script:
2928
- if test "$USE_CONFIG" = "yes" ; then autoreconf && ./configure ; fi && make -e && make test

Makefile

+7-4
Original file line numberDiff line numberDiff line change
@@ -175,8 +175,9 @@ cram_sam_header_h = cram/sam_header.h cram/string_alloc.h cram/pooled_alloc.h $(
175175
cram_samtools_h = cram/cram_samtools.h $(htslib_sam_h) $(cram_sam_header_h)
176176
cram_structs_h = cram/cram_structs.h $(htslib_thread_pool_h) cram/string_alloc.h $(htslib_khash_h)
177177
cram_open_trace_file_h = cram/open_trace_file.h cram/mFILE.h
178-
hfile_internal_h = hfile_internal.h $(htslib_hfile_h)
179-
hts_internal_h = hts_internal.h $(htslib_hts_h)
178+
hfile_internal_h = hfile_internal.h $(htslib_hfile_h) $(textutils_internal_h)
179+
hts_internal_h = hts_internal.h $(htslib_hts_h) $(textutils_internal_h)
180+
textutils_internal_h = textutils_internal.h $(htslib_kstring_h)
180181
thread_pool_internal_h = thread_pool_internal.h $(htslib_thread_pool_h)
181182

182183

@@ -195,6 +196,7 @@ config.h:
195196
echo '/* Default config.h generated by Makefile */' > $@
196197
echo '#define HAVE_LIBBZ2 1' >> $@
197198
echo '#define HAVE_LIBLZMA 1' >> $@
199+
echo '#define HAVE_LZMA_H 1' >> $@
198200
echo '#define HAVE_FSEEKO 1' >> $@
199201
echo '#define HAVE_DRAND48 1' >> $@
200202

@@ -293,8 +295,9 @@ hfile.o hfile.pico: hfile.c config.h $(htslib_hfile_h) $(hfile_internal_h) $(hts
293295
hfile_gcs.o hfile_gcs.pico: hfile_gcs.c config.h $(htslib_hts_h) $(htslib_kstring_h) $(hfile_internal_h)
294296
hfile_libcurl.o hfile_libcurl.pico: hfile_libcurl.c config.h $(hfile_internal_h) $(htslib_hts_h) $(htslib_kstring_h)
295297
hfile_net.o hfile_net.pico: hfile_net.c config.h $(hfile_internal_h) $(htslib_knetfile_h)
296-
hfile_s3.o hfile_s3.pico: hfile_s3.c config.h $(hts_internal_h) $(hfile_internal_h) $(htslib_hts_h) $(htslib_kstring_h)
298+
hfile_s3.o hfile_s3.pico: hfile_s3.c config.h $(hfile_internal_h) $(htslib_hts_h) $(htslib_kstring_h)
297299
hts.o hts.pico: hts.c config.h $(htslib_hts_h) $(htslib_bgzf_h) $(cram_h) $(hfile_internal_h) $(htslib_hfile_h) version.h $(hts_internal_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_ksort_h)
300+
hts_os.o hts_os.pico: hts_os.c config.h os/rand.c
298301
vcf.o vcf.pico: vcf.c config.h $(htslib_vcf_h) $(htslib_bgzf_h) $(htslib_tbx_h) $(htslib_hfile_h) $(hts_internal_h) $(htslib_khash_str2int_h) $(htslib_kstring_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_hts_endian_h)
299302
sam.o sam.pico: sam.c config.h $(htslib_sam_h) $(htslib_bgzf_h) $(cram_h) $(hts_internal_h) $(htslib_hfile_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_kstring_h) $(htslib_hts_endian_h)
300303
tbx.o tbx.pico: tbx.c config.h $(htslib_tbx_h) $(htslib_bgzf_h) $(hts_internal_h) $(htslib_khash_h)
@@ -317,7 +320,7 @@ cram/cram_decode.o cram/cram_decode.pico: cram/cram_decode.c config.h $(cram_h)
317320
cram/cram_encode.o cram/cram_encode.pico: cram/cram_encode.c config.h $(cram_h) $(cram_os_h) $(htslib_hts_h) $(htslib_hts_endian_h)
318321
cram/cram_external.o cram/cram_external.pico: cram/cram_external.c config.h $(htslib_hfile_h) $(cram_h)
319322
cram/cram_index.o cram/cram_index.pico: cram/cram_index.c config.h $(htslib_bgzf_h) $(htslib_hfile_h) $(hts_internal_h) $(cram_h) $(cram_os_h)
320-
cram/cram_io.o cram/cram_io.pico: cram/cram_io.c config.h $(cram_h) $(cram_os_h) $(htslib_hts_h) $(cram_open_trace_file_h) cram/rANS_static.h $(htslib_hfile_h) $(htslib_bgzf_h) $(htslib_faidx_h) $(hts_internal_h)
323+
cram/cram_io.o cram/cram_io.pico: cram/cram_io.c config.h os/lzma_stub.h $(cram_h) $(cram_os_h) $(htslib_hts_h) $(cram_open_trace_file_h) cram/rANS_static.h $(htslib_hfile_h) $(htslib_bgzf_h) $(htslib_faidx_h) $(hts_internal_h)
321324
cram/cram_samtools.o cram/cram_samtools.pico: cram/cram_samtools.c config.h $(cram_h) $(htslib_sam_h)
322325
cram/cram_stats.o cram/cram_stats.pico: cram/cram_stats.c config.h $(cram_h) $(cram_os_h)
323326
cram/files.o cram/files.pico: cram/files.c config.h $(cram_misc_h)

NEWS

+42
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,45 @@
1+
Noteworthy changes in release 1.7 (26th January 2018)
2+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
3+
4+
* BAM: HTSlib now supports BAMs which include CIGARs with more than
5+
65535 operations as per HTS-Specs 18th November (dab57f4 and 2f915a8).
6+
7+
* BCF/VCF:
8+
- Removed the need for long double in pileup calculations.
9+
- Sped up the synced reader in some situations.
10+
- Bug fixing: removed memory leak in bcf_copy.
11+
12+
* CRAM:
13+
- Added support for HTS_IDX_START in cram iterators.
14+
- Easier to build when lzma header files are absent.
15+
- Bug fixing: a region query with REQUIRED_FIELDS option to
16+
disable sequence retrieval now gives correct results.
17+
- Bug fixing: stop queries to regions starting after the last
18+
read on a chromosome from incorrectly reporting errors
19+
(#651, #653; reported by Imran Haque and @egafni via pysam).
20+
21+
* Multi-region iterator: The new structure takes a list of regions and
22+
iterates over all, deduplicating reads in the process, and producing a
23+
full list of file offset intervals. This is usually much faster than
24+
repeatedly using the old single-region iterator on a series of regions.
25+
26+
* Curl improvements:
27+
- Add Bearer token support via HTS_AUTH_LOCATION env (#600).
28+
- Use CURL_CA_BUNDLE environment variable to override the CA (#622;
29+
thanks to Garret Kelly & David Alexander).
30+
- Speed up (removal of excessive waiting) for both http(s) and ftp.
31+
- Avoid repeatedly reconnecting by removal of unnecessary seeks.
32+
- Bug fixing: double free when libcurl_open fails.
33+
34+
* BGZF block caching, if enabled, now performs far better (#629; reported
35+
by Ram Yalamanchili).
36+
37+
* Added an hFILE layer for in-memory I/O buffers (#590; thanks to Thomas
38+
Hickman).
39+
40+
* Tidied up the drand48 support (intended for systems that do not
41+
provide this function).
42+
143
Noteworthy changes in release 1.6 (28th September 2017)
244
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
345

bcf_sr_sort.c

+34-21
Original file line numberDiff line numberDiff line change
@@ -327,6 +327,20 @@ char *grp_create_key(sr_sort_t *srt)
327327
}
328328
return ret;
329329
}
330+
int bcf_sr_sort_set_active(sr_sort_t *srt, int idx)
331+
{
332+
hts_expand(int,idx+1,srt->mactive,srt->active);
333+
srt->nactive = 1;
334+
srt->active[srt->nactive - 1] = idx;
335+
return 0;
336+
}
337+
int bcf_sr_sort_add_active(sr_sort_t *srt, int idx)
338+
{
339+
hts_expand(int,idx+1,srt->mactive,srt->active);
340+
srt->nactive++;
341+
srt->active[srt->nactive - 1] = idx;
342+
return 0;
343+
}
330344
static void bcf_sr_sort_set(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, int min_pos)
331345
{
332346
if ( !srt->grp_str2int )
@@ -357,11 +371,11 @@ static void bcf_sr_sort_set(bcf_srs_t *readers, sr_sort_t *srt, const char *chr,
357371
memset(&grp,0,sizeof(grp_t));
358372

359373
// group VCFs into groups, each with a unique combination of variants in the duplicate lines
360-
int ireader,ivar,irec,igrp,ivset;
361-
for (ireader=0; ireader<readers->nreaders; ireader++)
374+
int ireader,ivar,irec,igrp,ivset,iact;
375+
for (ireader=0; ireader<readers->nreaders; ireader++) srt->vcf_buf[ireader].nrec = 0;
376+
for (iact=0; iact<srt->nactive; iact++)
362377
{
363-
srt->vcf_buf[ireader].nrec = 0;
364-
378+
ireader = srt->active[iact];
365379
bcf_sr_t *reader = &readers->readers[ireader];
366380
int rid = bcf_hdr_name2id(reader->header, chr);
367381
grp.nvar = 0;
@@ -546,23 +560,8 @@ static void bcf_sr_sort_set(bcf_srs_t *readers, sr_sort_t *srt, const char *chr,
546560
int bcf_sr_sort_next(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, int min_pos)
547561
{
548562
int i,j;
549-
if ( readers->nreaders == 1 )
550-
{
551-
srt->nsr = 1;
552-
if ( !srt->msr )
553-
{
554-
srt->vcf_buf = (vcf_buf_t*) calloc(1,sizeof(vcf_buf_t)); // first time here
555-
srt->msr = 1;
556-
}
557-
bcf_sr_t *reader = &readers->readers[0];
558-
assert( reader->buffer[1]->pos==min_pos );
559-
bcf1_t *tmp = reader->buffer[0];
560-
for (j=1; j<=reader->nbuffer; j++) reader->buffer[j-1] = reader->buffer[j];
561-
reader->buffer[ reader->nbuffer ] = tmp;
562-
reader->nbuffer--;
563-
readers->has_line[0] = 1;
564-
return 1;
565-
}
563+
assert( srt->nactive>0 );
564+
566565
if ( srt->nsr != readers->nreaders )
567566
{
568567
srt->sr = readers;
@@ -575,6 +574,19 @@ int bcf_sr_sort_next(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, int mi
575574
srt->nsr = readers->nreaders;
576575
srt->chr = NULL;
577576
}
577+
if ( srt->nactive == 1 )
578+
{
579+
if ( readers->nreaders>1 )
580+
memset(readers->has_line, 0, readers->nreaders*sizeof(*readers->has_line));
581+
bcf_sr_t *reader = &readers->readers[srt->active[0]];
582+
assert( reader->buffer[1]->pos==min_pos );
583+
bcf1_t *tmp = reader->buffer[0];
584+
for (j=1; j<=reader->nbuffer; j++) reader->buffer[j-1] = reader->buffer[j];
585+
reader->buffer[ reader->nbuffer ] = tmp;
586+
reader->nbuffer--;
587+
readers->has_line[srt->active[0]] = 1;
588+
return 1;
589+
}
578590
if ( !srt->chr || srt->pos!=min_pos || strcmp(srt->chr,chr) ) bcf_sr_sort_set(readers, srt, chr, min_pos);
579591

580592
if ( !srt->vcf_buf[0].nrec ) return 0;
@@ -629,6 +641,7 @@ sr_sort_t *bcf_sr_sort_init(sr_sort_t *srt)
629641
}
630642
void bcf_sr_sort_destroy(sr_sort_t *srt)
631643
{
644+
free(srt->active);
632645
if ( srt->var_str2int ) khash_str2int_destroy_free(srt->var_str2int);
633646
if ( srt->grp_str2int ) khash_str2int_destroy_free(srt->grp_str2int);
634647
int i;

bcf_sr_sort.h

+3
Original file line numberDiff line numberDiff line change
@@ -92,11 +92,14 @@ typedef struct
9292
const char *chr;
9393
int pos, nsr, msr;
9494
int pair;
95+
int nactive, mactive, *active; // list of readers with lines at the current pos
9596
}
9697
sr_sort_t;
9798

9899
sr_sort_t *bcf_sr_sort_init(sr_sort_t *srt);
99100
int bcf_sr_sort_next(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, int pos);
101+
int bcf_sr_sort_set_active(sr_sort_t *srt, int i);
102+
int bcf_sr_sort_add_active(sr_sort_t *srt, int i);
100103
void bcf_sr_sort_destroy(sr_sort_t *srt);
101104
void bcf_sr_sort_remove_reader(bcf_srs_t *readers, sr_sort_t *srt, int i);
102105

bgzf.c

+52-18
Original file line numberDiff line numberDiff line change
@@ -71,10 +71,16 @@ typedef struct {
7171
uint8_t *block;
7272
int64_t end_offset;
7373
} cache_t;
74+
7475
#include "htslib/khash.h"
7576
KHASH_MAP_INIT_INT64(cache, cache_t)
7677
#endif
7778

79+
struct bgzf_cache_t {
80+
khash_t(cache) *h;
81+
khint_t last_pos;
82+
};
83+
7884
#ifdef BGZF_MT
7985

8086
typedef struct bgzf_job {
@@ -215,7 +221,16 @@ static BGZF *bgzf_read_init(hFILE *hfpr)
215221
fp->is_compressed = (n==18 && magic[0]==0x1f && magic[1]==0x8b);
216222
fp->is_gzip = ( !fp->is_compressed || ((magic[3]&4) && memcmp(&magic[12], "BC\2\0",4)==0) ) ? 0 : 1;
217223
#ifdef BGZF_CACHE
218-
fp->cache = kh_init(cache);
224+
if (!(fp->cache = malloc(sizeof(*fp->cache)))) {
225+
free(fp);
226+
return NULL;
227+
}
228+
if (!(fp->cache->h = kh_init(cache))) {
229+
free(fp->cache);
230+
free(fp);
231+
return NULL;
232+
}
233+
fp->cache->last_pos = 0;
219234
#endif
220235
return fp;
221236
}
@@ -524,27 +539,27 @@ static int check_header(const uint8_t *header)
524539
static void free_cache(BGZF *fp)
525540
{
526541
khint_t k;
527-
khash_t(cache) *h = (khash_t(cache)*)fp->cache;
528542
if (fp->is_write) return;
543+
khash_t(cache) *h = fp->cache->h;
529544
for (k = kh_begin(h); k < kh_end(h); ++k)
530545
if (kh_exist(h, k)) free(kh_val(h, k).block);
531546
kh_destroy(cache, h);
547+
free(fp->cache);
532548
}
533549

534550
static int load_block_from_cache(BGZF *fp, int64_t block_address)
535551
{
536552
khint_t k;
537553
cache_t *p;
538554

539-
khash_t(cache) *h = (khash_t(cache)*)fp->cache;
555+
khash_t(cache) *h = fp->cache->h;
540556
k = kh_get(cache, h, block_address);
541557
if (k == kh_end(h)) return 0;
542558
p = &kh_val(h, k);
543559
if (fp->block_length != 0) fp->block_offset = 0;
544560
fp->block_address = block_address;
545561
fp->block_length = p->size;
546-
// FIXME: why BGZF_MAX_BLOCK_SIZE and not p->size?
547-
memcpy(fp->uncompressed_block, p->block, BGZF_MAX_BLOCK_SIZE);
562+
memcpy(fp->uncompressed_block, p->block, p->size);
548563
if ( hseek(fp->fp, p->end_offset, SEEK_SET) < 0 )
549564
{
550565
// todo: move the error up
@@ -557,29 +572,48 @@ static int load_block_from_cache(BGZF *fp, int64_t block_address)
557572
static void cache_block(BGZF *fp, int size)
558573
{
559574
int ret;
560-
khint_t k;
575+
khint_t k, k_orig;
576+
uint8_t *block = NULL;
561577
cache_t *p;
562578
//fprintf(stderr, "Cache block at %llx\n", (int)fp->block_address);
563-
khash_t(cache) *h = (khash_t(cache)*)fp->cache;
579+
khash_t(cache) *h = fp->cache->h;
564580
if (BGZF_MAX_BLOCK_SIZE >= fp->cache_size) return;
581+
if (fp->block_length < 0 || fp->block_length > BGZF_MAX_BLOCK_SIZE) return;
565582
if ((kh_size(h) + 1) * BGZF_MAX_BLOCK_SIZE > (uint32_t)fp->cache_size) {
566-
/* A better way would be to remove the oldest block in the
567-
* cache, but here we remove a random one for simplicity. This
568-
* should not have a big impact on performance. */
569-
for (k = kh_begin(h); k < kh_end(h); ++k)
570-
if (kh_exist(h, k)) break;
571-
if (k < kh_end(h)) {
572-
free(kh_val(h, k).block);
583+
/* Remove uniformly from any position in the hash by a simple
584+
* round-robin approach. An alternative strategy would be to
585+
* remove the least recently accessed block, but the round-robin
586+
* removal is simpler and is not expected to have a big impact
587+
* on performance */
588+
if (fp->cache->last_pos >= kh_end(h)) fp->cache->last_pos = kh_begin(h);
589+
k_orig = k = fp->cache->last_pos;
590+
if (++k >= kh_end(h)) k = kh_begin(h);
591+
while (k != k_orig) {
592+
if (kh_exist(h, k))
593+
break;
594+
if (++k == kh_end(h))
595+
k = kh_begin(h);
596+
}
597+
fp->cache->last_pos = k;
598+
599+
if (k != k_orig) {
600+
block = kh_val(h, k).block;
573601
kh_del(cache, h, k);
574602
}
603+
} else {
604+
block = (uint8_t*)malloc(BGZF_MAX_BLOCK_SIZE);
575605
}
606+
if (!block) return;
576607
k = kh_put(cache, h, fp->block_address, &ret);
577-
if (ret == 0) return; // if this happens, a bug!
608+
if (ret <= 0) { // kh_put failed, or in there already (shouldn't happen)
609+
free(block);
610+
return;
611+
}
578612
p = &kh_val(h, k);
579613
p->size = fp->block_length;
580614
p->end_offset = fp->block_address + size;
581-
p->block = (uint8_t*)malloc(BGZF_MAX_BLOCK_SIZE);
582-
memcpy(kh_val(h, k).block, fp->uncompressed_block, BGZF_MAX_BLOCK_SIZE);
615+
p->block = block;
616+
memcpy(p->block, fp->uncompressed_block, p->size);
583617
}
584618
#else
585619
static void free_cache(BGZF *fp) {}
@@ -1489,7 +1523,7 @@ int bgzf_close(BGZF* fp)
14891523

14901524
void bgzf_set_cache_size(BGZF *fp, int cache_size)
14911525
{
1492-
if (fp) fp->cache_size = cache_size;
1526+
if (fp && fp->cache) fp->cache_size = cache_size;
14931527
}
14941528

14951529
int bgzf_check_EOF(BGZF *fp) {

bgzip.c

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
/* bgzip.c -- Block compression/decompression utility.
22
33
Copyright (C) 2008, 2009 Broad Institute / Massachusetts Institute of Technology
4-
Copyright (C) 2010, 2013-2017 Genome Research Ltd.
4+
Copyright (C) 2010, 2013-2018 Genome Research Ltd.
55
66
Permission is hereby granted, free of charge, to any person obtaining a copy
77
of this software and associated documentation files (the "Software"), to deal
@@ -130,7 +130,7 @@ int main(int argc, char **argv)
130130
case 1:
131131
printf(
132132
"bgzip (htslib) %s\n"
133-
"Copyright (C) 2017 Genome Research Ltd.\n", hts_version());
133+
"Copyright (C) 2018 Genome Research Ltd.\n", hts_version());
134134
return EXIT_SUCCESS;
135135
case 'h':
136136
case '?': return bgzip_main_usage();

0 commit comments

Comments
 (0)