Skip to content

Commit e76d1ae

Browse files
committed
Remove UR:file:// and UR:ftp:// from ref search path, plus REF_PATH to EBI.
While use of the EBI refget server was originally encouraged by the CRAM inventors, it became a self-imposed DDOS and it is now unreliable. This removes EBI as a fallback when REF_PATH has not been set. In doing this we discovered that we could still retrieve references (ironically also from EBI due to the test being a 1000genomes CRAM) via the SQ UR: tag supporting remote URIs. This behaviour is explicity listed as not being supported in the samtools manpage and we believe it was an accidental ability added when switching from fopen to bgzf_open for reading the UR reference file. Note this check must be in cram_populate_ref and not load_ref_portion or bgzf_open_ref as the user still has the ability to explicitly request an external reference, eg via "samtools view -T URI". open_path_mfile() now takes an extra 'int *local' argument which is filled out with non-zero if the find found in REF_PATH is local. Non-local files will be cached to REF_CACHE if set, but it no longer has a default value as we did when ebi refget was the default REF_PATH. This means it should operate much as before, except for the lack of EBI defaults.
1 parent 7b65da3 commit e76d1ae

File tree

3 files changed

+39
-61
lines changed

3 files changed

+39
-61
lines changed

cram/cram_io.c

+21-56
Original file line numberDiff line numberDiff line change
@@ -2473,6 +2473,9 @@ static refs_t *refs_create(void) {
24732473
static BGZF *bgzf_open_ref(char *fn, char *mode, int is_md5) {
24742474
BGZF *fp;
24752475

2476+
if (strncmp(fn, "file://", 7) == 0)
2477+
fn += 7;
2478+
24762479
if (!is_md5 && !hisremote(fn)) {
24772480
char fai_file[PATH_MAX];
24782481

@@ -2934,30 +2937,6 @@ static void mkdir_prefix(char *path, int mode) {
29342937
*cp = '/';
29352938
}
29362939

2937-
/*
2938-
* Return the cache directory to use, based on the first of these
2939-
* environment variables to be set to a non-empty value.
2940-
*/
2941-
static const char *get_cache_basedir(const char **extra) {
2942-
char *base;
2943-
2944-
*extra = "";
2945-
2946-
base = getenv("XDG_CACHE_HOME");
2947-
if (base && *base) return base;
2948-
2949-
base = getenv("HOME");
2950-
if (base && *base) { *extra = "/.cache"; return base; }
2951-
2952-
base = getenv("TMPDIR");
2953-
if (base && *base) return base;
2954-
2955-
base = getenv("TEMP");
2956-
if (base && *base) return base;
2957-
2958-
return "/tmp";
2959-
}
2960-
29612940
/*
29622941
* Queries the M5 string from the header and attempts to populate the
29632942
* reference from this using the REF_PATH environment.
@@ -2971,31 +2950,12 @@ static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) {
29712950
sam_hrec_tag_t *tag;
29722951
char path[PATH_MAX];
29732952
kstring_t path_tmp = KS_INITIALIZE;
2974-
char cache[PATH_MAX], cache_root[PATH_MAX];
29752953
char *local_cache = getenv("REF_CACHE");
29762954
mFILE *mf;
29772955
int local_path = 0;
29782956

29792957
hts_log_info("Running cram_populate_ref on fd %p, id %d", (void *)fd, id);
29802958

2981-
cache_root[0] = '\0';
2982-
2983-
if (!ref_path || *ref_path == '\0') {
2984-
/*
2985-
* If we have no ref path, we use the EBI server.
2986-
* However to avoid spamming it we require a local ref cache too.
2987-
*/
2988-
ref_path = "https://www.ebi.ac.uk/ena/cram/md5/%s";
2989-
if (!local_cache || *local_cache == '\0') {
2990-
const char *extra;
2991-
const char *base = get_cache_basedir(&extra);
2992-
snprintf(cache_root, PATH_MAX, "%s%s/hts-ref", base, extra);
2993-
snprintf(cache,PATH_MAX, "%s%s/hts-ref/%%2s/%%2s/%%s", base, extra);
2994-
local_cache = cache;
2995-
hts_log_info("Populating local cache: %s", local_cache);
2996-
}
2997-
}
2998-
29992959
if (!r->name)
30002960
return -1;
30012961

@@ -3009,7 +2969,10 @@ static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) {
30092969

30102970
/* Use cache if available */
30112971
if (local_cache && *local_cache) {
3012-
if (expand_cache_path(path, local_cache, tag->str+3) == 0)
2972+
struct stat sb;
2973+
if (expand_cache_path(path, local_cache, tag->str+3) == 0 &&
2974+
stat(path, &sb) == 0)
2975+
// Found it in the local cache
30132976
local_path = 1;
30142977
}
30152978

@@ -3053,7 +3016,8 @@ static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) {
30533016

30543017

30553018
/* Otherwise search full REF_PATH; slower as loads entire file */
3056-
if ((mf = open_path_mfile(tag->str+3, ref_path, NULL))) {
3019+
int is_local = 0;
3020+
if ((mf = open_path_mfile(tag->str+3, ref_path, NULL, &is_local))) {
30573021
size_t sz;
30583022
r->seq = mfsteal(mf, &sz);
30593023
if (r->seq) {
@@ -3069,15 +3033,22 @@ static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) {
30693033
} else {
30703034
refs_t *refs;
30713035
const char *fn;
3036+
sam_hrec_tag_t *UR_tag;
30723037

30733038
no_M5:
30743039
/* Failed to find in search path or M5 cache, see if @SQ UR: tag? */
3075-
if (!(tag = sam_hrecs_find_key(ty, "UR", NULL)))
3040+
if (!(UR_tag = sam_hrecs_find_key(ty, "UR", NULL)))
3041+
return -1;
3042+
3043+
if (strncmp(UR_tag->str+3, "file:", 5) != 0) {
3044+
// Documented as omitted, but accidentally supported until now
3045+
hts_log_error("UR tags pointing to remote files are not supported");
30763046
return -1;
3047+
}
30773048

3078-
fn = (strncmp(tag->str+3, "file:", 5) == 0)
3079-
? tag->str+8
3080-
: tag->str+3;
3049+
fn = (strncmp(UR_tag->str+3, "file:", 5) == 0)
3050+
? UR_tag->str+8
3051+
: UR_tag->str+3;
30813052

30823053
if (fd->refs->fp) {
30833054
if (bgzf_close(fd->refs->fp) != 0)
@@ -3108,15 +3079,9 @@ static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) {
31083079
}
31093080

31103081
/* Populate the local disk cache if required */
3111-
if (local_cache && *local_cache) {
3082+
if (!is_local && local_cache && *local_cache) {
31123083
hFILE *fp;
31133084

3114-
if (*cache_root && !is_directory(cache_root)) {
3115-
hts_log_warning("Creating reference cache directory %s\n"
3116-
"This may become large; see the samtools(1) manual page REF_CACHE discussion",
3117-
cache_root);
3118-
}
3119-
31203085
if (expand_cache_path(path, local_cache, tag->str+3) < 0) {
31213086
return 0; // Not fatal - we have the data already so keep going.
31223087
}

cram/open_trace_file.c

+13-4
Original file line numberDiff line numberDiff line change
@@ -324,14 +324,21 @@ static mFILE *find_file_dir(const char *file, char *dirname) {
324324
* all of the locations listed in 'path' (which is a colon separated list).
325325
* If 'path' is NULL it uses the RAWDATA environment variable instead.
326326
*
327+
* If non-NULL *local is filled out to 1 for a local file and 0 for a remote
328+
* URL.
329+
*
327330
* Returns a mFILE pointer when found.
328331
* NULL otherwise.
329332
*/
330-
mFILE *open_path_mfile(const char *file, char *path, char *relative_to) {
333+
mFILE *open_path_mfile(const char *file, char *path, char *relative_to,
334+
int *local) {
331335
char *newsearch;
332336
char *ele;
333337
mFILE *fp;
334338

339+
if (local)
340+
*local = 1;
341+
335342
/* Use path first */
336343
if (!path)
337344
path = getenv("RAWDATA");
@@ -361,14 +368,16 @@ mFILE *open_path_mfile(const char *file, char *path, char *relative_to) {
361368

362369
if (0 == strncmp(ele2, "URL=", 4)) {
363370
if ((fp = find_file_url(file, ele2+4))) {
371+
if (local)
372+
*local = strncmp(ele2+4, "file:", 5) == 0 ? 1 : 0;
364373
free(newsearch);
365374
return fp;
366375
}
367-
} else if (!strncmp(ele2, "http:", 5) ||
368-
!strncmp(ele2, "https:", 6) ||
369-
!strncmp(ele2, "ftp:", 4)) {
376+
} else if (hisremote(ele2)) {
370377
if ((fp = find_file_url(file, ele2))) {
371378
free(newsearch);
379+
if (local)
380+
*local = 0;
372381
return fp;
373382
}
374383
} else if ((fp = find_file_dir(file, ele2))) {

cram/open_trace_file.h

+5-1
Original file line numberDiff line numberDiff line change
@@ -96,10 +96,14 @@ char *tokenise_search_path(const char *searchpath);
9696
* all of the locations listed in 'path' (which is a colon separated list).
9797
* If 'path' is NULL it uses the RAWDATA environment variable instead.
9898
*
99+
* If non-NULL *local is filled out to 1 for a local file and 0 for a remote
100+
* URL.
101+
*
99102
* Returns a mFILE pointer when found.
100103
* NULL otherwise.
101104
*/
102-
mFILE *open_path_mfile(const char *file, char *path, char *relative_to);
105+
mFILE *open_path_mfile(const char *file, char *path, char *relative_to,
106+
int *local);
103107

104108
/*
105109
* Returns a mFILE containing the entire contents of the url;

0 commit comments

Comments
 (0)