-
Notifications
You must be signed in to change notification settings - Fork 444
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Adjust parser for undefined CIGARs #1182
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1110,21 +1110,21 @@ int bam_set_qname(bam1_t *b, const char *qname); | |
can be NULL | ||
@param a_cigar [out] address of the destination uint32_t buffer | ||
@param a_mem [in/out] address of the allocated number of buffer elements | ||
@return number of processed CIGAR operators; 0 if error | ||
@return number of processed CIGAR operators; -1 on error | ||
*/ | ||
HTSLIB_EXPORT | ||
size_t sam_parse_cigar(const char *in, char **end, uint32_t **a_cigar, uint32_t *a_mem); | ||
ssize_t sam_parse_cigar(const char *in, char **end, uint32_t **a_cigar, uint32_t *a_mem); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. By luck sam.h currently gets it via kstring.h, but in case that changes: need to add There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for the catch. |
||
|
||
/*! @function | ||
@abstract Parse a CIGAR string into a bam1_t struct | ||
@param in [in] pointer to the source string | ||
@param end [out] address of the pointer to the new end of the input string | ||
can be NULL | ||
@param b [in/out] address of the destination bam1_t struct | ||
@return number of processed CIGAR operators; 0 if error | ||
@return number of processed CIGAR operators; -1 on error | ||
*/ | ||
HTSLIB_EXPORT | ||
size_t bam_parse_cigar(const char *in, char **end, bam1_t *b); | ||
ssize_t bam_parse_cigar(const char *in, char **end, bam1_t *b); | ||
|
||
/************************* | ||
*** BAM/CRAM indexing *** | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2150,8 +2150,8 @@ int sam_parse1(kstring_t *s, sam_hdr_t *h, bam1_t *b) | |
if (*p != '*') { | ||
uint32_t *cigar = NULL; | ||
int old_l_data = b->l_data; | ||
uint32_t n_cigar = bam_parse_cigar(p, &p, b); | ||
if (!n_cigar || *p++ != '\t') goto err_ret; | ||
int32_t n_cigar = bam_parse_cigar(p, &p, b); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Being nit-picky this should be ssize_t, but frankly the world will have broken long before we get to 2 billion cigar elements! Infact I'm not sure it's possible in BAM as the bam record would overflow in length. I'd just go with an unsized "int" or a specific "ssize_t" if you feel inclined to be pedantic. |
||
if (n_cigar < 1 || *p++ != '\t') goto err_ret; | ||
cigar = (uint32_t *)(b->data + old_l_data); | ||
c->n_cigar = n_cigar; | ||
|
||
|
@@ -2370,16 +2370,20 @@ static int parse_cigar(const char *in, uint32_t *a_cigar, uint32_t n_cigar) { | |
return q-in; | ||
} | ||
|
||
size_t sam_parse_cigar(const char *in, char **end, uint32_t **a_cigar, uint32_t *a_mem) { | ||
ssize_t sam_parse_cigar(const char *in, char **end, uint32_t **a_cigar, uint32_t *a_mem) { | ||
size_t n_cigar = 0; | ||
int diff; | ||
|
||
if (!in || !a_cigar || !a_mem) { | ||
hts_log_error("NULL pointer arguments"); | ||
return 0; | ||
return -1; | ||
} | ||
if (end) *end = (char *)in; | ||
|
||
if (*in == '*') { | ||
if (end) (*end)++; | ||
return 0; | ||
} | ||
n_cigar = read_ncigar(in); | ||
if (!n_cigar) return 0; | ||
if (n_cigar > *a_mem) { | ||
|
@@ -2389,7 +2393,7 @@ size_t sam_parse_cigar(const char *in, char **end, uint32_t **a_cigar, uint32_t | |
*a_mem = n_cigar; | ||
} else { | ||
hts_log_error("Memory allocation error"); | ||
return 0; | ||
return -1; | ||
} | ||
} | ||
|
||
|
@@ -2399,21 +2403,25 @@ size_t sam_parse_cigar(const char *in, char **end, uint32_t **a_cigar, uint32_t | |
return n_cigar; | ||
} | ||
|
||
size_t bam_parse_cigar(const char *in, char **end, bam1_t *b) { | ||
ssize_t bam_parse_cigar(const char *in, char **end, bam1_t *b) { | ||
size_t n_cigar = 0; | ||
int diff; | ||
|
||
if (!in || !b) { | ||
hts_log_error("NULL pointer arguments"); | ||
return 0; | ||
return -1; | ||
} | ||
if (end) *end = (char *)in; | ||
|
||
if (*in == '*') { | ||
if (end) (*end)++; | ||
return 0; | ||
} | ||
n_cigar = read_ncigar(in); | ||
if (!n_cigar) return 0; | ||
if (possibly_expand_bam_data(b, n_cigar * sizeof(uint32_t)) < 0) { | ||
hts_log_error("Memory allocation error"); | ||
return 0; | ||
return -1; | ||
} | ||
|
||
if (!(diff = parse_cigar(in, (uint32_t *)(b->data + b->l_data), n_cigar))) return 0; | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2149,6 +2149,27 @@ static void test_bam_set1_write_and_read_back() | |
ks_free(&ks); | ||
} | ||
|
||
static void test_cigar_api(void) | ||
{ | ||
uint32_t *buf = NULL; | ||
char *cig = "*"; | ||
char *end; | ||
uint32_t m = 0; | ||
int32_t n; | ||
n = sam_parse_cigar(cig, &end, &buf, &m); | ||
VERIFY(n == 0 && m == 0 && (end-cig) == 1, "failed to parse undefined CIGAR"); | ||
cig = "2M3X1I10M5D"; | ||
n = sam_parse_cigar(cig, &end, &buf, &m); | ||
VERIFY(n == 5 && m > 0 && (end-cig) == 11, "failed to parse CIGAR string: 2M3X1I10M5D"); | ||
n = sam_parse_cigar("722M15D187217376188323783284M67I", NULL, &buf, &m); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Edit: ignore me! Sorry it's validating it can't parse it. Good one. :-) I'm not sure what the point is of this test. 187217376188323783284 is 68 bits long. Cigars are held inside htslib as uint32_t with 4 bits being the type, so the maximum length of cigar op we currently handle is 268435455 (2^28-1). CRAM could handle longer lengths (if #976 got merged), but I think we rejected that idea as the in-memory htslib struct would need fixing first. I'm sure we discussed changing the cigar type when we were doing all of our sweeping ABI changes and it was rejected, so I think we're committed to having excessively long cigar lengths as a series of concatenated smaller lengths using the same op type. (I don't see a reason to support that though until we get reads long enough and accurate enough to generate such data. We're considerably far off that still.) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This does raise the question though why the test isn't:
The documentation for |
||
VERIFY(n == 0, "failed to flag CIGAR string with long op length: 722M15D187217376188323783284M67I"); | ||
n = sam_parse_cigar("53I722MD8X", NULL, &buf, &m); | ||
VERIFY(n == 0, "failed to flag CIGAR string with no op length: 53I722MD8X"); | ||
|
||
cleanup: | ||
free(buf); | ||
} | ||
|
||
int main(int argc, char **argv) | ||
{ | ||
int i; | ||
|
@@ -2186,6 +2207,7 @@ int main(int argc, char **argv) | |
test_bam_set1_validate_cigar(); | ||
test_bam_set1_validate_size_limits(); | ||
test_bam_set1_write_and_read_back(); | ||
test_cigar_api(); | ||
|
||
return status; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
a_cigar is [in/out] presumably?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Initially, I thought of just being
[out]
, as the information contained by the buffer is generated inside the method, but the pointer itself is being read. So, I guess a case can be made for[in]
.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's
in
because the pointed-to variable needs to already have a meaningful value before the call (this is how it differs fromend
, whose pointed-to variable may even be uninitialised). It'sout
because the pointed-to value might be different after the call.Therefore it's
[in/out]
, the same asa_mem
.