Skip to content

Commit

Permalink
Merge pull request #829 from uclahs-cds/czhu-fix-call-variant
Browse files Browse the repository at this point in the history
In-bubble reference node treated as subgraph-in mistakenly
  • Loading branch information
zhuchcn authored Dec 3, 2023
2 parents d124e3c + 6d73da7 commit 9726065
Show file tree
Hide file tree
Showing 8 changed files with 156 additions and 12 deletions.
2 changes: 1 addition & 1 deletion moPepGen/svgraph/TVGNode.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,7 +298,7 @@ def is_subgraph_bridge(self, out_node:TVGNode) -> bool:

def is_subgraph_end(self) -> bool:
""" check if is the end of a subgraph """
return self.get_out_nodes() \
return len(self.get_out_nodes()) > 0 \
and all(x.level < self.level for x in self.get_out_nodes())

def is_orf_bridge(self, out_node:TVGNode) -> bool:
Expand Down
9 changes: 7 additions & 2 deletions moPepGen/svgraph/ThreeFrameTVG.py
Original file line number Diff line number Diff line change
Expand Up @@ -1257,9 +1257,14 @@ def find_bridge_nodes_between(self, start:TVGNode, end:TVGNode, members:Set[TVGN
if e.in_node.get_first_rf_index() != this_id or e.in_node.was_bridge \
and e.in_node not in visited and e.in_node is not start:
bridge_in.add(e.in_node)
elif not self.is_circ_rna() and e.in_node.subgraph_id != cur.subgraph_id:
if not (e.in_node.is_inframe_subgraph(start, end) and e.in_node in members):
elif not self.is_circ_rna() \
and e.in_node.subgraph_id != cur.subgraph_id:
if e.in_node.subgraph_id != start.subgraph_id:
subgraph_in.add(e.in_node)
elif e.in_node.is_inframe_subgraph(start, end):
if not e.in_node in members:
subgraph_in.add(e.in_node)

if cur is not end:
for e in cur.out_edges:
queue.appendleft(e.out_node)
Expand Down
29 changes: 29 additions & 0 deletions test/files/fuzz/53/annotation.gtf
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
chr1 . gene 1 2005 . + . gene_id FAKEG00000372; transcript_id FAKET00000372; protein_id FAKEP00000372; tag cds_start_NF;
chr1 . transcript 1 2005 . + . gene_id FAKEG00000372; transcript_id FAKET00000372; protein_id FAKEP00000372; tag cds_start_NF; is_protein_coding true;
chr1 . selenocysteine 112 114 . + . gene_id FAKEG00000372; transcript_id FAKET00000372; protein_id FAKEP00000372; tag cds_start_NF;
chr1 . selenocysteine 621 623 . + . gene_id FAKEG00000372; transcript_id FAKET00000372; protein_id FAKEP00000372; tag cds_start_NF;
chr1 . selenocysteine 999 1001 . + . gene_id FAKEG00000372; transcript_id FAKET00000372; protein_id FAKEP00000372; tag cds_start_NF;
chr1 . CDS 1 224 . + 0 gene_id FAKEG00000372; transcript_id FAKET00000372; protein_id FAKEP00000372; tag cds_start_NF;
chr1 . exon 1 224 . + . gene_id FAKEG00000372; transcript_id FAKET00000372; protein_id FAKEP00000372; tag cds_start_NF;
chr1 . CDS 431 658 . + 1 gene_id FAKEG00000372; transcript_id FAKET00000372; protein_id FAKEP00000372; tag cds_start_NF;
chr1 . exon 431 658 . + . gene_id FAKEG00000372; transcript_id FAKET00000372; protein_id FAKEP00000372; tag cds_start_NF;
chr1 . CDS 968 1257 . + 1 gene_id FAKEG00000372; transcript_id FAKET00000372; protein_id FAKEP00000372; tag cds_start_NF;
chr1 . exon 968 1257 . + . gene_id FAKEG00000372; transcript_id FAKET00000372; protein_id FAKEP00000372; tag cds_start_NF;
chr1 . CDS 1758 1963 . + 2 gene_id FAKEG00000372; transcript_id FAKET00000372; protein_id FAKEP00000372; tag cds_start_NF;
chr1 . exon 1758 2005 . + . gene_id FAKEG00000372; transcript_id FAKET00000372; protein_id FAKEP00000372; tag cds_start_NF;
chr1 . UTR 1964 2005 . + . gene_id FAKEG00000372; transcript_id FAKET00000372; protein_id FAKEP00000372; tag cds_start_NF;
chr1 . gene 2006 3527 . - . gene_id FAKEG00000932; transcript_id FAKET00000932; protein_id FAKEP00000932;
chr1 . transcript 2006 3527 . - . gene_id FAKEG00000932; transcript_id FAKET00000932; protein_id FAKEP00000932; is_protein_coding true;
chr1 . selenocysteine 2279 2281 . - . gene_id FAKEG00000932; transcript_id FAKET00000932; protein_id FAKEP00000932;
chr1 . exon 2006 2112 . - . gene_id FAKEG00000932; transcript_id FAKET00000932; protein_id FAKEP00000932;
chr1 . CDS 2008 2112 . - 2 gene_id FAKEG00000932; transcript_id FAKET00000932; protein_id FAKEP00000932;
chr1 . CDS 2186 2301 . - 1 gene_id FAKEG00000932; transcript_id FAKET00000932; protein_id FAKEP00000932;
chr1 . exon 2186 2301 . - . gene_id FAKEG00000932; transcript_id FAKET00000932; protein_id FAKEP00000932;
chr1 . CDS 2673 2970 . - 2 gene_id FAKEG00000932; transcript_id FAKET00000932; protein_id FAKEP00000932;
chr1 . exon 2673 2970 . - . gene_id FAKEG00000932; transcript_id FAKET00000932; protein_id FAKEP00000932;
chr1 . CDS 3030 3203 . - 2 gene_id FAKEG00000932; transcript_id FAKET00000932; protein_id FAKEP00000932;
chr1 . exon 3030 3203 . - . gene_id FAKEG00000932; transcript_id FAKET00000932; protein_id FAKEP00000932;
chr1 . CDS 3259 3313 . - 1 gene_id FAKEG00000932; transcript_id FAKET00000932; protein_id FAKEP00000932;
chr1 . exon 3259 3527 . - . gene_id FAKEG00000932; transcript_id FAKET00000932; protein_id FAKEP00000932;
chr1 . UTR 2006 2007 . - . gene_id FAKEG00000932; transcript_id FAKET00000932; protein_id FAKEP00000932;
chr1 . UTR 3314 3527 . - . gene_id FAKEG00000932; transcript_id FAKET00000932; protein_id FAKEP00000932;
18 changes: 18 additions & 0 deletions test/files/fuzz/53/brute_force.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
CIVPGTDLIPCFMFPP
HLTRRCIVPGTDLIPCFMFPP
LLTLDMIIPTIPPSRPNFLAVR
LLTLDMIIPTIPPSRPNFLAVRGCR
LMKRPFSTKSSSTYAF
LSSLLVYCSWN
LYTTCVSVAASR
LYTTCVSVAASRHLTR
LYTTCVSVAASRHLTRR
MVRSSVLYTDYCRLSSLLVYCSWN
MVRSSVLYTDYCRLSSM
MVRSSVLYTDYCRLYTTCVSVAASR
RCIVPGTDLIPCFMFPP
RPFSTKSSSTYAF
SSSTYAF
SSVLYTDYCRLSSLLVYCSWN
SSVLYTDYCRLSSM
SSVLYTDYCRLYTTCVSVAASR
26 changes: 26 additions & 0 deletions test/files/fuzz/53/fake_variants.gvf
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
##fileformat=VCFv4.2
##mopepgen_version=1.2.0
##parser=parseVEP
##reference_index=
##genome_fasta=
##annotation_gtf=
##source=
##CHROM=<Description="Gene ID">
##INFO=<ID=TRANSCRIPT_ID,Number=1,Type=String,Description="Transcript ID">
##INFO=<ID=GENE_SYMBOL,Number=1,Type=String,Description="Gene Symbol">
##INFO=<ID=GENOMIC_POSITION,Number=1,Type=String,Description="Genomic Position">
##INFO=<ID=DONOR_START,Number=1,Type=Integer,Description="Donor Start Position">
##INFO=<ID=DONOR_END,Number=1,Type=Integer,Description="Donor End Position">
##INFO=<ID=START,Number=1,Type=Integer,Description="Start Position">
##INFO=<ID=END,Number=1,Type=Integer,Description="End Position">
##INFO=<ID=OFFSET,Number=+,Type=Integer,Description="Offsets of fragments (exons or introns)">
##INFO=<ID=LENGTH,Number=+,Type=Integer,Description="Lengths of fragments (exons or introns)">
##INFO=<ID=INTRON,Number=+,Type=Integer,Description="Indices of fragments that are introns">
##INFO=<ID=COORDINATE,Number=1,Type=String,Description="Coordinate for Insertion or Substitution">
##INFO=<ID=ACCEPTER_GENE_ID,Number=1,Type=String,Description="3' Accepter Transcript's Gene ID">
##INFO=<ID=ACCEPTER_TRANSCRIPT_ID,Number=1,Type=String,Description="3' Accepter Transcript's Transcript ID">
##INFO=<ID=ACCEPTER_POSITION,Number=1,Type=Integer,Description="Position of the break point of the 3' accepter transcript">
#CHROM POS ID REF ALT QUAL FILTER INFO
FAKEG00000372 477 FAKEG00000372-476-TATACCACATGCGTTTC-T TATACCACATGCGTTTC T . . TRANSCRIPT_ID=FAKET00000372;GENOMIC_POSITION=chrF-476:493;GENE_SYMBOL=
FAKEG00000372 499 FUSION-FAKET00000372:498-FAKET00000932:141 A <FUSION> . . TRANSCRIPT_ID=FAKET00000372;GENE_SYMBOL=None;GENOMIC_POSITION=498;ACCEPTER_GENE_ID=FAKEG00000932;ACCEPTER_TRANSCRIPT_ID=FAKET00000932;ACCEPTER_SYMBOL=None;ACCEPTER_POSITION=142;ACCEPTER_GENOMIC_POSITION=3418
FAKEG00000932 147 FAKEG00000932-146-A-AGCAGGCATCTTACTCGACG A AGCAGGCATCTTACTCGACG . . TRANSCRIPT_ID=FAKET00000932;GENOMIC_POSITION=chrF-3413:3414;GENE_SYMBOL=
60 changes: 60 additions & 0 deletions test/files/fuzz/53/genome.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
>chr1
ATGACTTATCCAAAAAACCCGCGCCATGGTAGGAAAAATATCACACCCACTAAGCCACTG
CAGCGGCGTTGCTCCAGAGGAGTTGCAGGTAGCGGGTGCCGCGCCCCCAAATGATATGCT
TATACAGGCTTACTTCTGATTAACGCGGTCACATATCACCTAGATATTTATTCGTGTCAA
CTGCTGACCGGGCGGTGCTCTAGTCACACATTCACCCCAGAGAATGACCTAAGCCTGTGT
TACGTATTCAAGCGAAGTGTTTTATCCTGATTTACCTAGACAGCGAGAATCGGTAGAGCA
CTAGTCTGAAATAGACACCCTCATAGTCCTATCTCATCGGCGTCCCGTTGAAGGAGTTTG
TCCACACGCCAACCCTGTATAATAAGCGAGGATAGGAACTAGAGAAGTACCCTTTAGGCA
TGATTCGGCAAAGAATGGTTCGGTCGTCAGTTCTATATACAGATTACTGCCGCCTATATA
CCACATGCGTTTCCGTCGATGTAAGTCATATAGTATACGTGGCTCGAAATCAGCTGGGAG
CCCTCGGTTACGAACGGCGCATAGTTCAGATTCTTATAGGGACAGTAGAAAGGTTGTCAT
GTTACTTAACCAATGTACACTGAGTGGCTCTCGAACTACCTGTCACTCTAGTGAGCGTCG
ATCCCGGTGCTACGATCCGCACACCACGATGAATGTACGGCAGAAAGACAAATTCGCTTT
TGGCAGCTGGGGAGATACCTGTACCCACCTGTATTGGATCGGGCCAGCCGCCGAAGACTA
GGGAGAGTAGCGCCCGTATTCAAACGTATGCAACAATAATCTAACTTGCGGGGCAAGGTG
AAGCCTTGTCGACCTTACTCGTACTTTAACGACGAGCACGCCCTGCCAGGCCTGGAACAA
AAATACGTAGGTACCCCTGGGCAGCGTGGTCGGAAGCCCTACCTGTTGCTCAGGATTAAT
GCAGGGATCGATCAGAGTGGAAAACAGCCAGCTTGACGTGAGAGTGGCAGGTCCTGATAG
AGTCATGTGATCTGCTAGAAAGGCCGGCACGAGCCGTTAGGCAGCGCGATTACTCTGAAT
CCTCAGCTGATGGGACCCGTTTCGACGTAGGCATGTTGTGGTCATTCCGCAGGCGAGTGC
GTACCTGCAGTACAATCAGCATCAAGACCATGCACAAGCACTTCCGCAGGAGCTGGCACG
AGGACGTCATTCATTGTCTTCATTTGAGGCATGACATGTATTGGACACCTCTGTTGAGAC
GAACACGATTAGCACAGTGAGGTGGACGCTTGTGTAAGTTTCTTTCCTTATCAGCAACCG
AATCATGTGGGGAGAAGTAAAAGCGAGGGGAGGCCATGTCCTGTGGTGGATGCGCTACGA
TCAAGCAGACGCCGCAGGTTCAGATAGGAACTAACGACTGGGTGGTGAGGTGTTGTCTCC
GTTATCGGTCGACCACAAACAGGTAAATCGTATAAGCTGGATGTAGGTATATTTCGAGTT
TAGCCTCTGCCTTATAGTAGTAATTAGCATTCAGTAAAAGAGTCGCTTACCATTAGCTTA
CTCCATTTTCCCTTGGGATACGTACAGCCACAACTATGGAGAGCGTACAGCTAAACGCCC
ACGGCTGTGGGAAGATTGGATCCACCATGTTCCACATTGCGGGAAGGTAGCTGCAGTGCC
CGAATGGTCCTAAATTCTTATATCGATCAGGTATTTCCACAGAAGCTGGTTACAAGAGAG
CCGGGCGGCGACCCATTACACTTGGTACCCTTACTGCGCCTTGAATGGTACAGTCGTGTC
CCAGTGTTCTTCTAAAACACGTTCAGCGACCTCCGATGCCGCAAGGGGCGCGCGATACAA
TGCGAAGCCGTTGAAACAACCATCGGCAGAAGCTGCCGAACGTTCACGTTTTAGTACTGA
GACCCTTTTGTGGCTAAATTCTCTAAGTCTATTGCTTAAATACTAGGAACACATCTTAGC
ACGCACCGGTGGGTTGTAAGCACGCTAGGACAGCCGTACCCCACTAAACGACTTATCTAC
TCCGAATGCGGCCCTAGGACTAGCAACTAGCCGGTAATTAGAAGGTAGTGAAACGCCATT
CAGTAGTGAGTGGATTTAGGACGAAAAGGGGAGTGTTACGATATATAAGATACAGGTGTT
GTTGGCCTGAGCCTTGTTTCAGCACCATAATAGCGACGCTTGTCCCCGCCCCGTTGACCG
TGCGTCGTTCGACAGATGAATAAAAGACGAAAGTCATGCGTGATTCCGGCTGCAGAATTC
AAAAGGCGTATGTAGAAGAACTATGCGGAGAAGTAAGTTCACTACTTGCGATGACAATAC
GACTAATCACGAGACCGAACTAGTGCACAACCTTGTGATGTCAGGCCCAAACGGTAGATT
TAATACTGCAGTTGTGACCGGATGACCTGTTTACGGAACTTCAGGTCGTTCATAGGCTAT
GGCTGGTTTATTTAGCAATTGGCAATACGAAAATAGTCGATCCTCGCAAGAAGCATTATT
ATTCCCAGACGCCAGAGTGGACCGCCGGCAGTCCTCGGCAGACGGGGGATAGCCAGCCAC
CTAGTAGCCAAACTTGGGGCATGCCGTCCATTACTGCCCAACCCAGATCAAACAAGGATT
CACCAGTATCGACAGTGAATACATTCATGTTGTTTTCGTAGAGAAAGGTCGTTTCATTAA
TCGAGCCAGGTCAACAGGCCCAATCAGTCGACTGGCTGGAAGGAGTCTTTGTTCTTGTGA
CGGAGTGCCCGAACAAAGCAAAAAGATGCCGTTTTGGAGGCCCAAACAATGTTCCAAACG
AGAACAGCGACGAGTCACGTACAACGCCCACCATACGGAAGAAGTTATGCAGAAACCAAG
TGTAGACATGGAGTTATGATTAGATAAAAAGTCAACATGAGACTTGACATACAGATAGGA
TTGCGCTGCTTGCCAGAAAATGGTGTGTGGACGAAGGACCCTCTCTGGATGCAGGGGAGA
GCATGCCGTACGACCACCATCGAGCTCGATACCGCATGGATCTTGAGTTCTTGGTACCTC
ATACACTTCCGCGAACCCCGACCATCTCGCCGTATTTCAAAAGGTACAGCCCTGACAATT
ATGTTCCTAAGCTTTATAGTGGGGCTCCTCGGGAAGCGGGATGCTGCCTCCTCCCACATA
CGTTGTAATATCCAATACCTGCAGATGCGTTCAGACTTTCGATGCGGTGGGCGCAGGTAC
CGCCGTAGGGAAAGCCATGCCTCGCACTGCGAGAAAGTTGGGTCTGCTAGGAGGTATCGT
AGGGATAATCATATCCAGCGTAAGAAGTCTAGGGTGGAAACATAAAGCATGGAATAAGGT
CAGTTCCAGGAACAATACACTAGCAGATTAACCCTTGTAGACACCACTGCAAAAAGAACA
AGGTACGTCGCTTGATATCGCGTACACACGGACTGAATTGACTCGGTCATGCTGGATCGA
TAAACTACCGTTTTGAAGATGAAGAGAGTCATACTGTTTGAATAAGG
13 changes: 13 additions & 0 deletions test/files/fuzz/53/proteome.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
>FAKEP00000372|FAKET00000372|FAKEG00000372|XXX
MTYPKNPRHGRKNITPTKPLQRRCSRGVAGSGCRAPKUYAYTGLLLINAVTYHLDIYSCQ
LLTGRCSSHTFTPEKRMVRSSVLYTDYCRLYTTCVSVDVSHIVYVARNQLGALGYERRIV
QILIGTVERLSCYLTNVHUVALELPVTLVSVRSEWKTASLTUEWQVLIESCDLLERPARA
VRQRDYSESSADGTRFDVGMLWSFRRRVRTCSTISIKTMHKHFRRSWHEDVIHCLHLRHD
MYWTPLLNTWYPYCALNGTVVSQCSSKTRSATSDAARGARYNAKPLKQPSAEAAERSRFS
TETLLWLNSLSLLLKY
>FAKEP00000932|FAKET00000932|FAKEG00000932|XXX
MIIPTIPPSRPNFLAVRGCRYWILQRMWEEAASRFPRSPTIKLRNIIVRAVPFEIRRDGR
GSRKCMRYQELKIHAVPHTIFWQAAQSYLYVKSHVDFLSNHNSMSTLGFCITSSVWWALY
VTRRCSRLEHCLGLQNGIFLLCSGTPSQEQRLLPASRLIGPVDLARLMKRPFSTKSSSTY
AFUILQPESRMTFVFYSSVERRTVNGAGTSVAIMHSLLNGVSLPSNYRLVASPRAAFGVD
KSFSGVRLS
11 changes: 2 additions & 9 deletions test/integration/test_call_variant_peptides.py
Original file line number Diff line number Diff line change
Expand Up @@ -1212,15 +1212,8 @@ def test_call_variant_peptide_case82(self):
self.default_test_case(gvf, reference, expected)

def test_call_variant_peptide_case83(self):
""" Issue in graph digestion. When determining the downstream nodes for
the next iteration after cleaving a bubble, the outbound node of a newly
created node (by merging or cleaving) is usually skipped if the new node
contains frameshifting variants. However, if the outbound node contains
multiple inbound nodes, and all of them are created in the current bubble,
it should still be processed and identified as a downstream node, otherwise,
it will remain as uncleaved and result in potential invalid characters
(e.g., *). """
test_dir = self.data_dir/'fuzz/52'
""" In-bubble reference node was treated as subgraph-in mistakenly. """
test_dir = self.data_dir/'fuzz/53'
gvf = [
test_dir/'fake_variants.gvf'
]
Expand Down

0 comments on commit 9726065

Please sign in to comment.