Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix PVG that downstream node skipped mistakenly after cleaving #826

Merged
merged 3 commits into from
Nov 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion moPepGen/svgraph/PVGNode.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,9 @@ def _get_nth_rf_index(self, i:int) -> int:
for v in self.variants:
if not (v.variant.is_fusion() \
or v.variant.is_circ_rna() \
or (v.variant.is_alternative_splicing() and not v.variant.is_deletion())):
or (v.variant.is_alternative_splicing() and not v.variant.is_deletion()) \
or v.downstream_cleavage_altering \
or v.upstream_cleavage_altering):
locations.append(v.location)

locations.sort()
Expand Down
1 change: 1 addition & 0 deletions moPepGen/svgraph/PeptideVariantGraph.py
Original file line number Diff line number Diff line change
Expand Up @@ -364,6 +364,7 @@ def move_downstreams(self, nodes:Iterable[PVGNode], reading_frame_index:int
if node.get_last_rf_index() != reading_frame_index \
and len(node.get_out_nodes()) == 1 \
and not node.has_exclusive_outbond_node() \
and not all(x in nodes for x in node.get_out_nodes()[0].get_in_nodes()) \
and not len(node.get_out_nodes()[0].get_out_nodes()) == 0 \
and not node.get_out_nodes()[0].get_out_nodes()[0].seq.seq == '*':
continue
Expand Down
12 changes: 12 additions & 0 deletions test/files/fuzz/52/annotation.gtf
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
chr1 . gene 1 929 . - . gene_id FAKEG00000896; transcript_id FAKET00000896; protein_id FAKEP00000896; tag cds_start_NF;
chr1 . transcript 1 929 . - . gene_id FAKEG00000896; transcript_id FAKET00000896; protein_id FAKEP00000896; tag cds_start_NF; is_protein_coding true;
chr1 . selenocysteine 126 128 . - . gene_id FAKEG00000896; transcript_id FAKET00000896; protein_id FAKEP00000896; tag cds_start_NF;
chr1 . selenocysteine 132 134 . - . gene_id FAKEG00000896; transcript_id FAKET00000896; protein_id FAKEP00000896; tag cds_start_NF;
chr1 . selenocysteine 186 188 . - . gene_id FAKEG00000896; transcript_id FAKET00000896; protein_id FAKEP00000896; tag cds_start_NF;
chr1 . exon 1 219 . - . gene_id FAKEG00000896; transcript_id FAKET00000896; protein_id FAKEP00000896; tag cds_start_NF;
chr1 . CDS 114 219 . - 1 gene_id FAKEG00000896; transcript_id FAKET00000896; protein_id FAKEP00000896; tag cds_start_NF;
chr1 . CDS 507 545 . - 1 gene_id FAKEG00000896; transcript_id FAKET00000896; protein_id FAKEP00000896; tag cds_start_NF;
chr1 . exon 507 545 . - . gene_id FAKEG00000896; transcript_id FAKET00000896; protein_id FAKEP00000896; tag cds_start_NF;
chr1 . CDS 917 929 . - 2 gene_id FAKEG00000896; transcript_id FAKET00000896; protein_id FAKEP00000896; tag cds_start_NF;
chr1 . exon 917 929 . - . gene_id FAKEG00000896; transcript_id FAKET00000896; protein_id FAKEP00000896; tag cds_start_NF;
chr1 . UTR 1 113 . - . gene_id FAKEG00000896; transcript_id FAKET00000896; protein_id FAKEP00000896; tag cds_start_NF;
61 changes: 61 additions & 0 deletions test/files/fuzz/52/brute_force.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
AYRRDVDCR
CAGPGGNATK
CAGPGGNATKS
DVDCRGIPLSYYIPRPSR
GIPLSYYIPRPSR
GSISLKVCK
GSISLKVCKMR
GSISLKVCKMSWPR
GSISLMR
GSISLMRWPR
GSISLMRWPRGK
GSISLMSWPR
GSISLMSWPRGK
GSISLMSWPRGKCY
GSMSLKVCK
GSMSLKVCKMR
GSMSLKVCKMSWPR
GSMSLMR
GSMSLMRWPR
GSMSLMRWPRGK
GSMSLMSWPR
GSMSLMSWPRGK
GSMSLMSWPRGKCY
MRWPRGK
MSWPRGK
MSWPRGKCY
RDVDCRGIPLSYYIPRPSR
RRTFPUPVTGLLSHYGCYWSTVA
RRTFPUPVTGLLSHYGCYWSTVAR
RRTFPUPVTGLLSHYGCYWSTVAUA
RRTFPUPVTGLQSHYGCYWSTVA
RRTFPUPVTGLQSHYGCYWSTVAR
RRTFPUPVTGLQSHYGCYWSTVAUA
RRTFPUPVTMVVTGLQWPVVA
RRTFPUPVTMVVTGLQWPVVAUA
RRTFPUPVTMVVTGLQWPVVAUAUR
RTFPUPVTGLLSHYGCYWSTVA
RTFPUPVTGLLSHYGCYWSTVAR
RTFPUPVTGLLSHYGCYWSTVAUA
RTFPUPVTGLQSHYGCYWSTVA
RTFPUPVTGLQSHYGCYWSTVAR
RTFPUPVTGLQSHYGCYWSTVAUA
RTFPUPVTMVVTGLQWPVVA
RTFPUPVTMVVTGLQWPVVAUA
RTFPUPVTMVVTGLQWPVVAUAUR
SAKCAGPGGNATK
SAKCAGPGGNATKS
TFPUPVTGLLSHYGCYWSTVA
TFPUPVTGLLSHYGCYWSTVAR
TFPUPVTGLLSHYGCYWSTVAUA
TFPUPVTGLLSHYGCYWSTVAUAUR
TFPUPVTGLQSHYGCYWSTVA
TFPUPVTGLQSHYGCYWSTVAR
TFPUPVTGLQSHYGCYWSTVAUA
TFPUPVTMVVTGLQWPVVA
TFPUPVTMVVTGLQWPVVAUA
TFPUPVTMVVTGLQWPVVAUAUR
VCKMRWPR
VCKMSWPR
VCKMSWPRGK
WPRGKCY
29 changes: 29 additions & 0 deletions test/files/fuzz/52/fake_variants.gvf
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
##fileformat=VCFv4.2
##mopepgen_version=1.2.1
##parser=parseVEP
##reference_index=
##genome_fasta=
##annotation_gtf=
##source=
##CHROM=<Description="Gene ID">
##INFO=<ID=TRANSCRIPT_ID,Number=1,Type=String,Description="Transcript ID">
##INFO=<ID=GENE_SYMBOL,Number=1,Type=String,Description="Gene Symbol">
##INFO=<ID=GENOMIC_POSITION,Number=1,Type=String,Description="Genomic Position">
##INFO=<ID=START,Number=1,Type=Integer,Description="Start Position">
##INFO=<ID=END,Number=1,Type=Integer,Description="End Position">
##INFO=<ID=DONOR_START,Number=1,Type=Integer,Description="Donor Start Position">
##INFO=<ID=DONOR_END,Number=1,Type=Integer,Description="Donor End Position">
##INFO=<ID=COORDINATE,Number=1,Type=String,Description="Coordinate for Insertion or Substitution">
##INFO=<ID=ACCEPTER_GENE_ID,Number=1,Type=String,Description="3' Accepter Transcript's Gene ID">
##INFO=<ID=ACCEPTER_TRANSCRIPT_ID,Number=1,Type=String,Description="3' Accepter Transcript's Transcript ID">
##INFO=<ID=ACCEPTER_POSITION,Number=1,Type=Integer,Description="Position of the break point of the 3' accepter transcript">
##INFO=<ID=OFFSET,Number=+,Type=Integer,Description="Offsets of fragments (exons or introns)">
##INFO=<ID=LENGTH,Number=+,Type=Integer,Description="Lengths of fragments (exons or introns)">
##INFO=<ID=INTRON,Number=+,Type=Integer,Description="Indices of fragments that are introns">
#CHROM POS ID REF ALT QUAL FILTER INFO
FAKEG00000896 753 FAKEG00000896-752-GGGACTACAGTCACATT-G GGGACTACAGTCACATT G . . TRANSCRIPT_ID=FAKET00000896;GENOMIC_POSITION=chrF-176:161;GENE_SYMBOL=
FAKEG00000896 761 FAKEG00000896-760-AG-TT AG TT . . TRANSCRIPT_ID=FAKET00000896;GENOMIC_POSITION=chrF-168:169;GENE_SYMBOL=
FAKEG00000896 793 FAKEG00000896-792-G-GCCCGTGGTAG G GCCCGTGGTAG . . TRANSCRIPT_ID=FAKET00000896;GENOMIC_POSITION=chrF-136:137;GENE_SYMBOL=
FAKEG00000896 797 FAKEG00000896-796-G-A G A . . TRANSCRIPT_ID=FAKET00000896;GENOMIC_POSITION=chrF-132:133;GENE_SYMBOL=
FAKEG00000896 803 FAKEG00000896-802-GAAGGTCTGCAAA-G GAAGGTCTGCAAA G . . TRANSCRIPT_ID=FAKET00000896;GENOMIC_POSITION=chrF-126:115;GENE_SYMBOL=
FAKEG00000896 819 FAKEG00000896-818-A-C A C . . TRANSCRIPT_ID=FAKET00000896;GENOMIC_POSITION=chrF-110:111;GENE_SYMBOL=
17 changes: 17 additions & 0 deletions test/files/fuzz/52/genome.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
>chr1
TTAGTCGTTATCTGGACGGCCGAGGGATGTAATATGATAAGGGGATCCCCCTACAATCTA
CATCTCGCCTGTACGCTTTAGGACTTAGTAGCATTTCCCCCTGGGCCAGCTCATTTTGCA
GACCTTCAGGCTCATGCCACTGTAGACCAGTAACAACCATAATGTGACTGTAGTCCCGTA
ACGGGTCACGGGAAAGTCCGACGCTTTTTATCCCTCCACTCCCCAGTAAAGGTAGCTTGA
TAAGAACATAGAGGAGGATCAAACCTTAGGGGAGTATTAGGCGCTTCGAGCATAGGGGTC
GAAGCCGTGTTGGGTTAAGAGCTAACGGACATCGTCATTGCGTTCGCGACCCGATGGCGC
CACGCATATTATTTGCCCGCAAGGACCTCATTGTCTACAACCGTCGCCCCCGGCCTCGGT
GTAATCTCGCTTAAGCACTACGCTATTAAGGGACGTATGGCGGGCAATTGTATCGCTGAG
TGGAGGCTCGACCAACTGTAAGTACGGTCGCGTGCGTCTGCACCGTGTCAAACGTGCCTC
CGTTCGGAAGCATTGAACAACCTTACTGATATGAGGTAATCGCCGATTAAGTGGGCTTGA
CGTACACCGGCGCGCATTCCGAAGTAAGCGGGTTAGATCAATATCCCGATCATGATCGGC
CGTTCTTTAGGACCCGGTGGAGACTGGCCTGGTTCGCAAATATGTCGTCTTCGACTCGCC
CCATACTGATGCGATGTAACAAGTGCCTATGACACTCTATAGGTGCCCACCATATGGCAC
TTCCAGTTCAGGAGGTGTGAAAGACTTCCTAGTCAGGCCAAAGCTACTCCATAATAGTGC
GTTTTAAGTAAACCATAAACTTGATGGTGTAGCTACAATTAGACTTGCGAACTCCCTTAG
TTGGGACGGTGCCTCCAAGTCCATCATAT
2 changes: 2 additions & 0 deletions test/files/fuzz/52/proteome.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
>FAKEP00000896|FAKET00000896|FAKEG00000896|XXX
MMDLNGGTFDTVQTHATWRDKKRRTFPUPVTGLQSHYGCYWSTVAUAURSAK
17 changes: 17 additions & 0 deletions test/integration/test_call_variant_peptides.py
Original file line number Diff line number Diff line change
Expand Up @@ -1210,3 +1210,20 @@ def test_call_variant_peptide_case82(self):
expected = test_dir/'brute_force.txt'
reference = test_dir
self.default_test_case(gvf, reference, expected)

def test_call_variant_peptide_case83(self):
""" Issue in graph digestion. When determining the downstream nodes for
the next iteration after cleaving a bubble, the outbound node of a newly
created node (by merging or cleaving) is usually skipped if the new node
contains frameshifting variants. However, if the outbound node contains
multiple inbound nodes, and all of them are created in the current bubble,
it should still be processed and identified as a downstream node, otherwise,
it will remain as uncleaved and result in potential invalid characters
(e.g., *). """
test_dir = self.data_dir/'fuzz/52'
gvf = [
test_dir/'fake_variants.gvf'
]
expected = test_dir/'brute_force.txt'
reference = test_dir
self.default_test_case(gvf, reference, expected)
Loading