diff --git a/CHANGELOG.md b/CHANGELOG.md index 1f989ac6..8f8f7619 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,10 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm ## [Unreleased] +## [1.3.2] + +- `--include-coding` and `--find-ass` added to `callNoncoding` and `callVariant` to call novel ORF peptides from coding transcripts. #659 + ## [1.3.1] - 2024-03-18 ### Added: diff --git a/docs/file-format.md b/docs/file-format.md index 6a317b07..1d23da92 100644 --- a/docs/file-format.md +++ b/docs/file-format.md @@ -212,7 +212,7 @@ The ID of circRNAs consists of two components. They all start with \ List[VariantPeptideIdentifier]: +def _set_identifier_type(t, v): + """ Set identifier type """ + if t is not None: + raise ValueError(f"VariantType is not None: {t}") + return v + +def parse_variant_peptide_id(label:str, coding_txs:Set[str]) -> List[VariantPeptideIdentifier]: """ Parse variant peptide identifier from peptide name. Examples: @@ -69,51 +76,145 @@ def parse_variant_peptide_id(label:str) -> List[VariantPeptideIdentifier]: [: 'ENST0001|SNV-100-A-T|1'] """ + # FASTA header examples: + # ENST0001|SNV-50-A-T|1 + # ENST0001|SNV-50-A-T|INDEL-55-CC-C|2 + # ENST0001|SNV-50-A-T|ORF2|1 + # FASTA headers have 5 fields below separate by | : + # - (Required) Transcript backbone ID (tx ID, fusion ID, or circRNA ID) + # - (Optioanl) Gene ID, required for novel ORF peptides without additional variants. + # - (Optional) Variant IDs separated by | + # - (Optional) ORF ID, required for novel ORF peptides. + # - (Required) peptide index. variant_ids = [] for it in label.split(VARIANT_PEPTIDE_SOURCE_DELIMITER): fields = it.split('|') - if fields[-2].startswith('ORF'): - # Noncoding peptide - tx_id, gene_id, *codon_reassigns, orf_id, index = fields - variant_id = NoncodingPeptideIdentifier( - transcript_id=tx_id, gene_id=gene_id, - codon_reassigns=codon_reassigns, - orf_id=orf_id, index=int(index) - ) + gene_id = None + backbone_id = None + var_ids:Dict[int, List[str]] = {} + alt_ids = [] + orf_id = None - else: - x_id, *var_ids, index = fields - - orf_id = None - if len(var_ids) > 0 and var_ids[0].startswith('ORF'): - orf_id = var_ids.pop(0) - - if x_id.startswith('FUSION-'): - first_variants:List[str] = [] - second_variants:List[str] = [] - peptide_variants:List[str] = [] - for var_id in var_ids: - if var_id.startswith('1-') or var_id.startswith('2-'): - which_gene, var_id = var_id.split('-', 1) - if int(which_gene) == 1: - first_variants.append(var_id) - elif int(which_gene) == 2: - second_variants.append(var_id) - else: - peptide_variants.append(var_id) - variant_id = FusionVariantPeptideIdentifier(x_id, first_variants, - second_variants, peptide_variants, orf_id, index) - elif x_id.startswith('CIRC-') or x_id.startswith('CI-'): - variant_id = CircRNAVariantPeptideIdentifier(x_id, var_ids, orf_id, index) + try: + index = int(fields[-1]) + fields.pop() + except ValueError: + index = None + + IdentifierType:VariantPeptideIdentifier = None + + # Step 1. Iterate over the fields in a FASTA header separated by | and + # assign them into the 5 variables. + for i, field in enumerate(fields): + # The first field is always the backbone ID, and if its prefix is + # FUSION-, CIRC-, or CI-, this must be a Fusion or circRNA entry. + if field.startswith(str(VariantPrefix.FUSION)): + if i != 0: + raise ValueError(f"FASTA header isn't valid: {it}") + backbone_id = field + IdentifierType = _set_identifier_type( + IdentifierType, + FusionVariantPeptideIdentifier + ) + + elif field.startswith(str(VariantPrefix.CI)) \ + or field.startswith(str(VariantPrefix.CIRC)): + if i != 0: + raise ValueError(f"FASTA header isn't valid: {it}") + backbone_id = field + IdentifierType = _set_identifier_type( + IdentifierType, + CircRNAVariantPeptideIdentifier + ) + + elif field.startswith('ORF'): + orf_id = field + + elif IdentifierType is FusionVariantPeptideIdentifier: + if field.startswith('1-') or field.startswith('2-'): + which_gene, var_id = field.split('-', 1) + which_gene = int(which_gene) + else: + # SECT or W2F + var_id = field + which_gene = 0 + if which_gene in var_ids: + var_ids[which_gene].append(var_id) + else: + var_ids[which_gene] = [var_id] + + elif any(field.startswith(str(t)) for t in VariantPrefix.alt_translation()): + alt_ids.append(field) + + elif any(field.startswith(str(t)) for t in VariantPrefix.ctbv()): + if 1 in var_ids: + var_ids[1].append(field) + else: + var_ids[1] = [field] + + # Step 2. Infer the FASTA header type. If it's not already interpreted in + # step 1 (either Fusion or circRNA), it must be either a base variant or + # novel ORF peptide. + if IdentifierType is None: + # `NovelORFPeptideIdentifier` is for novel ORF peptides without any + # additional variants. While `BaseVariantPeptideIdentifer` always + # have at least one variant. + if not var_ids and orf_id is not None: + backbone_id = fields[0] + if len(fields) > 1: + gene_id = fields[1] + IdentifierType = _set_identifier_type( + IdentifierType, + NovelORFPeptideIdentifier + ) else: - variant_id = BaseVariantPeptideIdentifier(x_id, var_ids, orf_id, index) + backbone_id = fields[0] + IdentifierType = _set_identifier_type( + IdentifierType, + BaseVariantPeptideIdentifier + ) + + # Step 3. Construct the object based on the FASTA header type. + if IdentifierType is NovelORFPeptideIdentifier: + variant_id = NovelORFPeptideIdentifier( + transcript_id=backbone_id, + gene_id=gene_id, + codon_reassigns=alt_ids, + orf_id=orf_id, + index=index, + is_protein_coding=(backbone_id in coding_txs) + ) + elif IdentifierType is FusionVariantPeptideIdentifier: + variant_id = FusionVariantPeptideIdentifier( + fusion_id=backbone_id, + first_variants=var_ids.get(1, []), + second_variants=var_ids.get(2, []), + peptide_variants=var_ids.get(0, []), + orf_id=orf_id, + index=index + ) + elif IdentifierType is CircRNAVariantPeptideIdentifier: + variant_id = CircRNAVariantPeptideIdentifier( + circ_rna_id=backbone_id, + variant_ids=sum(var_ids.values(), []) + alt_ids, + orf_id=orf_id, + index=index + ) + elif IdentifierType is BaseVariantPeptideIdentifier: + variant_id = BaseVariantPeptideIdentifier( + transcript_id=backbone_id, + variant_ids=sum(var_ids.values(), []) + alt_ids, + orf_id=orf_id, + index=index, + gene_id=gene_id + ) variant_ids.append(variant_id) return variant_ids class VariantPeptideIdentifier(ABC): - """ variant peptide identifer virtual class """ + """ variant peptide identifier virtual class """ @abstractmethod def __str__(self) -> str: """ str """ @@ -205,16 +306,17 @@ def second_tx_id(self): _,_,second = self.fusion_id.split('-') return second.split(':')[0] -class NoncodingPeptideIdentifier(VariantPeptideIdentifier): +class NovelORFPeptideIdentifier(VariantPeptideIdentifier): """ Noncoding peptide identifier """ def __init__(self, transcript_id:str, gene_id:str, codon_reassigns:List[str], - orf_id:str=None, index:int=None): + is_protein_coding:bool, orf_id:str=None, index:int=None): """ constructor """ self.transcript_id = transcript_id self.gene_id = gene_id self.codon_reassigns = codon_reassigns self.orf_id = orf_id self.index = index + self.is_protein_coding = is_protein_coding def __str__(self) -> str: """ str """ diff --git a/moPepGen/aa/VariantPeptideLabel.py b/moPepGen/aa/VariantPeptideLabel.py index c053fd4c..884e8c41 100644 --- a/moPepGen/aa/VariantPeptideLabel.py +++ b/moPepGen/aa/VariantPeptideLabel.py @@ -129,9 +129,9 @@ def from_variant_peptide_minimal(peptide:AminoAcidSeqRecord ) -> List[VariantPeptideInfo]: """ Create list of VariantPeptideInfo with minimal information. """ info_list:List[VariantPeptideInfo] = [] - variant_ids = pi.parse_variant_peptide_id(peptide.description) + variant_ids = pi.parse_variant_peptide_id(peptide.description, set()) for variant_id in variant_ids: - if isinstance(variant_id, pi.NoncodingPeptideIdentifier): + if isinstance(variant_id, pi.NovelORFPeptideIdentifier): gene_ids = [variant_id.gene_id] var_ids = {} @@ -167,9 +167,9 @@ def from_variant_peptide(peptide:AminoAcidSeqRecord, if group_map is None: group_map = {} info_list = [] - variant_ids = pi.parse_variant_peptide_id(peptide.description) + variant_ids = pi.parse_variant_peptide_id(peptide.description, coding_tx) for variant_id in variant_ids: - if isinstance(variant_id, pi.NoncodingPeptideIdentifier): + if isinstance(variant_id, pi.NovelORFPeptideIdentifier): tx_id = variant_id.transcript_id gene_ids = [variant_id.gene_id] var_ids = {variant_id.gene_id: variant_id.codon_reassigns} @@ -227,17 +227,17 @@ def from_variant_peptide(peptide:AminoAcidSeqRecord, def is_fusion(self) -> bool: """ Check if this is a fusion """ - _id = pi.parse_variant_peptide_id(self.original_label)[0] + _id = pi.parse_variant_peptide_id(self.original_label, set())[0] return isinstance(_id, pi.FusionVariantPeptideIdentifier) def is_circ_rna(self) -> bool: """ Check if this is a circRNA """ - _id = pi.parse_variant_peptide_id(self.original_label)[0] + _id = pi.parse_variant_peptide_id(self.original_label, set())[0] return isinstance(_id, pi.CircRNAVariantPeptideIdentifier) def is_splice_altering(self) -> bool: """ Check if the variant paptide label is alternative splicing """ - _id = pi.parse_variant_peptide_id(self.original_label)[0] + _id = pi.parse_variant_peptide_id(self.original_label, set())[0] return isinstance(_id, pi.BaseVariantPeptideIdentifier) and \ _id.is_alternative_splicing() @@ -273,14 +273,14 @@ def any_coding(self, anno:GenomicAnnotation) -> bool: def get_transcript_ids(self) -> List[str]: """ get transcript IDs """ - variant_id = pi.parse_variant_peptide_id(self.original_label)[0] + variant_id = pi.parse_variant_peptide_id(self.original_label, set())[0] if isinstance(variant_id, pi.CircRNAVariantPeptideIdentifier): return [variant_id.circ_rna_id.split('-', 2)[1]] if isinstance(variant_id, pi.FusionVariantPeptideIdentifier): return [variant_id.first_tx_id, variant_id.second_tx_id] if isinstance(variant_id, pi.BaseVariantPeptideIdentifier): return [variant_id.transcript_id] - if isinstance(variant_id, pi.NoncodingPeptideIdentifier): + if isinstance(variant_id, pi.NovelORFPeptideIdentifier): return [variant_id.transcript_id] raise ValueError('Variant ID unrecognized') diff --git a/moPepGen/cli/call_noncoding_peptide.py b/moPepGen/cli/call_noncoding_peptide.py index 8be2741d..00366049 100644 --- a/moPepGen/cli/call_noncoding_peptide.py +++ b/moPepGen/cli/call_noncoding_peptide.py @@ -7,7 +7,7 @@ from Bio.SeqIO import FastaIO from moPepGen import params, svgraph, aa, get_logger, VARIANT_PEPTIDE_SOURCE_DELIMITER from moPepGen.dna.DNASeqRecord import DNASeqRecordWithCoordinates -from moPepGen.err import ReferenceSeqnameNotFoundError, warning +from moPepGen.err import ReferenceSeqnameNotFoundError from moPepGen.cli import common @@ -60,6 +60,11 @@ def add_subparser_call_noncoding(subparsers:argparse._SubParsersAction): default='max', metavar='' ) + p.add_argument( + '--coding-novel-orf', + action='store_true', + help='Include coding transcripts to find alternative ORFs.' + ) p.add_argument( '--w2f-reassignment', action='store_true', @@ -117,22 +122,26 @@ def call_noncoding_peptide(args:argparse.Namespace) -> None: inclusion_biotypes, exclusion_biotypes = common.load_inclusion_exclusion_biotypes(args) - noncanonical_pool = aa.VariantPeptidePool() + novel_orf_peptide_pool = aa.VariantPeptidePool() orf_pool = [] i = 0 for tx_id in anno.transcripts: tx_model = anno.transcripts[tx_id] - if inclusion_biotypes and \ - tx_model.transcript.biotype not in inclusion_biotypes: - continue - if exclusion_biotypes and \ - tx_model.transcript.biotype in exclusion_biotypes: - continue - if tx_id in proteome: - continue - if tx_model.transcript_len() < args.min_tx_length: - continue + if tx_model.is_protein_coding: + if not args.coding_novel_orf: + pass + else: + if inclusion_biotypes and \ + tx_model.transcript.biotype not in inclusion_biotypes: + continue + if exclusion_biotypes and \ + tx_model.transcript.biotype in exclusion_biotypes: + continue + if tx_id in proteome: + continue + if tx_model.transcript_len() < args.min_tx_length: + continue try: peptides, orfs = call_noncoding_peptide_main( @@ -149,11 +158,11 @@ def call_noncoding_peptide(args:argparse.Namespace) -> None: orf_pool.extend(orfs) for peptide in peptides: - noncanonical_pool.add_peptide(peptide, canonical_peptides, + novel_orf_peptide_pool.add_peptide(peptide, canonical_peptides, cleavage_params) except ReferenceSeqnameNotFoundError as e: if not ReferenceSeqnameNotFoundError.raised: - warning(e.args[0] + ' Make sure your GTF and FASTA files match.') + logger.warning('%s: Make sure your GTF and FASTA files match.', e.args[0]) ReferenceSeqnameNotFoundError.mute() except: logger.error('Exception raised from %s', tx_id) @@ -163,7 +172,7 @@ def call_noncoding_peptide(args:argparse.Namespace) -> None: if i % 5000 == 0: logger.info('%i transcripts processed.', i) - noncanonical_pool.write(args.output_path) + novel_orf_peptide_pool.write(args.output_path) if args.output_orf: with open(args.output_orf, 'w') as handle: write_orf(orf_pool, handle) @@ -223,16 +232,25 @@ def call_noncoding_peptide_main(tx_id:str, tx_model:TranscriptAnnotationModel, name=label ) ) - orfs = get_orf_sequences(pgraph, tx_id, tx_model.gene_id, tx_seq) + orfs = get_orf_sequences( + pgraph=pgraph, + tx_id=tx_id, + gene_id=tx_model.gene_id, + tx_seq=tx_seq, + exclude_canonical_orf=False + ) return peptides, orfs def get_orf_sequences(pgraph:svgraph.PeptideVariantGraph, tx_id:str, gene_id:str, - tx_seq:DNASeqRecordWithCoordinates) -> List[aa.AminoAcidSeqRecord]: + tx_seq:DNASeqRecordWithCoordinates, exclude_canonical_orf:bool + ) -> List[aa.AminoAcidSeqRecord]: """ Get the full ORF sequences """ seqs = [] translate_seqs = [tx_seq[i:].translate() for i in range(3)] for orf, orf_id in pgraph.orf_id_map.items(): orf_start = orf[0] + if exclude_canonical_orf and tx_seq.orf and tx_seq.orf.start == orf_start: + continue seq_start = int(orf_start / 3) reading_frame_index = orf_start % 3 #seq_start = int((orf_start - node.orf[0]) / 3) diff --git a/moPepGen/cli/call_variant_peptide.py b/moPepGen/cli/call_variant_peptide.py index fc0456d9..b95fe9cb 100644 --- a/moPepGen/cli/call_variant_peptide.py +++ b/moPepGen/cli/call_variant_peptide.py @@ -83,6 +83,11 @@ def add_subparser_call_variant(subparsers:argparse._SubParsersAction): action='store_true', help='For circRNA, only keep noncanonical peptides spaning the backsplicing site.' ) + p.add_argument( + '--coding-novel-orf', + action='store_true', + help='Find alternative start site for coding transcripts.' + ) p.add_argument( '--max-variants-per-node', type=int, @@ -320,6 +325,7 @@ def call_variant_peptides_wrapper(tx_id:str, w2f_reassignment:bool, backsplicing_only:bool, save_graph:bool, + coding_novel_orf:bool, **kwargs ) -> Tuple[Dict[Seq, List[AnnotatedPeptideLabel]], str, TypeDGraphs, TypePGraphs]: """ wrapper function to call variant peptides """ @@ -354,7 +360,8 @@ def add_peptide_anno(x:Dict[Seq, List[AnnotatedPeptideLabel]]): truncate_sec=truncate_sec, w2f=w2f_reassignment, denylist=denylist, - save_graph=save_graph + save_graph=save_graph, + coding_novel_orf=coding_novel_orf ) main_peptides = set(peptide_map.keys()) dgraphs = (dgraph, dgraphs[1], dgraphs[2]) @@ -385,7 +392,7 @@ def add_peptide_anno(x:Dict[Seq, List[AnnotatedPeptideLabel]]): variant=variant, variant_pool=variant_pool, ref=reference_data, tx_seqs=tx_seqs, gene_seqs=gene_seqs, cleavage_params=cleavage_params, max_adjacent_as_mnv=max_adjacent_as_mnv, w2f_reassignment=w2f_reassignment, - denylist=denylist, save_graph=save_graph + denylist=denylist, save_graph=save_graph, coding_novel_orf=coding_novel_orf ) dgraphs[1][variant.id] = dgraph pgraphs[1][variant.id] = pgraph @@ -559,7 +566,8 @@ def call_variant_peptide(args:argparse.Namespace) -> None: 'timeout': args.timeout_seconds, 'max_variants_per_node': tuple(args.max_variants_per_node), 'additional_variants_per_misc': tuple(args.additional_variants_per_misc), - 'backsplicing_only': args.backsplicing_only + 'backsplicing_only': args.backsplicing_only, + 'coding_novel_orf': args.coding_novel_orf } dispatches.append(dispatch) @@ -644,7 +652,7 @@ def call_peptide_main(tx_id:str, tx_variants:List[seqvar.VariantRecord], gene_seqs:Dict[str, dna.DNASeqRecordWithCoordinates], cleavage_params:params.CleavageParams, max_adjacent_as_mnv:bool, truncate_sec:bool, w2f:bool, - denylist:Set[str], save_graph:bool + denylist:Set[str], save_graph:bool, coding_novel_orf:bool ) -> TypeCallPeptideReturnData: """ Call variant peptides for main variants (except cirRNA). """ tx_model = ref.anno.transcripts[tx_id] @@ -671,12 +679,30 @@ def call_peptide_main(tx_id:str, tx_variants:List[seqvar.VariantRecord], pgraph = dgraph.translate() pgraph.create_cleavage_graph() - peptide_map = pgraph.call_variant_peptides( - denylist=denylist, - truncate_sec=truncate_sec, - w2f=w2f, - check_external_variants=True - ) + + if tx_model.is_protein_coding: + peptide_map = pgraph.call_variant_peptides( + denylist=denylist, + truncate_sec=truncate_sec, + w2f=w2f, + check_external_variants=True, + check_orf=False + ) + else: + peptide_map = {} + + if not tx_model.is_protein_coding or coding_novel_orf: + peptide_novel_orf = pgraph.call_variant_peptides( + denylist=denylist, + truncate_sec=truncate_sec, + w2f=w2f, + check_external_variants=True, + check_orf=True + ) + for peptide, labels in peptide_novel_orf.items(): + if peptide not in peptide_map: + peptide_map[peptide] = labels + if not save_graph: dgraph, pgraph = None, None return peptide_map, dgraph, pgraph @@ -687,7 +713,8 @@ def call_peptide_fusion(variant:seqvar.VariantRecord, tx_seqs:Dict[str, dna.DNASeqRecordWithCoordinates], gene_seqs:Dict[str, dna.DNASeqRecordWithCoordinates], cleavage_params:params.CleavageParams, max_adjacent_as_mnv:bool, - w2f_reassignment:bool, denylist:Set[str], save_graph:bool + w2f_reassignment:bool, denylist:Set[str], save_graph:bool, + coding_novel_orf:bool ) -> TypeCallPeptideReturnData: """ Call variant peptides for fusion """ tx_id = variant.location.seqname @@ -730,11 +757,28 @@ def call_peptide_fusion(variant:seqvar.VariantRecord, dgraph.fit_into_codons() pgraph = dgraph.translate() pgraph.create_cleavage_graph() - peptide_map = pgraph.call_variant_peptides( - denylist=denylist, - w2f=w2f_reassignment, - check_external_variants=True - ) + + if tx_model.is_protein_coding: + peptide_map = pgraph.call_variant_peptides( + denylist=denylist, + w2f=w2f_reassignment, + check_external_variants=True, + check_orf=False + ) + else: + peptide_map = {} + + if not tx_model.is_protein_coding or coding_novel_orf: + peptide_novel_orf = pgraph.call_variant_peptides( + denylist=denylist, + w2f=w2f_reassignment, + check_external_variants=True, + check_orf=True + ) + for peptide, labels in peptide_novel_orf.items(): + if peptide not in peptide_map: + peptide_map[peptide] = labels + if not save_graph: dgraph, pgraph = None, None return peptide_map, dgraph, pgraph diff --git a/moPepGen/constant.py b/moPepGen/constant.py index 83231415..387f2899 100644 --- a/moPepGen/constant.py +++ b/moPepGen/constant.py @@ -1,4 +1,62 @@ """ moPepGen constants """ +from enum import Enum + + +class VariantPrefix(Enum): + """ Varinat Prefix""" + SNV = 'SNV' + INDEL = 'INDEL' + MNV = 'MNV' + RES = 'RES' + SE = 'SE' + RI = 'RI' + A3SS = 'A3SS' + A5SS = 'A5SS' + MXE = 'MXE' + W2F = 'W2F' + SECT = 'SECT' + CIRC = 'CIRC' + CI = 'CI' + FUSION = 'FUSION' + + def __str__(self): + """ str """ + return self.value + + @classmethod + def ctbv(cls): + """ Conserved Transcript Backbone Variants """ + return [ + cls.SNV, + cls.INDEL, + cls.MNV, + cls.RES, + cls.SE, + cls.RI, + cls.A3SS, + cls.A5SS, + cls.MXE, + cls.W2F, + cls.SECT + ] + + @classmethod + def ntbv(cls): + """ Novel Transcript Backbone Variants """ + return [ + cls.CIRC, + cls.CI, + cls.FUSION + ] + + @classmethod + def alt_translation(cls): + """ Alternative translation variants """ + return [ + cls.W2F, + cls.SECT + ] + # Variant related constants SINGLE_NUCLEOTIDE_SUBSTITUTION = ['SNV', 'SNP', 'INDEL', 'MNV', 'RNAEditingSite'] ATTRS_POSITION = ['START', 'DONOR_START', 'ACCEPTER_START', 'ACCEPTER_POSITION'] diff --git a/moPepGen/data/gencode_hs_exclusion_list.txt b/moPepGen/data/gencode_hs_exclusion_list.txt index 1eb2deb4..a1487748 100644 --- a/moPepGen/data/gencode_hs_exclusion_list.txt +++ b/moPepGen/data/gencode_hs_exclusion_list.txt @@ -19,4 +19,4 @@ rRNA_pseudogene misc_RNA_pseudogene miRNA_pseudogene artifact -vaultRNA/vault_RNA \ No newline at end of file +vaultRNA/vault_RNA diff --git a/moPepGen/svgraph/PeptideVariantGraph.py b/moPepGen/svgraph/PeptideVariantGraph.py index 4c05d8d4..216f7b64 100644 --- a/moPepGen/svgraph/PeptideVariantGraph.py +++ b/moPepGen/svgraph/PeptideVariantGraph.py @@ -795,7 +795,7 @@ def call_variant_peptides(self, check_variants:bool=True, check_orf:bool=False, keep_all_occurrence:bool=True, denylist:Set[str]=None, circ_rna:CircRNAModel=None, orf_assignment:str='max', backsplicing_only:bool=False, truncate_sec:bool=False, w2f:bool=False, - check_external_variants:bool=True + check_external_variants:bool=True, find_ass:bool=False ) -> Dict[Seq, List[AnnotatedPeptideLabel]]: """ Walk through the graph and find all noncanonical peptides. @@ -823,6 +823,7 @@ def call_variant_peptides(self, check_variants:bool=True, harbouring any external variants will also be outputted. This can be used when calling noncanonical peptides from noncoding transcripts. + - find_ass (bool): Whether to find alternative start sites. """ if self.is_circ_rna() and not circ_rna: raise ValueError('`circ_rna` must be given.') @@ -844,7 +845,8 @@ def call_variant_peptides(self, check_variants:bool=True, check_variants=check_external_variants, check_orf=check_orf, queue=queue, pool=peptide_pool, circ_rna=circ_rna, orf_assignment=orf_assignment, - backsplicing_only=backsplicing_only + backsplicing_only=backsplicing_only, + find_ass=find_ass ) if self.has_known_orf(): @@ -870,7 +872,7 @@ def call_variant_peptides(self, check_variants:bool=True, self.call_and_stage_silently( cursor=cur, traversal=traversal ) - elif self.has_known_orf(): + elif self.has_known_orf() and not traversal.find_ass: # add out nodes to the queue self.call_and_stage_known_orf( cursor=cur, @@ -888,7 +890,8 @@ def call_variant_peptides(self, check_variants:bool=True, peptide_pool.translational_modification(w2f, self.denylist) return peptide_pool.get_peptide_sequences( - keep_all_occurrence=keep_all_occurrence, orf_id_map=self.orf_id_map, + keep_all_occurrence=keep_all_occurrence, + orf_id_map=self.orf_id_map, check_variants=check_variants ) @@ -1207,6 +1210,8 @@ def call_and_stage_unknown_orf(self, cursor:PVGCursor, traversal.stage(target_node, out_node, cursor) for node, orfs, is_start_codon, additional_variants in node_list: + if traversal.find_ass and any(o == traversal.known_orf_tx[0] for o in orfs): + continue traversal.pool.add_miscleaved_sequences( node=node, orfs=orfs, @@ -1318,7 +1323,6 @@ def __init__(self, in_node:PVGNode, out_node:PVGNode, in_cds:bool, self.finding_start_site = finding_start_site - class PVGTraversal(): """ PVG Traversal. The purpose of this class is to facilitate the graph traversal to call variant peptides. @@ -1328,7 +1332,8 @@ def __init__(self, check_variants:bool, check_orf:bool, known_orf_aa:Tuple[int,int]=None, circ_rna:CircRNAModel=None, queue:Deque[PVGCursor]=None, stack:Dict[PVGNode, Dict[PVGNode, PVGCursor]]=None, - orf_assignment:str='max', backsplicing_only:bool=False): + orf_assignment:str='max', backsplicing_only:bool=False, + find_ass:bool=False): """ constructor """ self.check_variants = check_variants self.check_orf = check_orf @@ -1340,6 +1345,7 @@ def __init__(self, check_variants:bool, check_orf:bool, self.stack = stack or {} self.orf_assignment = orf_assignment self.backsplicing_only = backsplicing_only + self.find_ass = find_ass def is_done(self) -> bool: """ Check if the traversal is done """ @@ -1424,6 +1430,12 @@ def cmp_unknown_orf(x:PVGCursor, y:PVGCursor) -> bool: def cmp_unknown_orf_check_orf(self, x:PVGCursor, y:PVGCursor) -> bool: """ comparison for unknown ORF and check ORFs. """ # pylint: disable=R0911 + if self.find_ass: + if any(o.orf[0] == self.known_orf_tx[0] for o in x.orfs): + return -1 + if any(o.orf[0] == self.known_orf_tx[0] for o in y.orfs): + return 1 + if x.in_cds and not y.in_cds: return -1 if not x.in_cds and y.in_cds: @@ -1431,8 +1443,8 @@ def cmp_unknown_orf_check_orf(self, x:PVGCursor, y:PVGCursor) -> bool: if not x.in_cds and not y.in_cds: return -1 - x_orf = x.orfs[0] - y_orf = y.orfs[0] + x_orf = x.orfs[0].orf + y_orf = y.orfs[0].orf if x_orf[0] is not None and (y_orf[0] is None or y_orf[0] == -1): return -1 if (x_orf[0] is None or x_orf[0] == -1) and y_orf[0] is not None: @@ -1454,9 +1466,14 @@ def cmp_unknown_orf_check_orf(self, x:PVGCursor, y:PVGCursor) -> bool: return -1 if sorted(x_start_gain)[0] > sorted(y_start_gain)[0] else 1 return -1 - @staticmethod - def comp_unknown_orf_keep_all_orfs(x:PVGCursor, y:PVGCursor) -> bool: + def comp_unknown_orf_keep_all_orfs(self, x:PVGCursor, y:PVGCursor) -> bool: """ comparison when all ORFs want to be kept (for circRNA) """ + if self.find_ass: + if any(o.orf[0] == self.known_orf_tx[0] for o in x.orfs): + return -1 + if any(o.orf[0] == self.known_orf_tx[0] for o in y.orfs): + return 1 + if x.in_cds and not y.in_cds: return -1 if not x.in_cds and y.in_cds: diff --git a/moPepGen/svgraph/VariantPeptideDict.py b/moPepGen/svgraph/VariantPeptideDict.py index a5fbb106..94b41de0 100644 --- a/moPepGen/svgraph/VariantPeptideDict.py +++ b/moPepGen/svgraph/VariantPeptideDict.py @@ -48,6 +48,12 @@ def __init__(self, query:FeatureLocation, ref:FeatureLocation, feature_type:str, self.feature_id = feature_id self.variant_id = variant_id + def __repr__(self): + """ str """ + return f"" + def merge(self, other:PeptideSegment) -> PeptideSegment: """ merge """ query = FeatureLocation( diff --git a/moPepGen/util/validate_noncoding_calling.py b/moPepGen/util/validate_noncoding_calling.py index 667eedd6..2e1808fe 100644 --- a/moPepGen/util/validate_noncoding_calling.py +++ b/moPepGen/util/validate_noncoding_calling.py @@ -82,6 +82,7 @@ def call_noncoding(ref_dir:Path, output_fasta:Path): args.output_orf = None args.output_path = output_fasta args.min_tx_length = 21 + args.coding_novel_orf = False args.inclusion_biotypes = None args.exclusion_biotypes = None args.quiet = True diff --git a/moPepGen/util/validate_variant_calling.py b/moPepGen/util/validate_variant_calling.py index dac6b8eb..e292beee 100644 --- a/moPepGen/util/validate_variant_calling.py +++ b/moPepGen/util/validate_variant_calling.py @@ -106,6 +106,7 @@ def call_variant(gvf_files:Path, ref_dir:Path, output_fasta:Path, graph_output_d args.reference_source = None args.max_adjacent_as_mnv = 2 args.backsplicing_only = False + args.coding_novel_orf = False args.selenocysteine_termination = True args.w2f_reassignment = True args.invalid_protein_as_noncoding = False diff --git a/test/integration/test_call_alt_translation.py b/test/integration/test_call_alt_translation.py index 58c34bf3..86dd62ca 100644 --- a/test/integration/test_call_alt_translation.py +++ b/test/integration/test_call_alt_translation.py @@ -68,7 +68,7 @@ def test_call_alt_translation_case1(self): var_labels = functools.reduce( lambda x,y: x + y, - [vpi.parse_variant_peptide_id(x) for x in headers] + [vpi.parse_variant_peptide_id(x, set()) for x in headers] ) self.assertTrue(all(isinstance(x, vpi.BaseVariantPeptideIdentifier) for x in var_labels)) diff --git a/test/integration/test_call_noncoding_peptides.py b/test/integration/test_call_noncoding_peptides.py index 2f036f0e..93e89384 100644 --- a/test/integration/test_call_noncoding_peptides.py +++ b/test/integration/test_call_noncoding_peptides.py @@ -5,7 +5,7 @@ import sys from test.integration import TestCaseIntegration from Bio import SeqIO -from moPepGen import cli +from moPepGen import cli, gtf, dna, aa def create_base_args() -> argparse.Namespace: @@ -19,8 +19,10 @@ def create_base_args() -> argparse.Namespace: args.reference_source = None args.output_path = None args.output_orf = None + args.coding_novel_orf = False args.inclusion_biotypes = None args.exclusion_biotypes = None + args.coing_novel_orf = False args.min_tx_length = 21 args.orf_assignment = 'max' args.w2f_reassignment = False @@ -108,3 +110,47 @@ def test_call_noncoding_peptides_w2f(self): self.assertEqual(len(ids),len(set(ids))) self.assertTrue(peptides[0].id.split('|')[0] != 'None') self.assertTrue(any('W2F' in x for x in ids)) + + def test_call_noncoding_peptides_coding(self): + """ Test calling for novel ORF peptides from coding TXs """ + args = create_base_args() + args.w2f_reassignment = True + args.genome_fasta = self.data_dir/'genome.fasta' + args.annotation_gtf = self.data_dir/'annotation.gtf' + args.proteome_fasta = self.data_dir/'translate.fasta' + args.output_path = self.work_dir/'noncoding_peptide.fasta' + args.output_orf = self.work_dir/'noncoding_orf.fasta' + args.include_coding = True + cli.call_noncoding_peptide(args) + files = {str(file.name) for file in self.work_dir.glob('*')} + expected = {args.output_path.name, args.output_orf.name} + self.assertEqual(files, expected) + + anno = gtf.GenomicAnnotation() + anno.dump_gtf(args.annotation_gtf) + + genome = dna.DNASeqDict() + genome.dump_fasta(args.genome_fasta) + + proteome = aa.AminoAcidSeqDict() + proteome.dump_fasta(args.proteome_fasta) + canonical_peptides = proteome.create_unique_peptide_pool( + anno=anno, rule='trypsin', exception='trypsin_exception' + ) + canonical_peptides = set(canonical_peptides) + + with open(args.output_path, 'rt') as handle: + novel_peptides = list(SeqIO.parse(handle, 'fasta')) + + # assert that the novel orf peptides don't overlap with the canonical + # peptide + self.assertEqual( + len({str(x.seq) for x in novel_peptides}.intersection(canonical_peptides)), + 0 + ) + + # assert that novel peptides are called from coding transcripts + novel_orf_txs = set().union( + *[{y.split('|')[0] for y in x.description.split(' ')} for x in novel_peptides] + ) + self.assertTrue(any(x in proteome for x in novel_orf_txs)) diff --git a/test/integration/test_call_variant_peptides.py b/test/integration/test_call_variant_peptides.py index 4e9ed430..8081e297 100644 --- a/test/integration/test_call_variant_peptides.py +++ b/test/integration/test_call_variant_peptides.py @@ -26,6 +26,7 @@ def create_base_args() -> argparse.Namespace: args.graph_output_dir = None args.max_adjacent_as_mnv = 0 args.backsplicing_only = False + args.coding_novel_orf = False args.selenocysteine_termination = False args.w2f_reassignment = False args.max_variants_per_node = [7] @@ -55,7 +56,7 @@ def assert_no_canonical_peptide_with_circ(self, seqs): """ Assert that no canonical peptide with circRNA """ no_canonical_peptides_with_circ = False for seq in seqs: - labels = vpi.parse_variant_peptide_id(seq.description) + labels = vpi.parse_variant_peptide_id(seq.description, set()) for label in labels: if isinstance(label, vpi.CircRNAVariantPeptideIdentifier): for other_label in labels: @@ -641,7 +642,7 @@ def test_call_variant_peptide_case31(self): self.default_test_case(gvf, reference, None) has_incorrect_fasta_header = False for peptide in SeqIO.parse(self.work_dir/'vep_moPepGen.fasta', 'fasta'): - labels = vpi.parse_variant_peptide_id(peptide.description) + labels = vpi.parse_variant_peptide_id(peptide.description, set()) for label in labels: if not isinstance(label, vpi.CircRNAVariantPeptideIdentifier): continue diff --git a/test/unit/test_variant_peptide_id.py b/test/unit/test_variant_peptide_id.py index 2a834903..1cb49569 100644 --- a/test/unit/test_variant_peptide_id.py +++ b/test/unit/test_variant_peptide_id.py @@ -115,9 +115,10 @@ def test_create_variant_id_orf(self): def test_parse_variant_id_single_snv(self): """ parse variant from single snv """ + coding_txs = {} label = 'ENST0001|SNV-101-A-T|1' peptide_ids:List[pi.BaseVariantPeptideIdentifier] - peptide_ids = aa.parse_variant_peptide_id(label) + peptide_ids = aa.parse_variant_peptide_id(label, coding_txs) self.assertEqual(len(peptide_ids), 1) self.assertIsInstance(peptide_ids[0], pi.BaseVariantPeptideIdentifier) self.assertEqual(peptide_ids[0].transcript_id, 'ENST0001') @@ -126,9 +127,10 @@ def test_parse_variant_id_single_snv(self): def test_parse_variant_id_multiple_snv(self): """ parse variant from multiple snv """ + coding_txs = {} label = 'ENST0001|SNV-101-A-T|SNV-111-A-T|1' peptide_ids:List[pi.BaseVariantPeptideIdentifier] - peptide_ids = aa.parse_variant_peptide_id(label) + peptide_ids = aa.parse_variant_peptide_id(label, coding_txs) self.assertEqual(len(peptide_ids), 1) self.assertIsInstance(peptide_ids[0], pi.BaseVariantPeptideIdentifier) self.assertEqual(peptide_ids[0].transcript_id, 'ENST0001') @@ -137,9 +139,10 @@ def test_parse_variant_id_multiple_snv(self): def test_parse_variant_id_single_fusion(self): """ parse variant from single fusion """ + coding_txs = {} label = 'FUSION-ENSG0001:101-ENSG0002:201|1' peptide_ids:List[pi.FusionVariantPeptideIdentifier] - peptide_ids = aa.parse_variant_peptide_id(label) + peptide_ids = aa.parse_variant_peptide_id(label, coding_txs) self.assertEqual(len(peptide_ids), 1) self.assertIsInstance(peptide_ids[0], pi.FusionVariantPeptideIdentifier) self.assertEqual(peptide_ids[0].fusion_id, 'FUSION-ENSG0001:101-ENSG0002:201') @@ -149,9 +152,10 @@ def test_parse_variant_id_single_fusion(self): def test_parse_variant_id_fusion_and_first_variant(self): """ parse variant from single fusion """ + coding_txs = {} label = 'FUSION-ENSG0001:101-ENSG0002:201|1-SNV-98-A-T|1' peptide_ids:List[pi.FusionVariantPeptideIdentifier] - peptide_ids = aa.parse_variant_peptide_id(label) + peptide_ids = aa.parse_variant_peptide_id(label, coding_txs) self.assertEqual(len(peptide_ids), 1) self.assertIsInstance(peptide_ids[0], pi.FusionVariantPeptideIdentifier) self.assertEqual(peptide_ids[0].fusion_id, 'FUSION-ENSG0001:101-ENSG0002:201') @@ -161,9 +165,10 @@ def test_parse_variant_id_fusion_and_first_variant(self): def test_parse_variant_id_fusion_and_second_variant(self): """ parse variant from single fusion """ + coding_txs = {} label = 'FUSION-ENSG0001:101-ENSG0002:201|2-SNV-210-A-T|1' peptide_ids:List[pi.FusionVariantPeptideIdentifier] - peptide_ids = aa.parse_variant_peptide_id(label) + peptide_ids = aa.parse_variant_peptide_id(label, coding_txs) self.assertEqual(len(peptide_ids), 1) self.assertIsInstance(peptide_ids[0], pi.FusionVariantPeptideIdentifier) self.assertEqual(peptide_ids[0].fusion_id, 'FUSION-ENSG0001:101-ENSG0002:201') @@ -173,10 +178,11 @@ def test_parse_variant_id_fusion_and_second_variant(self): def test_parse_variant_id_fusion_and_first_and_second_variant(self): """ parse variant from single fusion """ + coding_txs = {} label = 'FUSION-ENSG0001:101-ENSG0002:201|1-SNV-98-A-T|' +\ '2-SNV-210-A-T|2-SNV-215-A-T|1' peptide_ids:List[pi.FusionVariantPeptideIdentifier] - peptide_ids = aa.parse_variant_peptide_id(label) + peptide_ids = aa.parse_variant_peptide_id(label, coding_txs) self.assertEqual(len(peptide_ids), 1) self.assertIsInstance(peptide_ids[0], pi.FusionVariantPeptideIdentifier) self.assertEqual(peptide_ids[0].fusion_id, 'FUSION-ENSG0001:101-ENSG0002:201') @@ -186,8 +192,9 @@ def test_parse_variant_id_fusion_and_first_and_second_variant(self): def test_parse_variant_id_single_circ_rna(self): """ parse variant from single fusion """ + coding_txs = {} label = 'CIRC-ENSG0001-E2-E3|1' - peptide_ids = aa.parse_variant_peptide_id(label) + peptide_ids = aa.parse_variant_peptide_id(label, coding_txs) self.assertEqual(len(peptide_ids), 1) self.assertIsInstance(peptide_ids[0], pi.CircRNAVariantPeptideIdentifier) peptide_ids:List[pi.CircRNAVariantPeptideIdentifier] @@ -197,8 +204,9 @@ def test_parse_variant_id_single_circ_rna(self): def test_parse_variant_id_circ_rna_and_snv(self): """ parse variant from single fusion """ + coding_txs = {} label = 'CIRC-ENSG0001-E2-E3|SNV-111-A-T|2' - peptide_ids = aa.parse_variant_peptide_id(label) + peptide_ids = aa.parse_variant_peptide_id(label, coding_txs) self.assertEqual(len(peptide_ids), 1) self.assertIsInstance(peptide_ids[0], pi.CircRNAVariantPeptideIdentifier) peptide_ids:List[pi.CircRNAVariantPeptideIdentifier] @@ -208,11 +216,12 @@ def test_parse_variant_id_circ_rna_and_snv(self): def test_parse_variant_id_orf(self): """ parse variant with orf """ + coding_txs = {} label = 'ENST0001|ENSG0001|ORF1|1' - peptide_ids = aa.parse_variant_peptide_id(label) + peptide_ids = aa.parse_variant_peptide_id(label, coding_txs) self.assertEqual(len(peptide_ids), 1) - self.assertIsInstance(peptide_ids[0], pi.NoncodingPeptideIdentifier) - peptide_ids:List[pi.NoncodingPeptideIdentifier] + self.assertIsInstance(peptide_ids[0], pi.NovelORFPeptideIdentifier) + peptide_ids:List[pi.NovelORFPeptideIdentifier] self.assertEqual(peptide_ids[0].transcript_id, 'ENST0001') self.assertEqual(peptide_ids[0].orf_id, 'ORF1') self.assertEqual(str(peptide_ids[0]), label)