Merge pull request #862 from uclahs-cds/czhu-ass

Novel ORF from coding transcripts
uclahs-cds · Mar 22, 2024 · 6079e4f · 6079e4f
2 parents 3d6df24 + e8a2cf5
commit 6079e4f
Show file tree

Hide file tree

Showing 18 changed files with 415 additions and 109 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,6 +10,10 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm
 
 ## [Unreleased]
 
+## [1.3.2]
+
+- `--include-coding` and `--find-ass` added to `callNoncoding` and `callVariant` to call novel ORF peptides from coding transcripts. #659
+
 ## [1.3.1] - 2024-03-18
 
 ### Added:

diff --git a/docs/file-format.md b/docs/file-format.md
@@ -212,7 +212,7 @@ The ID of circRNAs consists of two components. They all start with \<transcript_
 
 ## 2 Variant Peptide FASTA
 
-In moPepGen, the headers of the final output variant peptide FASTA contain the transcript IDs and variants associated with this variant peptide. The header of a peptide record starts with the transcript backbone ID and is followed by the variant IDs that it is associated with, separated by '|'. The variant IDs are defined in the GVF files. In some cases, several non-canonical peptides from the same transcript may share the same variants. This is most common in cases of peptide miscleavages. In addition, a frameshifting variant may cause multiple non-canonical peptides. An integer index is thus always added to the end of each header entry to resolve redundancies.
+In moPepGen, the headers of the final output variant peptide FASTA contain the transcript IDs and variants associated with this variant peptide. The header of a peptide record starts with the transcript backbone ID and is followed by the variant IDs that it is associated with, separated by '|'. The variant IDs are defined in the GVF files. An ORF ID is added if the peptide is found from a novel ORF of the transcript. In some cases, several non-canonical peptides from the same transcript may share the same variants. This is most common in cases of peptide miscleavages. In addition, a frameshifting variant may cause multiple non-canonical peptides. An integer index is thus always added to the end of each header entry to resolve redundancies.
 
 If the same peptide is found in multiple transcripts, all are documented as separate entries in the fasta header, separated by space.
 

diff --git a/moPepGen/__init__.py b/moPepGen/__init__.py
@@ -8,7 +8,7 @@
 from . import constant
 
 
-__version__ = '1.3.1'
+__version__ = '1.3.2'
 
 ## Error messages
 ERROR_INDEX_IN_INTRON = 'The genomic index seems to be in an intron'

diff --git a/moPepGen/aa/PeptidePoolSplitter.py b/moPepGen/aa/PeptidePoolSplitter.py
@@ -11,7 +11,6 @@
 
 if TYPE_CHECKING:
     from .AminoAcidSeqRecord import AminoAcidSeqRecord
-    from moPepGen.gtf import GenomicAnnotation
 
 Databases = Dict[str,VariantPeptidePool]
 

diff --git a/moPepGen/aa/VariantPeptideIdentifier.py b/moPepGen/aa/VariantPeptideIdentifier.py
@@ -1,8 +1,9 @@
 """ Module for VariantPeptideIdentifier """
 from __future__ import annotations
 from abc import ABC, abstractmethod
-from typing import Dict, List, TYPE_CHECKING
+from typing import Dict, List, TYPE_CHECKING, Set
 from moPepGen import VARIANT_PEPTIDE_SOURCE_DELIMITER
+from moPepGen.constant import VariantPrefix
 
 if TYPE_CHECKING:
     from moPepGen.seqvar import VariantRecord
@@ -60,7 +61,13 @@ def create_variant_peptide_id(transcript_id:str, variants:List[VariantRecord],
         label = BaseVariantPeptideIdentifier(transcript_id, variant_ids, orf_id, index, gene_id)
     return str(label)
 
-def parse_variant_peptide_id(label:str) -> List[VariantPeptideIdentifier]:
+def _set_identifier_type(t, v):
+    """ Set identifier type """
+    if t is not None:
+        raise ValueError(f"VariantType is not None: {t}")
+    return v
+
+def parse_variant_peptide_id(label:str, coding_txs:Set[str]) -> List[VariantPeptideIdentifier]:
     """ Parse variant peptide identifier from peptide name.
 
     Examples:
@@ -69,51 +76,145 @@ def parse_variant_peptide_id(label:str) -> List[VariantPeptideIdentifier]:
         [<BaseVariantPeptideIdentifier>: 'ENST0001|SNV-100-A-T|1']
 
     """
+    # FASTA header examples:
+    # ENST0001|SNV-50-A-T|1
+    # ENST0001|SNV-50-A-T|INDEL-55-CC-C|2
+    # ENST0001|SNV-50-A-T|ORF2|1
+    # FASTA headers have 5 fields below separate by | :
+    # - (Required) Transcript backbone ID (tx ID, fusion ID, or circRNA ID)
+    # - (Optioanl) Gene ID, required for novel ORF peptides without additional variants.
+    # - (Optional) Variant IDs separated by |
+    # - (Optional) ORF ID, required for novel ORF peptides.
+    # - (Required) peptide index.
     variant_ids = []
     for it in label.split(VARIANT_PEPTIDE_SOURCE_DELIMITER):
         fields = it.split('|')
-        if fields[-2].startswith('ORF'):
-            # Noncoding peptide
-            tx_id, gene_id, *codon_reassigns, orf_id, index = fields
-            variant_id = NoncodingPeptideIdentifier(
-                transcript_id=tx_id, gene_id=gene_id,
-                codon_reassigns=codon_reassigns,
-                orf_id=orf_id, index=int(index)
-            )
+        gene_id = None
+        backbone_id = None
+        var_ids:Dict[int, List[str]] = {}
+        alt_ids = []
+        orf_id = None
 
-        else:
-            x_id, *var_ids, index = fields
-
-            orf_id = None
-            if len(var_ids) > 0 and var_ids[0].startswith('ORF'):
-                orf_id = var_ids.pop(0)
-
-            if x_id.startswith('FUSION-'):
-                first_variants:List[str] = []
-                second_variants:List[str] = []
-                peptide_variants:List[str] = []
-                for var_id in var_ids:
-                    if var_id.startswith('1-') or var_id.startswith('2-'):
-                        which_gene, var_id = var_id.split('-', 1)
-                        if int(which_gene) == 1:
-                            first_variants.append(var_id)
-                        elif int(which_gene) == 2:
-                            second_variants.append(var_id)
-                    else:
-                        peptide_variants.append(var_id)
-                variant_id = FusionVariantPeptideIdentifier(x_id, first_variants,
-                    second_variants, peptide_variants, orf_id, index)
-            elif x_id.startswith('CIRC-') or x_id.startswith('CI-'):
-                variant_id = CircRNAVariantPeptideIdentifier(x_id, var_ids, orf_id, index)
+        try:
+            index = int(fields[-1])
+            fields.pop()
+        except ValueError:
+            index = None
+
+        IdentifierType:VariantPeptideIdentifier = None
+
+        # Step 1. Iterate over the fields in a FASTA header separated by | and
+        # assign them into the 5 variables.
+        for i, field in enumerate(fields):
+            # The first field is always the backbone ID, and if its prefix is
+            # FUSION-, CIRC-, or CI-, this must be a Fusion or circRNA entry.
+            if field.startswith(str(VariantPrefix.FUSION)):
+                if i != 0:
+                    raise ValueError(f"FASTA header isn't valid: {it}")
+                backbone_id = field
+                IdentifierType = _set_identifier_type(
+                    IdentifierType,
+                    FusionVariantPeptideIdentifier
+                )
+
+            elif field.startswith(str(VariantPrefix.CI)) \
+                    or field.startswith(str(VariantPrefix.CIRC)):
+                if i != 0:
+                    raise ValueError(f"FASTA header isn't valid: {it}")
+                backbone_id = field
+                IdentifierType = _set_identifier_type(
+                    IdentifierType,
+                    CircRNAVariantPeptideIdentifier
+                )
+
+            elif field.startswith('ORF'):
+                orf_id = field
+
+            elif IdentifierType is FusionVariantPeptideIdentifier:
+                if field.startswith('1-') or field.startswith('2-'):
+                    which_gene, var_id = field.split('-', 1)
+                    which_gene = int(which_gene)
+                else:
+                    # SECT or W2F
+                    var_id = field
+                    which_gene = 0
+                if which_gene in var_ids:
+                    var_ids[which_gene].append(var_id)
+                else:
+                    var_ids[which_gene] = [var_id]
+
+            elif any(field.startswith(str(t)) for t in VariantPrefix.alt_translation()):
+                alt_ids.append(field)
+
+            elif any(field.startswith(str(t)) for t in VariantPrefix.ctbv()):
+                if 1 in var_ids:
+                    var_ids[1].append(field)
+                else:
+                    var_ids[1] = [field]
+
+        # Step 2. Infer the FASTA header type. If it's not already interpreted in
+        # step 1 (either Fusion or circRNA), it must be either a base variant or
+        # novel ORF peptide.
+        if IdentifierType is None:
+            # `NovelORFPeptideIdentifier` is for novel ORF peptides without any
+            # additional variants. While `BaseVariantPeptideIdentifer` always
+            # have at least one variant.
+            if not var_ids and orf_id is not None:
+                backbone_id = fields[0]
+                if len(fields) > 1:
+                    gene_id = fields[1]
+                IdentifierType = _set_identifier_type(
+                    IdentifierType,
+                    NovelORFPeptideIdentifier
+                )
             else:
-                variant_id = BaseVariantPeptideIdentifier(x_id, var_ids, orf_id, index)
+                backbone_id = fields[0]
+                IdentifierType = _set_identifier_type(
+                    IdentifierType,
+                    BaseVariantPeptideIdentifier
+                )
+
+        # Step 3. Construct the object based on the FASTA header type.
+        if IdentifierType is NovelORFPeptideIdentifier:
+            variant_id = NovelORFPeptideIdentifier(
+                transcript_id=backbone_id,
+                gene_id=gene_id,
+                codon_reassigns=alt_ids,
+                orf_id=orf_id,
+                index=index,
+                is_protein_coding=(backbone_id in coding_txs)
+            )
+        elif IdentifierType is FusionVariantPeptideIdentifier:
+            variant_id = FusionVariantPeptideIdentifier(
+                fusion_id=backbone_id,
+                first_variants=var_ids.get(1, []),
+                second_variants=var_ids.get(2, []),
+                peptide_variants=var_ids.get(0, []),
+                orf_id=orf_id,
+                index=index
+            )
+        elif IdentifierType is CircRNAVariantPeptideIdentifier:
+            variant_id = CircRNAVariantPeptideIdentifier(
+                circ_rna_id=backbone_id,
+                variant_ids=sum(var_ids.values(), []) + alt_ids,
+                orf_id=orf_id,
+                index=index
+            )
+        elif IdentifierType is BaseVariantPeptideIdentifier:
+            variant_id = BaseVariantPeptideIdentifier(
+                transcript_id=backbone_id,
+                variant_ids=sum(var_ids.values(), []) + alt_ids,
+                orf_id=orf_id,
+                index=index,
+                gene_id=gene_id
+            )
 
         variant_ids.append(variant_id)
     return variant_ids
 
 
 class VariantPeptideIdentifier(ABC):
-    """ variant peptide identifer virtual class """
+    """ variant peptide identifier virtual class """
     @abstractmethod
     def __str__(self) -> str:
         """ str """
@@ -205,16 +306,17 @@ def second_tx_id(self):
         _,_,second = self.fusion_id.split('-')
         return second.split(':')[0]
 
-class NoncodingPeptideIdentifier(VariantPeptideIdentifier):
+class NovelORFPeptideIdentifier(VariantPeptideIdentifier):
     """ Noncoding peptide identifier """
     def __init__(self, transcript_id:str, gene_id:str, codon_reassigns:List[str],
-             orf_id:str=None, index:int=None):
+            is_protein_coding:bool, orf_id:str=None, index:int=None):
         """ constructor """
         self.transcript_id = transcript_id
         self.gene_id = gene_id
         self.codon_reassigns = codon_reassigns
         self.orf_id = orf_id
         self.index = index
+        self.is_protein_coding = is_protein_coding
 
     def __str__(self) -> str:
         """ str """

diff --git a/moPepGen/aa/VariantPeptideLabel.py b/moPepGen/aa/VariantPeptideLabel.py
@@ -129,9 +129,9 @@ def from_variant_peptide_minimal(peptide:AminoAcidSeqRecord
             ) -> List[VariantPeptideInfo]:
         """ Create list of VariantPeptideInfo with minimal information. """
         info_list:List[VariantPeptideInfo] = []
-        variant_ids = pi.parse_variant_peptide_id(peptide.description)
+        variant_ids = pi.parse_variant_peptide_id(peptide.description, set())
         for variant_id in variant_ids:
-            if isinstance(variant_id, pi.NoncodingPeptideIdentifier):
+            if isinstance(variant_id, pi.NovelORFPeptideIdentifier):
                 gene_ids = [variant_id.gene_id]
                 var_ids = {}
 
@@ -167,9 +167,9 @@ def from_variant_peptide(peptide:AminoAcidSeqRecord,
         if group_map is None:
             group_map = {}
         info_list = []
-        variant_ids = pi.parse_variant_peptide_id(peptide.description)
+        variant_ids = pi.parse_variant_peptide_id(peptide.description, coding_tx)
         for variant_id in variant_ids:
-            if isinstance(variant_id, pi.NoncodingPeptideIdentifier):
+            if isinstance(variant_id, pi.NovelORFPeptideIdentifier):
                 tx_id = variant_id.transcript_id
                 gene_ids = [variant_id.gene_id]
                 var_ids = {variant_id.gene_id: variant_id.codon_reassigns}
@@ -227,17 +227,17 @@ def from_variant_peptide(peptide:AminoAcidSeqRecord,
 
     def is_fusion(self) -> bool:
         """ Check if this is a fusion """
-        _id = pi.parse_variant_peptide_id(self.original_label)[0]
+        _id = pi.parse_variant_peptide_id(self.original_label, set())[0]
         return isinstance(_id, pi.FusionVariantPeptideIdentifier)
 
     def is_circ_rna(self) -> bool:
         """ Check if this is a circRNA """
-        _id = pi.parse_variant_peptide_id(self.original_label)[0]
+        _id = pi.parse_variant_peptide_id(self.original_label, set())[0]
         return isinstance(_id, pi.CircRNAVariantPeptideIdentifier)
 
     def is_splice_altering(self) -> bool:
         """ Check if the variant paptide label is alternative splicing """
-        _id = pi.parse_variant_peptide_id(self.original_label)[0]
+        _id = pi.parse_variant_peptide_id(self.original_label, set())[0]
         return isinstance(_id, pi.BaseVariantPeptideIdentifier) and \
             _id.is_alternative_splicing()
 
@@ -273,14 +273,14 @@ def any_coding(self, anno:GenomicAnnotation) -> bool:
 
     def get_transcript_ids(self) -> List[str]:
         """ get transcript IDs """
-        variant_id = pi.parse_variant_peptide_id(self.original_label)[0]
+        variant_id = pi.parse_variant_peptide_id(self.original_label, set())[0]
         if isinstance(variant_id, pi.CircRNAVariantPeptideIdentifier):
             return [variant_id.circ_rna_id.split('-', 2)[1]]
         if isinstance(variant_id, pi.FusionVariantPeptideIdentifier):
             return [variant_id.first_tx_id, variant_id.second_tx_id]
         if isinstance(variant_id, pi.BaseVariantPeptideIdentifier):
             return [variant_id.transcript_id]
-        if isinstance(variant_id, pi.NoncodingPeptideIdentifier):
+        if isinstance(variant_id, pi.NovelORFPeptideIdentifier):
             return [variant_id.transcript_id]
         raise ValueError('Variant ID unrecognized')