Skip to content

Commit

Permalink
Merge pull request #862 from uclahs-cds/czhu-ass
Browse files Browse the repository at this point in the history
Novel ORF from coding transcripts
  • Loading branch information
zhuchcn authored Mar 22, 2024
2 parents 3d6df24 + e8a2cf5 commit 6079e4f
Show file tree
Hide file tree
Showing 18 changed files with 415 additions and 109 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm

## [Unreleased]

## [1.3.2]

- `--include-coding` and `--find-ass` added to `callNoncoding` and `callVariant` to call novel ORF peptides from coding transcripts. #659

## [1.3.1] - 2024-03-18

### Added:
Expand Down
2 changes: 1 addition & 1 deletion docs/file-format.md
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,7 @@ The ID of circRNAs consists of two components. They all start with \<transcript_

## 2 Variant Peptide FASTA

In moPepGen, the headers of the final output variant peptide FASTA contain the transcript IDs and variants associated with this variant peptide. The header of a peptide record starts with the transcript backbone ID and is followed by the variant IDs that it is associated with, separated by '|'. The variant IDs are defined in the GVF files. In some cases, several non-canonical peptides from the same transcript may share the same variants. This is most common in cases of peptide miscleavages. In addition, a frameshifting variant may cause multiple non-canonical peptides. An integer index is thus always added to the end of each header entry to resolve redundancies.
In moPepGen, the headers of the final output variant peptide FASTA contain the transcript IDs and variants associated with this variant peptide. The header of a peptide record starts with the transcript backbone ID and is followed by the variant IDs that it is associated with, separated by '|'. The variant IDs are defined in the GVF files. An ORF ID is added if the peptide is found from a novel ORF of the transcript. In some cases, several non-canonical peptides from the same transcript may share the same variants. This is most common in cases of peptide miscleavages. In addition, a frameshifting variant may cause multiple non-canonical peptides. An integer index is thus always added to the end of each header entry to resolve redundancies.

If the same peptide is found in multiple transcripts, all are documented as separate entries in the fasta header, separated by space.

Expand Down
2 changes: 1 addition & 1 deletion moPepGen/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from . import constant


__version__ = '1.3.1'
__version__ = '1.3.2'

## Error messages
ERROR_INDEX_IN_INTRON = 'The genomic index seems to be in an intron'
Expand Down
1 change: 0 additions & 1 deletion moPepGen/aa/PeptidePoolSplitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@

if TYPE_CHECKING:
from .AminoAcidSeqRecord import AminoAcidSeqRecord
from moPepGen.gtf import GenomicAnnotation

Databases = Dict[str,VariantPeptidePool]

Expand Down
178 changes: 140 additions & 38 deletions moPepGen/aa/VariantPeptideIdentifier.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
""" Module for VariantPeptideIdentifier """
from __future__ import annotations
from abc import ABC, abstractmethod
from typing import Dict, List, TYPE_CHECKING
from typing import Dict, List, TYPE_CHECKING, Set
from moPepGen import VARIANT_PEPTIDE_SOURCE_DELIMITER
from moPepGen.constant import VariantPrefix

if TYPE_CHECKING:
from moPepGen.seqvar import VariantRecord
Expand Down Expand Up @@ -60,7 +61,13 @@ def create_variant_peptide_id(transcript_id:str, variants:List[VariantRecord],
label = BaseVariantPeptideIdentifier(transcript_id, variant_ids, orf_id, index, gene_id)
return str(label)

def parse_variant_peptide_id(label:str) -> List[VariantPeptideIdentifier]:
def _set_identifier_type(t, v):
""" Set identifier type """
if t is not None:
raise ValueError(f"VariantType is not None: {t}")
return v

def parse_variant_peptide_id(label:str, coding_txs:Set[str]) -> List[VariantPeptideIdentifier]:
""" Parse variant peptide identifier from peptide name.
Examples:
Expand All @@ -69,51 +76,145 @@ def parse_variant_peptide_id(label:str) -> List[VariantPeptideIdentifier]:
[<BaseVariantPeptideIdentifier>: 'ENST0001|SNV-100-A-T|1']
"""
# FASTA header examples:
# ENST0001|SNV-50-A-T|1
# ENST0001|SNV-50-A-T|INDEL-55-CC-C|2
# ENST0001|SNV-50-A-T|ORF2|1
# FASTA headers have 5 fields below separate by | :
# - (Required) Transcript backbone ID (tx ID, fusion ID, or circRNA ID)
# - (Optioanl) Gene ID, required for novel ORF peptides without additional variants.
# - (Optional) Variant IDs separated by |
# - (Optional) ORF ID, required for novel ORF peptides.
# - (Required) peptide index.
variant_ids = []
for it in label.split(VARIANT_PEPTIDE_SOURCE_DELIMITER):
fields = it.split('|')
if fields[-2].startswith('ORF'):
# Noncoding peptide
tx_id, gene_id, *codon_reassigns, orf_id, index = fields
variant_id = NoncodingPeptideIdentifier(
transcript_id=tx_id, gene_id=gene_id,
codon_reassigns=codon_reassigns,
orf_id=orf_id, index=int(index)
)
gene_id = None
backbone_id = None
var_ids:Dict[int, List[str]] = {}
alt_ids = []
orf_id = None

else:
x_id, *var_ids, index = fields

orf_id = None
if len(var_ids) > 0 and var_ids[0].startswith('ORF'):
orf_id = var_ids.pop(0)

if x_id.startswith('FUSION-'):
first_variants:List[str] = []
second_variants:List[str] = []
peptide_variants:List[str] = []
for var_id in var_ids:
if var_id.startswith('1-') or var_id.startswith('2-'):
which_gene, var_id = var_id.split('-', 1)
if int(which_gene) == 1:
first_variants.append(var_id)
elif int(which_gene) == 2:
second_variants.append(var_id)
else:
peptide_variants.append(var_id)
variant_id = FusionVariantPeptideIdentifier(x_id, first_variants,
second_variants, peptide_variants, orf_id, index)
elif x_id.startswith('CIRC-') or x_id.startswith('CI-'):
variant_id = CircRNAVariantPeptideIdentifier(x_id, var_ids, orf_id, index)
try:
index = int(fields[-1])
fields.pop()
except ValueError:
index = None

IdentifierType:VariantPeptideIdentifier = None

# Step 1. Iterate over the fields in a FASTA header separated by | and
# assign them into the 5 variables.
for i, field in enumerate(fields):
# The first field is always the backbone ID, and if its prefix is
# FUSION-, CIRC-, or CI-, this must be a Fusion or circRNA entry.
if field.startswith(str(VariantPrefix.FUSION)):
if i != 0:
raise ValueError(f"FASTA header isn't valid: {it}")
backbone_id = field
IdentifierType = _set_identifier_type(
IdentifierType,
FusionVariantPeptideIdentifier
)

elif field.startswith(str(VariantPrefix.CI)) \
or field.startswith(str(VariantPrefix.CIRC)):
if i != 0:
raise ValueError(f"FASTA header isn't valid: {it}")
backbone_id = field
IdentifierType = _set_identifier_type(
IdentifierType,
CircRNAVariantPeptideIdentifier
)

elif field.startswith('ORF'):
orf_id = field

elif IdentifierType is FusionVariantPeptideIdentifier:
if field.startswith('1-') or field.startswith('2-'):
which_gene, var_id = field.split('-', 1)
which_gene = int(which_gene)
else:
# SECT or W2F
var_id = field
which_gene = 0
if which_gene in var_ids:
var_ids[which_gene].append(var_id)
else:
var_ids[which_gene] = [var_id]

elif any(field.startswith(str(t)) for t in VariantPrefix.alt_translation()):
alt_ids.append(field)

elif any(field.startswith(str(t)) for t in VariantPrefix.ctbv()):
if 1 in var_ids:
var_ids[1].append(field)
else:
var_ids[1] = [field]

# Step 2. Infer the FASTA header type. If it's not already interpreted in
# step 1 (either Fusion or circRNA), it must be either a base variant or
# novel ORF peptide.
if IdentifierType is None:
# `NovelORFPeptideIdentifier` is for novel ORF peptides without any
# additional variants. While `BaseVariantPeptideIdentifer` always
# have at least one variant.
if not var_ids and orf_id is not None:
backbone_id = fields[0]
if len(fields) > 1:
gene_id = fields[1]
IdentifierType = _set_identifier_type(
IdentifierType,
NovelORFPeptideIdentifier
)
else:
variant_id = BaseVariantPeptideIdentifier(x_id, var_ids, orf_id, index)
backbone_id = fields[0]
IdentifierType = _set_identifier_type(
IdentifierType,
BaseVariantPeptideIdentifier
)

# Step 3. Construct the object based on the FASTA header type.
if IdentifierType is NovelORFPeptideIdentifier:
variant_id = NovelORFPeptideIdentifier(
transcript_id=backbone_id,
gene_id=gene_id,
codon_reassigns=alt_ids,
orf_id=orf_id,
index=index,
is_protein_coding=(backbone_id in coding_txs)
)
elif IdentifierType is FusionVariantPeptideIdentifier:
variant_id = FusionVariantPeptideIdentifier(
fusion_id=backbone_id,
first_variants=var_ids.get(1, []),
second_variants=var_ids.get(2, []),
peptide_variants=var_ids.get(0, []),
orf_id=orf_id,
index=index
)
elif IdentifierType is CircRNAVariantPeptideIdentifier:
variant_id = CircRNAVariantPeptideIdentifier(
circ_rna_id=backbone_id,
variant_ids=sum(var_ids.values(), []) + alt_ids,
orf_id=orf_id,
index=index
)
elif IdentifierType is BaseVariantPeptideIdentifier:
variant_id = BaseVariantPeptideIdentifier(
transcript_id=backbone_id,
variant_ids=sum(var_ids.values(), []) + alt_ids,
orf_id=orf_id,
index=index,
gene_id=gene_id
)

variant_ids.append(variant_id)
return variant_ids


class VariantPeptideIdentifier(ABC):
""" variant peptide identifer virtual class """
""" variant peptide identifier virtual class """
@abstractmethod
def __str__(self) -> str:
""" str """
Expand Down Expand Up @@ -205,16 +306,17 @@ def second_tx_id(self):
_,_,second = self.fusion_id.split('-')
return second.split(':')[0]

class NoncodingPeptideIdentifier(VariantPeptideIdentifier):
class NovelORFPeptideIdentifier(VariantPeptideIdentifier):
""" Noncoding peptide identifier """
def __init__(self, transcript_id:str, gene_id:str, codon_reassigns:List[str],
orf_id:str=None, index:int=None):
is_protein_coding:bool, orf_id:str=None, index:int=None):
""" constructor """
self.transcript_id = transcript_id
self.gene_id = gene_id
self.codon_reassigns = codon_reassigns
self.orf_id = orf_id
self.index = index
self.is_protein_coding = is_protein_coding

def __str__(self) -> str:
""" str """
Expand Down
18 changes: 9 additions & 9 deletions moPepGen/aa/VariantPeptideLabel.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,9 +129,9 @@ def from_variant_peptide_minimal(peptide:AminoAcidSeqRecord
) -> List[VariantPeptideInfo]:
""" Create list of VariantPeptideInfo with minimal information. """
info_list:List[VariantPeptideInfo] = []
variant_ids = pi.parse_variant_peptide_id(peptide.description)
variant_ids = pi.parse_variant_peptide_id(peptide.description, set())
for variant_id in variant_ids:
if isinstance(variant_id, pi.NoncodingPeptideIdentifier):
if isinstance(variant_id, pi.NovelORFPeptideIdentifier):
gene_ids = [variant_id.gene_id]
var_ids = {}

Expand Down Expand Up @@ -167,9 +167,9 @@ def from_variant_peptide(peptide:AminoAcidSeqRecord,
if group_map is None:
group_map = {}
info_list = []
variant_ids = pi.parse_variant_peptide_id(peptide.description)
variant_ids = pi.parse_variant_peptide_id(peptide.description, coding_tx)
for variant_id in variant_ids:
if isinstance(variant_id, pi.NoncodingPeptideIdentifier):
if isinstance(variant_id, pi.NovelORFPeptideIdentifier):
tx_id = variant_id.transcript_id
gene_ids = [variant_id.gene_id]
var_ids = {variant_id.gene_id: variant_id.codon_reassigns}
Expand Down Expand Up @@ -227,17 +227,17 @@ def from_variant_peptide(peptide:AminoAcidSeqRecord,

def is_fusion(self) -> bool:
""" Check if this is a fusion """
_id = pi.parse_variant_peptide_id(self.original_label)[0]
_id = pi.parse_variant_peptide_id(self.original_label, set())[0]
return isinstance(_id, pi.FusionVariantPeptideIdentifier)

def is_circ_rna(self) -> bool:
""" Check if this is a circRNA """
_id = pi.parse_variant_peptide_id(self.original_label)[0]
_id = pi.parse_variant_peptide_id(self.original_label, set())[0]
return isinstance(_id, pi.CircRNAVariantPeptideIdentifier)

def is_splice_altering(self) -> bool:
""" Check if the variant paptide label is alternative splicing """
_id = pi.parse_variant_peptide_id(self.original_label)[0]
_id = pi.parse_variant_peptide_id(self.original_label, set())[0]
return isinstance(_id, pi.BaseVariantPeptideIdentifier) and \
_id.is_alternative_splicing()

Expand Down Expand Up @@ -273,14 +273,14 @@ def any_coding(self, anno:GenomicAnnotation) -> bool:

def get_transcript_ids(self) -> List[str]:
""" get transcript IDs """
variant_id = pi.parse_variant_peptide_id(self.original_label)[0]
variant_id = pi.parse_variant_peptide_id(self.original_label, set())[0]
if isinstance(variant_id, pi.CircRNAVariantPeptideIdentifier):
return [variant_id.circ_rna_id.split('-', 2)[1]]
if isinstance(variant_id, pi.FusionVariantPeptideIdentifier):
return [variant_id.first_tx_id, variant_id.second_tx_id]
if isinstance(variant_id, pi.BaseVariantPeptideIdentifier):
return [variant_id.transcript_id]
if isinstance(variant_id, pi.NoncodingPeptideIdentifier):
if isinstance(variant_id, pi.NovelORFPeptideIdentifier):
return [variant_id.transcript_id]
raise ValueError('Variant ID unrecognized')

Expand Down
Loading

0 comments on commit 6079e4f

Please sign in to comment.