Skip to content

Commit

Permalink
Merge pull request #823 from uclahs-cds/czhu-fix-split-fasta
Browse files Browse the repository at this point in the history
Update splitFasta and summarizeFasta to accept source combinations in --order-source
  • Loading branch information
zhuchcn authored Nov 9, 2023
2 parents bb90867 + 83d4ac5 commit 93be448
Show file tree
Hide file tree
Showing 6 changed files with 135 additions and 6 deletions.
9 changes: 8 additions & 1 deletion moPepGen/aa/VariantPeptideLabel.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,10 @@ def __gt__(self, other:VariantSourceSet) -> bool:
return False
this = self.to_int()
that = other.to_int()
if len(this) > len(that):
return True
if len(this) < len(that):
return False
for i, j in zip(this, that):
if i > j:
return True
Expand All @@ -96,7 +100,10 @@ def __le__(self, other:VariantSourceSet) -> bool:

def to_int(self, sort=True) -> Iterable[int]:
""" to int """
source_int = {self.levels_map[x] for x in self}
try:
source_int = {self.levels_map[frozenset(self)]}
except KeyError:
source_int = {self.levels_map[x] for x in self}
if sort:
source_int = list(source_int)
source_int.sort()
Expand Down
10 changes: 7 additions & 3 deletions moPepGen/cli/split_fasta.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,11 +133,15 @@ def split_fasta(args:argparse.Namespace) -> None:
if args.order_source:
source_order = {}
for i,val in enumerate(args.order_source.split(',')):
if val in source_order:
if '-' in val:
source = frozenset(val.split('-'))
else:
source = val
if source in source_order:
raise ValueError(
f"Non-unique value found from `--group-source`: {val}"
f"Non-unique value found from `--group-source`: {source}"
)
source_order[val] = i
source_order[source] = i
else:
source_order = None

Expand Down
16 changes: 14 additions & 2 deletions moPepGen/cli/summarize_fasta.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,8 +158,20 @@ def summarize_fasta(args:argparse.Namespace) -> None:
coding_tx.add(tx_id)
del anno

source_order = {val:i for i,val in enumerate(args.order_source.split(','))}\
if args.order_source else None
if args.order_source:
source_order = {}
for i,val in enumerate(args.order_source.split(',')):
if '-' in val:
source = frozenset(val.split('-'))
else:
source = val
if source in source_order:
raise ValueError(
f"Non-unique value found from `--group-source`: {source}"
)
source_order[source] = i
else:
source_order = None

group_map = None
if args.group_source:
Expand Down
34 changes: 34 additions & 0 deletions test/integration/test_split_fasta.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,3 +176,37 @@ def test_split_fasta_case5(self):
'test_Noncoding.fasta', 'test_ALT.fasta'
}
self.assertEqual(files, expected)

def test_split_fasta_source_order_comb(self):
""" test splitFasta with source order of combinations """
args = self.create_base_args()
args.gvf = [
self.data_dir/'vep/vep_gSNP.gvf',
self.data_dir/'vep/vep_gINDEL.gvf',
self.data_dir/'reditools/reditools.gvf',
self.data_dir/'fusion/star_fusion.gvf',
self.data_dir/'circRNA/circ_rna.gvf'
]
args.variant_peptides = self.data_dir/'peptides/variant.fasta'
args.noncoding_peptides = self.data_dir/'peptides/noncoding.fasta'
args.alt_translation_peptides = self.data_dir/'peptides/alt_translation.fasta'
args.annotation_gtf = self.data_dir/'annotation.gtf'
args.proteome_fasta = self.data_dir/'translate.fasta'
args.group_source = [
'ALT:SECT,CodonReassign',
'NotCirc:gSNP,gINDEL,sSNV,sINDEL,Fusion,altSplice,RNAEditingSite'
]
args.order_source = ','.join([
'NotCir',
'ALT',
'NotCirc-ALT',
'Noncoding',
'Noncoding-NotCirc',
'Noncoding-ALT',
'circRNA',
'circRNA-ALT',
'circRNA-NotCirc',
'Noncoding-circRNA'
])
args.max_source_groups = 2
cli.split_fasta(args)
38 changes: 38 additions & 0 deletions test/integration/test_summarize_fasta.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,3 +87,41 @@ def test_summarize_fasta_cli(self):
print(cmd)
print(res.stderr.decode('utf-8'))
raise

def test_summarize_fasta_order_source_comb(self):
""" summarize fasta case2 with order source of combinations """
args = self.create_base_args()
args.gvf = [
self.data_dir/'vep/vep_gSNP.gvf',
self.data_dir/'vep/vep_gINDEL.gvf',
self.data_dir/'alternative_splicing/alternative_splicing.gvf',
self.data_dir/'reditools/reditools.gvf',
self.data_dir/'fusion/star_fusion.gvf',
self.data_dir/'circRNA/circ_rna.gvf'
]
args.variant_peptides = self.data_dir/'peptides/variant.fasta'
args.noncoding_peptides = self.data_dir/'peptides/noncoding.fasta'
args.alt_translation_peptides = None
args.annotation_gtf = self.data_dir/"annotation.gtf"
args.proteome_fasta = self.data_dir/"translate.fasta"
args.group_source = [
'ALT:SECT,CodonReassign',
'NotCirc:gSNP,gINDEL,Fusion,AlternativeSplicing,RNAEditingSite'
]
args.order_source = ','.join([
'NotCir',
'ALT',
'NotCirc-ALT',
'Noncoding',
'Noncoding-NotCirc',
'Noncoding-ALT',
'circRNA',
'circRNA-ALT',
'circRNA-NotCirc',
'Noncoding-circRNA'
])
args.ignore_missing_source = True
cli.summarize_fasta(args)
files = {str(file.name) for file in self.work_dir.glob('*')}
expected = {args.output_path.name}
self.assertEqual(files, expected)
34 changes: 34 additions & 0 deletions test/unit/test_peptide_pool_splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,12 @@ def test_comparison(self):
set2 = VariantSourceSet(['gINDEL', 'sINDEL'])
self.assertTrue(set1 < set2)

levels[frozenset({'sINDEL', 'circRNA'})] = 7
VariantSourceSet.set_levels(levels)
set1 = VariantSourceSet(['gSNP', 'circRNA'])
set2 = VariantSourceSet(['sINDEL', 'circRNA'])
self.assertTrue(set1 > set2)

class TestVariantPeptideInfo(unittest.TestCase):
""" Test VariantPeptideInfo """
def test_from_variant_peptide(self):
Expand Down Expand Up @@ -515,3 +521,31 @@ def test_split_database_fusion(self):
received = {str(x.seq) for x in splitter.databases['Fusion'].peptides}
expected = {'SSSSSSSK'}
self.assertEqual(expected, received)

def test_split_database_source_comb_order(self):
""" Test split database with source order of combinations. """
anno = create_genomic_annotation(ANNOTATION_DATA)
tx2gene, coding_tx = get_tx2gene_and_coding_tx(anno)
peptides_data = [
[
'SSSSSSSR',
'ENST0001|SNV-1001-T-A|INDEL-1101-TTTT-T|1' +
' ENST0001|SNV-1003-T-A|INDEL-1104-TTTT-T|1'
]
]
peptides = VariantPeptidePool({create_aa_record(*x) for x in peptides_data})
label_map = LabelSourceMapping(copy.copy(LABEL_MAP1))
order = copy.copy(SOURCE_ORDER)
order[frozenset(['sSNV', 'sINDEL'])] = max(order.values()) + 1
splitter = PeptidePoolSplitter(
peptides=peptides,
order=order,
label_map=label_map
)
splitter.split(2, [], tx2gene, coding_tx)

self.assertEqual({'sSNV-sINDEL'}, set(splitter.databases.keys()))

received = {str(x.seq) for x in splitter.databases['sSNV-sINDEL'].peptides}
expected = {'SSSSSSSR'}
self.assertEqual(expected, received)

0 comments on commit 93be448

Please sign in to comment.