diff --git a/moPepGen/aa/VariantPeptideLabel.py b/moPepGen/aa/VariantPeptideLabel.py index cdd10465..b80afc9c 100644 --- a/moPepGen/aa/VariantPeptideLabel.py +++ b/moPepGen/aa/VariantPeptideLabel.py @@ -75,6 +75,10 @@ def __gt__(self, other:VariantSourceSet) -> bool: return False this = self.to_int() that = other.to_int() + if len(this) > len(that): + return True + if len(this) < len(that): + return False for i, j in zip(this, that): if i > j: return True @@ -96,7 +100,10 @@ def __le__(self, other:VariantSourceSet) -> bool: def to_int(self, sort=True) -> Iterable[int]: """ to int """ - source_int = {self.levels_map[x] for x in self} + try: + source_int = {self.levels_map[frozenset(self)]} + except KeyError: + source_int = {self.levels_map[x] for x in self} if sort: source_int = list(source_int) source_int.sort() diff --git a/moPepGen/cli/split_fasta.py b/moPepGen/cli/split_fasta.py index a222455c..352dfda1 100644 --- a/moPepGen/cli/split_fasta.py +++ b/moPepGen/cli/split_fasta.py @@ -133,11 +133,15 @@ def split_fasta(args:argparse.Namespace) -> None: if args.order_source: source_order = {} for i,val in enumerate(args.order_source.split(',')): - if val in source_order: + if '-' in val: + source = frozenset(val.split('-')) + else: + source = val + if source in source_order: raise ValueError( - f"Non-unique value found from `--group-source`: {val}" + f"Non-unique value found from `--group-source`: {source}" ) - source_order[val] = i + source_order[source] = i else: source_order = None diff --git a/moPepGen/cli/summarize_fasta.py b/moPepGen/cli/summarize_fasta.py index 9fb45603..692ef7ef 100644 --- a/moPepGen/cli/summarize_fasta.py +++ b/moPepGen/cli/summarize_fasta.py @@ -158,8 +158,20 @@ def summarize_fasta(args:argparse.Namespace) -> None: coding_tx.add(tx_id) del anno - source_order = {val:i for i,val in enumerate(args.order_source.split(','))}\ - if args.order_source else None + if args.order_source: + source_order = {} + for i,val in enumerate(args.order_source.split(',')): + if '-' in val: + source = frozenset(val.split('-')) + else: + source = val + if source in source_order: + raise ValueError( + f"Non-unique value found from `--group-source`: {source}" + ) + source_order[source] = i + else: + source_order = None group_map = None if args.group_source: diff --git a/test/integration/test_split_fasta.py b/test/integration/test_split_fasta.py index a6e58a70..708d808c 100644 --- a/test/integration/test_split_fasta.py +++ b/test/integration/test_split_fasta.py @@ -176,3 +176,37 @@ def test_split_fasta_case5(self): 'test_Noncoding.fasta', 'test_ALT.fasta' } self.assertEqual(files, expected) + + def test_split_fasta_source_order_comb(self): + """ test splitFasta with source order of combinations """ + args = self.create_base_args() + args.gvf = [ + self.data_dir/'vep/vep_gSNP.gvf', + self.data_dir/'vep/vep_gINDEL.gvf', + self.data_dir/'reditools/reditools.gvf', + self.data_dir/'fusion/star_fusion.gvf', + self.data_dir/'circRNA/circ_rna.gvf' + ] + args.variant_peptides = self.data_dir/'peptides/variant.fasta' + args.noncoding_peptides = self.data_dir/'peptides/noncoding.fasta' + args.alt_translation_peptides = self.data_dir/'peptides/alt_translation.fasta' + args.annotation_gtf = self.data_dir/'annotation.gtf' + args.proteome_fasta = self.data_dir/'translate.fasta' + args.group_source = [ + 'ALT:SECT,CodonReassign', + 'NotCirc:gSNP,gINDEL,sSNV,sINDEL,Fusion,altSplice,RNAEditingSite' + ] + args.order_source = ','.join([ + 'NotCir', + 'ALT', + 'NotCirc-ALT', + 'Noncoding', + 'Noncoding-NotCirc', + 'Noncoding-ALT', + 'circRNA', + 'circRNA-ALT', + 'circRNA-NotCirc', + 'Noncoding-circRNA' + ]) + args.max_source_groups = 2 + cli.split_fasta(args) diff --git a/test/integration/test_summarize_fasta.py b/test/integration/test_summarize_fasta.py index 27c77bc1..badff88d 100644 --- a/test/integration/test_summarize_fasta.py +++ b/test/integration/test_summarize_fasta.py @@ -87,3 +87,41 @@ def test_summarize_fasta_cli(self): print(cmd) print(res.stderr.decode('utf-8')) raise + + def test_summarize_fasta_order_source_comb(self): + """ summarize fasta case2 with order source of combinations """ + args = self.create_base_args() + args.gvf = [ + self.data_dir/'vep/vep_gSNP.gvf', + self.data_dir/'vep/vep_gINDEL.gvf', + self.data_dir/'alternative_splicing/alternative_splicing.gvf', + self.data_dir/'reditools/reditools.gvf', + self.data_dir/'fusion/star_fusion.gvf', + self.data_dir/'circRNA/circ_rna.gvf' + ] + args.variant_peptides = self.data_dir/'peptides/variant.fasta' + args.noncoding_peptides = self.data_dir/'peptides/noncoding.fasta' + args.alt_translation_peptides = None + args.annotation_gtf = self.data_dir/"annotation.gtf" + args.proteome_fasta = self.data_dir/"translate.fasta" + args.group_source = [ + 'ALT:SECT,CodonReassign', + 'NotCirc:gSNP,gINDEL,Fusion,AlternativeSplicing,RNAEditingSite' + ] + args.order_source = ','.join([ + 'NotCir', + 'ALT', + 'NotCirc-ALT', + 'Noncoding', + 'Noncoding-NotCirc', + 'Noncoding-ALT', + 'circRNA', + 'circRNA-ALT', + 'circRNA-NotCirc', + 'Noncoding-circRNA' + ]) + args.ignore_missing_source = True + cli.summarize_fasta(args) + files = {str(file.name) for file in self.work_dir.glob('*')} + expected = {args.output_path.name} + self.assertEqual(files, expected) diff --git a/test/unit/test_peptide_pool_splitter.py b/test/unit/test_peptide_pool_splitter.py index a6064362..84edb8e4 100644 --- a/test/unit/test_peptide_pool_splitter.py +++ b/test/unit/test_peptide_pool_splitter.py @@ -202,6 +202,12 @@ def test_comparison(self): set2 = VariantSourceSet(['gINDEL', 'sINDEL']) self.assertTrue(set1 < set2) + levels[frozenset({'sINDEL', 'circRNA'})] = 7 + VariantSourceSet.set_levels(levels) + set1 = VariantSourceSet(['gSNP', 'circRNA']) + set2 = VariantSourceSet(['sINDEL', 'circRNA']) + self.assertTrue(set1 > set2) + class TestVariantPeptideInfo(unittest.TestCase): """ Test VariantPeptideInfo """ def test_from_variant_peptide(self): @@ -515,3 +521,31 @@ def test_split_database_fusion(self): received = {str(x.seq) for x in splitter.databases['Fusion'].peptides} expected = {'SSSSSSSK'} self.assertEqual(expected, received) + + def test_split_database_source_comb_order(self): + """ Test split database with source order of combinations. """ + anno = create_genomic_annotation(ANNOTATION_DATA) + tx2gene, coding_tx = get_tx2gene_and_coding_tx(anno) + peptides_data = [ + [ + 'SSSSSSSR', + 'ENST0001|SNV-1001-T-A|INDEL-1101-TTTT-T|1' + + ' ENST0001|SNV-1003-T-A|INDEL-1104-TTTT-T|1' + ] + ] + peptides = VariantPeptidePool({create_aa_record(*x) for x in peptides_data}) + label_map = LabelSourceMapping(copy.copy(LABEL_MAP1)) + order = copy.copy(SOURCE_ORDER) + order[frozenset(['sSNV', 'sINDEL'])] = max(order.values()) + 1 + splitter = PeptidePoolSplitter( + peptides=peptides, + order=order, + label_map=label_map + ) + splitter.split(2, [], tx2gene, coding_tx) + + self.assertEqual({'sSNV-sINDEL'}, set(splitter.databases.keys())) + + received = {str(x.seq) for x in splitter.databases['sSNV-sINDEL'].peptides} + expected = {'SSSSSSSR'} + self.assertEqual(expected, received)