Merge pull request #823 from uclahs-cds/czhu-fix-split-fasta

Update splitFasta and summarizeFasta to accept source combinations in --order-source
uclahs-cds · Nov 9, 2023 · 93be448 · 93be448
2 parents bb90867 + 83d4ac5
commit 93be448
Show file tree

Hide file tree

Showing 6 changed files with 135 additions and 6 deletions.
diff --git a/moPepGen/aa/VariantPeptideLabel.py b/moPepGen/aa/VariantPeptideLabel.py
@@ -75,6 +75,10 @@ def __gt__(self, other:VariantSourceSet) -> bool:
             return False
         this = self.to_int()
         that = other.to_int()
+        if len(this) > len(that):
+            return True
+        if len(this) < len(that):
+            return False
         for i, j in zip(this, that):
             if i > j:
                 return True
@@ -96,7 +100,10 @@ def __le__(self, other:VariantSourceSet) -> bool:
 
     def to_int(self, sort=True) -> Iterable[int]:
         """ to int """
-        source_int = {self.levels_map[x] for x in self}
+        try:
+            source_int = {self.levels_map[frozenset(self)]}
+        except KeyError:
+            source_int = {self.levels_map[x] for x in self}
         if sort:
             source_int = list(source_int)
             source_int.sort()

diff --git a/moPepGen/cli/split_fasta.py b/moPepGen/cli/split_fasta.py
@@ -133,11 +133,15 @@ def split_fasta(args:argparse.Namespace) -> None:
     if args.order_source:
         source_order = {}
         for i,val in enumerate(args.order_source.split(',')):
-            if val in source_order:
+            if '-' in val:
+                source = frozenset(val.split('-'))
+            else:
+                source = val
+            if source in source_order:
                 raise ValueError(
-                    f"Non-unique value found from `--group-source`: {val}"
+                    f"Non-unique value found from `--group-source`: {source}"
                 )
-            source_order[val] = i
+            source_order[source] = i
     else:
         source_order = None
 

diff --git a/moPepGen/cli/summarize_fasta.py b/moPepGen/cli/summarize_fasta.py
@@ -158,8 +158,20 @@ def summarize_fasta(args:argparse.Namespace) -> None:
             coding_tx.add(tx_id)
     del anno
 
-    source_order = {val:i for i,val in enumerate(args.order_source.split(','))}\
-        if args.order_source else None
+    if args.order_source:
+        source_order = {}
+        for i,val in enumerate(args.order_source.split(',')):
+            if '-' in val:
+                source = frozenset(val.split('-'))
+            else:
+                source = val
+            if source in source_order:
+                raise ValueError(
+                    f"Non-unique value found from `--group-source`: {source}"
+                )
+            source_order[source] = i
+    else:
+        source_order = None
 
     group_map = None
     if args.group_source:

diff --git a/test/integration/test_split_fasta.py b/test/integration/test_split_fasta.py
@@ -176,3 +176,37 @@ def test_split_fasta_case5(self):
             'test_Noncoding.fasta', 'test_ALT.fasta'
         }
         self.assertEqual(files, expected)
+
+    def test_split_fasta_source_order_comb(self):
+        """ test splitFasta with source order of combinations """
+        args = self.create_base_args()
+        args.gvf = [
+            self.data_dir/'vep/vep_gSNP.gvf',
+            self.data_dir/'vep/vep_gINDEL.gvf',
+            self.data_dir/'reditools/reditools.gvf',
+            self.data_dir/'fusion/star_fusion.gvf',
+            self.data_dir/'circRNA/circ_rna.gvf'
+        ]
+        args.variant_peptides = self.data_dir/'peptides/variant.fasta'
+        args.noncoding_peptides = self.data_dir/'peptides/noncoding.fasta'
+        args.alt_translation_peptides = self.data_dir/'peptides/alt_translation.fasta'
+        args.annotation_gtf = self.data_dir/'annotation.gtf'
+        args.proteome_fasta = self.data_dir/'translate.fasta'
+        args.group_source = [
+            'ALT:SECT,CodonReassign',
+            'NotCirc:gSNP,gINDEL,sSNV,sINDEL,Fusion,altSplice,RNAEditingSite'
+        ]
+        args.order_source = ','.join([
+            'NotCir',
+            'ALT',
+            'NotCirc-ALT',
+            'Noncoding',
+            'Noncoding-NotCirc',
+            'Noncoding-ALT',
+            'circRNA',
+            'circRNA-ALT',
+            'circRNA-NotCirc',
+            'Noncoding-circRNA'
+        ])
+        args.max_source_groups = 2
+        cli.split_fasta(args)
diff --git a/test/integration/test_summarize_fasta.py b/test/integration/test_summarize_fasta.py
@@ -87,3 +87,41 @@ def test_summarize_fasta_cli(self):
             print(cmd)
             print(res.stderr.decode('utf-8'))
             raise
+
+    def test_summarize_fasta_order_source_comb(self):
+        """ summarize fasta case2 with order source of combinations """
+        args = self.create_base_args()
+        args.gvf = [
+            self.data_dir/'vep/vep_gSNP.gvf',
+            self.data_dir/'vep/vep_gINDEL.gvf',
+            self.data_dir/'alternative_splicing/alternative_splicing.gvf',
+            self.data_dir/'reditools/reditools.gvf',
+            self.data_dir/'fusion/star_fusion.gvf',
+            self.data_dir/'circRNA/circ_rna.gvf'
+        ]
+        args.variant_peptides = self.data_dir/'peptides/variant.fasta'
+        args.noncoding_peptides = self.data_dir/'peptides/noncoding.fasta'
+        args.alt_translation_peptides = None
+        args.annotation_gtf = self.data_dir/"annotation.gtf"
+        args.proteome_fasta = self.data_dir/"translate.fasta"
+        args.group_source = [
+            'ALT:SECT,CodonReassign',
+            'NotCirc:gSNP,gINDEL,Fusion,AlternativeSplicing,RNAEditingSite'
+        ]
+        args.order_source = ','.join([
+            'NotCir',
+            'ALT',
+            'NotCirc-ALT',
+            'Noncoding',
+            'Noncoding-NotCirc',
+            'Noncoding-ALT',
+            'circRNA',
+            'circRNA-ALT',
+            'circRNA-NotCirc',
+            'Noncoding-circRNA'
+        ])
+        args.ignore_missing_source = True
+        cli.summarize_fasta(args)
+        files = {str(file.name) for file in self.work_dir.glob('*')}
+        expected = {args.output_path.name}
+        self.assertEqual(files, expected)
diff --git a/test/unit/test_peptide_pool_splitter.py b/test/unit/test_peptide_pool_splitter.py
@@ -202,6 +202,12 @@ def test_comparison(self):
         set2 = VariantSourceSet(['gINDEL', 'sINDEL'])
         self.assertTrue(set1 < set2)
 
+        levels[frozenset({'sINDEL', 'circRNA'})] = 7
+        VariantSourceSet.set_levels(levels)
+        set1 = VariantSourceSet(['gSNP', 'circRNA'])
+        set2 = VariantSourceSet(['sINDEL', 'circRNA'])
+        self.assertTrue(set1 > set2)
+
 class TestVariantPeptideInfo(unittest.TestCase):
     """ Test VariantPeptideInfo """
     def test_from_variant_peptide(self):
@@ -515,3 +521,31 @@ def test_split_database_fusion(self):
         received = {str(x.seq) for x in splitter.databases['Fusion'].peptides}
         expected = {'SSSSSSSK'}
         self.assertEqual(expected, received)
+
+    def test_split_database_source_comb_order(self):
+        """ Test split database with source order of combinations. """
+        anno = create_genomic_annotation(ANNOTATION_DATA)
+        tx2gene, coding_tx = get_tx2gene_and_coding_tx(anno)
+        peptides_data = [
+            [
+                'SSSSSSSR',
+                'ENST0001|SNV-1001-T-A|INDEL-1101-TTTT-T|1' +
+                ' ENST0001|SNV-1003-T-A|INDEL-1104-TTTT-T|1'
+            ]
+        ]
+        peptides = VariantPeptidePool({create_aa_record(*x) for x in peptides_data})
+        label_map = LabelSourceMapping(copy.copy(LABEL_MAP1))
+        order = copy.copy(SOURCE_ORDER)
+        order[frozenset(['sSNV', 'sINDEL'])] = max(order.values()) + 1
+        splitter = PeptidePoolSplitter(
+            peptides=peptides,
+            order=order,
+            label_map=label_map
+        )
+        splitter.split(2, [], tx2gene, coding_tx)
+
+        self.assertEqual({'sSNV-sINDEL'}, set(splitter.databases.keys()))
+
+        received = {str(x.seq) for x in splitter.databases['sSNV-sINDEL'].peptides}
+        expected = {'SSSSSSSR'}
+        self.assertEqual(expected, received)