Merge pull request #817 from uclahs-cds/czhu-fix-seqvar

Fix MNV and generateIndex
uclahs-cds · Nov 8, 2023 · bb90867 · bb90867
2 parents 988fda9 + bd01560
commit bb90867
Show file tree

Hide file tree

Showing 10 changed files with 104 additions and 21 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,6 +10,14 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm
 
 ## [Unreleased]
 
+## [1.2.2] - 2023-10-23
+
+### Fixed:
+
+- Adjacent variants were not merged as MNVs successfully. The function always exited with nothing.
+
+- Because of the updating to on-disk GTF, the coding transcripts were not generated and saved successfully. `filterFasta` is the only command affected.
+
 ## [1.2.1] - 2023-10-05
 
 ### Add

diff --git a/docs/files/fuzz_test_history.tsv b/docs/files/fuzz_test_history.tsv
@@ -23,3 +23,6 @@ v1.2.0	84afd2b	2023-09-16	comprehensive	95039	1	0	0:00:00.366488	1.1783745697633
 v1.2.0	aca093e	2023-09-21	snv	96909	0	0	0:00:00.139143	0.3491648515251173	0:00:56.182404	117.08552498241643
 v1.2.0	aca093e	2023-09-21	indel	96810	1	0	0:00:00.184495	0.3863134944517933	0:00:42.898299	101.18628104172282
 v1.2.0	aca093e	2023-09-21	comprehensive	184432	1	0	0:00:00.348405	1.4592854473001466	0:00:41.582605	178.84322477289754
+v1.2.0	8bb50b1	2023-10-03	snv	298799	0	0	0:00:00.148556	0.3663789187817885	0:00:56.622936	118.54456105047444
+v1.2.0	8bb50b1	2023-10-03	indel	295046	0	0	0:00:00.200385	0.41735107691824685	0:00:42.615877	100.6975160415037
+v1.2.0	8bb50b1	2023-10-03	comprehensive	562642	0	0	0:00:00.360436	1.536579515902589	0:00:41.751076	182.56285717474975
diff --git a/moPepGen/__init__.py b/moPepGen/__init__.py
@@ -5,7 +5,7 @@
 from typing import Iterable, IO
 
 
-__version__ = '1.2.1'
+__version__ = '1.2.2'
 
 ## Error messages
 ERROR_INDEX_IN_INTRON = 'The genomic index seems to be in an intron'

diff --git a/moPepGen/cli/generate_index.py b/moPepGen/cli/generate_index.py
@@ -56,9 +56,7 @@ def index_gtf(file:Path, source:str=None, proteome:aa.AminoAcidSeqDict=None,
         invalid_protein_as_noncoding:bool=True):
     """ Index a GTF file. """
     anno = gtf.GenomicAnnotationOnDisk()
-
-    with open(file, 'rb') as handle:
-        anno.generate_index(handle, source)
+    anno.generate_index(file, source)
 
     if proteome:
         anno.check_protein_coding(proteome, invalid_protein_as_noncoding)
@@ -78,6 +76,8 @@ def index_gtf(file:Path, source:str=None, proteome:aa.AminoAcidSeqDict=None,
             tx_pointer:TranscriptPointer = anno.transcripts.get_pointer(tx)
             handle.write(tx_pointer.to_line() + '\n')
 
+    return anno
+
 def create_gtf_copy(file:Path, output_dir:Path, symlink:bool=True) -> Path:
     """ Create copy of GTF """
     if file.suffix.lower() == '.gz':
@@ -143,7 +143,12 @@ def generate_index(args:argparse.Namespace):
         logger('Proteome FASTA loaded.')
 
     output_gtf = create_gtf_copy(path_gtf, output_dir, symlink)
-    index_gtf(output_gtf, args.reference_source, proteome, invalid_protein_as_noncoding)
+    anno = index_gtf(
+        file=output_gtf,
+        source=args.reference_source,
+        proteome=proteome,
+        invalid_protein_as_noncoding=invalid_protein_as_noncoding
+    )
     if not quiet:
         logger('Genome annotation GTF saved to disk.')
 
@@ -152,9 +157,6 @@ def generate_index(args:argparse.Namespace):
     if not quiet:
         logger('Proteome FASTA saved to disk.')
 
-    anno = gtf.GenomicAnnotationOnDisk()
-    anno.generate_index(output_gtf)
-
     # canoincal peptide pool
     canonical_peptides = proteome.create_unique_peptide_pool(
         anno=anno, rule=rule, exception=exception, miscleavage=miscleavage,

diff --git a/moPepGen/gtf/GenomicAnnotation.py b/moPepGen/gtf/GenomicAnnotation.py
@@ -255,10 +255,10 @@ def variant_coordinates_to_gene(self, variant:seqvar.VariantRecord,
         ref = variant.ref
         if end_gene - start_gene != end - start:
             if not (variant.type == 'INDEL' and variant.is_deletion()):
-                raise ValueError(f'''
-                    The variant is spanning over at least one entire intron with
-                    the type of '{variant.type}'. Don't know how to handle it.
-                ''')
+                raise ValueError(
+                    'The variant is spanning over at least one entire intron with'
+                    f"the type of '{variant.type}'. Don't know how to handle it."
+                )
             ref = variant.id.split('-')[2]
 
         attrs = copy.copy(variant.attrs)

diff --git a/moPepGen/seqvar/VariantRecord.py b/moPepGen/seqvar/VariantRecord.py
@@ -90,7 +90,7 @@ def create_mnv_from_adjacent(variants:Iterable[VariantRecord]) -> VariantRecord:
         ref=ref,
         alt=alt,
         _type='MNV',
-        _id=f"MNV-{start}-{ref}-{end}",
+        _id=f"MNV-{start}-{ref}-{alt}",
         attrs=attrs
     )
 
@@ -107,12 +107,12 @@ def find_mnvs_from_adjacent_variants(variants:List[VariantRecord],
         if v_0.type not in compatible_type_map:
             continue
         type0 = compatible_type_map[v_0.type]
-        adjacent_combs:Dict[int, List[VariantRecord]] = {
+        adjacent_combs:Dict[int, List[int]] = {
             0: [[i]]
         }
         for k in range(1, max_adjacent_as_mnv):
-            if k not in adjacent_combs:
-                break
+            # if k not in adjacent_combs:
+            #     break
             for comb in adjacent_combs[k - 1]:
                 i_t = comb[-1]
                 if i_t >= len(variants) - 1:
@@ -127,13 +127,16 @@ def find_mnvs_from_adjacent_variants(variants:List[VariantRecord],
                         break
                     if compatible_type_map[v_j.type] == type0:
                         new_comb = comb + [j]
-                        adjacent_combs[k].append(new_comb)
+                        if k in adjacent_combs:
+                            adjacent_combs[k].append(new_comb)
+                        else:
+                            adjacent_combs[k] = [new_comb]
 
         for k, combs in adjacent_combs.items():
             if k == 0:
                 continue
             for comb in combs:
-                mnv = create_mnv_from_adjacent(comb)
+                mnv = create_mnv_from_adjacent([variants[x] for x in comb])
                 mnvs.append(mnv)
     return mnvs
 

diff --git a/moPepGen/seqvar/VariantRecordPool.py b/moPepGen/seqvar/VariantRecordPool.py
@@ -204,7 +204,12 @@ def _filter(x):
                 for record in self[tx_id].transcriptional:
                     if record.type in exclude_type:
                         continue
-                    record_gene = self.anno.variant_coordinates_to_gene(record, gene_id)
+                    try:
+                        record_gene = self.anno.variant_coordinates_to_gene(record, gene_id)
+                    except ValueError as e:
+                        if record.is_merged_mnv():
+                            continue
+                        raise e
                     if _filter(record_gene):
                         if return_coord == 'gene':
                             records.add(record_gene)

diff --git a/moPepGen/svgraph/PeptideVariantGraph.py b/moPepGen/svgraph/PeptideVariantGraph.py
@@ -1226,7 +1226,7 @@ def jsonfy(self):
             variants = set()
             for v in cur.variants:
                 if v.variant.attrs.get('MERGED_MNV'):
-                    variants.update([v.variant.attrs['INDIVIDUAL_VARIANT_IDS']])
+                    variants.update(v.variant.attrs['INDIVIDUAL_VARIANT_IDS'])
                 else:
                     variants.add(v.variant.id)
             node['variants'] = list(variants)

diff --git a/moPepGen/svgraph/ThreeFrameTVG.py b/moPepGen/svgraph/ThreeFrameTVG.py
@@ -1981,7 +1981,7 @@ def jsonfy(self):
             variants = set()
             for v in cur.variants:
                 if v.variant.attrs.get('MERGED_MNV'):
-                    variants.update([v.variant.attrs['INDIVIDUAL_VARIANT_IDS']])
+                    variants.update(v.variant.attrs['INDIVIDUAL_VARIANT_IDS'])
                 else:
                     variants.add(v.variant.id)
             node['variants'] = list(variants)

diff --git a/test/unit/test_variant_record.py b/test/unit/test_variant_record.py
@@ -194,3 +194,65 @@ def test_highest_hypermutated_region_complexity(self):
         series.sort()
         x = series.get_highest_hypermutated_region_complexity()
         self.assertEqual(x, 3)
+
+class TestFindMNVsFromAdjacentVariants(unittest.TestCase):
+    """ Test cases for finding MNVs from adjacent variants """
+    def test_find_mvs_from_adjacent_variants(self):
+        """ Find MNVs from adjacent variants. """
+        test_cases = [
+            (
+                (
+                    [
+                        (
+                            3, 4, 'T', 'A', 'SNV', 'SNV-3-T-A',
+                            {'GENE_ID': 'ENSG0001.1'}, 'ENST0001.1'
+                        ),
+                        (
+                            4, 5, 'G', 'A', 'SNV', 'SNV-4-G-A',
+                            {'GENE_ID': 'ENSG0001.1'}, 'ENST0001.1'
+                        )
+                    ],
+                    2
+                ),
+                ['MNV-3-TG-AA']
+            ), (
+                (
+                    [
+                        (
+                            3, 4, 'T', 'A', 'SNV', 'SNV-3-T-A',
+                            {'GENE_ID': 'ENSG0001.1'}, 'ENST0001.1'
+                        ),
+                        (
+                            5, 6, 'G', 'A', 'SNV', 'SNV-5-G-A',
+                            {'GENE_ID': 'ENSG0001.1'}, 'ENST0001.1'
+                        )
+                    ],
+                    2
+                ),
+                []
+            ), (
+                (
+                    [
+                        (
+                            3, 4, 'T', 'A', 'SNV', 'SNV-3-T-A',
+                            {'GENE_ID': 'ENSG0001.1'}, 'ENST0001.1'
+                        ),
+                        (
+                            4, 5, 'G', 'A', 'SNV', 'SNV-4-G-A',
+                            {'GENE_ID': 'ENSG0001.1'}, 'ENST0001.1'
+                        ),
+                        (
+                            5, 6, 'G', 'C', 'SNV', 'SNV-5-G-C',
+                            {'GENE_ID': 'ENSG0001.1'}, 'ENST0001.1'
+                        )
+                    ],
+                    2
+                ),
+                ['MNV-3-TG-AA', 'MNV-4-GG-AC']
+            )
+        ]
+        for test_data, mnv_ids in test_cases:
+            variant_data, max_adjacent_as_mnv = test_data
+            variants = create_variants(variant_data)
+            mnvs = seqvar.find_mnvs_from_adjacent_variants(variants, max_adjacent_as_mnv)
+            self.assertEqual({v.id for v in mnvs}, set(mnv_ids))