Skip to content

Commit

Permalink
Merge pull request #817 from uclahs-cds/czhu-fix-seqvar
Browse files Browse the repository at this point in the history
Fix MNV and generateIndex
  • Loading branch information
zhuchcn authored Nov 8, 2023
2 parents 988fda9 + bd01560 commit bb90867
Show file tree
Hide file tree
Showing 10 changed files with 104 additions and 21 deletions.
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,14 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm

## [Unreleased]

## [1.2.2] - 2023-10-23

### Fixed:

- Adjacent variants were not merged as MNVs successfully. The function always exited with nothing.

- Because of the updating to on-disk GTF, the coding transcripts were not generated and saved successfully. `filterFasta` is the only command affected.

## [1.2.1] - 2023-10-05

### Add
Expand Down
3 changes: 3 additions & 0 deletions docs/files/fuzz_test_history.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,6 @@ v1.2.0 84afd2b 2023-09-16 comprehensive 95039 1 0 0:00:00.366488 1.1783745697633
v1.2.0 aca093e 2023-09-21 snv 96909 0 0 0:00:00.139143 0.3491648515251173 0:00:56.182404 117.08552498241643
v1.2.0 aca093e 2023-09-21 indel 96810 1 0 0:00:00.184495 0.3863134944517933 0:00:42.898299 101.18628104172282
v1.2.0 aca093e 2023-09-21 comprehensive 184432 1 0 0:00:00.348405 1.4592854473001466 0:00:41.582605 178.84322477289754
v1.2.0 8bb50b1 2023-10-03 snv 298799 0 0 0:00:00.148556 0.3663789187817885 0:00:56.622936 118.54456105047444
v1.2.0 8bb50b1 2023-10-03 indel 295046 0 0 0:00:00.200385 0.41735107691824685 0:00:42.615877 100.6975160415037
v1.2.0 8bb50b1 2023-10-03 comprehensive 562642 0 0 0:00:00.360436 1.536579515902589 0:00:41.751076 182.56285717474975
2 changes: 1 addition & 1 deletion moPepGen/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from typing import Iterable, IO


__version__ = '1.2.1'
__version__ = '1.2.2'

## Error messages
ERROR_INDEX_IN_INTRON = 'The genomic index seems to be in an intron'
Expand Down
16 changes: 9 additions & 7 deletions moPepGen/cli/generate_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,7 @@ def index_gtf(file:Path, source:str=None, proteome:aa.AminoAcidSeqDict=None,
invalid_protein_as_noncoding:bool=True):
""" Index a GTF file. """
anno = gtf.GenomicAnnotationOnDisk()

with open(file, 'rb') as handle:
anno.generate_index(handle, source)
anno.generate_index(file, source)

if proteome:
anno.check_protein_coding(proteome, invalid_protein_as_noncoding)
Expand All @@ -78,6 +76,8 @@ def index_gtf(file:Path, source:str=None, proteome:aa.AminoAcidSeqDict=None,
tx_pointer:TranscriptPointer = anno.transcripts.get_pointer(tx)
handle.write(tx_pointer.to_line() + '\n')

return anno

def create_gtf_copy(file:Path, output_dir:Path, symlink:bool=True) -> Path:
""" Create copy of GTF """
if file.suffix.lower() == '.gz':
Expand Down Expand Up @@ -143,7 +143,12 @@ def generate_index(args:argparse.Namespace):
logger('Proteome FASTA loaded.')

output_gtf = create_gtf_copy(path_gtf, output_dir, symlink)
index_gtf(output_gtf, args.reference_source, proteome, invalid_protein_as_noncoding)
anno = index_gtf(
file=output_gtf,
source=args.reference_source,
proteome=proteome,
invalid_protein_as_noncoding=invalid_protein_as_noncoding
)
if not quiet:
logger('Genome annotation GTF saved to disk.')

Expand All @@ -152,9 +157,6 @@ def generate_index(args:argparse.Namespace):
if not quiet:
logger('Proteome FASTA saved to disk.')

anno = gtf.GenomicAnnotationOnDisk()
anno.generate_index(output_gtf)

# canoincal peptide pool
canonical_peptides = proteome.create_unique_peptide_pool(
anno=anno, rule=rule, exception=exception, miscleavage=miscleavage,
Expand Down
8 changes: 4 additions & 4 deletions moPepGen/gtf/GenomicAnnotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,10 +255,10 @@ def variant_coordinates_to_gene(self, variant:seqvar.VariantRecord,
ref = variant.ref
if end_gene - start_gene != end - start:
if not (variant.type == 'INDEL' and variant.is_deletion()):
raise ValueError(f'''
The variant is spanning over at least one entire intron with
the type of '{variant.type}'. Don't know how to handle it.
''')
raise ValueError(
'The variant is spanning over at least one entire intron with'
f"the type of '{variant.type}'. Don't know how to handle it."
)
ref = variant.id.split('-')[2]

attrs = copy.copy(variant.attrs)
Expand Down
15 changes: 9 additions & 6 deletions moPepGen/seqvar/VariantRecord.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def create_mnv_from_adjacent(variants:Iterable[VariantRecord]) -> VariantRecord:
ref=ref,
alt=alt,
_type='MNV',
_id=f"MNV-{start}-{ref}-{end}",
_id=f"MNV-{start}-{ref}-{alt}",
attrs=attrs
)

Expand All @@ -107,12 +107,12 @@ def find_mnvs_from_adjacent_variants(variants:List[VariantRecord],
if v_0.type not in compatible_type_map:
continue
type0 = compatible_type_map[v_0.type]
adjacent_combs:Dict[int, List[VariantRecord]] = {
adjacent_combs:Dict[int, List[int]] = {
0: [[i]]
}
for k in range(1, max_adjacent_as_mnv):
if k not in adjacent_combs:
break
# if k not in adjacent_combs:
# break
for comb in adjacent_combs[k - 1]:
i_t = comb[-1]
if i_t >= len(variants) - 1:
Expand All @@ -127,13 +127,16 @@ def find_mnvs_from_adjacent_variants(variants:List[VariantRecord],
break
if compatible_type_map[v_j.type] == type0:
new_comb = comb + [j]
adjacent_combs[k].append(new_comb)
if k in adjacent_combs:
adjacent_combs[k].append(new_comb)
else:
adjacent_combs[k] = [new_comb]

for k, combs in adjacent_combs.items():
if k == 0:
continue
for comb in combs:
mnv = create_mnv_from_adjacent(comb)
mnv = create_mnv_from_adjacent([variants[x] for x in comb])
mnvs.append(mnv)
return mnvs

Expand Down
7 changes: 6 additions & 1 deletion moPepGen/seqvar/VariantRecordPool.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,12 @@ def _filter(x):
for record in self[tx_id].transcriptional:
if record.type in exclude_type:
continue
record_gene = self.anno.variant_coordinates_to_gene(record, gene_id)
try:
record_gene = self.anno.variant_coordinates_to_gene(record, gene_id)
except ValueError as e:
if record.is_merged_mnv():
continue
raise e
if _filter(record_gene):
if return_coord == 'gene':
records.add(record_gene)
Expand Down
2 changes: 1 addition & 1 deletion moPepGen/svgraph/PeptideVariantGraph.py
Original file line number Diff line number Diff line change
Expand Up @@ -1226,7 +1226,7 @@ def jsonfy(self):
variants = set()
for v in cur.variants:
if v.variant.attrs.get('MERGED_MNV'):
variants.update([v.variant.attrs['INDIVIDUAL_VARIANT_IDS']])
variants.update(v.variant.attrs['INDIVIDUAL_VARIANT_IDS'])
else:
variants.add(v.variant.id)
node['variants'] = list(variants)
Expand Down
2 changes: 1 addition & 1 deletion moPepGen/svgraph/ThreeFrameTVG.py
Original file line number Diff line number Diff line change
Expand Up @@ -1981,7 +1981,7 @@ def jsonfy(self):
variants = set()
for v in cur.variants:
if v.variant.attrs.get('MERGED_MNV'):
variants.update([v.variant.attrs['INDIVIDUAL_VARIANT_IDS']])
variants.update(v.variant.attrs['INDIVIDUAL_VARIANT_IDS'])
else:
variants.add(v.variant.id)
node['variants'] = list(variants)
Expand Down
62 changes: 62 additions & 0 deletions test/unit/test_variant_record.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,3 +194,65 @@ def test_highest_hypermutated_region_complexity(self):
series.sort()
x = series.get_highest_hypermutated_region_complexity()
self.assertEqual(x, 3)

class TestFindMNVsFromAdjacentVariants(unittest.TestCase):
""" Test cases for finding MNVs from adjacent variants """
def test_find_mvs_from_adjacent_variants(self):
""" Find MNVs from adjacent variants. """
test_cases = [
(
(
[
(
3, 4, 'T', 'A', 'SNV', 'SNV-3-T-A',
{'GENE_ID': 'ENSG0001.1'}, 'ENST0001.1'
),
(
4, 5, 'G', 'A', 'SNV', 'SNV-4-G-A',
{'GENE_ID': 'ENSG0001.1'}, 'ENST0001.1'
)
],
2
),
['MNV-3-TG-AA']
), (
(
[
(
3, 4, 'T', 'A', 'SNV', 'SNV-3-T-A',
{'GENE_ID': 'ENSG0001.1'}, 'ENST0001.1'
),
(
5, 6, 'G', 'A', 'SNV', 'SNV-5-G-A',
{'GENE_ID': 'ENSG0001.1'}, 'ENST0001.1'
)
],
2
),
[]
), (
(
[
(
3, 4, 'T', 'A', 'SNV', 'SNV-3-T-A',
{'GENE_ID': 'ENSG0001.1'}, 'ENST0001.1'
),
(
4, 5, 'G', 'A', 'SNV', 'SNV-4-G-A',
{'GENE_ID': 'ENSG0001.1'}, 'ENST0001.1'
),
(
5, 6, 'G', 'C', 'SNV', 'SNV-5-G-C',
{'GENE_ID': 'ENSG0001.1'}, 'ENST0001.1'
)
],
2
),
['MNV-3-TG-AA', 'MNV-4-GG-AC']
)
]
for test_data, mnv_ids in test_cases:
variant_data, max_adjacent_as_mnv = test_data
variants = create_variants(variant_data)
mnvs = seqvar.find_mnvs_from_adjacent_variants(variants, max_adjacent_as_mnv)
self.assertEqual({v.id for v in mnvs}, set(mnv_ids))

0 comments on commit bb90867

Please sign in to comment.