Skip to content

Commit

Permalink
Merge pull request #850 from uclahs-cds/czhu-fix-index
Browse files Browse the repository at this point in the history
Refactor code for index directory
  • Loading branch information
zhuchcn authored Mar 1, 2024
2 parents edf1aea + 4c425d5 commit c4ee3a9
Show file tree
Hide file tree
Showing 18 changed files with 350 additions and 216 deletions.
4 changes: 3 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm

## [Unreleased]

## [1.2.2] - 2024-1-16
## [1.3.0] - 2024-2-29

### Fixed:

Expand All @@ -36,6 +36,8 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm

- Added `--timeout-seconds` to callVariant.

- Added `metadata.json` to the index directory for all essential parameters of how a moPepGen index directory is created, including cleavage parameters. #850

## Fixed

- Fixed `summarizeFasta` that SEC and W2F on fusion peptides are ignored. #789
Expand Down
2 changes: 1 addition & 1 deletion moPepGen/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from typing import Iterable, IO


__version__ = '1.2.2'
__version__ = '1.3.0'

## Error messages
ERROR_INDEX_IN_INTRON = 'The genomic index seems to be in an intron'
Expand Down
2 changes: 1 addition & 1 deletion moPepGen/cli/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
""" Module for command line interface """
from .generate_index import add_subparser_generate_index, generate_index, index_gtf
from .generate_index import add_subparser_generate_index, generate_index
from .call_variant_peptide import add_subparser_call_variant, call_variant_peptide
from .call_noncoding_peptide import add_subparser_call_noncoding, call_noncoding_peptide
from .call_alt_translation import add_subparser_call_alt_translation, call_alt_translation
Expand Down
2 changes: 1 addition & 1 deletion moPepGen/cli/call_alt_translation.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def call_alt_translation(args:argparse.Namespace) -> None:
common.print_start_message(args)

genome, anno, _, canonical_peptides = common.load_references(
args=args, load_proteome=True
args=args, load_proteome=True, cleavage_params=cleavage_params
)

peptide_pool = aa.VariantPeptidePool()
Expand Down
2 changes: 1 addition & 1 deletion moPepGen/cli/call_noncoding_peptide.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ def call_noncoding_peptide(args:argparse.Namespace) -> None:
common.print_start_message(args)

genome, anno, proteome, canonical_peptides = common.load_references(
args=args, load_proteome=True
args=args, load_proteome=True, cleavage_params=cleavage_params
)

inclusion_biotypes, exclusion_biotypes = common.load_inclusion_exclusion_biotypes(args)
Expand Down
3 changes: 2 additions & 1 deletion moPepGen/cli/call_variant_peptide.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,8 @@ def load_reference(self):
""" load reference genome, annotation, and canonical peptides """
genome, anno, _, canonical_peptides = common.load_references(
args=self.args,
invalid_protein_as_noncoding=self.invalid_protein_as_noncoding
invalid_protein_as_noncoding=self.invalid_protein_as_noncoding,
cleavage_params=self.cleavage_params
)
self.reference_data = params.ReferenceData(
genome=genome,
Expand Down
54 changes: 23 additions & 31 deletions moPepGen/cli/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,20 @@
import argparse
import os
import sys
from typing import Tuple, Set, List
from typing import Tuple, Set, List, TYPE_CHECKING
from pathlib import Path
import errno
import signal
import functools
import pickle
import pkg_resources
from moPepGen import aa, dna, gtf, logger, seqvar, err
from moPepGen import aa, dna, gtf, logger, seqvar
from moPepGen.aa.expasy_rules import EXPASY_RULES
from moPepGen.dna.DNASeqDict import DNASeqDict
from moPepGen.version import MetaVersion
from moPepGen.index import IndexDir


if TYPE_CHECKING:
from moPepGen.params import CleavageParams

def print_help_if_missing_args(parser:argparse.ArgumentParser):
""" If no args are provided, print help and exit """
if len(sys.argv) == 2 and sys.argv[1] == parser.prog.split(' ')[-1]:
Expand Down Expand Up @@ -212,53 +213,44 @@ def add_args_source(parser:argparse.ArgumentParser):

def load_references(args:argparse.Namespace, load_genome:bool=True,
load_canonical_peptides:bool=True, load_proteome:bool=False,
invalid_protein_as_noncoding:bool=False, check_protein_coding:bool=False
invalid_protein_as_noncoding:bool=False, check_protein_coding:bool=False,
cleavage_params:CleavageParams=None
) -> Tuple[dna.DNASeqDict, gtf.GenomicAnnotationOnDisk, Set[str]]:
""" Load reference files. If index_dir is specified, data will be loaded
from pickles, otherwise, will read from FASTA and GTF. """
index_dir:Path = args.index_dir
quiet:bool = args.quiet

genome = None
annotation = None
anno = None
canonical_peptides = None
if invalid_protein_as_noncoding:
load_proteome = True

version = MetaVersion()

if index_dir:
if args.index_dir:
index_dir = IndexDir(args.index_dir)
index_dir.validate_metadata(cleavage_params)
if load_genome:
with open(f'{index_dir}/genome.pkl', 'rb') as handle:
genome:DNASeqDict = pickle.load(handle)
if not version.is_valid(genome.version):
raise err.IndexVersionNotMatchError(version, genome.version)
genome = index_dir.load_genome()

gtf_file = list(index_dir.glob('annotation.gtf*'))[0]
annotation = gtf.GenomicAnnotationOnDisk()
annotation.init_handle(gtf_file)
annotation.load_index(gtf_file)
anno = index_dir.load_annotation()

if load_proteome:
with open(f'{index_dir}/proteome.pkl', 'rb') as handle:
proteome:aa.AminoAcidSeqDict = pickle.load(handle)
if not version.is_valid(proteome.version):
raise err.IndexVersionNotMatchError(version, genome.version)
proteome = index_dir.load_proteome()

if invalid_protein_as_noncoding:
annotation.check_protein_coding(proteome, True)
anno.check_protein_coding(proteome, True)

if load_canonical_peptides:
with open(f"{index_dir}/canonical_peptides.pkl", 'rb') as handle:
canonical_peptides = pickle.load(handle)
canonical_peptides = index_dir.load_canonical_peptides()

if not quiet:
logger('Reference indices loaded.')
else:
if (check_protein_coding is True or load_canonical_peptides is True) and \
args.proteome_fasta is None:
raise ValueError('--proteome-fasta was not specified.')
annotation = gtf.GenomicAnnotationOnDisk()
annotation.generate_index(args.annotation_gtf, source=args.reference_source)
anno = gtf.GenomicAnnotationOnDisk()
anno.generate_index(args.annotation_gtf, source=args.reference_source)

if not quiet:
logger('Annotation GTF loaded.')
Expand All @@ -268,7 +260,7 @@ def load_references(args:argparse.Namespace, load_genome:bool=True,
proteome.dump_fasta(args.proteome_fasta, source=args.reference_source)
if not quiet:
logger('Proteome FASTA loaded.')
annotation.check_protein_coding(proteome, invalid_protein_as_noncoding)
anno.check_protein_coding(proteome, invalid_protein_as_noncoding)

if load_genome:
genome = dna.DNASeqDict()
Expand All @@ -284,7 +276,7 @@ def load_references(args:argparse.Namespace, load_genome:bool=True,
min_length:int = args.min_length
max_length:int = args.max_length
canonical_peptides = proteome.create_unique_peptide_pool(
anno=annotation, rule=rule, exception=exception,
anno=anno, rule=rule, exception=exception,
miscleavage=miscleavage, min_mw=min_mw, min_length=min_length,
max_length=max_length
)
Expand All @@ -294,7 +286,7 @@ def load_references(args:argparse.Namespace, load_genome:bool=True,
if not load_proteome:
proteome = None

return genome, annotation, proteome, canonical_peptides
return genome, anno, proteome, canonical_peptides

def generate_metadata(args:argparse.Namespace) -> seqvar.GVFMetadata:
""" Generate metadata """
Expand Down
113 changes: 32 additions & 81 deletions moPepGen/cli/generate_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,21 +6,14 @@
moPepGen to avoid processing the reference files repeatedly and save massive
time. """
from __future__ import annotations
from typing import TYPE_CHECKING
import argparse
import os
import shutil
import gzip
from pathlib import Path
import pickle
from moPepGen import dna, aa, gtf, logger
from moPepGen import dna, aa, logger, params
from moPepGen.index import IndexDir, IndexMetadata
from moPepGen.cli import common
from moPepGen.gtf.GTFIndexMetadata import GTFIndexMetadata
from moPepGen.version import MetaVersion


if TYPE_CHECKING:
from moPepGen.gtf.GTFPointer import GenePointer, TranscriptPointer

# pylint: disable=W0212
def add_subparser_generate_index(subparsers:argparse._SubParsersAction):
""" CLI for moPepGen generateIndex """
Expand Down Expand Up @@ -51,56 +44,6 @@ def add_subparser_generate_index(subparsers:argparse._SubParsersAction):
common.print_help_if_missing_args(p)
return p


def index_gtf(file:Path, source:str=None, proteome:aa.AminoAcidSeqDict=None,
invalid_protein_as_noncoding:bool=True):
""" Index a GTF file. """
anno = gtf.GenomicAnnotationOnDisk()
anno.generate_index(file, source)

if proteome:
anno.check_protein_coding(proteome, invalid_protein_as_noncoding)

gene_idx_file, tx_idx_file = anno.get_index_files(file)
metadata = GTFIndexMetadata(source=anno.source.upper())

with open(gene_idx_file, 'wt') as handle:
metadata.write(handle)
for gene in anno.genes.keys():
gene_pointer:GenePointer = anno.genes.get_pointer(gene)
handle.write(gene_pointer.to_line() + '\n')

with open(tx_idx_file, 'wt') as handle:
metadata.write(handle)
for tx in anno.transcripts.keys():
tx_pointer:TranscriptPointer = anno.transcripts.get_pointer(tx)
handle.write(tx_pointer.to_line() + '\n')

return anno

def create_gtf_copy(file:Path, output_dir:Path, symlink:bool=True) -> Path:
""" Create copy of GTF """
if file.suffix.lower() == '.gz':
if symlink:
symlink = False
logger(
"--gtf-symlink was suppressed because compressed GTF file was received. "
)
elif file.suffix.lower() != '.gtf':
raise ValueError(f"Cannot handle gtf file {file}")

output_file = output_dir/'annotation.gtf'

if symlink:
os.symlink(file.absolute(), output_file)
elif file.suffix.lower() == '.gtf':
shutil.copy2(file, output_file)
else:
with gzip.open(file, 'rt') as ihandle, open(output_file, 'wt') as ohandle:
for line in ihandle:
ohandle.write(line)
return output_file

def generate_index(args:argparse.Namespace):
""" Generate index """
path_genome:Path = args.genome_fasta
Expand All @@ -115,62 +58,70 @@ def generate_index(args:argparse.Namespace):
exception = args.cleavage_exception
invalid_protein_as_noncoding:bool = args.invalid_protein_as_noncoding
quiet:bool = args.quiet
symlink:bool = args.gtf_symlink

output_dir:Path = args.output_dir
output_genome = output_dir/"genome.pkl"
output_proteome = output_dir/"proteome.pkl"
output_peptides = output_dir/"canonical_peptides.pkl"
output_coding_tx = output_dir/"coding_transcripts.pkl"

common.print_start_message(args)

output_dir.mkdir(exist_ok=True)
index_dir = IndexDir(output_dir)

# genome fasta
genome = dna.DNASeqDict()
genome.dump_fasta(path_genome)
if not quiet:
logger('Genome FASTA loaded')
with open(output_genome, 'wb') as handle:
pickle.dump(genome, handle)
index_dir.save_genome(genome)
if not quiet:
logger('Genome FASTA saved to disk.')
del genome

# proteome
proteome = aa.AminoAcidSeqDict()
proteome.dump_fasta(parth_proteome, source=args.reference_source)
if not quiet:
logger('Proteome FASTA loaded.')
index_dir.save_proteome(proteome)
if not quiet:
logger('Proteome FASTA saved to disk.')

output_gtf = create_gtf_copy(path_gtf, output_dir, symlink)
anno = index_gtf(
file=output_gtf,
# annotation GTF
anno = index_dir.save_annotation(
file=path_gtf,
source=args.reference_source,
proteome=proteome,
invalid_protein_as_noncoding=invalid_protein_as_noncoding
invalid_protein_as_noncoding=invalid_protein_as_noncoding,
symlink=args.gtf_symlink
)
if not quiet:
logger('Genome annotation GTF saved to disk.')

with open(output_proteome, 'wb') as handle:
pickle.dump(proteome, handle)
if not quiet:
logger('Proteome FASTA saved to disk.')

# canoincal peptide pool
canonical_peptides = proteome.create_unique_peptide_pool(
anno=anno, rule=rule, exception=exception, miscleavage=miscleavage,
min_mw=min_mw, min_length = min_length, max_length = max_length
)
if not quiet:
logger('canonical peptide pool generated.')
with open(output_peptides, 'wb') as handle:
pickle.dump(canonical_peptides, handle)
index_dir.save_canonical_peptides(canonical_peptides)
if not quiet:
logger('canonical peptide pool saved to disk.')

# create list of coding transcripts
coding_tx = {tx_id for tx_id, tx_model in anno.transcripts.items()
if tx_model.is_protein_coding}
with open(output_coding_tx, 'wb') as handle:
pickle.dump(coding_tx, handle)
index_dir.save_coding_tx(coding_tx)

# save metadata
version = MetaVersion()
cleavage_params = params.CleavageParams(
enzyme=rule, exception=exception, miscleavage=miscleavage,
min_mw=min_mw, min_length=min_length, max_length=max_length
)
metadata = IndexMetadata(
version=version,
cleavage_params=cleavage_params,
source=anno.source
)
index_dir.save_metadata(metadata)
if not quiet:
logger('metadata saved.')
2 changes: 0 additions & 2 deletions moPepGen/dna/DNASeqDict.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
"""
from Bio import SeqIO
from moPepGen.dna import DNASeqRecord
from moPepGen.version import MetaVersion


class DNASeqDict(dict):
Expand All @@ -13,7 +12,6 @@ def __init__(self, *args, **kwargs) -> None:
for val in kwargs.values():
self._validate(val)
super().__init__(*args, **kwargs)
self.version = MetaVersion()

@staticmethod
def _validate(val):
Expand Down
Loading

0 comments on commit c4ee3a9

Please sign in to comment.