diff --git a/.env.example b/.env.example index 41fba3f..3e69e4e 100644 --- a/.env.example +++ b/.env.example @@ -1,2 +1,2 @@ -ENA_WEBIN="" -ENA_WEBIN_PASSWORD="" \ No newline at end of file +ENA_WEBIN="ENA_WEBIN" +ENA_WEBIN_PASSWORD="ENA_WEBIN_PASSWORD" diff --git a/.github/workflows/pytest_workflow.yml b/.github/workflows/pytest_workflow.yml index c0b190f..4effa1e 100644 --- a/.github/workflows/pytest_workflow.yml +++ b/.github/workflows/pytest_workflow.yml @@ -4,7 +4,9 @@ on: push: branches: [main] pull_request: - branches: [main] + branches: + - main + - dev jobs: pytest_38: @@ -27,7 +29,7 @@ jobs: emoji: false verbose: true job-summary: true - + pytest_39: name: Run pytest 3.9 needs: diff --git a/README.md b/README.md index 05a3b8c..05354d6 100755 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ # Public bins and MAGs uploader -Python script to upload bins and MAGs in fasta format to ENA (European Nucleotide Archive). This script generates xmls and manifests necessary for submission with webin-cli. +Python script to upload bins and MAGs in fasta format to ENA (European Nucleotide Archive). This script generates xmls and manifests necessary for submission with webin-cli. It takes as input one tsv (tab-separated values) table in the following format: @@ -26,20 +26,20 @@ With columns indicating: * _local_environment_: `string` (explanation following) * _environmental_medium_: `string` (explanation following) -According to ENA checklist's guidelines, 'broad_environment' describes the broad ecological context of a sample - desert, taiga, coral reef, ... 'local_environment' is more local - lake, harbour, cliff, ... 'environmental_medium' is either the material displaced by the sample, or the one in which the sample was embedded prior to the sampling event - air, soil, water, ... +According to ENA checklist's guidelines, 'broad_environment' describes the broad ecological context of a sample - desert, taiga, coral reef, ... 'local_environment' is more local - lake, harbour, cliff, ... 'environmental_medium' is either the material displaced by the sample, or the one in which the sample was embedded prior to the sampling event - air, soil, water, ... For host-associated metagenomic samples, the three variables can be defined similarly to the following example for the chicken gut metagenome: "chicken digestive system", "digestive tube", "caecum". More information can be found at [ERC000050]() for bins and [ERC000047]() for MAGs under field names "broad-scale environmental context", "local environmental context", "environmental medium" Another example can be found [here](examples/input_example.tsv) ### Warnings -Raw-read runs from which genomes were generated should already be available on the INSDC (ENA by EBI, GenBank by NCBI, or DDBJ), hence at least one DRR|ERR|SRR accession should be available for every genome to be uploaded. Assembly accessions (ERZ|SRZ|DRZ) are also supported. +Raw-read runs from which genomes were generated should already be available on the INSDC (ENA by EBI, GenBank by NCBI, or DDBJ), hence at least one DRR|ERR|SRR accession should be available for every genome to be uploaded. Assembly accessions (ERZ|SRZ|DRZ) are also supported. -If uploading TPA (Third PArty) genomes, you will need to contact [ENA support]() before using the script. They will provide instructions on how to correctly register a TPA project where to submit your genomes. If both TPA and non-TPA genomes need to be uploaded, please divide them in two batches and use the `--tpa` flag only with TPA genomes. +If uploading TPA (Third PArty) genomes, you will need to contact [ENA support]() before using the script. They will provide instructions on how to correctly register a TPA project where to submit your genomes. If both TPA and non-TPA genomes need to be uploaded, please divide them in two batches and use the `--tpa` flag only with TPA genomes. -Files to be uploaded will need to be compressed (e.g. already in .gz format). +Files to be uploaded will need to be compressed (e.g. already in .gz format). -No more than 5000 genomes can be submitted at the same time. +No more than 5000 genomes can be submitted at the same time. ## Register samples and generate pre-upload files The script needs `python`, `pandas`, `requests`, and `ena-webin-cli` to run. We provide a yaml file for the generation of a conda environment: @@ -68,9 +68,9 @@ where * `--centre_name CENTRE_NAME`: name of the centre generating and uploading genomes * `--tpa`: if uploading TPA (Third PArty) generated genomes -It is recommended to validate your genomes in test mode (i.e. without `--live` in the registration step and with `-test` during the upload) before attempting the final upload. Launching the registration in test mode will add a timestamp to the genome name to allow multiple executions of the test process. +It is recommended to validate your genomes in test mode (i.e. without `--live` in the registration step and with `-test` during the upload) before attempting the final upload. Launching the registration in test mode will add a timestamp to the genome name to allow multiple executions of the test process. -Sample xmls won't be regenerated automatically if a previous xml already exists. If any metadata or value in the tsv table changes, `--force` will allow xml regeneration. +Sample xmls won't be regenerated automatically if a previous xml already exists. If any metadata or value in the tsv table changes, `--force` will allow xml regeneration. ### Produced files: The script produces the following files and folders: diff --git a/genomeuploader/__init__.py b/genomeuploader/__init__.py index 55fa725..58039f5 100644 --- a/genomeuploader/__init__.py +++ b/genomeuploader/__init__.py @@ -1 +1 @@ -__version__ = '2.1.1' +__version__ = "2.1.1" diff --git a/genomeuploader/constants.py b/genomeuploader/constants.py index 11d9a5e..7c74036 100644 --- a/genomeuploader/constants.py +++ b/genomeuploader/constants.py @@ -14,14 +14,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -HQ = ( - "Multiple fragments where gaps span repetitive regions. Presence of the " - "23S, 16S, and 5S rRNA genes and at least 18 tRNAs." -) -MQ = ( - "Many fragments with little to no review of assembly other than reporting " - "of standard assembly statistics." -) +HQ = "Multiple fragments where gaps span repetitive regions. Presence of the " "23S, 16S, and 5S rRNA genes and at least 18 tRNAs." +MQ = "Many fragments with little to no review of assembly other than reporting " "of standard assembly statistics." METAGENOMES = [ "activated carbon metagenome", @@ -635,3 +629,24 @@ "Zambia", "Zimbabwe", ] + +BIN_MANDATORY_FIELDS = [ + "genome_name", + "accessions", + "assembly_software", + "binning_software", + "binning_parameters", + "stats_generation_software", + "NCBI_lineage", + "broad_environment", + "local_environment", + "environmental_medium", + "metagenome", + "co-assembly", + "genome_coverage", + "genome_path", +] + +MAG_MANDATORY_FIELDS = ["rRNA_presence", "completeness", "contamination"] + +GEOGRAPHY_DIGIT_COORDS = 8 diff --git a/genomeuploader/ena.py b/genomeuploader/ena.py index 51ab45c..0f3c5b6 100644 --- a/genomeuploader/ena.py +++ b/genomeuploader/ena.py @@ -1,4 +1,3 @@ - #!/usr/bin/env python # -*- coding: utf-8 -*- @@ -16,12 +15,17 @@ # limitations under the License. -import requests import json import logging +import os +import sys +import xml.dom.minidom as minidom +from pathlib import Path from time import sleep -import xml.dom.minidom as minidom +import requests +from dotenv import load_dotenv +from requests.exceptions import ConnectionError, HTTPError, RequestException, Timeout logging.basicConfig(level=logging.INFO) @@ -32,265 +36,396 @@ class NoDataException(ValueError): pass -RUN_DEFAULT_FIELDS = ','.join([ - 'study_accession', - 'secondary_study_accession', - 'instrument_model', - 'run_accession', - 'sample_accession' -]) - -ASSEMBLY_DEFAULT_FIELDS = 'sample_accession' - -SAMPLE_DEFAULT_FIELDS = ','.join([ - 'sample_accession', - 'secondary_sample_accession', - 'collection_date', - 'country', - 'location' -]) - -STUDY_DEFAULT_FIELDS = ','.join([ - 'study_accession', - 'secondary_study_accession', - 'study_description', - 'study_title' -]) - -RETRY_COUNT = 5 - - -class ENA(): - def get_default_params(self): - return { - 'format': 'json', - 'includeMetagenomes': True, - 'dataPortal': 'ena' - } +RUN_DEFAULT_FIELDS = ",".join(["secondary_study_accession", "sample_accession"]) + +STUDY_RUN_DEFAULT_FIELDS = ",".join(["sample_accession", "run_accession", "instrument_model"]) + +ASSEMBLY_DEFAULT_FIELDS = "sample_accession" - def post_request(self, data, webin, password): - url = "https://www.ebi.ac.uk/ena/portal/api/search" - auth = (webin, password) - default_connection_headers = { + +SAMPLE_DEFAULT_FIELDS = ",".join(["sample_accession", "collection_date", "country", "location"]) + +STUDY_DEFAULT_FIELDS = "study_accession,study_description" + +RETRY_COUNT = 3 + + +def get_default_connection_headers(): + return { + "headers": { "Content-Type": "application/x-www-form-urlencoded", - "Accept": "*/*" + "Accept": "*/*", } - response = requests.post(url, data=data, auth=auth, headers=default_connection_headers) - - return response + } + + +def get_default_params(): + return {"format": "json", "includeMetagenomes": True, "dataPortal": "ena"} + + +def parse_accession(accession): + if accession.startswith("PRJ"): + return "study_accession" + elif "RP" in accession: + return "secondary_study_accession" + elif "RR" in accession: + return "run_accession" + elif "SAM" in accession: + return "sample_accession" + elif "RS" in accession: + return "secondary_sample_accession" + elif "RZ" in accession: + return "analysis_accession" + else: + logging.error(f"{accession} is not a valid accession") + sys.exit() + + +def configure_credentials(): + # Config file + user_config = Path.home() / ".genome_uploader.config.env" + if user_config.exists(): + logger.debug("Loading the env variables from ".format()) + load_dotenv(str(user_config)) + else: + cwd_config = Path.cwd() / ".genome_uploader.config.env" + if cwd_config.exists(): + logger.debug("Loading the variables from the current directory.") + load_dotenv(str(cwd_config)) + else: + logger.debug("Trying to load env variables from the .env file") + # from a local .env file + load_dotenv() - def get_run(self, run_accession, webin, password, attempt=0, search_params=None): - data = self.get_default_params() - data['result'] = 'read_run' - data['fields'] = RUN_DEFAULT_FIELDS - data['query'] = 'run_accession=\"{}\"'.format(run_accession) - - if search_params: - data.update(search_params) - - response = self.post_request(data, webin, password) - - if not response.ok and attempt > 2: - raise ValueError("Could not retrieve run with accession {}, returned " - "message: {}".format(run_accession, response.text)) - elif response.status_code == 204: - if attempt < 2: - attempt += 1 - sleep(1) - return self.get_run(run_accession, webin, password, attempt) - else: - raise ValueError("Could not find run {} in ENA after {}" - " attempts".format(run_accession, RETRY_COUNT)) - try: - run = json.loads(response.text)[0] - except (IndexError, TypeError, ValueError): - raise ValueError("Could not find run {} in ENA.".format(run_accession)) - except: - raise Exception("Could not query ENA API: {}".format(response.text)) + username = os.getenv("ENA_WEBIN") + password = os.getenv("ENA_WEBIN_PASSWORD") - return run + return username, password - def get_run_from_assembly(self, assembly_name): - manifestXml = minidom.parseString(requests.get("https://www.ebi.ac.uk" + - "/ena/browser/api/xml/" + assembly_name).text) - run_ref = manifestXml.getElementsByTagName("RUN_REF") - run = run_ref[0].attributes["accession"].value - - return run +def query_taxid(taxid): + url = "https://www.ebi.ac.uk/ena/taxonomy/rest/tax-id/{}".format(taxid) + response = requests.get(url) - def get_study(self, webin, password, study_accession): - data = self.get_default_params() - data['result'] = "study" - data['fields'] = STUDY_DEFAULT_FIELDS - data['query'] = 'study_accession="{}" OR secondary_study_accession="{}"' \ - .format(study_accession, study_accession) + try: + # Will raise exception if response status code is non-200 + response.raise_for_status() + except requests.exceptions.HTTPError as e: + print("Request failed {} with error {}".format(url, e)) + return False - data['dataPortal'] = "ena" + res = response.json() + + return res.get("scientificName", "") - try: - response = self.post_request(data, webin, password) - if response.status_code == 204: - raise NoDataException() - try: - studyList = response.json() - assert len(studyList) == 1 - study = studyList[0] - except (IndexError, TypeError, ValueError, KeyError) as e: - raise e - return study - except NoDataException: - print("No info found to fetch study {}".format(study_accession)) - except (IndexError, TypeError, ValueError, KeyError): - print("Failed to fetch study {}, returned error: {}".format(study_accession, response.text)) - raise ValueError('Could not find study {} in ENA.'.format(study_accession)) +def query_scientific_name(scientific_name, search_rank=False): + url = "https://www.ebi.ac.uk/ena/taxonomy/rest/scientific-name/{}".format(scientific_name) + response = requests.get(url) - def get_study_runs(self, study_acc, webin, password, fields=None, search_params=None): - data = self.get_default_params() - data['result'] = 'read_run' - data['fields'] = fields or RUN_DEFAULT_FIELDS - data['query'] = '(study_accession=\"{}\" OR secondary_study_accession=\"{}\")'.format(study_acc, study_acc) + try: + # Will raise exception if response status code is non-200 + response.raise_for_status() + except requests.exceptions.HTTPError: + if search_rank: + return False, "", "" + else: + return False, "" - if search_params: - data.update(search_params) + try: + res = response.json()[0] + except IndexError: + if search_rank: + return False, "", "" + else: + return False, "" + + submittable = res.get("submittable", "").lower() == "true" + taxid = res.get("taxId", "") + rank = res.get("rank", "") + + if search_rank: + return submittable, taxid, rank + else: + return submittable, taxid + + +class EnaQuery: + def __init__(self, accession, query_type, private=False): + self.private_url = "https://www.ebi.ac.uk/ena/submit/report" + self.public_url = "https://www.ebi.ac.uk/ena/portal/api/search" + self.browser_url = "https://www.ebi.ac.uk/ena/browser/api/xml" + self.accession = accession + self.acc_type = parse_accession(accession) + username, password = configure_credentials() + if username is None or password is None: + logging.error("ENA_WEBIN and ENA_WEBIN_PASSWORD are not set") + if username and password: + self.auth = (username, password) + else: + self.auth = None + self.private = private + self.query_type = query_type + + def post_request(self, data): + response = requests.post(self.public_url, data=data, **get_default_connection_headers()) + return response - response = self.post_request(data, webin, password) - - if not response.ok: - raise ValueError("Could not retrieve runs for study %s.", study_acc) - - if response.status_code == 204: - return [] + def get_request(self, url): + if self.private: + response = requests.get(url, auth=self.auth) + else: + response = requests.get(url) + return response + def get_data_or_handle_error(self, response, xml=False, full_json=False): try: - runs = response.json() - except: - raise ValueError("Query against ENA API did not work. Returned " - "message: {}".format(response.text)) + data_txt = response.text + if data_txt is None: + if self.private: + logging.error(f"{self.accession} private data is not present in the specified Webin account") + else: + logging.error(f"{self.accession} public data does not exist") + return None + else: + if xml: + return minidom.parseString(data_txt) + elif full_json: + # return the full json when more than one entry expected + return response.json() + else: + # only return the fist element in the list is the default. In these cases there should only be one entry returned + return json.loads(response.text)[0] + except (IndexError, TypeError, ValueError, KeyError): + logging.error(f"Failed to fetch {self.accession}, returned error: {response.text}") + + def retry_or_handle_request_error(self, request, *args, **kwargs): + attempt = 0 + while attempt < RETRY_COUNT: + try: + response = request(*args, **kwargs) + response.raise_for_status() + return response + # all other RequestExceptions are raised below + except (Timeout, ConnectionError) as retry_err: + attempt += 1 + if attempt >= RETRY_COUNT: + raise ValueError(f"Could not find {self.accession} in ENA after {RETRY_COUNT} attempts. Error: {retry_err}") + sleep(1) + except HTTPError as http_err: + print(f"HTTP response has an error status: {http_err}") + raise + except RequestException as req_err: + print(f"Network-related error status: {req_err}") + raise + # should hopefully encompass all other issues... + except Exception as e: + print(f"An unexpected error occurred: {e}") + raise + + def _get_private_run(self): + url = f"{self.private_url}/runs/{self.accession}" + response = self.retry_or_handle_request_error(self.get_request, url) + run = self.get_data_or_handle_error(response) + run_data = run["report"] + reformatted_data = { + "run_accession": run_data["id"], + "secondary_study_accession": run_data["studyId"], + "sample_accession": run_data["sampleId"], + } + logging.info(f"{self.accession} private run returned from ENA") + return reformatted_data + + def _get_public_run(self): + data = get_default_params() + data.update( + { + "result": "read_run", + "query": f'run_accession="{self.accession}"', + "fields": "secondary_study_accession,sample_accession", + } + ) + response = self.retry_or_handle_request_error(self.post_request, data) + run = self.get_data_or_handle_error(response) + logging.info(f"{self.accession} public run returned from ENA") + return run + + def _get_private_study(self): + url = f"{self.private_url}/studies/xml/{self.accession}" + response = self.retry_or_handle_request_error(self.get_request, url) + study = self.get_data_or_handle_error(response, xml=True) + study_desc = study.getElementsByTagName("STUDY_DESCRIPTION")[0].firstChild.nodeValue + reformatted_data = {"study_accession": self.accession, "study_description": study_desc} + logging.info(f"{self.accession} private study returned from ENA") + return reformatted_data + + def _get_public_study(self): + data = get_default_params() + data.update( + { + "result": "study", + "query": f'{self.acc_type}="{self.accession}"', + "fields": STUDY_DEFAULT_FIELDS, + } + ) + response = self.retry_or_handle_request_error(self.post_request, data) + study = self.get_data_or_handle_error(response) + logging.info(f"{self.accession} public study returned from ENA") + return study + + def _get_private_run_from_assembly(self): + url = f"{self.private_url}/analyses/xml/{self.accession}" + reponse = self.retry_or_handle_request_error(self.get_request, url) + parsed_xml = self.get_data_or_handle_error(reponse, xml=True) + run_ref = parsed_xml.getElementsByTagName("RUN_REF") + run = run_ref[0].attributes["accession"].value + logging.info(f"private run from the assembly {self.accession} returned from ENA") + return run + + def _get_public_run_from_assembly(self): + url = f"{self.browser_url}/{self.accession}" + reponse = self.retry_or_handle_request_error(self.get_request, url) + parsed_xml = self.get_data_or_handle_error(reponse, xml=True) + run_ref = parsed_xml.getElementsByTagName("RUN_REF") + run = run_ref[0].attributes["accession"].value + logging.info(f"public run from the assembly {self.accession} returned from ENA") + return run + + def _get_private_study_runs(self): + # Pagination documentation unclear - offest not working. Using hardcoded 2000 default. TO MODIFY + url = f"{self.private_url}/runs/{self.accession}?format=json&max-results=2000" + response = self.retry_or_handle_request_error(self.get_request, url) + runs = self.get_data_or_handle_error(response, full_json=True) + reformatted_data = [] + for run in runs: + run_data = run["report"] + if "sampleId" in run_data: + run_data["sample_accession"] = run_data.pop("sampleId") + if "id" in run_data: + run_data["run_accession"] = run_data.pop("id") + if "instrumentModel" in run_data: + run_data["instrument_model"] = run_data.pop("instrumentModel") + reformatted_data.append(run_data) + logging.info(f"found {len(reformatted_data)} runs for study {self.accession}") + logging.info(f"private runs from study {self.accession}, returned from ENA") + return reformatted_data + + def _get_public_study_runs(self): + data = get_default_params() + data.update({"result": "read_run", "fields": STUDY_RUN_DEFAULT_FIELDS, "query": f"{self.acc_type}={self.accession}"}) + response = self.retry_or_handle_request_error(self.post_request, data) + runs = self.get_data_or_handle_error(response, full_json=True) + logging.info(f"public runs from study {self.accession}, returned from ENA") return runs - def get_sample(self, sample_accession, webin, password, fields=None, search_params=None, attempt=0): - data = self.get_default_params() - data['result'] = 'sample' - data['fields'] = fields or SAMPLE_DEFAULT_FIELDS - data['query'] = ('(sample_accession=\"{acc}\" OR secondary_sample_accession' - '=\"{acc}\") ').format(acc=sample_accession) - - if search_params: - data.update(search_params) - - response = self.post_request(data, webin, password) - - if response.status_code == 200: - sample = response.json() - assert len(sample) == 1 - return sample[0] - - if response.status_code == 204: - if attempt < 2: - new_params = {'dataPortal': 'metagenome' if data['dataPortal'] == 'ena' else 'ena'} - attempt += 1 - return self.get_sample(sample_accession, webin, password, fields=fields, - search_params=new_params, attempt=attempt) - else: - raise ValueError("Could not find sample {} in ENA after " - "{} attempts.".format(sample_accession, RETRY_COUNT)) - else: - raise ValueError("Could not retrieve sample with accession {}. " - "Returned message: {}".format(sample_accession, response.text)) + def _get_private_sample(self): + url = f"{self.private_url}/samples/xml/{self.accession}" + reponse = self.retry_or_handle_request_error(self.get_request, url) + reformatted_data = {} + reformatted_data["sample_accession"] = self.accession + parsed_xml = self.get_data_or_handle_error(reponse, xml=True) + sample_attributes = parsed_xml.getElementsByTagName("SAMPLE_ATTRIBUTE") + for attribute in sample_attributes: + tag = attribute.getElementsByTagName("TAG")[0].firstChild.nodeValue + if tag == "geographic location (country and/or sea)": + reformatted_data["country"] = attribute.getElementsByTagName("VALUE")[0].firstChild.nodeValue + if tag == "geographic location (latitude)": + reformatted_data["latitude"] = attribute.getElementsByTagName("VALUE")[0].firstChild.nodeValue + if tag == "geographic location (longitude)": + reformatted_data["longitude"] = attribute.getElementsByTagName("VALUE")[0].firstChild.nodeValue + if tag == "collection date": + reformatted_data["collection_date"] = attribute.getElementsByTagName("VALUE")[0].firstChild.nodeValue + logging.info(f"{self.accession} private sample returned from ENA") + return reformatted_data + + def _get_public_sample(self): + data = get_default_params() + data.update({"result": "sample", "fields": SAMPLE_DEFAULT_FIELDS, "query": f"{self.acc_type}={self.accession}"}) + response = self.retry_or_handle_request_error(self.post_request, data) + sample = self.get_data_or_handle_error(response) + logging.info(f"{self.accession} public sample returned from ENA") + return sample + + def build_query(self): + """If the private flag is given, assume private data and try private APIs. + ENA also has cases where a run may be private but the sample may be public etc. Hence always try + public if private fails""" + api_map = { + "study": (self._get_private_study, self._get_public_study), + "run": (self._get_private_run, self._get_public_run), + "run_assembly": (self._get_private_run_from_assembly, self._get_public_run_from_assembly), + "study_runs": (self._get_private_study_runs, self._get_public_study_runs), + "sample": (self._get_private_sample, self._get_public_sample), + } - def query_taxid(self, taxid): - url = "https://www.ebi.ac.uk/ena/taxonomy/rest/tax-id/{}".format(taxid) - response = requests.get(url) + private_api, public_api = api_map.get(self.query_type, (None, None)) - try: - # Will raise exception if response status code is non-200 - response.raise_for_status() - except requests.exceptions.HTTPError as e: - print("Request failed {} with error {}".format(url, e)) - return False - - res = response.json() - - return res.get("scientificName", "") - - def query_scientific_name(self, scientificName, searchRank=False): - url = "https://www.ebi.ac.uk/ena/taxonomy/rest/scientific-name/{}".format(scientificName) - response = requests.get(url) - - try: - # Will raise exception if response status code is non-200 - response.raise_for_status() - except requests.exceptions.HTTPError as e: - if searchRank: - return False, "", "" - else: - return False, "" - - try: - res = response.json()[0] - except IndexError: - if searchRank: - return False, "", "" - else: - return False, "" + if self.private: + try: + ena_response = private_api() + except Exception: + logging.info(f"Private API for {self.query_type} failed, trying public.") + ena_response = public_api() + else: + ena_response = public_api() + + return ena_response - submittable = res.get("submittable", "").lower() == "true" - taxid = res.get("taxId", "") - rank = res.get("rank", "") - if searchRank: - return submittable, taxid, rank +class EnaSubmit: + def __init__(self, sample_xml, submission_xml, live=False): + self.sample_xml = sample_xml + self.submission_xml = submission_xml + self.live = live + username, password = configure_credentials() + if username is None or password is None: + logging.error("ENA_WEBIN and ENA_WEBIN_PASSWORD are not set") + if username and password: + self.auth = (username, password) else: - return submittable, taxid + self.auth = None - def handle_genomes_registration(self, sample_xml, submission_xml, webin, password, live=False): - liveSub, mode = "", "live" + def handle_genomes_registration(self): + live_sub, mode = "", "live" - if not live: - liveSub = "dev" + if not self.live: + live_sub = "dev" mode = "test" - url = "https://www{}.ebi.ac.uk/ena/submit/drop-box/submit/".format(liveSub) + url = "https://www{}.ebi.ac.uk/ena/submit/drop-box/submit/".format(live_sub) - logger.info('Registering sample xml in {} mode.'.format(mode)) + logger.info("Registering sample xml in {} mode.".format(mode)) - f = { - 'SUBMISSION': open(submission_xml, 'r'), - 'SAMPLE': open(sample_xml, 'r') - } + f = {"SUBMISSION": open(self.submission_xml, "r"), "SAMPLE": open(self.sample_xml, "r")} - submissionResponse = requests.post(url, files = f, auth = (webin, password)) + submission_response = requests.post(url, files=f, auth=self.auth) - if submissionResponse.status_code != 200: - if str(submissionResponse.status_code).startswith('5'): - raise Exception("Genomes could not be submitted to ENA as the server " + - "does not respond. Please again try later.") + if submission_response.status_code != 200: + if str(submission_response.status_code).startswith("5"): + raise Exception("Genomes could not be submitted to ENA as the server " + "does not respond. Please again try later.") else: - raise Exception("Genomes could not be submitted to ENA. HTTP response: " + - submissionResponse.reason) + raise Exception("Genomes could not be submitted to ENA. HTTP response: " + submission_response.reason) - receiptXml = minidom.parseString((submissionResponse.content).decode("utf-8")) - receipt = receiptXml.getElementsByTagName("RECEIPT") + receipt_xml = minidom.parseString((submission_response.content).decode("utf-8")) + receipt = receipt_xml.getElementsByTagName("RECEIPT") success = receipt[0].attributes["success"].value if success == "true": - aliasDict = {} - samples = receiptXml.getElementsByTagName("SAMPLE") + alias_dict = {} + samples = receipt_xml.getElementsByTagName("SAMPLE") for s in samples: - sraAcc = s.attributes["accession"].value + sra_acc = s.attributes["accession"].value alias = s.attributes["alias"].value - aliasDict[alias] = sraAcc + alias_dict[alias] = sra_acc elif success == "false": - errors = receiptXml.getElementsByTagName("ERROR") - finalError = "\tSome genomes could not be submitted to ENA. Please, check the errors below." + errors = receipt_xml.getElementsByTagName("ERROR") + final_error = "\tSome genomes could not be submitted to ENA. Please, check the errors below." for error in errors: - finalError += "\n\t" + error.firstChild.data - finalError += "\n\tIf you wish to validate again your data and metadata, " - finalError += "please use the --force option." - raise Exception(finalError) - - logger.info('{} genome samples successfully registered.'.format(str(len(aliasDict)))) - - return aliasDict + final_error += "\n\t" + error.firstChild.data + final_error += "\n\tIf you wish to validate again your data and metadata, " + final_error += "please use the --force option." + raise Exception(final_error) + + logger.info("{} genome samples successfully registered.".format(str(len(alias_dict)))) + + return alias_dict diff --git a/genomeuploader/genome_upload.py b/genomeuploader/genome_upload.py index 99cf3fb..4436195 100755 --- a/genomeuploader/genome_upload.py +++ b/genomeuploader/genome_upload.py @@ -14,34 +14,72 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os -import sys -import logging import argparse -import re import json -import pandas as pd -from datetime import date, datetime as dt -from dotenv import load_dotenv -from pathlib import Path - -import xml.etree.ElementTree as ET +import logging +import os +import re +import sys import xml.dom.minidom as minidom -import requests - -from ena import ENA +import xml.etree.ElementTree as ET +from datetime import date +from datetime import datetime as dt -from constants import METAGENOMES, GEOGRAPHIC_LOCATIONS, MQ, HQ +import ena +import pandas as pd +from constants import ( + BIN_MANDATORY_FIELDS, + GEOGRAPHIC_LOCATIONS, + GEOGRAPHY_DIGIT_COORDS, + HQ, + MAG_MANDATORY_FIELDS, + METAGENOMES, + MQ, +) +from ena import EnaQuery, EnaSubmit logging.basicConfig(level=logging.DEBUG) - logger = logging.getLogger(__name__) -ena = ENA() -GEOGRAPHY_DIGIT_COORDS = 8 +def parse_args(argv): + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + description="Create xmls and manifest files for genome upload and upload to ENA", + ) + + parser.add_argument("-u", "--upload_study", type=str, required=True, help="Study accession for genomes upload") + parser.add_argument("--genome_info", type=str, required=True, help="Genomes metadata file") + + genomeType = parser.add_mutually_exclusive_group(required=True) + genomeType.add_argument("-m", "--mags", action="store_true", help="Select for MAG upload") + genomeType.add_argument("-b", "--bins", action="store_true", help="Select for bin upload") + + parser.add_argument("--out", type=str, help="Output folder. Default: working directory") + parser.add_argument("--force", action="store_true", required=False, default=False, help="Forces reset of sample xml's backups") + parser.add_argument( + "--live", + action="store_true", + required=False, + default=False, + help="Uploads on ENA. Omitting this " + "option allows to validate samples beforehand", + ) + parser.add_argument("--tpa", action="store_true", required=False, default=False, help="Select if uploading TPA-generated genomes") + + # Users can provide their credentials in environment variables or using a config file + parser.add_argument("--centre_name", required=True, help="Name of the centre uploading genomes") + parser.add_argument( + "--private", + required=False, + action="store_true", + default=False, + help="if data is private", + ) + + return parser.parse_args(argv) + -''' +""" Input table: expects the following parameters: genome_name: genome file name accessions: run(s) or assembly(ies) the genome was generated from @@ -60,190 +98,95 @@ co-assembly: True/False, whether the genome was generated from a co-assembly genome_coverage : genome coverage genome_path: path to genome to upload -''' -def read_and_cleanse_metadata_tsv(inputFile, genomeType, live): - logger.info('Retrieving info for genomes to submit...') - - binMandatoryFields = ["genome_name", "accessions", - "assembly_software", "binning_software", - "binning_parameters", "stats_generation_software", "NCBI_lineage", - "broad_environment", "local_environment", "environmental_medium", "metagenome", - "co-assembly", "genome_coverage", "genome_path"] - MAGMandatoryFields = ["rRNA_presence", "completeness", "contamination"] - - allFields = MAGMandatoryFields + binMandatoryFields - metadata = pd.read_csv(inputFile, sep='\t', usecols=allFields) - - # make sure there are no empty cells - cleanColumns = list(metadata.dropna(axis=1)) - if genomeType == "MAGs": - missingValues = [item for item in allFields if item not in cleanColumns] - else: - missingValues = [item for item in binMandatoryFields if item not in cleanColumns] - - if missingValues: - raise ValueError("The following mandatory fields have missing values in " + - "the input file: {}".format(", ".join(missingValues))) - - # check amount of genomes to register at the same time - if len(metadata) >= 5000: - raise ValueError("Genomes need to be registered in batches of 5000 genomes or smaller.") - - # check whether accessions follow the right format - accessions_regExp = re.compile(r"([E|S|D]R[R|Z]\d{6,})") - - accessionComparison = pd.DataFrame(columns=["genome_name", "attemptive_accessions", - "correct", "mismatching", "co-assembly"]) - accessionComparison["genome_name"] = metadata["genome_name"] - - accessionComparison["attemptive_accessions"] = metadata["accessions"].map( - lambda a: len(a.split(','))) - - accessionComparison["correct"] = metadata["accessions"].map( - lambda a: len(accessions_regExp.findall(a))) - - accessionComparison["mismatching"] = accessionComparison.apply(lambda row: - True if row["attemptive_accessions"] == row["correct"] - else None, axis=1).isna() - - mismatchingAccessions = accessionComparison[accessionComparison["mismatching"]]["genome_name"] - if not mismatchingAccessions.empty: - raise ValueError("Run accessions are not correctly formatted for the following " + - "genomes: " + ','.join(mismatchingAccessions.values)) - - # check whether completeness and contamination are floats - try: - pd.to_numeric(metadata["completeness"]) - pd.to_numeric(metadata["contamination"]) - pd.to_numeric(metadata["genome_coverage"]) - except: - raise ValueError("Completeness, contamination or coverage values should be formatted as floats") - - # check whether all co-assemblies have more than one run associated and viceversa - accessionComparison["co-assembly"] = metadata["co-assembly"] - coassemblyDiscrepancy = metadata[( - (accessionComparison["correct"] < 2) & (accessionComparison["co-assembly"])) | - ((accessionComparison["correct"] > 1) & (~accessionComparison["co-assembly"]) - )]["genome_name"] - if not coassemblyDiscrepancy.empty: - raise ValueError("The following genomes show discrepancy between number of runs " - "involved and co-assembly status: " + ','.join(coassemblyDiscrepancy.values)) - - # are provided metagenomes part of the accepted metagenome list? - if False in metadata.apply(lambda row: - True if row["metagenome"] in METAGENOMES - else False, axis=1).unique(): - raise ValueError("Metagenomes associated with each genome need to belong to ENA's " + - "approved metagenomes list.") - - # do provided file paths exist? - if False in metadata.apply(lambda row: - True if os.path.exists(row["genome_path"]) - else False, axis =1).unique(): - raise FileNotFoundError("Some genome paths do not exist.") - - # check genome name lengths - #if not (metadata["genome_name"].map(lambda a: len(a) < 20).all()): - # raise ValueError("Genome names must be shorter than 20 characters.") - - # create dictionary while checking genome name uniqueness - uniqueness = metadata["genome_name"].nunique() == metadata["genome_name"].size - if uniqueness: - if not live: - timestamp = str(int(dt.timestamp(dt.now()))) - timestamp_names = [row["genome_name"] + '_' + timestamp for index, row in metadata.iterrows()] - metadata["unique_genome_name"] = timestamp_names - genomeInfo = metadata.set_index("unique_genome_name").transpose().to_dict() - else: - genomeInfo = metadata.set_index("genome_name").transpose().to_dict() - else: - raise ValueError("Duplicate names found in genome names") +""" - return genomeInfo def round_stats(stats): - newStat = round(float(stats), 2) - if newStat == 100.0: - newStat = 100 - if newStat == 0: - newStat = 0.0 + new_stat = round(float(stats), 2) + if new_stat == 100.0: + new_stat = 100 + if new_stat == 0: + new_stat = 0.0 - return newStat + return new_stat -def compute_MAG_quality(completeness, contamination, RNApresence): - RNApresent = str(RNApresence).lower() in ["true", "yes", "y"] + +def compute_mag_quality(completeness, contamination, rna_presence): + rna_presence = str(rna_presence).lower() in ["true", "yes", "y"] quality = MQ - if float(completeness) >= 90 and float(contamination) <= 5 and RNApresent: + if float(completeness) >= 90 and float(contamination) <= 5 and rna_presence: quality = HQ - + return quality, completeness, contamination -def extract_tax_info(taxInfo): + +def extract_tax_info(tax_info): # if unclassified, block the execution - lineage, kingdomPositionInLineage, digitAnnotation = taxInfo.split(';'), 0, False - lineageFirst = lineage[0] - if "Unclassified " in lineageFirst: - if "Archaea" in lineageFirst: - scientificName = "uncultured archaeon" - elif "Bacteria" in lineageFirst: - scientificName = "uncultured bacterium" - elif "Eukaryota" in lineageFirst: - scientificName = "uncultured eukaryote" - submittable, taxid, rank = ena.query_scientific_name(scientificName, searchRank=True) - return taxid, scientificName + lineage, kingdom_position_lineage, digit_annotation = tax_info.split(";"), 0, False + lineage_first = lineage[0] + if "Unclassified " in lineage_first: + if "Archaea" in lineage_first: + scientific_name = "uncultured archaeon" + elif "Bacteria" in lineage_first: + scientific_name = "uncultured bacterium" + elif "Eukaryota" in lineage_first: + scientific_name = "uncultured eukaryote" + submittable, taxid, rank = ena.query_scientific_name(scientific_name, search_rank=True) + return taxid, scientific_name kingdoms = ["Archaea", "Bacteria", "Eukaryota"] - kingdomTaxa = ["2157", "2", "2759"] + kingdom_taxa = ["2157", "2", "2759"] - selectedKingdom, finalKingdom = kingdoms, "" + selected_kingdom, final_kingdom = kingdoms, "" if lineage[1].isdigit(): - selectedKingdom = kingdomTaxa - kingdomPositionInLineage = 2 - digitAnnotation = True - for index, k in enumerate(selectedKingdom): - if digitAnnotation: - if k == lineage[kingdomPositionInLineage]: - finalKingdom = selectedKingdom[index] + selected_kingdom = kingdom_taxa + kingdom_position_lineage = 2 + digit_annotation = True + for index, k in enumerate(selected_kingdom): + if digit_annotation: + if k == lineage[kingdom_position_lineage]: + final_kingdom = selected_kingdom[index] break else: - if k in lineage[kingdomPositionInLineage]: - finalKingdom = selectedKingdom[index] + if k in lineage[kingdom_position_lineage]: + final_kingdom = selected_kingdom[index] break - iterator = len(lineage)-1 + iterator = len(lineage) - 1 submittable = False rank = "" while iterator != -1 and not submittable: - scientificName = lineage[iterator].strip() - if digitAnnotation: - if not '*' in scientificName: - scientificName = ena.query_taxid(scientificName) + scientific_name = lineage[iterator].strip() + if digit_annotation: + if "*" not in scientific_name: + scientific_name = ena.query_taxid(scientific_name) else: iterator -= 1 continue - elif "__" in scientificName: - scientificName = scientificName.split("__")[1] + elif "__" in scientific_name: + scientific_name = scientific_name.split("__")[1] else: - raise ValueError("Unrecognised taxonomy format: " + scientificName) - submittable, taxid, rank = ena.query_scientific_name(scientificName, searchRank=True) + raise ValueError("Unrecognised taxonomy format: " + scientific_name) + submittable, taxid, rank = ena.query_scientific_name(scientific_name, search_rank=True) if not submittable: - if finalKingdom == "Archaea" or finalKingdom == "2157": - submittable, scientificName, taxid = extract_Archaea_info(scientificName, rank) - elif finalKingdom == "Bacteria" or finalKingdom == "2": - submittable, scientificName, taxid = extract_Bacteria_info(scientificName, rank) - elif finalKingdom == "Eukaryota" or finalKingdom == "2759": - submittable, scientificName, taxid = extract_Eukaryota_info(scientificName, rank) + if final_kingdom == "Archaea" or final_kingdom == "2157": + submittable, scientific_name, taxid = extract_archaea_info(scientific_name, rank) + elif final_kingdom == "Bacteria" or final_kingdom == "2": + submittable, scientific_name, taxid = extract_bacteria_info(scientific_name, rank) + elif final_kingdom == "Eukaryota" or final_kingdom == "2759": + submittable, scientific_name, taxid = extract_eukaryota_info(scientific_name, rank) iterator -= 1 - return taxid, scientificName + return taxid, scientific_name -def extract_Eukaryota_info(name, rank): - nonSubmittable = (False, "", 0) + +def extract_eukaryota_info(name, rank): + non_submittable = (False, "", 0) # Asterisks in given taxonomy suggest the classification might be not confident enough. - if '*' in name: - return nonSubmittable + if "*" in name: + return non_submittable if rank == "super kingdom": name = "uncultured eukaryote" @@ -260,24 +203,25 @@ def extract_Eukaryota_info(name, rank): if submittable: return submittable, name, taxid else: - name = name.replace(" sp.", '') + name = name.replace(" sp.", "") submittable, taxid = ena.query_scientific_name(name) if submittable: return submittable, name, taxid else: - return nonSubmittable + return non_submittable + -def extract_Bacteria_info(name, rank): +def extract_bacteria_info(name, rank): if rank == "species": name = name elif rank == "superkingdom": - name = "uncultured bacterium".format(name) + name = "uncultured bacterium".format() elif rank in ["family", "order", "class", "phylum"]: name = "uncultured {} bacterium".format(name) elif rank == "genus": name = "uncultured {} sp.".format(name) - - submittable, taxid, rank = ena.query_scientific_name(name, searchRank=True) + + submittable, taxid, rank = ena.query_scientific_name(name, search_rank=True) if not submittable: if rank in ["species", "genus"] and name.lower().endswith("bacteria"): name = "uncultured {}".format(name.lower().replace("bacteria", "bacterium")) @@ -288,7 +232,8 @@ def extract_Bacteria_info(name, rank): return submittable, name, taxid -def extract_Archaea_info(name, rank): + +def extract_archaea_info(name, rank): if rank == "species": name = name elif rank == "superkingdom": @@ -304,326 +249,204 @@ def extract_Archaea_info(name, rank): name = "uncultured {} archaeon".format(name) elif rank == "genus": name = "uncultured {} sp.".format(name) - - submittable, taxid, rank = ena.query_scientific_name(name, searchRank=True) + + submittable, taxid, rank = ena.query_scientific_name(name, search_rank=True) if not submittable: if "Candidatus" in name: if rank == "phylum": - name = name.replace("Candidatus ", '') + name = name.replace("Candidatus ", "") elif rank == "family": - name = name.replace("uncultured ", '') + name = name.replace("uncultured ", "") submittable, taxid = ena.query_scientific_name(name) - + return submittable, name, taxid -def extract_genomes_info(inputFile, genomeType, live): - genomeInfo = read_and_cleanse_metadata_tsv(inputFile, genomeType, live) - for gen in genomeInfo: - genomeInfo[gen]["accessions"] = genomeInfo[gen]["accessions"].split(',') - accessionType = "run" - assembly_regExp = re.compile(r"([E|S|D]RZ\d{6,})") - if assembly_regExp.findall(genomeInfo[gen]["accessions"][0]): - accessionType = "assembly" - genomeInfo[gen]["accessionType"] = accessionType - - genomeInfo[gen]["isolationSource"] = genomeInfo[gen]["metagenome"] - - try: - (genomeInfo[gen]["MAG_quality"], - genomeInfo[gen]["completeness"], - genomeInfo[gen]["contamination"]) = compute_MAG_quality( - str(round_stats(genomeInfo[gen]["completeness"])), - str(round_stats(genomeInfo[gen]["contamination"])), - genomeInfo[gen]["rRNA_presence"]) - except IndexError: - pass - - if str(genomeInfo[gen]["co-assembly"]).lower() in ["yes", "y", "true"]: - genomeInfo[gen]["co-assembly"] = True - else: - genomeInfo[gen]["co-assembly"] = False - - genomeInfo[gen]["alias"] = gen - - taxID, scientificName = extract_tax_info(genomeInfo[gen]["NCBI_lineage"]) - genomeInfo[gen]["taxID"] = taxID - genomeInfo[gen]["scientific_name"] = scientificName - - return genomeInfo - -def extract_ENA_info(genomeInfo, uploadDir, webin, password): - logger.info('Retrieving project and run info from ENA (this might take a while)...') - - # retrieving metadata from runs (and runs from assembly accessions if provided) - allRuns = [] - for g in genomeInfo: - if genomeInfo[g]["accessionType"] == "assembly": - derivedRuns = [] - for acc in genomeInfo[g]["accessions"]: - derivedRuns.append(ena.get_run_from_assembly(acc)) - genomeInfo[g]["accessions"] = derivedRuns - allRuns.extend(genomeInfo[g]["accessions"]) - - runsSet, studySet, samplesDict, tempDict = set(allRuns), set(), {}, {} - for r in runsSet: - run_info = ena.get_run(r, webin, password) - studySet.add(run_info["secondary_study_accession"]) - samplesDict[r] = run_info["sample_accession"] - - if not studySet: - raise ValueError("No study corresponding to runs found.") - - backupFile = os.path.join(uploadDir, "ENA_backup.json") - counter = 0 - if not os.path.exists(backupFile): - with open(backupFile, 'w') as file: - pass - with open(backupFile, "r+") as file: - try: - backupDict = json.load(file) - tempDict = dict(backupDict) - logger.info(f"A backup file {backupFile} for ENA sample metadata has been found.") - except json.decoder.JSONDecodeError: - backupDict = {} - for s in studySet: - studyInfo = ena.get_study(webin, password, s) - projectDescription = studyInfo["study_description"] - - ENA_info = ena.get_study_runs(s, webin, password) - if ENA_info == []: - raise IOError("No runs found on ENA for project {}.".format(s)) - - for run, item in enumerate(ENA_info): - runAccession = ENA_info[run]["run_accession"] - if runAccession not in backupDict: - if runAccession in runsSet: - sampleAccession = ENA_info[run]["sample_accession"] - sampleInfo = ena.get_sample(sampleAccession, webin, password) - - location = sampleInfo["location"] - latitude, longitude = None, None - if 'N' in location: - latitude = location.split('N')[0].strip() - longitude = location.split('N')[1].strip() - elif 'S' in location: - latitude = '-' + location.split('S')[0].strip() - longitude = location.split('S')[1].strip() - - if 'W' in longitude: - longitude = '-' + longitude.split('W')[0].strip() - elif longitude.endswith('E'): - longitude = longitude.split('E')[0].strip() - - if latitude: - latitude = "{:.{}f}".format(round(float(latitude), GEOGRAPHY_DIGIT_COORDS), GEOGRAPHY_DIGIT_COORDS) - else: - latitude = "not provided" - - if longitude: - longitude = "{:.{}f}".format(round(float(longitude), GEOGRAPHY_DIGIT_COORDS), GEOGRAPHY_DIGIT_COORDS) - else: - longitude = "not provided" - - country = sampleInfo["country"].split(':')[0] - if not country in GEOGRAPHIC_LOCATIONS: - country = "not provided" - - collectionDate = sampleInfo["collection_date"] - if collectionDate == "" or collectionDate == "missing": - collectionDate = "not provided" - - tempDict[runAccession] = { - "instrumentModel" : ENA_info[run]["instrument_model"], - "collectionDate" : collectionDate, - "country" : country, - "latitude" : latitude, - "longitude" : longitude, - "projectDescription" : projectDescription, - "study" : s, - "sampleAccession" : samplesDict[runAccession] - } - counter += 1 - - if (counter%10 == 0) or (len(runsSet) - len(backupDict) == counter): - file.seek(0) - file.write(json.dumps(tempDict)) - file.truncate() - tempDict = {**tempDict, **backupDict} - combine_ENA_info(genomeInfo, tempDict) - -def multipleElementSet(metadataList): - return len(set(metadataList))>1 - -def combine_ENA_info(genomeInfo, ENADict): - for g in genomeInfo: + +def multiple_element_set(metadata_list): + return len(set(metadata_list)) > 1 + + +def combine_ena_info(genome_info, ena_dict): + for g in genome_info: # TODO: optimise all the part below - if genomeInfo[g]["co-assembly"]: - instrumentList, collectionList, countryList = [], [], [] - studyList, descriptionList, samplesList = [], [], [] - longList, latitList = [], [] - for run in genomeInfo[g]["accessions"]: - instrumentList.append(ENADict[run]["instrumentModel"]) - collectionList.append(ENADict[run]["collectionDate"]) - countryList.append(ENADict[run]["country"]) - studyList.append(ENADict[run]["study"]) - descriptionList.append(ENADict[run]["projectDescription"]) - samplesList.append(ENADict[run]["sampleAccession"]) - longList.append(ENADict[run]["longitude"]) - latitList.append(ENADict[run]["latitude"]) - - genomeInfo[g]["study"] = studyList[0] - genomeInfo[g]["description"] = descriptionList[0] - - instrument = instrumentList[0] - if multipleElementSet(instrumentList): - instrument = ','.join(instrumentList) - genomeInfo[g]["sequencingMethod"] = instrument - - collectionDate = collectionList[0] - if multipleElementSet(collectionList): - collectionDate = "not provided" - genomeInfo[g]["collectionDate"] = collectionDate - - country = countryList[0] - if multipleElementSet(countryList): + if genome_info[g]["co-assembly"]: + instrument_list, collection_list, country_list = [], [], [] + study_list, description_list, samples_list = [], [], [] + long_list, latit_list = [], [] + for run in genome_info[g]["accessions"]: + instrument_list.append(ena_dict[run]["instrumentModel"]) + collection_list.append(ena_dict[run]["collectionDate"]) + country_list.append(ena_dict[run]["country"]) + study_list.append(ena_dict[run]["study"]) + description_list.append(ena_dict[run]["projectDescription"]) + samples_list.append(ena_dict[run]["sampleAccession"]) + long_list.append(ena_dict[run]["longitude"]) + latit_list.append(ena_dict[run]["latitude"]) + + genome_info[g]["study"] = study_list[0] + genome_info[g]["description"] = description_list[0] + + instrument = instrument_list[0] + if multiple_element_set(instrument_list): + instrument = ",".join(instrument_list) + genome_info[g]["sequencingMethod"] = instrument + + collection_date = collection_list[0] + if multiple_element_set(collection_list): + collection_date = "not provided" + genome_info[g]["collectionDate"] = collection_date + + country = country_list[0] + if multiple_element_set(country_list): country = "not applicable" - genomeInfo[g]["country"] = country + genome_info[g]["country"] = country - latitude = latitList[0] - if multipleElementSet(latitList): + latitude = latit_list[0] + if multiple_element_set(latit_list): latitude = "not provided" - genomeInfo[g]["latitude"] = str(round(float(latitude), GEOGRAPHY_DIGIT_COORDS)) + genome_info[g]["latitude"] = str(round(float(latitude), GEOGRAPHY_DIGIT_COORDS)) - longitude = longList[0] - if multipleElementSet(longList): + longitude = long_list[0] + if multiple_element_set(long_list): longitude = "not provided" - genomeInfo[g]["longitude"] = str(round(float(longitude), GEOGRAPHY_DIGIT_COORDS)) + genome_info[g]["longitude"] = str(round(float(longitude), GEOGRAPHY_DIGIT_COORDS)) - samples = samplesList[0] - if multipleElementSet(samplesList): - samples = ','.join(samplesList) - genomeInfo[g]["sample_accessions"] = samples + samples = samples_list[0] + if multiple_element_set(samples_list): + samples = ",".join(samples_list) + genome_info[g]["sample_accessions"] = samples else: - run = genomeInfo[g]["accessions"][0] - genomeInfo[g]["sequencingMethod"] = ENADict[run]["instrumentModel"] - genomeInfo[g]["collectionDate"] = ENADict[run]["collectionDate"] - genomeInfo[g]["study"] = ENADict[run]["study"] - genomeInfo[g]["description"] = ENADict[run]["projectDescription"] - genomeInfo[g]["sample_accessions"] = ENADict[run]["sampleAccession"] - genomeInfo[g]["country"] = ENADict[run]["country"] - genomeInfo[g]["longitude"] = ENADict[run]["longitude"] - genomeInfo[g]["latitude"] = ENADict[run]["latitude"] - - genomeInfo[g]["accessions"] = ','.join(genomeInfo[g]["accessions"]) - - - -def getAccessions(accessionsFile): - accessionDict = {} - with open(accessionsFile, 'r') as f: + run = genome_info[g]["accessions"][0] + genome_info[g]["sequencingMethod"] = ena_dict[run]["instrumentModel"] + genome_info[g]["collectionDate"] = ena_dict[run]["collectionDate"] + genome_info[g]["study"] = ena_dict[run]["study"] + genome_info[g]["description"] = ena_dict[run]["projectDescription"] + genome_info[g]["sample_accessions"] = ena_dict[run]["sampleAccession"] + genome_info[g]["country"] = ena_dict[run]["country"] + genome_info[g]["longitude"] = ena_dict[run]["longitude"] + genome_info[g]["latitude"] = ena_dict[run]["latitude"] + + genome_info[g]["accessions"] = ",".join(genome_info[g]["accessions"]) + + +def get_accessions(accessions_file): + accession_dict = {} + with open(accessions_file, "r") as f: for line in f: - line = line.split('\t') + line = line.split("\t") alias = line[0] - accession = line[1].rstrip('\n') - accessionDict[alias] = accession - - return accessionDict - -def saveAccessions(aliasAccessionDict, accessionsFile, writeMode): - with open(accessionsFile, writeMode) as f: - for elem in aliasAccessionDict: - f.write("{}\t{}\n".format(elem, aliasAccessionDict[elem])) - -def create_manifest_dictionary(run, alias, assemblySoftware, sequencingMethod, - MAGpath, gen, study, coverage, isCoassembly): - manifestDict = { - "accessions" : run, - "alias" : alias, - "assembler" : assemblySoftware, - "sequencingMethod" : sequencingMethod, - "genome_path" : MAGpath, - "genome_name" : gen, - "study" : study, - "coverageDepth" : coverage, - "co-assembly" : isCoassembly + accession = line[1].rstrip("\n") + accession_dict[alias] = accession + + return accession_dict + + +def save_accessions(alias_accession_dict, accessions_file, write_mode): + with open(accessions_file, write_mode) as f: + for elem in alias_accession_dict: + f.write("{}\t{}\n".format(elem, alias_accession_dict[elem])) + + +def create_manifest_dictionary(run, alias, assembly_software, sequencing_method, mag_path, gen, study, coverage, is_coassembly): + manifest_dict = { + "accessions": run, + "alias": alias, + "assembler": assembly_software, + "sequencingMethod": sequencing_method, + "genome_path": mag_path, + "genome_name": gen, + "study": study, + "coverageDepth": coverage, + "co-assembly": is_coassembly, } - return manifestDict + return manifest_dict + def compute_manifests(genomes): - manifestInfo = {} + manifest_info = {} for g in genomes: - manifestInfo[g] = create_manifest_dictionary(genomes[g]["accessions"], - genomes[g]["alias"], genomes[g]["assembly_software"], - genomes[g]["sequencingMethod"], genomes[g]["genome_path"], g, - genomes[g]["study"], genomes[g]["genome_coverage"], genomes[g]["co-assembly"]) - - return manifestInfo + manifest_info[g] = create_manifest_dictionary( + genomes[g]["accessions"], + genomes[g]["alias"], + genomes[g]["assembly_software"], + genomes[g]["sequencingMethod"], + genomes[g]["genome_path"], + g, + genomes[g]["study"], + genomes[g]["genome_coverage"], + genomes[g]["co-assembly"], + ) + + return manifest_info + def get_study_from_xml(sample): description = sample.childNodes[5].childNodes[0].data - study = description.split(' ')[-1][:-1] + study = description.split(" ")[-1][:-1] return study -def recover_info_from_xml(genomeDict, sample_xml, live_mode): + +def recover_info_from_xml(genome_dict, sample_xml, live_mode): logger.info("Retrieving data for genome submission...") # extract list of genomes (samples) to be registered xml_structure = minidom.parse(sample_xml) samples = xml_structure.getElementsByTagName("SAMPLE") - + for s in samples: study = get_study_from_xml(s) # extract alias from xml and find a match with genomes the user is uploading - XMLalias = s.attributes["alias"].value - if not live_mode: # remove time stamp if test mode is selected - aliasSplit = XMLalias.split("_") - XMLalias = '_'.join(aliasSplit[:-1]) - for gen in genomeDict: + xml_alias = s.attributes["alias"].value + if not live_mode: # remove time stamp if test mode is selected + alias_split = xml_alias.split("_") + xml_alias = "_".join(alias_split[:-1]) + for gen in genome_dict: # if match is found, associate attributes listed in the xml file # with genomes to upload - if XMLalias == gen: + if xml_alias == gen: if not live_mode: - currentTimestamp = str(int(dt.timestamp(dt.now()))) - XMLalias = gen + '_' + currentTimestamp - s.attributes["alias"].value = XMLalias - sampleTitle = s.getElementsByTagName("TITLE")[0] - sampleTitleValue = sampleTitle.firstChild.nodeValue.split("_") - sampleTitleValue[-1] = currentTimestamp - newSampleTitle = '_'.join(sampleTitleValue) - s.getElementsByTagName("TITLE")[0].firstChild.replaceWholeText(newSampleTitle) + current_time_stamp = str(int(dt.timestamp(dt.now()))) + xml_alias = gen + "_" + current_time_stamp + s.attributes["alias"].value = xml_alias + sample_title = s.getElementsByTagName("TITLE")[0] + sample_title_value = sample_title.firstChild.nodeValue.split("_") + sample_title_value[-1] = current_time_stamp + new_sample_title = "_".join(sample_title_value) + s.getElementsByTagName("TITLE")[0].firstChild.replaceWholeText(new_sample_title) attributes = s.childNodes[7].getElementsByTagName("SAMPLE_ATTRIBUTE") - seqMethod, assSoftware = "", "" + seq_method, ass_software = "", "" for a in attributes: - tagElem = a.getElementsByTagName("TAG") - tag = tagElem[0].childNodes[0].nodeValue + tag_elem = a.getElementsByTagName("TAG") + tag = tag_elem[0].childNodes[0].nodeValue if tag == "sequencing method": - seqMethodElem = a.getElementsByTagName("VALUE") - seqMethod = seqMethodElem[0].childNodes[0].nodeValue + seq_method_elem = a.getElementsByTagName("VALUE") + seq_method = seq_method_elem[0].childNodes[0].nodeValue elif tag == "assembly software": - assSoftwareElem = a.getElementsByTagName("VALUE") - assSoftware = assSoftwareElem[0].childNodes[0].nodeValue - if not seqMethod == "" and not assSoftware == "": + ass_software_elem = a.getElementsByTagName("VALUE") + ass_software = ass_software_elem[0].childNodes[0].nodeValue + if not seq_method == "" and not ass_software == "": break - genomeDict[gen]["accessions"] = ','.join(genomeDict[gen]["accessions"]) - genomeDict[gen]["alias"] = XMLalias - genomeDict[gen]["assembly_software"] = assSoftware - genomeDict[gen]["sequencingMethod"] = seqMethod - genomeDict[gen]["study"] = study + genome_dict[gen]["accessions"] = ",".join(genome_dict[gen]["accessions"]) + genome_dict[gen]["alias"] = xml_alias + genome_dict[gen]["assembly_software"] = ass_software + genome_dict[gen]["sequencingMethod"] = seq_method + genome_dict[gen]["study"] = study break if not live_mode: for s in samples: xml_structure.firstChild.appendChild(s) - - with open(sample_xml, 'wb') as f: + + with open(sample_xml, "wb") as f: dom_string = xml_structure.toprettyxml().encode("utf-8") - dom_string = b'\n'.join([s for s in dom_string.splitlines() if s.strip()]) + dom_string = b"\n".join([s for s in dom_string.splitlines() if s.strip()]) f.write(dom_string) + def create_sample_attribute(sample_attributes, data_list, mag_data=None): tag = data_list[0] value = data_list[1] @@ -632,15 +455,16 @@ def create_sample_attribute(sample_attributes, data_list, mag_data=None): units = None if len(data_list) == 3: units = data_list[2] - + new_sample_attr = ET.SubElement(sample_attributes, "SAMPLE_ATTRIBUTE") - ET.SubElement(new_sample_attr, 'TAG').text = tag - ET.SubElement(new_sample_attr, 'VALUE').text = value + ET.SubElement(new_sample_attr, "TAG").text = tag + ET.SubElement(new_sample_attr, "VALUE").text = value if units: - ET.SubElement(new_sample_attr, 'UNITS').text = units + ET.SubElement(new_sample_attr, "UNITS").text = units -def write_genomes_xml(genomes, xml_path, genomeType, centreName, tpa): + +def write_genomes_xml(genomes, xml_path, genome_type, centre_name, tpa): map_sample_attributes = [ # tag - value - unit (optional) ["project name", "description"], @@ -664,10 +488,10 @@ def write_genomes_xml(genomes, xml_path, genomeType, centreName, tpa): ["metagenomic source", "metagenome"], ] - checklist, assemblyType = "ERC000047", "Metagenome-assembled genome" - if genomeType == "bins": + checklist, assembly_type = "ERC000047", "Metagenome-assembled genome" + if genome_type == "bins": checklist = "ERC000050" - assemblyType = "binned metagenome" + assembly_type = "binned metagenome" constant_sample_attributes = [ # tag - value @@ -676,27 +500,25 @@ def write_genomes_xml(genomes, xml_path, genomeType, centreName, tpa): ["ENA-CHECKLIST", checklist], ] - tpaDescription = "" + tpa_description = "" if tpa: - tpaDescription = "Third Party Annotation (TPA) " + tpa_description = "Third Party Annotation (TPA) " sample_set = ET.Element("SAMPLE_SET") for g in genomes: plural = "" if genomes[g]["co-assembly"]: - plural = 's' - description = ("This sample represents a {}{} assembled from the " - "metagenomic run{} {} of study {}.".format(tpaDescription, - assemblyType, plural, genomes[g]["accessions"], - genomes[g]["study"])) - + plural = "s" + description = "This sample represents a {}{} assembled from the " "metagenomic run{} {} of study {}.".format( + tpa_description, assembly_type, plural, genomes[g]["accessions"], genomes[g]["study"] + ) + sample = ET.SubElement(sample_set, "SAMPLE") sample.set("alias", genomes[g]["alias"]) - sample.set("center_name", centreName) + sample.set("center_name", centre_name) - ET.SubElement(sample, 'TITLE').text = ("{}: {}".format(assemblyType, - genomes[g]["alias"])) + ET.SubElement(sample, "TITLE").text = "{}: {}".format(assembly_type, genomes[g]["alias"]) sample_name = ET.SubElement(sample, "SAMPLE_NAME") ET.SubElement(sample_name, "TAXON_ID").text = genomes[g]["taxID"] ET.SubElement(sample_name, "SCIENTIFIC_NAME").text = genomes[g]["scientific_name"] @@ -712,230 +534,453 @@ def write_genomes_xml(genomes, xml_path, genomeType, centreName, tpa): for constant in constant_sample_attributes: create_sample_attribute(sample_attributes, constant) - with open(xml_path, 'wb') as f: - dom = minidom.parseString( - ET.tostring(sample_set, encoding="utf-8") - ) + with open(xml_path, "wb") as f: + dom = minidom.parseString(ET.tostring(sample_set, encoding="utf-8")) f.write(dom.toprettyxml().encode("utf-8")) + def write_submission_xml(upload_dir, centre_name, study=True): today = str(date.today()) - sub_xml = os.path.join(upload_dir, 'submission.xml') + sub_xml = os.path.join(upload_dir, "submission.xml") - submission = ET.Element('SUBMISSION') - submission.set('center_name', centre_name) + submission = ET.Element("SUBMISSION") + submission.set("center_name", centre_name) # template - actions = ET.SubElement(submission, 'ACTIONS') - action_sub = ET.SubElement(actions, 'ACTION') - ET.SubElement(action_sub, 'ADD') + actions = ET.SubElement(submission, "ACTIONS") + action_sub = ET.SubElement(actions, "ACTION") + ET.SubElement(action_sub, "ADD") # attributes: function and hold date if study: - action_hold = ET.SubElement(actions, 'ACTION') - hold = ET.SubElement(action_hold, 'HOLD') - hold.set('HoldUntilDate', today) + action_hold = ET.SubElement(actions, "ACTION") + hold = ET.SubElement(action_hold, "HOLD") + hold.set("HoldUntilDate", today) - with open(sub_xml, 'wb') as submission_file: - dom = minidom.parseString( - ET.tostring(submission, encoding="utf-8") - ) + with open(sub_xml, "wb") as submission_file: + dom = minidom.parseString(ET.tostring(submission, encoding="utf-8")) submission_file.write(dom.toprettyxml().encode("utf-8")) return sub_xml -def generate_genome_manifest(genomeInfo, study, manifestsRoot, aliasToSample, genomeType, tpa): - manifest_path = os.path.join(manifestsRoot, f'{genomeInfo["genome_name"]}.manifest') - - tpaAddition, multipleRuns = "", "" + +def generate_genome_manifest(genome_info, study, manifests_root, alias_to_sample, genome_type, tpa): + manifest_path = os.path.join(manifests_root, f'{genome_info["genome_name"]}.manifest') + + tpa_addition, multiple_runs = "", "" if tpa: - tpaAddition = "Third Party Annotation (TPA) " - if genomeInfo["co-assembly"]: - multipleRuns = "s" - assemblyType = "Metagenome-Assembled Genome (MAG)" - if genomeType == "bins": - assemblyType = "binned metagenome" + tpa_addition = "Third Party Annotation (TPA) " + if genome_info["co-assembly"]: + multiple_runs = "s" + assembly_type = "Metagenome-Assembled Genome (MAG)" + if genome_type == "bins": + assembly_type = "binned metagenome" values = ( - ('STUDY', study), - ('SAMPLE', aliasToSample[genomeInfo["alias"]]), - ('ASSEMBLYNAME', genomeInfo["alias"]), - ('ASSEMBLY_TYPE', assemblyType), - ('COVERAGE', genomeInfo["coverageDepth"]), - ('PROGRAM', genomeInfo["assembler"]), - ('PLATFORM', genomeInfo["sequencingMethod"]), - ('MOLECULETYPE', "genomic DNA"), - ('DESCRIPTION', ("This is a {}bin derived from the primary whole genome " - "shotgun (WGS) data set {}. This sample represents a {} from the " - "metagenomic run{} {}.".format(tpaAddition, genomeInfo["study"], - assemblyType, multipleRuns, genomeInfo["accessions"]))), - ('RUN_REF', genomeInfo["accessions"]), - ('FASTA', os.path.abspath(genomeInfo["genome_path"])) + ("STUDY", study), + ("SAMPLE", alias_to_sample[genome_info["alias"]]), + ("ASSEMBLYNAME", genome_info["alias"]), + ("ASSEMBLY_TYPE", assembly_type), + ("COVERAGE", genome_info["coverageDepth"]), + ("PROGRAM", genome_info["assembler"]), + ("PLATFORM", genome_info["sequencingMethod"]), + ("MOLECULETYPE", "genomic DNA"), + ( + "DESCRIPTION", + ( + "This is a {}bin derived from the primary whole genome " + "shotgun (WGS) data set {}. This sample represents a {} from the " + "metagenomic run{} {}.".format(tpa_addition, genome_info["study"], assembly_type, multiple_runs, genome_info["accessions"]) + ), + ), + ("RUN_REF", genome_info["accessions"]), + ("FASTA", os.path.abspath(genome_info["genome_path"])), ) - logger.info("Writing manifest file (.manifest) for {}.".format(genomeInfo["alias"])) + logger.info("Writing manifest file (.manifest) for {}.".format(genome_info["alias"])) with open(manifest_path, "w") as outfile: - for (k, v) in values: - manifest = f'{k}\t{v}\n' + for k, v in values: + manifest = f"{k}\t{v}\n" outfile.write(manifest) if tpa: outfile.write("TPA\ttrue\n") -def main(): - ENA_uploader = GenomeUpload() - - if not ENA_uploader.live: - logger.warning("Warning: genome submission is not in live mode, " + - "files will be validated, but not uploaded.") - - xmlGenomeFile, xmlSubFile = "genome_samples.xml", "submission.xml" - samples_xml = os.path.join(ENA_uploader.upload_dir, xmlGenomeFile) - submissionXmlPath = os.path.join(ENA_uploader.upload_dir, xmlSubFile) - submission_xml = submissionXmlPath - genomes, manifestInfo = {}, {} - - # submission xml existence - if not os.path.exists(submissionXmlPath): - submission_xml = write_submission_xml(ENA_uploader.upload_dir, ENA_uploader.centre_name, False) - - # sample xml generation or recovery - genomes = ENA_uploader.create_genome_dictionary(samples_xml) - - # manifests creation - manifestDir = os.path.join(ENA_uploader.upload_dir, "manifests") - os.makedirs(manifestDir, exist_ok=True) - - accessionsgen = "registered_MAGs.tsv" - if ENA_uploader.genomeType == "bins": - accessionsgen = accessionsgen.replace("MAG", "bin") - if not ENA_uploader.live: - accessionsgen = accessionsgen.replace(".tsv", "_test.tsv") - - accessionsFile = os.path.join(ENA_uploader.upload_dir, accessionsgen) - save = False - writeMode = 'a' - if os.path.exists(accessionsFile): - if not ENA_uploader.live: - save = True - if ENA_uploader.force: - writeMode = 'w' - if not save: - logger.info("Genome samples already registered, reading ERS accessions...") - aliasToNewSampleAccession = getAccessions(accessionsFile) - else: - save = True - - if save: - logger.info("Registering genome samples XMLs...") - aliasToNewSampleAccession = ena.handle_genomes_registration(samples_xml, - submission_xml, ENA_uploader.username, ENA_uploader.password, ENA_uploader.live) - saveAccessions(aliasToNewSampleAccession, accessionsFile, writeMode) - - logger.info("Generating manifest files...") - - manifestInfo = compute_manifests(genomes) - - for m in manifestInfo: - generate_genome_manifest(manifestInfo[m], ENA_uploader.upStudy, - manifestDir, aliasToNewSampleAccession, ENA_uploader.genomeType, ENA_uploader.tpa) class GenomeUpload: - def __init__(self, argv=sys.argv[1:]): - self.args = self.parse_args(argv) - self.upStudy = self.args.upload_study - self.genomeMetadata = self.args.genome_info - self.genomeType = "bins" if self.args.bins else "MAGs" - self.live = True if self.args.live else False - - if self.args.webin and self.args.password: - self.username = self.args.webin - self.password = self.args.password - else: - # Config file - user_config = Path.home() / ".genome_uploader.config.env" - if user_config.exists(): - logger.debug("Loading the env variables from ".format(user_config)) - load_dotenv(str(user_config)) - else: - cwd_config = Path.cwd() / ".genome_uploader.config.env" - if cwd_config.exists(): - logger.debug("Loading the variables from the current directory.") - load_dotenv(str(cwd_config)) - else: - logger.debug("Trying to load env variables from the .env file") - # from a local .env file - load_dotenv() - - self.username = os.getenv("ENA_WEBIN") - self.password = os.getenv("ENA_WEBIN_PASSWORD") - - if not self.username or not self.password: - logger.error("ENA Webin username or password are empty") - sys.exit(1) - - self.tpa = True if self.args.tpa else False - self.centre_name = self.args.centre_name - self.force = True if self.args.force else False - - workDir = self.args.out if self.args.out else os.getcwd() - self.upload_dir = self.generate_genomes_upload_dir(workDir, self.genomeType) - - def parse_args(self, argv): - parser = argparse.ArgumentParser(formatter_class = argparse.ArgumentDefaultsHelpFormatter, - description="Create xmls and manifest files for genome upload to ENA") - - parser.add_argument('-u', '--upload_study', type=str, help="Study accession for genomes upload") - parser.add_argument('--genome_info', type=str, required=True, help="Genomes metadata file") - - genomeType = parser.add_mutually_exclusive_group(required=True) - genomeType.add_argument('-m', '--mags', action='store_true', help="Select for MAG upload") - genomeType.add_argument('-b', '--bins', action='store_true', help="Select for bin upload") - - parser.add_argument('--out', type=str, help="Output folder. Default: working directory") - parser.add_argument('--force', action='store_true', help="Forces reset of sample xml's backups") - parser.add_argument('--live', action='store_true', help="Uploads on ENA. Omitting this " + - "option allows to validate samples beforehand") - parser.add_argument('--tpa', action='store_true', help="Select if uploading TPA-generated genomes") - - # Users can provide their credentials and centre name manually or using a config file - parser.add_argument('--webin', required=False, help="Webin id") - parser.add_argument('--password', required=False, help="Webin password") - parser.add_argument('--centre_name', required=False, help="Name of the centre uploading genomes") - - args = parser.parse_args(argv) - - if not args.upload_study: + def __init__( + self, + upload_study: str, + centre_name: str, + genome_info: str, + bins: bool = False, + live: bool = False, + private: bool = False, + tpa: bool = False, + force: bool = False, + out: str = None, + ): + """ + Submission of genomes. + + :param upload_study: Study accession for genomes upload. + :param centre_name: Name of the centre uploading genomes. + :param genome_info: Genomes metadata file. + :param bins: Performs bin upload. + :params live: Live upload to ENA. + :param private: Is this a private study? + :param tpa: Is this a third-party assembly? + :param force: Resets sample XML backups. + :param out: Output folder. + """ + + self.genome_type = "bins" if bins else "MAGs" + self.live = live + self.private = private + + self.tpa = tpa + self.centre_name = centre_name + self.force = force + + self.work_dir = out if out is not None else os.getcwd() + self.upload_dir = self.generate_genomes_upload_dir() + + if not upload_study: raise ValueError("No project selected for genome upload [-u, --upload_study].") - - if not os.path.exists(args.genome_info): - raise FileNotFoundError('Genome metadata file "{}" does not exist'.format(args.genome_info)) + self.upload_study = upload_study + + if not os.path.exists(genome_info): + raise FileNotFoundError(f"Genome metadata file {genome_info} does not exist") + self.genome_metadata = genome_info + + def validate_metadata_tsv(self): + logger.info("Retrieving info for genomes to submit...") + + all_fields = MAG_MANDATORY_FIELDS + BIN_MANDATORY_FIELDS + metadata = pd.read_csv(self.genome_metadata, sep="\t", usecols=all_fields) - return args + # make sure there are no empty cells + clean_columns = list(metadata.dropna(axis=1)) + if self.genome_type == "MAGs": + missing_values = [item for item in all_fields if item not in clean_columns] + else: + missing_values = [item for item in BIN_MANDATORY_FIELDS if item not in clean_columns] + + if missing_values: + raise ValueError( + "The following mandatory fields have missing values in " + "the input file: {}".format(", ".join(missing_values)) + ) + + # check amount of genomes to register at the same time + if len(metadata) >= 5000: + raise ValueError("Genomes need to be registered in batches of 5000 genomes or smaller.") + + # check whether accessions follow the right format + accessions_reg_exp = re.compile(r"([E|S|D]R[R|Z]\d{6,})") + + accession_comparison = pd.DataFrame(columns=["genome_name", "attemptive_accessions", "correct", "mismatching", "co-assembly"]) + accession_comparison["genome_name"] = metadata["genome_name"] + + accession_comparison["attemptive_accessions"] = metadata["accessions"].map(lambda a: len(a.split(","))) - def generate_genomes_upload_dir(self, dir, genomeType): - uploadName = "MAG_upload" - if genomeType == "bins": - uploadName = uploadName.replace("MAG", "bin") - upload_dir = os.path.join(dir, uploadName) + accession_comparison["correct"] = metadata["accessions"].map(lambda a: len(accessions_reg_exp.findall(a))) + + accession_comparison["mismatching"] = accession_comparison.apply( + lambda row: True if row["attemptive_accessions"] == row["correct"] else None, axis=1 + ).isna() + + mismatching_accessions = accession_comparison[accession_comparison["mismatching"]]["genome_name"] + if not mismatching_accessions.empty: + raise ValueError( + "Run accessions are not correctly formatted for the following " + "genomes: " + ",".join(mismatching_accessions.values) + ) + + # check whether completeness and contamination are floats + try: + pd.to_numeric(metadata["completeness"]) + pd.to_numeric(metadata["contamination"]) + pd.to_numeric(metadata["genome_coverage"]) + except ValueError: + raise ValueError("Completeness, contamination or coverage values should be formatted as floats") + + # check whether all co-assemblies have more than one run associated and viceversa + accession_comparison["co-assembly"] = metadata["co-assembly"] + coassembly_discrepancy = metadata[ + ((accession_comparison["correct"] < 2) & (accession_comparison["co-assembly"])) + | ((accession_comparison["correct"] > 1) & (~accession_comparison["co-assembly"])) + ]["genome_name"] + if not coassembly_discrepancy.empty: + raise ValueError( + "The following genomes show discrepancy between number of runs " + "involved and co-assembly status: " + ",".join(coassembly_discrepancy.values) + ) + + # are provided metagenomes part of the accepted metagenome list? + if False in metadata.apply(lambda row: True if row["metagenome"] in METAGENOMES else False, axis=1).unique(): + raise ValueError("Metagenomes associated with each genome need to belong to ENA's " + "approved metagenomes list.") + + # do provided file paths exist? + if False in metadata.apply(lambda row: True if os.path.exists(row["genome_path"]) else False, axis=1).unique(): + raise FileNotFoundError("Some genome paths do not exist.") + + # check genome name lengths + # if not (metadata["genome_name"].map(lambda a: len(a) < 20).all()): + # raise ValueError("Genome names must be shorter than 20 characters.") + + # create dictionary while checking genome name uniqueness + uniqueness = metadata["genome_name"].nunique() == metadata["genome_name"].size + if uniqueness: + if not self.live: + timestamp = str(int(dt.timestamp(dt.now()))) + timestamp_names = [row["genome_name"] + "_" + timestamp for index, row in metadata.iterrows()] + metadata["unique_genome_name"] = timestamp_names + genome_info = metadata.set_index("unique_genome_name").transpose().to_dict() + else: + genome_info = metadata.set_index("genome_name").transpose().to_dict() + else: + raise ValueError("Duplicate names found in genome names") + + return genome_info + + def extract_genomes_info(self): + genome_info = self.validate_metadata_tsv() + for gen in genome_info: + genome_info[gen]["accessions"] = genome_info[gen]["accessions"].split(",") + accession_type = "run" + assembly_reg_exp = re.compile(r"([E|S|D]RZ\d{6,})") + if assembly_reg_exp.findall(genome_info[gen]["accessions"][0]): + accession_type = "assembly" + genome_info[gen]["accessionType"] = accession_type + + genome_info[gen]["isolationSource"] = genome_info[gen]["metagenome"] + + try: + ( + genome_info[gen]["MAG_quality"], + genome_info[gen]["completeness"], + genome_info[gen]["contamination"], + ) = compute_mag_quality( + str(round_stats(genome_info[gen]["completeness"])), + str(round_stats(genome_info[gen]["contamination"])), + genome_info[gen]["rRNA_presence"], + ) + except IndexError: + pass + + if str(genome_info[gen]["co-assembly"]).lower() in ["yes", "y", "true"]: + genome_info[gen]["co-assembly"] = True + else: + genome_info[gen]["co-assembly"] = False + + genome_info[gen]["alias"] = gen + + tax_id, scientific_name = extract_tax_info(genome_info[gen]["NCBI_lineage"]) + genome_info[gen]["taxID"] = tax_id + genome_info[gen]["scientific_name"] = scientific_name + + return genome_info + + def extract_ena_info(self, genome_info): + logger.info("Retrieving project and run info from ENA (this might take a while)...") + + # retrieving metadata from runs (and runs from assembly accessions if provided) + all_runs = [] + for g in genome_info: + if genome_info[g]["accessionType"] == "assembly": + derived_runs = [] + for acc in genome_info[g]["accessions"]: + ena_query = EnaQuery(acc, "run_assembly", self.private) + derived_runs.append(ena_query.build_query()) + genome_info[g]["accessions"] = derived_runs + all_runs.extend(genome_info[g]["accessions"]) + + runs_set, study_set, samples_dict, temp_dict = set(all_runs), set(), {}, {} + for r in runs_set: + ena_query = EnaQuery(r, "run", self.private) + run_info = ena_query.build_query() + study_set.add(run_info["secondary_study_accession"]) + samples_dict[r] = run_info["sample_accession"] + + if not study_set: + raise ValueError("No study corresponding to runs found.") + + backup_file = os.path.join(self.upload_dir, "ENA_backup.json") + counter = 0 + if not os.path.exists(backup_file): + with open(backup_file, "w") as file: + pass + with open(backup_file, "r+") as file: + try: + backup_dict = json.load(file) + temp_dict = dict(backup_dict) + logger.info(f"A backup file {backup_file} for ENA sample metadata has been found.") + except json.decoder.JSONDecodeError: + backup_dict = {} + for s in study_set: + ena_query = EnaQuery(s, "study", self.private) + study_info = ena_query.build_query() + project_description = study_info["study_description"] + + ena_query = EnaQuery(s, "study_runs", self.private) + ena_info = ena_query.build_query() + if ena_info == []: + raise IOError("No runs found on ENA for project {}.".format(s)) + + for run, item in enumerate(ena_info): + run_accession = ena_info[run]["run_accession"] + if run_accession not in backup_dict: + if run_accession in runs_set: + sample_accession = ena_info[run]["sample_accession"] + ena_query = EnaQuery(sample_accession, "sample", self.private) + sample_info = ena_query.build_query() + + if "latitude" in sample_info and "longitude" in sample_info: + latitude = sample_info["latitude"] + longitude = sample_info["longitude"] + else: + location = sample_info["location"] + latitude, longitude = None, None + if "N" in location: + latitude = location.split("N")[0].strip() + longitude = location.split("N")[1].strip() + elif "S" in location: + latitude = "-" + location.split("S")[0].strip() + longitude = location.split("S")[1].strip() + + if "W" in longitude: + longitude = "-" + longitude.split("W")[0].strip() + elif longitude.endswith("E"): + longitude = longitude.split("E")[0].strip() + + if latitude: + latitude = "{:.{}f}".format(round(float(latitude), GEOGRAPHY_DIGIT_COORDS), GEOGRAPHY_DIGIT_COORDS) + else: + latitude = "not provided" + + if longitude: + longitude = "{:.{}f}".format(round(float(longitude), GEOGRAPHY_DIGIT_COORDS), GEOGRAPHY_DIGIT_COORDS) + else: + longitude = "not provided" + + country = sample_info["country"].split(":")[0] + if country not in GEOGRAPHIC_LOCATIONS: + country = "not provided" + + collectionDate = sample_info["collection_date"] + if collectionDate == "" or collectionDate == "missing": + collectionDate = "not provided" + + temp_dict[run_accession] = { + "instrumentModel": ena_info[run]["instrument_model"], + "collectionDate": collectionDate, + "country": country, + "latitude": latitude, + "longitude": longitude, + "projectDescription": project_description, + "study": s, + "sampleAccession": samples_dict[run_accession], + } + counter += 1 + + if (counter % 10 == 0) or (len(runs_set) - len(backup_dict) == counter): + file.seek(0) + file.write(json.dumps(temp_dict)) + file.truncate() + temp_dict = {**temp_dict, **backup_dict} + combine_ena_info(genome_info, temp_dict) + + def generate_genomes_upload_dir(self): + upload_name = "MAG_upload" + if self.genome_type == "bins": + upload_name = upload_name.replace("MAG", "bin") + upload_dir = os.path.join(self.work_dir, upload_name) os.makedirs(upload_dir, exist_ok=True) return upload_dir def create_genome_dictionary(self, samples_xml): - logger.info('Retrieving data for MAG submission...') + logger.info("Retrieving data for MAG submission...") - genomeInfo = extract_genomes_info(self.genomeMetadata, self.genomeType, self.live) + genome_info = self.extract_genomes_info() if not os.path.exists(samples_xml) or self.force: - extract_ENA_info(genomeInfo, self.upload_dir, self.username, self.password) + self.extract_ena_info(genome_info) logger.info("Writing genome registration XML...") - write_genomes_xml(genomeInfo, samples_xml, self.genomeType, - self.centre_name, self.tpa) + write_genomes_xml(genome_info, samples_xml, self.genome_type, self.centre_name, self.tpa) logger.info("All files have been written to " + self.upload_dir) else: - recover_info_from_xml(genomeInfo, samples_xml, self.live) + recover_info_from_xml(genome_info, samples_xml, self.live) + + return genome_info + + def genome_upload(self): + if not self.live: + logger.warning("Warning: genome submission is not in live mode, " + "files will be validated, but not uploaded.") + samples_xml = os.path.join(self.upload_dir, "genome_samples.xml") + submission_xml = os.path.join(self.upload_dir, "submission.xml") + genomes, manifest_info = {}, {} + + # submission xml existence + if not os.path.exists(submission_xml): + submission_xml = write_submission_xml(self.upload_dir, self.centre_name, False) + + # sample xml generation or recovery + genomes = self.create_genome_dictionary(samples_xml) + + # manifests creation + manifest_dir = os.path.join(self.upload_dir, "manifests") + os.makedirs(manifest_dir, exist_ok=True) + + accessionsgen = "registered_MAGs.tsv" + if self.genome_type == "bins": + accessionsgen = accessionsgen.replace("MAG", "bin") + if not self.live: + accessionsgen = accessionsgen.replace(".tsv", "_test.tsv") + + accessions_file = os.path.join(self.upload_dir, accessionsgen) + save = False + write_mode = "a" + if os.path.exists(accessions_file): + if not self.live: + save = True + if self.force: + write_mode = "w" + if not save: + logger.info("Genome samples already registered, reading ERS accessions...") + alias_to_new_sample_accession = get_accessions(accessions_file) + else: + save = True + + if save: + logger.info("Registering genome samples XMLs...") + ena_submit = EnaSubmit(samples_xml, submission_xml, self.live) + alias_to_new_sample_accession = ena_submit.handle_genomes_registration() + save_accessions(alias_to_new_sample_accession, accessions_file, write_mode) + + logger.info("Generating manifest files...") + + manifest_info = compute_manifests(genomes) + + for m in manifest_info: + generate_genome_manifest( + manifest_info[m], self.upload_study, manifest_dir, alias_to_new_sample_accession, self.genome_type, self.tpa + ) + + +def main(): + args = parse_args(sys.argv[1:]) + ena_upload = GenomeUpload( + upload_study=args.upload_study, + centre_name=args.centre_name, + genome_info=args.genome_info, + bins=args.bins, + live=args.live, + private=args.private, + tpa=args.tpa, + force=args.force, + out=args.out, + ) + ena_upload.genome_upload() - return genomeInfo if __name__ == "__main__": main() - logger.info('Completed') + logger.info("Completed") diff --git a/pyproject.toml b/pyproject.toml index 82527f3..968ffa2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -52,6 +52,7 @@ test = [ "pytest-md==0.2.0", "pytest-workflow==2.0.1", "pytest-cov==3.0.0", + "pytest-responses==0.5.1", ] [build-system] diff --git a/pytest.ini b/pytest.ini index 8e78522..654f24a 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,3 +1,4 @@ [pytest] python_files = tests/*.py testpaths = tests +required_plugins = pytest-workflow diff --git a/requirements-test.yml b/requirements-test.yml index 333f06c..914434e 100644 --- a/requirements-test.yml +++ b/requirements-test.yml @@ -1 +1 @@ --r requirements.yml \ No newline at end of file +-r requirements.yml diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..032779d --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,103 @@ +from pathlib import Path + +import pytest + + +@pytest.fixture(scope="module") +def test_file_dir(): + return Path(__file__).resolve().parent / Path("fixtures/responses") + + +@pytest.fixture(scope="module") +def public_study_json(test_file_dir): + return test_file_dir / Path("public_study.json") + + +@pytest.fixture(scope="module") +def private_study_xml(test_file_dir): + return test_file_dir / Path("private_study.xml") + + +@pytest.fixture(scope="module") +def public_run_json(test_file_dir): + return test_file_dir / Path("public_run.json") + + +@pytest.fixture(scope="module") +def private_run_json(test_file_dir): + return test_file_dir / Path("private_run.json") + + +@pytest.fixture(scope="module") +def public_run_from_assembly_xml(test_file_dir): + return test_file_dir / Path("public_run_from_assembly.xml") + + +@pytest.fixture(scope="module") +def private_run_from_assembly_xml(test_file_dir): + return test_file_dir / Path("private_run_from_assembly.xml") + + +@pytest.fixture(scope="module") +def public_study_runs_json(test_file_dir): + return test_file_dir / Path("public_study_runs.json") + + +@pytest.fixture(scope="module") +def private_study_runs_json(test_file_dir): + return test_file_dir / Path("private_study_runs.json") + + +@pytest.fixture(scope="module") +def public_sample_json(test_file_dir): + return test_file_dir / Path("public_sample.json") + + +@pytest.fixture(scope="module") +def private_sample_xml(test_file_dir): + return test_file_dir / Path("private_sample.xml") + + +@pytest.fixture +def public_study_data(): + return { + "study_accession": "PRJEB41657", + "study_description": "Metagenomic raw reads, assemblies, and bins derived from HoloFood salmon gut samples from trial A and B. The samples in this project contributed to the salmon MAG catalogue (project: PRJEB55376 [ERP140265]).", + } + + +# private XML has no PRJ study ID +@pytest.fixture +def private_study_data(): + return { + "study_accession": "ERP125469", + "study_description": "Metagenomic raw reads, assemblies, and bins derived from HoloFood salmon gut samples from trial A and B. The samples in this project contributed to the salmon MAG catalogue (project: PRJEB55376 [ERP140265]).", + } + + +@pytest.fixture +def public_run_data(): + return {"run_accession": "ERR4918394", "sample_accession": "SAMEA7687881", "secondary_study_accession": "ERP125469"} + + +# private json only has ERS sample IDs +@pytest.fixture +def private_run_data(): + return {"run_accession": "ERR4918394", "sample_accession": "ERS5444411", "secondary_study_accession": "ERP125469"} + + +@pytest.fixture +def public_sample_data(): + return {"sample_accession": "SAMEA7687881", "collection_date": "2019-06-11", "country": "Norway", "location": "66.079905 N 12.587848 E"} + + +# private xml can only query with RS accession and lat and long are split +@pytest.fixture +def private_sample_data(): + return { + "sample_accession": "ERS5444411", + "collection_date": "2019-06-11", + "country": "Norway", + "latitude": "66.079905", + "longitude": "12.587848", + } diff --git a/tests/fixtures/responses/private_run.json b/tests/fixtures/responses/private_run.json new file mode 100644 index 0000000..09c0a03 --- /dev/null +++ b/tests/fixtures/responses/private_run.json @@ -0,0 +1,17 @@ +[ + { + "report": { + "id": "ERR4918394", + "alias": "webin-reads-SA01_02C1a_metaG", + "instrumentModel": "DNBSEQ-G400", + "firstCreated": "2020-12-04T12:14:11", + "firstPublic": "2022-08-02T17:24:21", + "releaseStatus": "PUBLIC", + "submissionAccountId": "Webin-51990", + "studyId": "ERP125469", + "experimentId": "ERX4783303", + "sampleId": "ERS5444411" + }, + "links": [] + } +] diff --git a/tests/fixtures/responses/private_run_from_assembly.xml b/tests/fixtures/responses/private_run_from_assembly.xml new file mode 100644 index 0000000..a387933 --- /dev/null +++ b/tests/fixtures/responses/private_run_from_assembly.xml @@ -0,0 +1,40 @@ + + + + + ERZ2626953 + webin-genome-ERR4918394_62952e961c26f0859b4ebdd83d93e4e7 + + Genome assembly: ERR4918394_62952e961c26f0859b4ebdd83d93e4e7 + + + ERP125469 + PRJEB41657 + + + + + ERS5444411 + + + + + ERR4918394 + + + + + ERR4918394_62952e961c26f0859b4ebdd83d93e4e7 + primary metagenome + false + 17.0 + metaSPAdes v3.14.1 + DNBSEQ-G400 + genomic DNA + + + + + + + diff --git a/tests/fixtures/responses/private_sample.xml b/tests/fixtures/responses/private_sample.xml new file mode 100644 index 0000000..e0a50bd --- /dev/null +++ b/tests/fixtures/responses/private_sample.xml @@ -0,0 +1,159 @@ + + + + + ERS5444411 + SA01_02C1a_metaG + + SA01.02C1a + + 1602388 + fish gut metagenome + + Salmon metagenomic DNA data from Distal gut content; animal SA01.02 from tank SA01 with treatment Tiger + + + adapters + AAGTCGGAGGCCAAGCGGTCTTAGGAAGACAA;GAACGACATGGCTACGATCCGACTT + + + collection date + 2019-06-11 + + + geographic location (country and/or sea) + Norway + + + geographic location (latitude) + 66.079905 + DD + + + geographic location (longitude) + 12.587848 + DD + + + geographic location (region and locality) + Dønna; Nordland county + + + host body site + Distal gut content + + + host common name + Atlantic salmon + + + host diet + Fermented algae meal (added in in oil coating) + + + host diet treatment + Tiger: SA Control + + + host disease status + Wounded:- + + + host scientific name + Salmo salar + + + host subject id + SA01.02 + + + host taxid + 8030 + + + host total mass + 290.4 + g + + + nucleic acid extraction + D-rex protocol + + + pcr primers + N/A + + + project name + HoloFood Salmon - Metagenomic DNA + + + reference host genome for decontamination + GCA_000000000.0 + + + sample storage buffer + Shield + + + sample storage container + 2ml E-matrix + + + sample storage location + UCPH + + + sample storage temperature + -20 + °C + + + sample volume or weight for DNA extraction + 0.2 + mL + + + sequencing method + MGISEQ-2000 + + + trial timepoint + 0 + days + + + host storage container temperature + 12.76 + °C + + + host storage container pH + 7.05 + + + host gutted mass + 246.6 + g + + + host length + 29.5 + cm + + + host diet treatment concentration + 0 + % mass + + + host storage container + LetSea Tank: Tiger + + + ENA-CHECKLIST + ERC000052 + + + + diff --git a/tests/fixtures/responses/private_study.xml b/tests/fixtures/responses/private_study.xml new file mode 100644 index 0000000..32ef2c1 --- /dev/null +++ b/tests/fixtures/responses/private_study.xml @@ -0,0 +1,16 @@ + + + + + ERP125469 + ena-STUDY-UNIVERSITY OF COPENHAGEN-02-12-2020-08:09:52:762-111 + + + HoloFood Salmon Trial A+B Gut Metagenome + Metagenomic raw reads, assemblies, and bins derived from HoloFood salmon gut samples from trial A and B. The samples in this project contributed to the salmon MAG catalogue (project: PRJEB55376 [ERP140265]). + Metagenomic raw reads, assemblies, and bins derived from HoloFood salmon gut samples from trial A and B. The samples in this project contributed to the salmon MAG catalogue (project: PRJEB55376 [ERP140265]). + HoloFood Salmon Trial A+B Gut Metagenome + + + + \ No newline at end of file diff --git a/tests/fixtures/responses/private_study_runs.json b/tests/fixtures/responses/private_study_runs.json new file mode 100644 index 0000000..b3f7149 --- /dev/null +++ b/tests/fixtures/responses/private_study_runs.json @@ -0,0 +1,152 @@ +[ + { + "report": { + "id": "ERR4918752", + "alias": "webin-reads-SB06_18C1a_metaG", + "instrumentModel": "DNBSEQ-G400", + "firstCreated": "2020-12-04T12:32:19", + "firstPublic": "2022-08-02T17:24:21", + "releaseStatus": "PUBLIC", + "submissionAccountId": "Webin-51990", + "studyId": "ERP125469", + "experimentId": "ERX4783661", + "sampleId": "ERS5444665" + }, + "links": [] + }, + { + "report": { + "id": "ERR4918751", + "alias": "webin-reads-SB06_13C1a_metaG", + "instrumentModel": "DNBSEQ-G400", + "firstCreated": "2020-12-04T12:31:46", + "firstPublic": "2022-08-02T17:24:21", + "releaseStatus": "PUBLIC", + "submissionAccountId": "Webin-51990", + "studyId": "ERP125469", + "experimentId": "ERX4783660", + "sampleId": "ERS5444660" + }, + "links": [] + }, + { + "report": { + "id": "ERR4918750", + "alias": "webin-reads-SB10_16C1a_metaG", + "instrumentModel": "DNBSEQ-G400", + "firstCreated": "2020-12-04T12:31:17", + "firstPublic": "2022-08-02T17:25:05", + "releaseStatus": "PUBLIC", + "submissionAccountId": "Webin-51990", + "studyId": "ERP125469", + "experimentId": "ERX4783659", + "sampleId": "ERS5444722" + }, + "links": [] + }, + { + "report": { + "id": "ERR4918749", + "alias": "webin-reads-SB10_30C1a_metaG", + "instrumentModel": "DNBSEQ-G400", + "firstCreated": "2020-12-04T12:30:42", + "firstPublic": "2022-08-02T17:24:20", + "releaseStatus": "PUBLIC", + "submissionAccountId": "Webin-51990", + "studyId": "ERP125469", + "experimentId": "ERX4783658", + "sampleId": "ERS5444736" + }, + "links": [] + }, + { + "report": { + "id": "ERR4918748", + "alias": "webin-reads-SB11_25C1a_metaG", + "instrumentModel": "DNBSEQ-G400", + "firstCreated": "2020-12-04T12:30:20", + "firstPublic": "2022-08-02T17:25:03", + "releaseStatus": "PUBLIC", + "submissionAccountId": "Webin-51990", + "studyId": "ERP125469", + "experimentId": "ERX4783657", + "sampleId": "ERS5444756" + }, + "links": [] + }, + { + "report": { + "id": "ERR4918747", + "alias": "webin-reads-SB11_13C1a_metaG", + "instrumentModel": "DNBSEQ-G400", + "firstCreated": "2020-12-04T12:30:12", + "firstPublic": "2022-08-02T17:25:04", + "releaseStatus": "PUBLIC", + "submissionAccountId": "Webin-51990", + "studyId": "ERP125469", + "experimentId": "ERX4783656", + "sampleId": "ERS5444744" + }, + "links": [] + }, + { + "report": { + "id": "ERR4918746", + "alias": "webin-reads-SB11_29C1a_metaG", + "instrumentModel": "DNBSEQ-G400", + "firstCreated": "2020-12-04T12:29:52", + "firstPublic": "2022-08-02T17:24:21", + "releaseStatus": "PUBLIC", + "submissionAccountId": "Webin-51990", + "studyId": "ERP125469", + "experimentId": "ERX4783655", + "sampleId": "ERS5444760" + }, + "links": [] + }, + { + "report": { + "id": "ERR4918745", + "alias": "webin-reads-SA09_02C1a_metaG", + "instrumentModel": "DNBSEQ-G400", + "firstCreated": "2020-12-04T12:29:29", + "firstPublic": "2022-08-02T17:24:20", + "releaseStatus": "PUBLIC", + "submissionAccountId": "Webin-51990", + "studyId": "ERP125469", + "experimentId": "ERX4783654", + "sampleId": "ERS5444529" + }, + "links": [] + }, + { + "report": { + "id": "ERR4918744", + "alias": "webin-reads-SB08_05C1a_metaG", + "instrumentModel": "DNBSEQ-G400", + "firstCreated": "2020-12-04T12:29:27", + "firstPublic": "2022-08-02T17:24:21", + "releaseStatus": "PUBLIC", + "submissionAccountId": "Webin-51990", + "studyId": "ERP125469", + "experimentId": "ERX4783653", + "sampleId": "ERS5444707" + }, + "links": [] + }, + { + "report": { + "id": "ERR4918743", + "alias": "webin-reads-SB11_28C1a_metaG", + "instrumentModel": "DNBSEQ-G400", + "firstCreated": "2020-12-04T12:29:25", + "firstPublic": "2022-08-02T17:25:05", + "releaseStatus": "PUBLIC", + "submissionAccountId": "Webin-51990", + "studyId": "ERP125469", + "experimentId": "ERX4783652", + "sampleId": "ERS5444759" + }, + "links": [] + } +] diff --git a/tests/fixtures/responses/public_run.json b/tests/fixtures/responses/public_run.json new file mode 100644 index 0000000..cc790e8 --- /dev/null +++ b/tests/fixtures/responses/public_run.json @@ -0,0 +1,7 @@ +[ + { + "run_accession": "ERR4918394", + "sample_accession": "SAMEA7687881", + "secondary_study_accession": "ERP125469" + } +] diff --git a/tests/fixtures/responses/public_run_from_assembly.xml b/tests/fixtures/responses/public_run_from_assembly.xml new file mode 100644 index 0000000..7d5b620 --- /dev/null +++ b/tests/fixtures/responses/public_run_from_assembly.xml @@ -0,0 +1,25 @@ + + + + ERZ2626953 + webin-genome-ERR4918394_62952e961c26f0859b4ebdd83d93e4e7 + + Genome assembly: ERR4918394_62952e961c26f0859b4ebdd83d93e4e7 + + + ERP125469 + PRJEB41657 + + + + + ERS5444411 + SAMEA7687881 + + + + + ERR4918394 + + + diff --git a/tests/fixtures/responses/public_sample.json b/tests/fixtures/responses/public_sample.json new file mode 100644 index 0000000..d59db96 --- /dev/null +++ b/tests/fixtures/responses/public_sample.json @@ -0,0 +1,8 @@ +[ + { + "sample_accession": "SAMEA7687881", + "collection_date": "2019-06-11", + "country": "Norway", + "location": "66.079905 N 12.587848 E" + } +] diff --git a/tests/fixtures/responses/public_study.json b/tests/fixtures/responses/public_study.json new file mode 100644 index 0000000..eb360c6 --- /dev/null +++ b/tests/fixtures/responses/public_study.json @@ -0,0 +1,6 @@ +[ + { + "study_accession": "PRJEB41657", + "study_description": "Metagenomic raw reads, assemblies, and bins derived from HoloFood salmon gut samples from trial A and B. The samples in this project contributed to the salmon MAG catalogue (project: PRJEB55376 [ERP140265])." + } + ] diff --git a/tests/fixtures/responses/public_study_runs.json b/tests/fixtures/responses/public_study_runs.json new file mode 100644 index 0000000..ae8c790 --- /dev/null +++ b/tests/fixtures/responses/public_study_runs.json @@ -0,0 +1,52 @@ +[ + { + "run_accession": "ERR4918394", + "sample_accession": "SAMEA7687881", + "instrument_model": "DNBSEQ-G400" + }, + { + "run_accession": "ERR4918397", + "sample_accession": "SAMEA7687882", + "instrument_model": "DNBSEQ-G400" + }, + { + "run_accession": "ERR4918398", + "sample_accession": "SAMEA7678309", + "instrument_model": "DNBSEQ-G400" + }, + { + "run_accession": "ERR4918404", + "sample_accession": "SAMEA7687910", + "instrument_model": "DNBSEQ-G400" + }, + { + "run_accession": "ERR4918405", + "sample_accession": "SAMEA7687902", + "instrument_model": "DNBSEQ-G400" + }, + { + "run_accession": "ERR4918406", + "sample_accession": "SAMEA7687895", + "instrument_model": "DNBSEQ-G400" + }, + { + "run_accession": "ERR4918408", + "sample_accession": "SAMEA7687899", + "instrument_model": "DNBSEQ-G400" + }, + { + "run_accession": "ERR4918413", + "sample_accession": "SAMEA7687903", + "instrument_model": "DNBSEQ-G400" + }, + { + "run_accession": "ERR4918414", + "sample_accession": "SAMEA7687912", + "instrument_model": "DNBSEQ-G400" + }, + { + "run_accession": "ERR4918416", + "sample_accession": "SAMEA7687892", + "instrument_model": "DNBSEQ-G400" + } +] diff --git a/tests/test_dummy.py b/tests/test_dummy.py deleted file mode 100644 index 9095527..0000000 --- a/tests/test_dummy.py +++ /dev/null @@ -1,5 +0,0 @@ - -class TestDummy: - - def test_dummy(self): - assert 1 == 1 \ No newline at end of file diff --git a/tests/unit/test_ena.py b/tests/unit/test_ena.py new file mode 100644 index 0000000..902cf13 --- /dev/null +++ b/tests/unit/test_ena.py @@ -0,0 +1,156 @@ +import json + +import pytest +import responses +from requests.exceptions import ConnectionError, HTTPError + +from genomeuploader.ena import EnaQuery + +ENA_WEBIN = "ENA_WEBIN" +ENA_WEBIN_PASSWORD = "ENA_WEBIN_PASSWORD" + + +def read_json(path): + with open(path, "r") as jpath: + return json.load(jpath) + + +def read_xml(path): + with open(path, "r") as xpath: + return xpath.read() + + +def test_ena_study(public_study_data, private_study_data, public_study_json, private_study_xml): + responses.add(responses.POST, "https://www.ebi.ac.uk/ena/portal/api/search", json=read_json(public_study_json)) + + responses.add( + responses.GET, + "https://www.ebi.ac.uk/ena/submit/report/studies/xml/ERP125469", + body=read_xml(private_study_xml), + content_type="application/xml", + ) + + ena_study_public = EnaQuery(accession="ERP125469", query_type="study", private=False) + + ena_study_private = EnaQuery( + accession="ERP125469", + query_type="study", + private=True, + ) + assert ena_study_public.build_query() == public_study_data + assert ena_study_private.build_query() == private_study_data + + +def test_ena_run(public_run_data, private_run_data, public_run_json, private_run_json): + responses.add(responses.POST, "https://www.ebi.ac.uk/ena/portal/api/search", json=read_json(public_run_json)) + + responses.add(responses.GET, "https://www.ebi.ac.uk/ena/submit/report/runs/ERR4918394", json=read_json(private_run_json)) + + ena_run_public = EnaQuery(accession="ERR4918394", query_type="run", private=False) + + ena_run_private = EnaQuery(accession="ERR4918394", query_type="run", private=True) + assert ena_run_public.build_query() == public_run_data + assert ena_run_private.build_query() == private_run_data + + +def test_ena_run_from_assembly(public_run_from_assembly_xml, private_run_from_assembly_xml): + responses.add( + responses.GET, + "https://www.ebi.ac.uk/ena/browser/api/xml/ERZ2626953", + body=read_xml(public_run_from_assembly_xml), + content_type="application/xml", + ) + + responses.add( + responses.GET, + "https://www.ebi.ac.uk/ena/submit/report/analyses/xml/ERZ2626953", + body=read_xml(private_run_from_assembly_xml), + content_type="application/xml", + ) + + ena_run_from_assembly_public = EnaQuery(accession="ERZ2626953", query_type="run_assembly", private=False) + + ena_run_from_assembly_public = EnaQuery(accession="ERZ2626953", query_type="run_assembly", private=True) + + assert ena_run_from_assembly_public.build_query() and ena_run_from_assembly_public.build_query() == "ERR4918394" + + +def test_ena_study_runs(public_study_runs_json, private_study_runs_json): + """response mock limited to 10 runs for the study, need to check if there is page limit""" + responses.add(responses.POST, "https://www.ebi.ac.uk/ena/portal/api/search", json=read_json(public_study_runs_json)) + + responses.add(responses.GET, "https://www.ebi.ac.uk/ena/submit/report/runs/ERP125469", json=read_json(private_study_runs_json)) + + ena_study_runs_public = EnaQuery(accession="ERP125469", query_type="study_runs", private=False) + + ena_study_runs_private = EnaQuery(accession="ERP125469", query_type="study_runs", private=True) + + assert len(ena_study_runs_public.build_query()) and len(ena_study_runs_private.build_query()) == 10 + + +def test_ena_sample(public_sample_data, private_sample_data, public_sample_json, private_sample_xml): + responses.add(responses.POST, "https://www.ebi.ac.uk/ena/portal/api/search", json=read_json(public_sample_json)) + + responses.add(responses.GET, "https://www.ebi.ac.uk/ena/submit/report/samples/xml/ERS5444411", body=read_xml(private_sample_xml)) + + ena_sample_public = EnaQuery(accession="SAMEA7687881", query_type="sample", private=False) + + ena_sample_private = EnaQuery(accession="ERS5444411", query_type="sample", private=True) + + assert ena_sample_public.build_query() == public_sample_data + assert ena_sample_private.build_query() == private_sample_data + + +def test_ena_exceptions(monkeypatch): + monkeypatch.setenv(ENA_WEBIN, "fake-webin-999") + monkeypatch.setenv(ENA_WEBIN_PASSWORD, "fakewebinpw") + + ena_query = EnaQuery( + accession="ERP125469", + query_type="study", + private=True, + ) + + public_api_data = { + "format": "json", + "includeMetagenomes": True, + "dataPortal": "ena", + "result": "study", + "query": "secondary_study_accession=ERP125469", + "fields": "study_accession,study_description", + } + + responses.add( + responses.GET, + "https://www.ebi.ac.uk/ena/submit/report/studies/ERP125469", + body=ConnectionError("Test retry private error"), + ) + + responses.add(responses.POST, "https://www.ebi.ac.uk/ena/portal/api/search", body=ConnectionError("Test retry public error")) + + with pytest.raises( + ValueError, + match="Could not find ERP125469 in ENA after 3 attempts. Error: Test retry private error", + ): + ena_query.retry_or_handle_request_error( + request=ena_query.get_request, + url="https://www.ebi.ac.uk/ena/submit/report/studies/ERP125469", + ) + assert responses.assert_call_count("https://www.ebi.ac.uk/ena/submit/report/studies/ERP125469", 3) + + with pytest.raises(ValueError, match="Could not find ERP125469 in ENA after 3 attempts. Error: Test retry public error"): + ena_query.retry_or_handle_request_error(request=ena_query.post_request, data=public_api_data) + assert responses.assert_call_count("https://www.ebi.ac.uk/ena/portal/api/search", 3) + + responses.add( + responses.GET, + "https://www.ebi.ac.uk/ena/submit/report/studies/ERPXYZ", + body=HTTPError("Test failure error"), + ) + + with pytest.raises(HTTPError): + ena_query.retry_or_handle_request_error( + request=ena_query.get_request, + url="https://www.ebi.ac.uk/ena/submit/report/studies/ERPXYZ", + ) + assert responses.assert_call_count("https://www.ebi.ac.uk/ena/submit/report/studies/ERPXYZ", 1)