From e5c1e4626845e63a9112ea253c2a5630b7a8c1d4 Mon Sep 17 00:00:00 2001 From: Varsha Date: Sun, 13 Oct 2024 16:58:43 +0100 Subject: [PATCH 01/20] change endpoints for private data --- genomeuploader/ena.py | 224 ++++++++++++++++++++------------ genomeuploader/genome_upload.py | 48 ++++--- 2 files changed, 169 insertions(+), 103 deletions(-) diff --git a/genomeuploader/ena.py b/genomeuploader/ena.py index 51ab45c..5230aee 100644 --- a/genomeuploader/ena.py +++ b/genomeuploader/ena.py @@ -59,6 +59,9 @@ class NoDataException(ValueError): RETRY_COUNT = 5 +PRIVATE_DATA_URL = "https://www.ebi.ac.uk/ena/submit/report/" +PUBLIC_DATA_URL = "https://www.ebi.ac.uk/ena/portal/api/search" + class ENA(): def get_default_params(self): @@ -68,28 +71,36 @@ def get_default_params(self): 'dataPortal': 'ena' } - def post_request(self, data, webin, password): - url = "https://www.ebi.ac.uk/ena/portal/api/search" - auth = (webin, password) + def post_request(self, data): + url = PUBLIC_DATA_URL default_connection_headers = { "Content-Type": "application/x-www-form-urlencoded", "Accept": "*/*" } - response = requests.post(url, data=data, auth=auth, headers=default_connection_headers) + response = requests.post(url, data=data, headers=default_connection_headers) return response - def get_run(self, run_accession, webin, password, attempt=0, search_params=None): - data = self.get_default_params() - data['result'] = 'read_run' - data['fields'] = RUN_DEFAULT_FIELDS - data['query'] = 'run_accession=\"{}\"'.format(run_accession) - - if search_params: - data.update(search_params) - - response = self.post_request(data, webin, password) + def get_request(self, url, webin, password): + auth = (webin, password) + response = requests.get(url, auth=auth) + return response + + def get_run(self, run_accession, webin, password, private=False, attempt=0, search_params=None): + if not private: + data = self.get_default_params() + data['result'] = 'read_run' + data['fields'] = RUN_DEFAULT_FIELDS + data['query'] = 'run_accession=\"{}\"'.format(run_accession) + + if search_params: + data.update(search_params) + response = self.post_request(data) + else: + url = f'{PRIVATE_DATA_URL}runs/{run_accession}' + response = self.get_request(url, webin, password) + if not response.ok and attempt > 2: raise ValueError("Could not retrieve run with accession {}, returned " "message: {}".format(run_accession, response.text)) @@ -107,99 +118,147 @@ def get_run(self, run_accession, webin, password, attempt=0, search_params=None) raise ValueError("Could not find run {} in ENA.".format(run_accession)) except: raise Exception("Could not query ENA API: {}".format(response.text)) - - return run - - def get_run_from_assembly(self, assembly_name): - manifestXml = minidom.parseString(requests.get("https://www.ebi.ac.uk" + - "/ena/browser/api/xml/" + assembly_name).text) + + if private: + run_data = run['report'] + final_data = {'secondary_study_accession': run_data['studyId'], 'sample_accession': run_data['sampleId']} + return final_data + else: + return run + + # will not work for private, needs modification after checking best endpoint with ENA for run ref + def get_run_from_assembly(self, assembly_name, webin, password, private=False): + if not private: + manifestXml = minidom.parseString(requests.get("https://www.ebi.ac.uk" + + "/ena/browser/api/xml/" + assembly_name).text) + else: + url = f"https://www.ebi.ac.uk/ena/submit/report/analyses/xml/{assembly_name}" + manifestXml = minidom.parseString(requests.get(url, auth=(webin, password)).text) run_ref = manifestXml.getElementsByTagName("RUN_REF") run = run_ref[0].attributes["accession"].value return run - def get_study(self, webin, password, study_accession): - data = self.get_default_params() - data['result'] = "study" - data['fields'] = STUDY_DEFAULT_FIELDS - data['query'] = 'study_accession="{}" OR secondary_study_accession="{}"' \ - .format(study_accession, study_accession) - - data['dataPortal'] = "ena" + def get_study(self, study_accession, webin, password, private=False): + if not private: + data = self.get_default_params() + data['result'] = "study" + data['fields'] = STUDY_DEFAULT_FIELDS + data['query'] = 'study_accession="{}" OR secondary_study_accession="{}"' \ + .format(study_accession, study_accession) - try: - response = self.post_request(data, webin, password) - if response.status_code == 204: - raise NoDataException() + data['dataPortal'] = "ena" try: - studyList = response.json() - assert len(studyList) == 1 - study = studyList[0] - except (IndexError, TypeError, ValueError, KeyError) as e: - raise e - return study - except NoDataException: - print("No info found to fetch study {}".format(study_accession)) - except (IndexError, TypeError, ValueError, KeyError): - print("Failed to fetch study {}, returned error: {}".format(study_accession, response.text)) - - raise ValueError('Could not find study {} in ENA.'.format(study_accession)) - - def get_study_runs(self, study_acc, webin, password, fields=None, search_params=None): - data = self.get_default_params() - data['result'] = 'read_run' - data['fields'] = fields or RUN_DEFAULT_FIELDS - data['query'] = '(study_accession=\"{}\" OR secondary_study_accession=\"{}\")'.format(study_acc, study_acc) - - if search_params: - data.update(search_params) - - response = self.post_request(data, webin, password) + response = self.post_request(data) + if response.status_code == 204: + raise NoDataException() + try: + study = json.loads(response.text)[0] + except (IndexError, TypeError, ValueError, KeyError) as e: + raise e + return study + + except NoDataException: + print("No info found to fetch study {}".format(study_accession)) + except (IndexError, TypeError, ValueError, KeyError): + print("Failed to fetch study {}, returned error: {}".format(study_accession, response.text)) + + raise ValueError('Could not find study {} in ENA.'.format(study_accession)) + else: + url = f"https://www.ebi.ac.uk/ena/submit/report/studies/xml/{study_accession}" + manifestXml = minidom.parseString(requests.get(url, auth=(webin, password)).text) + study_desc= manifestXml.getElementsByTagName("STUDY_DESCRIPTION")[0].firstChild.nodeValue + final_data = {'study_description': study_desc} + return final_data + + + def get_study_runs(self, study_accession, webin, password, private= False, fields=None, search_params=None): + if not private: + data = self.get_default_params() + data['result'] = 'read_run' + data['fields'] = fields or RUN_DEFAULT_FIELDS + data['query'] = '(study_accession=\"{}\" OR secondary_study_accession=\"{}\")'.format(study_accession, study_accession) + + if search_params: + data.update(search_params) + + response = self.post_request(data) + else: + url = f'{PRIVATE_DATA_URL}runs/{study_accession}' + response = self.get_request(url, webin, password) if not response.ok: - raise ValueError("Could not retrieve runs for study %s.", study_acc) + raise ValueError("Could not retrieve runs for study %s.", study_accession) if response.status_code == 204: return [] try: - runs = response.json() + runs = response.json()[0:2] except: raise ValueError("Query against ENA API did not work. Returned " "message: {}".format(response.text)) + + if private: + final_data = [] + for run in runs: + run_data = run['report'] + if 'sampleId' in run_data: + run_data['sample_accession'] = run_data.pop('sampleId') + if 'id' in run_data: + run_data['run_accession'] = run_data.pop('id') + final_data.append(run_data) + return(final_data) + else: + return runs - return runs - - def get_sample(self, sample_accession, webin, password, fields=None, search_params=None, attempt=0): - data = self.get_default_params() - data['result'] = 'sample' - data['fields'] = fields or SAMPLE_DEFAULT_FIELDS - data['query'] = ('(sample_accession=\"{acc}\" OR secondary_sample_accession' - '=\"{acc}\") ').format(acc=sample_accession) + def get_sample(self, sample_accession, webin, password, private=False, fields=None, search_params=None, attempt=0): + if not private: + data = self.get_default_params() + data['result'] = 'sample' + data['fields'] = fields or SAMPLE_DEFAULT_FIELDS + data['query'] = ('(sample_accession=\"{acc}\" OR secondary_sample_accession' + '=\"{acc}\") ').format(acc=sample_accession) - if search_params: - data.update(search_params) + if search_params: + data.update(search_params) - response = self.post_request(data, webin, password) + response = self.post_request(data) - if response.status_code == 200: - sample = response.json() - assert len(sample) == 1 - return sample[0] + if response.status_code == 200: + sample = response.json() + assert len(sample) == 1 + return sample[0] - if response.status_code == 204: - if attempt < 2: - new_params = {'dataPortal': 'metagenome' if data['dataPortal'] == 'ena' else 'ena'} - attempt += 1 - return self.get_sample(sample_accession, webin, password, fields=fields, - search_params=new_params, attempt=attempt) + if response.status_code == 204: + if attempt < 2: + new_params = {'dataPortal': 'metagenome' if data['dataPortal'] == 'ena' else 'ena'} + attempt += 1 + return self.get_sample(sample_accession, webin, password, fields=fields, + search_params=new_params, attempt=attempt) + else: + raise ValueError("Could not find sample {} in ENA after " + "{} attempts.".format(sample_accession, RETRY_COUNT)) else: - raise ValueError("Could not find sample {} in ENA after " - "{} attempts.".format(sample_accession, RETRY_COUNT)) + raise ValueError("Could not retrieve sample with accession {}. " + "Returned message: {}".format(sample_accession, response.text)) else: - raise ValueError("Could not retrieve sample with accession {}. " - "Returned message: {}".format(sample_accession, response.text)) + url = f"https://www.ebi.ac.uk/ena/submit/report/samples/xml/{sample_accession}" + final_data = {} + manifestXml = minidom.parseString(requests.get(url, auth=(webin, password)).text) + sample_attributes = manifestXml.getElementsByTagName('SAMPLE_ATTRIBUTE') + for attribute in sample_attributes: + tag = attribute.getElementsByTagName('TAG')[0].firstChild.nodeValue + if tag == "geographic location (country and/or sea)": + final_data['country'] = attribute.getElementsByTagName('VALUE')[0].firstChild.nodeValue + if tag == "geographic location (latitude)": + final_data['latitude'] = attribute.getElementsByTagName('VALUE')[0].firstChild.nodeValue + if tag == "geographic location (longitude)": + final_data['longitude'] = attribute.getElementsByTagName('VALUE')[0].firstChild.nodeValue + if tag == "collection date": + final_data['collection_date'] = attribute.getElementsByTagName('VALUE')[0].firstChild.nodeValue + return final_data def query_taxid(self, taxid): url = "https://www.ebi.ac.uk/ena/taxonomy/rest/tax-id/{}".format(taxid) @@ -294,3 +353,4 @@ def handle_genomes_registration(self, sample_xml, submission_xml, webin, passwor logger.info('{} genome samples successfully registered.'.format(str(len(aliasDict)))) return aliasDict + \ No newline at end of file diff --git a/genomeuploader/genome_upload.py b/genomeuploader/genome_upload.py index 99cf3fb..6ec1f65 100755 --- a/genomeuploader/genome_upload.py +++ b/genomeuploader/genome_upload.py @@ -351,7 +351,7 @@ def extract_genomes_info(inputFile, genomeType, live): return genomeInfo -def extract_ENA_info(genomeInfo, uploadDir, webin, password): +def extract_ENA_info(genomeInfo, uploadDir, webin, password, private=False): logger.info('Retrieving project and run info from ENA (this might take a while)...') # retrieving metadata from runs (and runs from assembly accessions if provided) @@ -360,13 +360,13 @@ def extract_ENA_info(genomeInfo, uploadDir, webin, password): if genomeInfo[g]["accessionType"] == "assembly": derivedRuns = [] for acc in genomeInfo[g]["accessions"]: - derivedRuns.append(ena.get_run_from_assembly(acc)) + derivedRuns.append(ena.get_run_from_assembly(acc, private)) genomeInfo[g]["accessions"] = derivedRuns allRuns.extend(genomeInfo[g]["accessions"]) runsSet, studySet, samplesDict, tempDict = set(allRuns), set(), {}, {} for r in runsSet: - run_info = ena.get_run(r, webin, password) + run_info = ena.get_run(r, webin, password, private) studySet.add(run_info["secondary_study_accession"]) samplesDict[r] = run_info["sample_accession"] @@ -386,10 +386,10 @@ def extract_ENA_info(genomeInfo, uploadDir, webin, password): except json.decoder.JSONDecodeError: backupDict = {} for s in studySet: - studyInfo = ena.get_study(webin, password, s) + studyInfo = ena.get_study(s, webin, password, private=False) projectDescription = studyInfo["study_description"] - ENA_info = ena.get_study_runs(s, webin, password) + ENA_info = ena.get_study_runs(s, webin, password, private=False) if ENA_info == []: raise IOError("No runs found on ENA for project {}.".format(s)) @@ -398,21 +398,25 @@ def extract_ENA_info(genomeInfo, uploadDir, webin, password): if runAccession not in backupDict: if runAccession in runsSet: sampleAccession = ENA_info[run]["sample_accession"] - sampleInfo = ena.get_sample(sampleAccession, webin, password) - - location = sampleInfo["location"] - latitude, longitude = None, None - if 'N' in location: - latitude = location.split('N')[0].strip() - longitude = location.split('N')[1].strip() - elif 'S' in location: - latitude = '-' + location.split('S')[0].strip() - longitude = location.split('S')[1].strip() - - if 'W' in longitude: - longitude = '-' + longitude.split('W')[0].strip() - elif longitude.endswith('E'): - longitude = longitude.split('E')[0].strip() + sampleInfo = ena.get_sample(sampleAccession, webin, password, private=False) + + if sampleInfo['latitude'] and sampleInfo['longitude']: + latitude = sampleInfo['latitude'] + longitude = ['longitude'] + else: + location = sampleInfo["location"] + latitude, longitude = None, None + if 'N' in location: + latitude = location.split('N')[0].strip() + longitude = location.split('N')[1].strip() + elif 'S' in location: + latitude = '-' + location.split('S')[0].strip() + longitude = location.split('S')[1].strip() + + if 'W' in longitude: + longitude = '-' + longitude.split('W')[0].strip() + elif longitude.endswith('E'): + longitude = longitude.split('E')[0].strip() if latitude: latitude = "{:.{}f}".format(round(float(latitude), GEOGRAPHY_DIGIT_COORDS), GEOGRAPHY_DIGIT_COORDS) @@ -845,6 +849,7 @@ def __init__(self, argv=sys.argv[1:]): self.genomeMetadata = self.args.genome_info self.genomeType = "bins" if self.args.bins else "MAGs" self.live = True if self.args.live else False + self.private = self.args.private if self.args.webin and self.args.password: self.username = self.args.webin @@ -900,6 +905,7 @@ def parse_args(self, argv): parser.add_argument('--webin', required=False, help="Webin id") parser.add_argument('--password', required=False, help="Webin password") parser.add_argument('--centre_name', required=False, help="Name of the centre uploading genomes") + parser.add_argument('--private', required=False, help="if data is private", action='store_true', default=False) args = parser.parse_args(argv) @@ -925,7 +931,7 @@ def create_genome_dictionary(self, samples_xml): genomeInfo = extract_genomes_info(self.genomeMetadata, self.genomeType, self.live) if not os.path.exists(samples_xml) or self.force: - extract_ENA_info(genomeInfo, self.upload_dir, self.username, self.password) + extract_ENA_info(genomeInfo, self.upload_dir, self.username, self.password, self.private) logger.info("Writing genome registration XML...") write_genomes_xml(genomeInfo, samples_xml, self.genomeType, From 2e4b8ac89f6d8ba08838013c76ccbd57f18f97bf Mon Sep 17 00:00:00 2001 From: Varsha Date: Sun, 13 Oct 2024 17:00:09 +0100 Subject: [PATCH 02/20] remove comment --- genomeuploader/ena.py | 1 - 1 file changed, 1 deletion(-) diff --git a/genomeuploader/ena.py b/genomeuploader/ena.py index 5230aee..9b60095 100644 --- a/genomeuploader/ena.py +++ b/genomeuploader/ena.py @@ -126,7 +126,6 @@ def get_run(self, run_accession, webin, password, private=False, attempt=0, sea else: return run - # will not work for private, needs modification after checking best endpoint with ENA for run ref def get_run_from_assembly(self, assembly_name, webin, password, private=False): if not private: manifestXml = minidom.parseString(requests.get("https://www.ebi.ac.uk" + From c816a0c1aecf3f0c5a8de2a251227639a280d19e Mon Sep 17 00:00:00 2001 From: Varsha Date: Sun, 13 Oct 2024 17:01:20 +0100 Subject: [PATCH 03/20] remove subset for test --- genomeuploader/ena.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/genomeuploader/ena.py b/genomeuploader/ena.py index 9b60095..ef0082b 100644 --- a/genomeuploader/ena.py +++ b/genomeuploader/ena.py @@ -194,7 +194,7 @@ def get_study_runs(self, study_accession, webin, password, private= False, field return [] try: - runs = response.json()[0:2] + runs = response.json() except: raise ValueError("Query against ENA API did not work. Returned " "message: {}".format(response.text)) From 7c6e7c952ecc3c6c37ee133da857c18beedaf9cf Mon Sep 17 00:00:00 2001 From: Varsha Date: Sun, 13 Oct 2024 17:06:34 +0100 Subject: [PATCH 04/20] code corrections --- genomeuploader/ena.py | 2 +- genomeuploader/genome_upload.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/genomeuploader/ena.py b/genomeuploader/ena.py index ef0082b..534b2ea 100644 --- a/genomeuploader/ena.py +++ b/genomeuploader/ena.py @@ -167,7 +167,7 @@ def get_study(self, study_accession, webin, password, private=False): else: url = f"https://www.ebi.ac.uk/ena/submit/report/studies/xml/{study_accession}" manifestXml = minidom.parseString(requests.get(url, auth=(webin, password)).text) - study_desc= manifestXml.getElementsByTagName("STUDY_DESCRIPTION")[0].firstChild.nodeValue + study_desc = manifestXml.getElementsByTagName("STUDY_DESCRIPTION")[0].firstChild.nodeValue final_data = {'study_description': study_desc} return final_data diff --git a/genomeuploader/genome_upload.py b/genomeuploader/genome_upload.py index 6ec1f65..5b9a3c8 100755 --- a/genomeuploader/genome_upload.py +++ b/genomeuploader/genome_upload.py @@ -386,10 +386,10 @@ def extract_ENA_info(genomeInfo, uploadDir, webin, password, private=False): except json.decoder.JSONDecodeError: backupDict = {} for s in studySet: - studyInfo = ena.get_study(s, webin, password, private=False) + studyInfo = ena.get_study(s, webin, password, private) projectDescription = studyInfo["study_description"] - ENA_info = ena.get_study_runs(s, webin, password, private=False) + ENA_info = ena.get_study_runs(s, webin, password, private) if ENA_info == []: raise IOError("No runs found on ENA for project {}.".format(s)) @@ -398,11 +398,11 @@ def extract_ENA_info(genomeInfo, uploadDir, webin, password, private=False): if runAccession not in backupDict: if runAccession in runsSet: sampleAccession = ENA_info[run]["sample_accession"] - sampleInfo = ena.get_sample(sampleAccession, webin, password, private=False) + sampleInfo = ena.get_sample(sampleAccession, webin, password, private) if sampleInfo['latitude'] and sampleInfo['longitude']: latitude = sampleInfo['latitude'] - longitude = ['longitude'] + longitude = sampleInfo['longitude'] else: location = sampleInfo["location"] latitude, longitude = None, None From 2cbae287955ff7f93c02cfff0442b0b9c267e440 Mon Sep 17 00:00:00 2001 From: Varsha Date: Tue, 15 Oct 2024 16:21:10 +0100 Subject: [PATCH 05/20] add instrument model to run data --- genomeuploader/ena.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/genomeuploader/ena.py b/genomeuploader/ena.py index 534b2ea..c62a168 100644 --- a/genomeuploader/ena.py +++ b/genomeuploader/ena.py @@ -128,12 +128,12 @@ def get_run(self, run_accession, webin, password, private=False, attempt=0, sea def get_run_from_assembly(self, assembly_name, webin, password, private=False): if not private: - manifestXml = minidom.parseString(requests.get("https://www.ebi.ac.uk" + - "/ena/browser/api/xml/" + assembly_name).text) + response = requests.get("https://www.ebi.ac.uk/ena/browser/api/xml/" + assembly_name) + else: - url = f"https://www.ebi.ac.uk/ena/submit/report/analyses/xml/{assembly_name}" - manifestXml = minidom.parseString(requests.get(url, auth=(webin, password)).text) + response = requests.get("https://www.ebi.ac.uk/ena/submit/report/analyses/xml/" + assembly_name, auth=(webin, password)) + manifestXml = minidom.parseString(response.text) run_ref = manifestXml.getElementsByTagName("RUN_REF") run = run_ref[0].attributes["accession"].value @@ -207,6 +207,8 @@ def get_study_runs(self, study_accession, webin, password, private= False, field run_data['sample_accession'] = run_data.pop('sampleId') if 'id' in run_data: run_data['run_accession'] = run_data.pop('id') + if 'instrumentModel' in run_data: + run_data['instrument_model'] = run_data.pop('instrumentModel') final_data.append(run_data) return(final_data) else: From 3b09a7e8f1add94fde8a1472724e5694b15307bd Mon Sep 17 00:00:00 2001 From: Varsha Date: Wed, 16 Oct 2024 09:44:29 +0100 Subject: [PATCH 06/20] fix code bug --- genomeuploader/genome_upload.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/genomeuploader/genome_upload.py b/genomeuploader/genome_upload.py index 5b9a3c8..e62c89f 100755 --- a/genomeuploader/genome_upload.py +++ b/genomeuploader/genome_upload.py @@ -400,7 +400,7 @@ def extract_ENA_info(genomeInfo, uploadDir, webin, password, private=False): sampleAccession = ENA_info[run]["sample_accession"] sampleInfo = ena.get_sample(sampleAccession, webin, password, private) - if sampleInfo['latitude'] and sampleInfo['longitude']: + if 'latitude' in sampleInfo and 'longitude' in sampleInfo: latitude = sampleInfo['latitude'] longitude = sampleInfo['longitude'] else: From 5ee563597dab1efa01b16aa7e6bb27b7fc6436e1 Mon Sep 17 00:00:00 2001 From: Varsha Date: Tue, 17 Dec 2024 15:18:54 +0000 Subject: [PATCH 07/20] separate public and private API calls and streamline exceptions --- genomeuploader/ena.py | 444 +++++++++++++++++++++++------------------- 1 file changed, 247 insertions(+), 197 deletions(-) diff --git a/genomeuploader/ena.py b/genomeuploader/ena.py index c62a168..110f979 100644 --- a/genomeuploader/ena.py +++ b/genomeuploader/ena.py @@ -19,9 +19,12 @@ import requests import json import logging +import os from time import sleep +import sys import xml.dom.minidom as minidom +from requests.exceptions import ConnectionError, HTTPError, RequestException, Timeout logging.basicConfig(level=logging.INFO) @@ -33,15 +36,19 @@ class NoDataException(ValueError): RUN_DEFAULT_FIELDS = ','.join([ - 'study_accession', 'secondary_study_accession', - 'instrument_model', - 'run_accession', 'sample_accession' ]) +STUDY_RUN_DEFAULT_FIELDS = ','.join([ + 'sample_accession', + 'run_accession', + 'instrument_model' +]) + ASSEMBLY_DEFAULT_FIELDS = 'sample_accession' + SAMPLE_DEFAULT_FIELDS = ','.join([ 'sample_accession', 'secondary_sample_accession', @@ -50,216 +57,259 @@ class NoDataException(ValueError): 'location' ]) -STUDY_DEFAULT_FIELDS = ','.join([ - 'study_accession', - 'secondary_study_accession', - 'study_description', - 'study_title' -]) +STUDY_DEFAULT_FIELDS = 'study_description' -RETRY_COUNT = 5 +RETRY_COUNT = 3 -PRIVATE_DATA_URL = "https://www.ebi.ac.uk/ena/submit/report/" -PUBLIC_DATA_URL = "https://www.ebi.ac.uk/ena/portal/api/search" +def get_default_connection_headers(): + return { + "headers": { + "Content-Type": "application/x-www-form-urlencoded", + "Accept": "*/*", + } + } + +def get_default_params(): + return { + 'format': 'json', + 'includeMetagenomes': True, + 'dataPortal': 'ena' + } + +def parse_accession(accession): + if accession.startswith("PRJ"): + return "study_accession" + elif "RP" in accession: + return "secondary_study_accession" + elif "RR" in accession: + return "run_accession" + elif "SAM" in accession: + return "secondary_sample_accession" + elif "RS" in accession: + return "sample_accession" + else: + logging.error(f"{accession} is not a valid accession") + sys.exit() class ENA(): - def get_default_params(self): - return { - 'format': 'json', - 'includeMetagenomes': True, - 'dataPortal': 'ena' - } + def __init__(self, accession, private=False): + self.private_url = "https://www.ebi.ac.uk/ena/submit/report" + self.public_url = "https://www.ebi.ac.uk/ena/portal/api/search" + self.browser_url = "https://www.ebi.ac.uk/ena/browser/api/xml" + self.accession = accession + self.acc_type = parse_accession(accession) + username = os.getenv("ENA_WEBIN") + password = os.getenv("ENA_WEBIN_PASSWORD") + if username is None or password is None: + logging.error("ENA_WEBIN and ENA_WEBIN_PASSWORD are not set") + if username and password: + self.auth = (username, password) + else: + self.auth = None + self.private = private + def post_request(self, data): - url = PUBLIC_DATA_URL - default_connection_headers = { - "Content-Type": "application/x-www-form-urlencoded", - "Accept": "*/*" - } - response = requests.post(url, data=data, headers=default_connection_headers) - + response = requests.post( + self.public_url, data=data, **get_default_connection_headers() + ) return response - def get_request(self, url, webin, password): - auth = (webin, password) - response = requests.get(url, auth=auth) - - return response - - def get_run(self, run_accession, webin, password, private=False, attempt=0, search_params=None): - if not private: - data = self.get_default_params() - data['result'] = 'read_run' - data['fields'] = RUN_DEFAULT_FIELDS - data['query'] = 'run_accession=\"{}\"'.format(run_accession) - - if search_params: - data.update(search_params) - response = self.post_request(data) + def get_request(self, url): + if self.private: + response = requests.get(url, auth=self.auth) else: - url = f'{PRIVATE_DATA_URL}runs/{run_accession}' - response = self.get_request(url, webin, password) - - if not response.ok and attempt > 2: - raise ValueError("Could not retrieve run with accession {}, returned " - "message: {}".format(run_accession, response.text)) - elif response.status_code == 204: - if attempt < 2: + response = requests.get(url) + return response + + def get_data_or_handle_error(self, response, xml=False, full_json=False): + try: + data_txt = response.text + if data_txt is None: + if self.private: + logging.error( + f"{self.accession} private data is not present in the specified Webin account" + ) + else: + logging.error(f"{self.accession} public data does not exist") + else: + if xml: + return minidom.parseString(data_txt) + elif full_json: + # return the full json when more than one entry expected + return response.json() + else: + # only return the fist element in the list is the default. In these cases there should only be one entry returned + return json.loads(response.txt)[0] + except (IndexError, TypeError, ValueError, KeyError): + logging.error( + f"Failed to fetch {self.accession}, returned error: {response.text}" + ) + + def retry_or_handle_request_error(self, request, *args, **kwargs): + attempt = 0 + while attempt < RETRY_COUNT: + try: + response = request(*args, **kwargs) + response.raise_for_status() + return response + # all other RequestExceptions are raised below + except (Timeout, ConnectionError) as retry_err: attempt += 1 + if attempt >= RETRY_COUNT: + raise ValueError( + f"Could not find {self.accession} in ENA after {RETRY_COUNT} attempts. Error: {retry_err}" + ) sleep(1) - return self.get_run(run_accession, webin, password, attempt) - else: - raise ValueError("Could not find run {} in ENA after {}" - " attempts".format(run_accession, RETRY_COUNT)) - try: - run = json.loads(response.text)[0] - except (IndexError, TypeError, ValueError): - raise ValueError("Could not find run {} in ENA.".format(run_accession)) - except: - raise Exception("Could not query ENA API: {}".format(response.text)) - - if private: - run_data = run['report'] - final_data = {'secondary_study_accession': run_data['studyId'], 'sample_accession': run_data['sampleId']} - return final_data - else: - return run - - def get_run_from_assembly(self, assembly_name, webin, password, private=False): - if not private: - response = requests.get("https://www.ebi.ac.uk/ena/browser/api/xml/" + assembly_name) - - else: - response = requests.get("https://www.ebi.ac.uk/ena/submit/report/analyses/xml/" + assembly_name, auth=(webin, password)) - - manifestXml = minidom.parseString(response.text) - run_ref = manifestXml.getElementsByTagName("RUN_REF") + except HTTPError as http_err: + print(f"HTTP response has an error status: {http_err}") + raise + except RequestException as req_err: + print(f"Network-related error status: {req_err}") + raise + # should hopefully encompass all other issues... + except Exception as e: + print(f"An unexpected error occurred: {e}") + raise + + def _get_private_run(self): + url = f"{self.private_url}/runs/{self.accession}" + response = self.retry_or_handle_request_error(self.get_request, url) + run = self.get_data_or_handle_error(response) + run_data = run["report"] + reformatted_data = { + "secondary_study_accession": run_data['studyID'], + "sample_accession": run_data["sampleId"], + } + logging.info(f"{self.accession} private run returned from ENA") + return reformatted_data + + def _get_public_run(self): + data = get_default_params() + data.update( + { + "result": "read_run", + "query": f'run_accession="{self.accession}"', + "fields": "secondary_study_accession,sample_accession", + } + ) + response = self.retry_or_handle_request_error(self.post_request, data) + run = self.get_data_or_handle_error(response) + logging.info(f"{self.accession} public run returned from ENA") + return run + + + def _get_private_study(self): + url = f"{self.private_url}/studies/xml/{self.accession}" + response = self.retry_or_handle_request_error(self.get_request, url) + study = self.get_data_or_handle_error(response, xml=True) + study_desc = study.getElementsByTagName("STUDY_DESCRIPTION")[0].firstChild.nodeValue + reformatted_data = { + "study_description" : study_desc + } + logging.info(f"{self.accession} private study returned from ENA") + return reformatted_data + + def _get_public_study(self): + data = get_default_params() + data.update( + { + "result": "study", + "query": f'{self.acc_type}="{self.accession}"', + "fields": STUDY_DEFAULT_FIELDS, + } + ) + response = self.retry_or_handle_request_error(self.post_request, data) + study = self.get_data_or_handle_error(response) + logging.info(f"{self.accession} public study returned from ENA") + return study + + def _get_private_run_from_assembly(self): + url = f"{self.private_url}/analyses/xml/{self.accession}" + reponse = self.retry_or_handle_request_error(self.get_request, url) + parsed_xml = self.get_data_or_handle_error(reponse, xml=True) + run_ref = parsed_xml.getElementsByTagName("RUN_REF") run = run_ref[0].attributes["accession"].value - + logging.info(f"private run from the assembly {self.accession} returned from ENA") return run - def get_study(self, study_accession, webin, password, private=False): - if not private: - data = self.get_default_params() - data['result'] = "study" - data['fields'] = STUDY_DEFAULT_FIELDS - data['query'] = 'study_accession="{}" OR secondary_study_accession="{}"' \ - .format(study_accession, study_accession) - - data['dataPortal'] = "ena" - try: - response = self.post_request(data) - if response.status_code == 204: - raise NoDataException() - try: - study = json.loads(response.text)[0] - except (IndexError, TypeError, ValueError, KeyError) as e: - raise e - return study - - except NoDataException: - print("No info found to fetch study {}".format(study_accession)) - except (IndexError, TypeError, ValueError, KeyError): - print("Failed to fetch study {}, returned error: {}".format(study_accession, response.text)) - - raise ValueError('Could not find study {} in ENA.'.format(study_accession)) - else: - url = f"https://www.ebi.ac.uk/ena/submit/report/studies/xml/{study_accession}" - manifestXml = minidom.parseString(requests.get(url, auth=(webin, password)).text) - study_desc = manifestXml.getElementsByTagName("STUDY_DESCRIPTION")[0].firstChild.nodeValue - final_data = {'study_description': study_desc} - return final_data - - - def get_study_runs(self, study_accession, webin, password, private= False, fields=None, search_params=None): - if not private: - data = self.get_default_params() - data['result'] = 'read_run' - data['fields'] = fields or RUN_DEFAULT_FIELDS - data['query'] = '(study_accession=\"{}\" OR secondary_study_accession=\"{}\")'.format(study_accession, study_accession) - - if search_params: - data.update(search_params) - - response = self.post_request(data) - else: - url = f'{PRIVATE_DATA_URL}runs/{study_accession}' - response = self.get_request(url, webin, password) - - if not response.ok: - raise ValueError("Could not retrieve runs for study %s.", study_accession) - - if response.status_code == 204: - return [] - - try: - runs = response.json() - except: - raise ValueError("Query against ENA API did not work. Returned " - "message: {}".format(response.text)) - - if private: - final_data = [] - for run in runs: - run_data = run['report'] - if 'sampleId' in run_data: - run_data['sample_accession'] = run_data.pop('sampleId') - if 'id' in run_data: - run_data['run_accession'] = run_data.pop('id') - if 'instrumentModel' in run_data: - run_data['instrument_model'] = run_data.pop('instrumentModel') - final_data.append(run_data) - return(final_data) - else: - return runs + def _get_public_run_from_assembly(self): + url = f"{self.browser_url}/{self.accession}" + reponse = self.retry_or_handle_request_error(self.get_request, url) + parsed_xml = self.get_data_or_handle_error(reponse, xml=True) + run_ref = parsed_xml.getElementsByTagName("RUN_REF") + run = run_ref[0].attributes["accession"].value + logging.info(f"public run from the assembly {self.accession} returned from ENA") - def get_sample(self, sample_accession, webin, password, private=False, fields=None, search_params=None, attempt=0): - if not private: - data = self.get_default_params() - data['result'] = 'sample' - data['fields'] = fields or SAMPLE_DEFAULT_FIELDS - data['query'] = ('(sample_accession=\"{acc}\" OR secondary_sample_accession' - '=\"{acc}\") ').format(acc=sample_accession) + return run - if search_params: - data.update(search_params) + def _get_private_study_runs(self): + url = f"{self.private_url}/runs/{self.accession}" + response = self.retry_or_handle_request_error(self.get_request, url) + runs = self.get_data_or_handle_error(response, full_json=True) + reformatted_data = [] + for run in runs: + run_data = run['report'] + if 'sampleId' in run_data: + run_data['sample_accession'] = run_data.pop('sampleId') + if 'id' in run_data: + run_data['run_accession'] = run_data.pop('id') + if 'instrumentModel' in run_data: + run_data['instrument_model'] = run_data.pop('instrumentModel') + reformatted_data.append(run_data) + logging.info(f"private runs from study {self.accession}, returned from ENA") + return reformatted_data + + + def _get_public_study_runs(self): + data = get_default_params() + data.update( + { + "result": "read_run", + "fields": STUDY_RUN_DEFAULT_FIELDS, + "query": f"{self.acc_type}={self.accession}" + } + ) + response = self.retry_or_handle_request_error(self.post_request, data) + runs = self.get_data_or_handle_error(response, full_json=True) + logging.info(f"public runs from study {self.accession}, returned from ENA") + return runs + + def _get_private_sample(self): + url = f"{self.private_url}/samples/xml/{self.accession}" + reponse = self.retry_or_handle_request_error(self.get_request, url) + reformatted_data = {} + parsed_xml = self.get_data_or_handle_error(reponse, xml=True) + sample_attributes = parsed_xml.getElementsByTagName('SAMPLE_ATTRIBUTE') + for attribute in sample_attributes: + tag = attribute.getElementsByTagName('TAG')[0].firstChild.nodeValue + if tag == "geographic location (country and/or sea)": + reformatted_data['country'] = attribute.getElementsByTagName('VALUE')[0].firstChild.nodeValue + if tag == "geographic location (latitude)": + reformatted_data['latitude'] = attribute.getElementsByTagName('VALUE')[0].firstChild.nodeValue + if tag == "geographic location (longitude)": + reformatted_data['longitude'] = attribute.getElementsByTagName('VALUE')[0].firstChild.nodeValue + if tag == "collection date": + reformatted_data['collection_date'] = attribute.getElementsByTagName('VALUE')[0].firstChild.nodeValue + logging.info(f"{self.accession} private sample returned from ENA") + return reformatted_data + + + def _get_public_sample(self): + data = get_default_params() + data.update( + { + "result": "sample", + "fields": SAMPLE_DEFAULT_FIELDS, + "query": f"{self.acc_type}={self.accession}" + } + ) + response = self.retry_or_handle_request_error(self.post_request, data) + sample = self.get_data_or_handle_error(response) + logging.info(f"{self.accession} public sample returned from ENA") + return sample - response = self.post_request(data) - - if response.status_code == 200: - sample = response.json() - assert len(sample) == 1 - return sample[0] - - if response.status_code == 204: - if attempt < 2: - new_params = {'dataPortal': 'metagenome' if data['dataPortal'] == 'ena' else 'ena'} - attempt += 1 - return self.get_sample(sample_accession, webin, password, fields=fields, - search_params=new_params, attempt=attempt) - else: - raise ValueError("Could not find sample {} in ENA after " - "{} attempts.".format(sample_accession, RETRY_COUNT)) - else: - raise ValueError("Could not retrieve sample with accession {}. " - "Returned message: {}".format(sample_accession, response.text)) - else: - url = f"https://www.ebi.ac.uk/ena/submit/report/samples/xml/{sample_accession}" - final_data = {} - manifestXml = minidom.parseString(requests.get(url, auth=(webin, password)).text) - sample_attributes = manifestXml.getElementsByTagName('SAMPLE_ATTRIBUTE') - for attribute in sample_attributes: - tag = attribute.getElementsByTagName('TAG')[0].firstChild.nodeValue - if tag == "geographic location (country and/or sea)": - final_data['country'] = attribute.getElementsByTagName('VALUE')[0].firstChild.nodeValue - if tag == "geographic location (latitude)": - final_data['latitude'] = attribute.getElementsByTagName('VALUE')[0].firstChild.nodeValue - if tag == "geographic location (longitude)": - final_data['longitude'] = attribute.getElementsByTagName('VALUE')[0].firstChild.nodeValue - if tag == "collection date": - final_data['collection_date'] = attribute.getElementsByTagName('VALUE')[0].firstChild.nodeValue - return final_data def query_taxid(self, taxid): url = "https://www.ebi.ac.uk/ena/taxonomy/rest/tax-id/{}".format(taxid) @@ -306,7 +356,7 @@ def query_scientific_name(self, scientificName, searchRank=False): else: return submittable, taxid - def handle_genomes_registration(self, sample_xml, submission_xml, webin, password, live=False): + def handle_genomes_registration(self, sample_xml, submission_xml, live=False): liveSub, mode = "", "live" if not live: @@ -322,7 +372,7 @@ def handle_genomes_registration(self, sample_xml, submission_xml, webin, passwor 'SAMPLE': open(sample_xml, 'r') } - submissionResponse = requests.post(url, files = f, auth = (webin, password)) + submissionResponse = requests.post(url, files = f, auth = self.auth) if submissionResponse.status_code != 200: if str(submissionResponse.status_code).startswith('5'): From 0e5d8d055a68281517df5e4e300b0ff64b724512 Mon Sep 17 00:00:00 2001 From: Varsha Date: Tue, 17 Dec 2024 17:15:58 +0000 Subject: [PATCH 08/20] remove webin credential parameter, remove webin credentials from genome upload script, redfine ena functions --- genomeuploader/ena.py | 171 ++++++++++++++------- genomeuploader/genome_upload.py | 255 +++++++++++++++----------------- 2 files changed, 236 insertions(+), 190 deletions(-) diff --git a/genomeuploader/ena.py b/genomeuploader/ena.py index 110f979..7682799 100644 --- a/genomeuploader/ena.py +++ b/genomeuploader/ena.py @@ -19,9 +19,11 @@ import requests import json import logging -import os +import os +import sys from time import sleep -import sys +from pathlib import Path +from dotenv import load_dotenv import xml.dom.minidom as minidom from requests.exceptions import ConnectionError, HTTPError, RequestException, Timeout @@ -91,16 +93,82 @@ def parse_accession(accession): logging.error(f"{accession} is not a valid accession") sys.exit() +def configure_credentials(): + # Config file + user_config = Path.home() / ".genome_uploader.config.env" + if user_config.exists(): + logger.debug("Loading the env variables from ".format(user_config)) + load_dotenv(str(user_config)) + else: + cwd_config = Path.cwd() / ".genome_uploader.config.env" + if cwd_config.exists(): + logger.debug("Loading the variables from the current directory.") + load_dotenv(str(cwd_config)) + else: + logger.debug("Trying to load env variables from the .env file") + # from a local .env file + load_dotenv() + + username = os.getenv("ENA_WEBIN") + password = os.getenv("ENA_WEBIN_PASSWORD") + + return username, password + +def query_taxid(taxid): + url = "https://www.ebi.ac.uk/ena/taxonomy/rest/tax-id/{}".format(taxid) + response = requests.get(url) + + try: + # Will raise exception if response status code is non-200 + response.raise_for_status() + except requests.exceptions.HTTPError as e: + print("Request failed {} with error {}".format(url, e)) + return False + + res = response.json() + + return res.get("scientificName", "") + +def query_scientific_name(scientificName, searchRank=False): + url = "https://www.ebi.ac.uk/ena/taxonomy/rest/scientific-name/{}".format(scientificName) + response = requests.get(url) + + try: + # Will raise exception if response status code is non-200 + response.raise_for_status() + except requests.exceptions.HTTPError as e: + if searchRank: + return False, "", "" + else: + return False, "" + + try: + res = response.json()[0] + except IndexError: + if searchRank: + return False, "", "" + else: + return False, "" + + submittable = res.get("submittable", "").lower() == "true" + taxid = res.get("taxId", "") + rank = res.get("rank", "") -class ENA(): - def __init__(self, accession, private=False): + if searchRank: + return submittable, taxid, rank + else: + return submittable, taxid + + + +class EnaQuery(): + def __init__(self, accession, query_type, private=False): self.private_url = "https://www.ebi.ac.uk/ena/submit/report" self.public_url = "https://www.ebi.ac.uk/ena/portal/api/search" self.browser_url = "https://www.ebi.ac.uk/ena/browser/api/xml" self.accession = accession self.acc_type = parse_accession(accession) - username = os.getenv("ENA_WEBIN") - password = os.getenv("ENA_WEBIN_PASSWORD") + username, password = configure_credentials() if username is None or password is None: logging.error("ENA_WEBIN and ENA_WEBIN_PASSWORD are not set") if username and password: @@ -108,7 +176,7 @@ def __init__(self, accession, private=False): else: self.auth = None self.private = private - + self.query_type = query_type def post_request(self, data): response = requests.post( @@ -311,55 +379,53 @@ def _get_public_sample(self): return sample - def query_taxid(self, taxid): - url = "https://www.ebi.ac.uk/ena/taxonomy/rest/tax-id/{}".format(taxid) - response = requests.get(url) - - try: - # Will raise exception if response status code is non-200 - response.raise_for_status() - except requests.exceptions.HTTPError as e: - print("Request failed {} with error {}".format(url, e)) - return False - - res = response.json() - - return res.get("scientificName", "") - - def query_scientific_name(self, scientificName, searchRank=False): - url = "https://www.ebi.ac.uk/ena/taxonomy/rest/scientific-name/{}".format(scientificName) - response = requests.get(url) - - try: - # Will raise exception if response status code is non-200 - response.raise_for_status() - except requests.exceptions.HTTPError as e: - if searchRank: - return False, "", "" + def build_query(self): + if self.query_type == "study": + if self.private: + ena_response = self._get_private_study() else: - return False, "" - - try: - res = response.json()[0] - except IndexError: - if searchRank: - return False, "", "" + ena_response = self._get_public_study() + elif self.query_type == "run": + if self.private: + ena_response = self._get_private_run() + else: + ena_response = self._get_public_run() + elif self.query_type == "run_assembly": + if self.private: + ena_response = self._get_private_run_from_assembly() + else: + ena_response = self._get_public_run_from_assembly() + elif self.query_type == "study_runs": + if self.private: + ena_response = self._get_private_study_runs() + else: + ena_response = self._get_public_study_runs() + elif self.query_type == "sample": + if self.private: + ena_response = self._get_private_sample() else: - return False, "" + ena_response = self._get_public_sample() + return ena_response - submittable = res.get("submittable", "").lower() == "true" - taxid = res.get("taxId", "") - rank = res.get("rank", "") - if searchRank: - return submittable, taxid, rank +class EnaSubmit(): + def __init__(self, sample_xml, submission_xml, live=False): + self.sample_xml = sample_xml + self.submission_xml = submission_xml + self.live = live + username, password = configure_credentials() + if username is None or password is None: + logging.error("ENA_WEBIN and ENA_WEBIN_PASSWORD are not set") + if username and password: + self.auth = (username, password) else: - return submittable, taxid + self.auth = None + - def handle_genomes_registration(self, sample_xml, submission_xml, live=False): + def handle_genomes_registration(self): liveSub, mode = "", "live" - if not live: + if not self.live: liveSub = "dev" mode = "test" @@ -368,8 +434,8 @@ def handle_genomes_registration(self, sample_xml, submission_xml, live=False): logger.info('Registering sample xml in {} mode.'.format(mode)) f = { - 'SUBMISSION': open(submission_xml, 'r'), - 'SAMPLE': open(sample_xml, 'r') + 'SUBMISSION': open(self.submission_xml, 'r'), + 'SAMPLE': open(self.sample_xml, 'r') } submissionResponse = requests.post(url, files = f, auth = self.auth) @@ -404,4 +470,7 @@ def handle_genomes_registration(self, sample_xml, submission_xml, live=False): logger.info('{} genome samples successfully registered.'.format(str(len(aliasDict)))) return aliasDict - \ No newline at end of file + + + + diff --git a/genomeuploader/genome_upload.py b/genomeuploader/genome_upload.py index e62c89f..9a54f5e 100755 --- a/genomeuploader/genome_upload.py +++ b/genomeuploader/genome_upload.py @@ -22,14 +22,13 @@ import json import pandas as pd from datetime import date, datetime as dt -from dotenv import load_dotenv -from pathlib import Path import xml.etree.ElementTree as ET import xml.dom.minidom as minidom import requests -from ena import ENA +import ena +from ena import EnaQuery, EnaSubmit from constants import METAGENOMES, GEOGRAPHIC_LOCATIONS, MQ, HQ @@ -37,8 +36,6 @@ logger = logging.getLogger(__name__) -ena = ENA() - GEOGRAPHY_DIGIT_COORDS = 8 ''' @@ -351,110 +348,6 @@ def extract_genomes_info(inputFile, genomeType, live): return genomeInfo -def extract_ENA_info(genomeInfo, uploadDir, webin, password, private=False): - logger.info('Retrieving project and run info from ENA (this might take a while)...') - - # retrieving metadata from runs (and runs from assembly accessions if provided) - allRuns = [] - for g in genomeInfo: - if genomeInfo[g]["accessionType"] == "assembly": - derivedRuns = [] - for acc in genomeInfo[g]["accessions"]: - derivedRuns.append(ena.get_run_from_assembly(acc, private)) - genomeInfo[g]["accessions"] = derivedRuns - allRuns.extend(genomeInfo[g]["accessions"]) - - runsSet, studySet, samplesDict, tempDict = set(allRuns), set(), {}, {} - for r in runsSet: - run_info = ena.get_run(r, webin, password, private) - studySet.add(run_info["secondary_study_accession"]) - samplesDict[r] = run_info["sample_accession"] - - if not studySet: - raise ValueError("No study corresponding to runs found.") - - backupFile = os.path.join(uploadDir, "ENA_backup.json") - counter = 0 - if not os.path.exists(backupFile): - with open(backupFile, 'w') as file: - pass - with open(backupFile, "r+") as file: - try: - backupDict = json.load(file) - tempDict = dict(backupDict) - logger.info(f"A backup file {backupFile} for ENA sample metadata has been found.") - except json.decoder.JSONDecodeError: - backupDict = {} - for s in studySet: - studyInfo = ena.get_study(s, webin, password, private) - projectDescription = studyInfo["study_description"] - - ENA_info = ena.get_study_runs(s, webin, password, private) - if ENA_info == []: - raise IOError("No runs found on ENA for project {}.".format(s)) - - for run, item in enumerate(ENA_info): - runAccession = ENA_info[run]["run_accession"] - if runAccession not in backupDict: - if runAccession in runsSet: - sampleAccession = ENA_info[run]["sample_accession"] - sampleInfo = ena.get_sample(sampleAccession, webin, password, private) - - if 'latitude' in sampleInfo and 'longitude' in sampleInfo: - latitude = sampleInfo['latitude'] - longitude = sampleInfo['longitude'] - else: - location = sampleInfo["location"] - latitude, longitude = None, None - if 'N' in location: - latitude = location.split('N')[0].strip() - longitude = location.split('N')[1].strip() - elif 'S' in location: - latitude = '-' + location.split('S')[0].strip() - longitude = location.split('S')[1].strip() - - if 'W' in longitude: - longitude = '-' + longitude.split('W')[0].strip() - elif longitude.endswith('E'): - longitude = longitude.split('E')[0].strip() - - if latitude: - latitude = "{:.{}f}".format(round(float(latitude), GEOGRAPHY_DIGIT_COORDS), GEOGRAPHY_DIGIT_COORDS) - else: - latitude = "not provided" - - if longitude: - longitude = "{:.{}f}".format(round(float(longitude), GEOGRAPHY_DIGIT_COORDS), GEOGRAPHY_DIGIT_COORDS) - else: - longitude = "not provided" - - country = sampleInfo["country"].split(':')[0] - if not country in GEOGRAPHIC_LOCATIONS: - country = "not provided" - - collectionDate = sampleInfo["collection_date"] - if collectionDate == "" or collectionDate == "missing": - collectionDate = "not provided" - - tempDict[runAccession] = { - "instrumentModel" : ENA_info[run]["instrument_model"], - "collectionDate" : collectionDate, - "country" : country, - "latitude" : latitude, - "longitude" : longitude, - "projectDescription" : projectDescription, - "study" : s, - "sampleAccession" : samplesDict[runAccession] - } - counter += 1 - - if (counter%10 == 0) or (len(runsSet) - len(backupDict) == counter): - file.seek(0) - file.write(json.dumps(tempDict)) - file.truncate() - tempDict = {**tempDict, **backupDict} - combine_ENA_info(genomeInfo, tempDict) - def multipleElementSet(metadataList): return len(set(metadataList))>1 @@ -830,8 +723,8 @@ def main(): if save: logger.info("Registering genome samples XMLs...") - aliasToNewSampleAccession = ena.handle_genomes_registration(samples_xml, - submission_xml, ENA_uploader.username, ENA_uploader.password, ENA_uploader.live) + ena_submit = EnaSubmit(samples_xml, submission_xml, ENA_uploader.live) + aliasToNewSampleAccession = ena_submit.handle_genomes_registration() saveAccessions(aliasToNewSampleAccession, accessionsFile, writeMode) logger.info("Generating manifest files...") @@ -850,32 +743,6 @@ def __init__(self, argv=sys.argv[1:]): self.genomeType = "bins" if self.args.bins else "MAGs" self.live = True if self.args.live else False self.private = self.args.private - - if self.args.webin and self.args.password: - self.username = self.args.webin - self.password = self.args.password - else: - # Config file - user_config = Path.home() / ".genome_uploader.config.env" - if user_config.exists(): - logger.debug("Loading the env variables from ".format(user_config)) - load_dotenv(str(user_config)) - else: - cwd_config = Path.cwd() / ".genome_uploader.config.env" - if cwd_config.exists(): - logger.debug("Loading the variables from the current directory.") - load_dotenv(str(cwd_config)) - else: - logger.debug("Trying to load env variables from the .env file") - # from a local .env file - load_dotenv() - - self.username = os.getenv("ENA_WEBIN") - self.password = os.getenv("ENA_WEBIN_PASSWORD") - - if not self.username or not self.password: - logger.error("ENA Webin username or password are empty") - sys.exit(1) self.tpa = True if self.args.tpa else False self.centre_name = self.args.centre_name @@ -883,6 +750,8 @@ def __init__(self, argv=sys.argv[1:]): workDir = self.args.out if self.args.out else os.getcwd() self.upload_dir = self.generate_genomes_upload_dir(workDir, self.genomeType) + + self.private = self.args.private def parse_args(self, argv): parser = argparse.ArgumentParser(formatter_class = argparse.ArgumentDefaultsHelpFormatter, @@ -902,8 +771,6 @@ def parse_args(self, argv): parser.add_argument('--tpa', action='store_true', help="Select if uploading TPA-generated genomes") # Users can provide their credentials and centre name manually or using a config file - parser.add_argument('--webin', required=False, help="Webin id") - parser.add_argument('--password', required=False, help="Webin password") parser.add_argument('--centre_name', required=False, help="Name of the centre uploading genomes") parser.add_argument('--private', required=False, help="if data is private", action='store_true', default=False) @@ -917,6 +784,116 @@ def parse_args(self, argv): return args + def extract_ENA_info(self, genomeInfo, uploadDir, private=False): + logger.info('Retrieving project and run info from ENA (this might take a while)...') + + # retrieving metadata from runs (and runs from assembly accessions if provided) + allRuns = [] + for g in genomeInfo: + if genomeInfo[g]["accessionType"] == "assembly": + derivedRuns = [] + for acc in genomeInfo[g]["accessions"]: + ena_query = EnaQuery(acc, "run_assembly", self.private) + derivedRuns.append(ena_query.build_query()) + genomeInfo[g]["accessions"] = derivedRuns + allRuns.extend(genomeInfo[g]["accessions"]) + + runsSet, studySet, samplesDict, tempDict = set(allRuns), set(), {}, {} + for r in runsSet: + ena_query = EnaQuery(r, "run", self.private) + run_info = ena_query.build_query() + studySet.add(run_info["secondary_study_accession"]) + samplesDict[r] = run_info["sample_accession"] + + if not studySet: + raise ValueError("No study corresponding to runs found.") + + backupFile = os.path.join(uploadDir, "ENA_backup.json") + counter = 0 + if not os.path.exists(backupFile): + with open(backupFile, 'w') as file: + pass + with open(backupFile, "r+") as file: + try: + backupDict = json.load(file) + tempDict = dict(backupDict) + logger.info(f"A backup file {backupFile} for ENA sample metadata has been found.") + except json.decoder.JSONDecodeError: + backupDict = {} + for s in studySet: + ena_query = EnaQuery(s, "study", self.private) + studyInfo = ena_query.build_query() + projectDescription = studyInfo["study_description"] + + ena_query = EnaQuery(s, "study_runs", self.private) + ENA_info = ena_query.build_query() + if ENA_info == []: + raise IOError("No runs found on ENA for project {}.".format(s)) + + for run, item in enumerate(ENA_info): + runAccession = ENA_info[run]["run_accession"] + if runAccession not in backupDict: + if runAccession in runsSet: + sampleAccession = ENA_info[run]["sample_accession"] + ena_query = EnaQuery(sampleAccession, "sample", self.private) + sampleInfo = ena_query.build_query() + + if 'latitude' in sampleInfo and 'longitude' in sampleInfo: + latitude = sampleInfo['latitude'] + longitude = sampleInfo['longitude'] + else: + location = sampleInfo["location"] + latitude, longitude = None, None + if 'N' in location: + latitude = location.split('N')[0].strip() + longitude = location.split('N')[1].strip() + elif 'S' in location: + latitude = '-' + location.split('S')[0].strip() + longitude = location.split('S')[1].strip() + + if 'W' in longitude: + longitude = '-' + longitude.split('W')[0].strip() + elif longitude.endswith('E'): + longitude = longitude.split('E')[0].strip() + + if latitude: + latitude = "{:.{}f}".format(round(float(latitude), GEOGRAPHY_DIGIT_COORDS), GEOGRAPHY_DIGIT_COORDS) + else: + latitude = "not provided" + + if longitude: + longitude = "{:.{}f}".format(round(float(longitude), GEOGRAPHY_DIGIT_COORDS), GEOGRAPHY_DIGIT_COORDS) + else: + longitude = "not provided" + + country = sampleInfo["country"].split(':')[0] + if not country in GEOGRAPHIC_LOCATIONS: + country = "not provided" + + collectionDate = sampleInfo["collection_date"] + if collectionDate == "" or collectionDate == "missing": + collectionDate = "not provided" + + tempDict[runAccession] = { + "instrumentModel" : ENA_info[run]["instrument_model"], + "collectionDate" : collectionDate, + "country" : country, + "latitude" : latitude, + "longitude" : longitude, + "projectDescription" : projectDescription, + "study" : s, + "sampleAccession" : samplesDict[runAccession] + } + counter += 1 + + if (counter%10 == 0) or (len(runsSet) - len(backupDict) == counter): + file.seek(0) + file.write(json.dumps(tempDict)) + file.truncate() + tempDict = {**tempDict, **backupDict} + combine_ENA_info(genomeInfo, tempDict) + + def generate_genomes_upload_dir(self, dir, genomeType): uploadName = "MAG_upload" if genomeType == "bins": From ea275f27a4dcfa1933bb1bbda8fc989bdfd78e78 Mon Sep 17 00:00:00 2001 From: Varsha Date: Wed, 18 Dec 2024 11:23:04 +0000 Subject: [PATCH 09/20] move lists to contstants, pull arguments out of class, reformat camelCase --- genomeuploader/constants.py | 25 ++ genomeuploader/ena.py | 52 +-- genomeuploader/genome_upload.py | 710 ++++++++++++++++---------------- 3 files changed, 401 insertions(+), 386 deletions(-) diff --git a/genomeuploader/constants.py b/genomeuploader/constants.py index 11d9a5e..5a27b39 100644 --- a/genomeuploader/constants.py +++ b/genomeuploader/constants.py @@ -635,3 +635,28 @@ "Zambia", "Zimbabwe", ] + +BIN_MANDATORY_FIELDS = [ + "genome_name", + "accessions", + "assembly_software", + "binning_software", + "binning_parameters", + "stats_generation_software", + "NCBI_lineage", + "broad_environment", + "local_environment", + "environmental_medium", + "metagenome", + "co-assembly", + "genome_coverage", + "genome_path" +] + +MAG_MANDATORY_FIELDS = [ + "rRNA_presence", + "completeness", + "contamination" +] + +GEOGRAPHY_DIGIT_COORDS = 8 \ No newline at end of file diff --git a/genomeuploader/ena.py b/genomeuploader/ena.py index 7682799..7faab70 100644 --- a/genomeuploader/ena.py +++ b/genomeuploader/ena.py @@ -129,15 +129,15 @@ def query_taxid(taxid): return res.get("scientificName", "") -def query_scientific_name(scientificName, searchRank=False): - url = "https://www.ebi.ac.uk/ena/taxonomy/rest/scientific-name/{}".format(scientificName) +def query_scientific_name(scientific_name, search_rank=False): + url = "https://www.ebi.ac.uk/ena/taxonomy/rest/scientific-name/{}".format(scientific_name) response = requests.get(url) try: # Will raise exception if response status code is non-200 response.raise_for_status() except requests.exceptions.HTTPError as e: - if searchRank: + if search_rank: return False, "", "" else: return False, "" @@ -145,7 +145,7 @@ def query_scientific_name(scientificName, searchRank=False): try: res = response.json()[0] except IndexError: - if searchRank: + if search_rank: return False, "", "" else: return False, "" @@ -154,7 +154,7 @@ def query_scientific_name(scientificName, searchRank=False): taxid = res.get("taxId", "") rank = res.get("rank", "") - if searchRank: + if search_rank: return submittable, taxid, rank else: return submittable, taxid @@ -423,13 +423,13 @@ def __init__(self, sample_xml, submission_xml, live=False): def handle_genomes_registration(self): - liveSub, mode = "", "live" + live_sub, mode = "", "live" if not self.live: - liveSub = "dev" + live_sub = "dev" mode = "test" - url = "https://www{}.ebi.ac.uk/ena/submit/drop-box/submit/".format(liveSub) + url = "https://www{}.ebi.ac.uk/ena/submit/drop-box/submit/".format(live_sub) logger.info('Registering sample xml in {} mode.'.format(mode)) @@ -438,38 +438,38 @@ def handle_genomes_registration(self): 'SAMPLE': open(self.sample_xml, 'r') } - submissionResponse = requests.post(url, files = f, auth = self.auth) + submission_response = requests.post(url, files = f, auth = self.auth) - if submissionResponse.status_code != 200: - if str(submissionResponse.status_code).startswith('5'): + if submission_response.status_code != 200: + if str(submission_response.status_code).startswith('5'): raise Exception("Genomes could not be submitted to ENA as the server " + "does not respond. Please again try later.") else: raise Exception("Genomes could not be submitted to ENA. HTTP response: " + - submissionResponse.reason) + submission_response.reason) - receiptXml = minidom.parseString((submissionResponse.content).decode("utf-8")) - receipt = receiptXml.getElementsByTagName("RECEIPT") + receipt_xml = minidom.parseString((submission_response.content).decode("utf-8")) + receipt = receipt_xml.getElementsByTagName("RECEIPT") success = receipt[0].attributes["success"].value if success == "true": - aliasDict = {} - samples = receiptXml.getElementsByTagName("SAMPLE") + alias_dict = {} + samples = receipt_xml.getElementsByTagName("SAMPLE") for s in samples: - sraAcc = s.attributes["accession"].value + sra_acc = s.attributes["accession"].value alias = s.attributes["alias"].value - aliasDict[alias] = sraAcc + alias_dict[alias] = sra_acc elif success == "false": - errors = receiptXml.getElementsByTagName("ERROR") - finalError = "\tSome genomes could not be submitted to ENA. Please, check the errors below." + errors = receipt_xml.getElementsByTagName("ERROR") + final_error = "\tSome genomes could not be submitted to ENA. Please, check the errors below." for error in errors: - finalError += "\n\t" + error.firstChild.data - finalError += "\n\tIf you wish to validate again your data and metadata, " - finalError += "please use the --force option." - raise Exception(finalError) + final_error += "\n\t" + error.firstChild.data + final_error += "\n\tIf you wish to validate again your data and metadata, " + final_error += "please use the --force option." + raise Exception(final_error) - logger.info('{} genome samples successfully registered.'.format(str(len(aliasDict)))) + logger.info('{} genome samples successfully registered.'.format(str(len(alias_dict)))) - return aliasDict + return alias_dict diff --git a/genomeuploader/genome_upload.py b/genomeuploader/genome_upload.py index 9a54f5e..9136118 100755 --- a/genomeuploader/genome_upload.py +++ b/genomeuploader/genome_upload.py @@ -30,13 +30,33 @@ import ena from ena import EnaQuery, EnaSubmit -from constants import METAGENOMES, GEOGRAPHIC_LOCATIONS, MQ, HQ +from constants import * logging.basicConfig(level=logging.DEBUG) - logger = logging.getLogger(__name__) -GEOGRAPHY_DIGIT_COORDS = 8 +def parse_args(argv): + parser = argparse.ArgumentParser(formatter_class = argparse.ArgumentDefaultsHelpFormatter, + description="Create xmls and manifest files for genome upload to ENA") + + parser.add_argument('-u', '--upload_study', type=str, help="Study accession for genomes upload") + parser.add_argument('--genome_info', type=str, required=True, help="Genomes metadata file") + + genomeType = parser.add_mutually_exclusive_group(required=True) + genomeType.add_argument('-m', '--mags', action='store_true', help="Select for MAG upload") + genomeType.add_argument('-b', '--bins', action='store_true', help="Select for bin upload") + + parser.add_argument('--out', type=str, help="Output folder. Default: working directory") + parser.add_argument('--force', action='store_true', help="Forces reset of sample xml's backups") + parser.add_argument('--live', action='store_true', help="Uploads on ENA. Omitting this " + + "option allows to validate samples beforehand") + parser.add_argument('--tpa', action='store_true', help="Select if uploading TPA-generated genomes") + + # Users can provide their credentials and centre name manually or using a config file + parser.add_argument('--centre_name', required=False, help="Name of the centre uploading genomes") + parser.add_argument('--private', required=False, help="if data is private", action='store_true', default=False) + + return parser.parse_args(argv) ''' Input table: expects the following parameters: @@ -58,55 +78,49 @@ genome_coverage : genome coverage genome_path: path to genome to upload ''' -def read_and_cleanse_metadata_tsv(inputFile, genomeType, live): + +def validate_metadata_tsv(input_file, genome_type, live): logger.info('Retrieving info for genomes to submit...') - binMandatoryFields = ["genome_name", "accessions", - "assembly_software", "binning_software", - "binning_parameters", "stats_generation_software", "NCBI_lineage", - "broad_environment", "local_environment", "environmental_medium", "metagenome", - "co-assembly", "genome_coverage", "genome_path"] - MAGMandatoryFields = ["rRNA_presence", "completeness", "contamination"] - - allFields = MAGMandatoryFields + binMandatoryFields - metadata = pd.read_csv(inputFile, sep='\t', usecols=allFields) + all_fields = MAG_MANDATORY_FIELDS + BIN_MANDATORY_FIELDS + metadata = pd.read_csv(input_file, sep='\t', usecols=all_fields) # make sure there are no empty cells - cleanColumns = list(metadata.dropna(axis=1)) - if genomeType == "MAGs": - missingValues = [item for item in allFields if item not in cleanColumns] + clean_columns = list(metadata.dropna(axis=1)) + if genome_type == "MAGs": + missing_values = [item for item in all_fields if item not in clean_columns] else: - missingValues = [item for item in binMandatoryFields if item not in cleanColumns] + missing_values = [item for item in BIN_MANDATORY_FIELDS if item not in clean_columns] - if missingValues: + if missing_values: raise ValueError("The following mandatory fields have missing values in " + - "the input file: {}".format(", ".join(missingValues))) + "the input file: {}".format(", ".join(missing_values))) # check amount of genomes to register at the same time if len(metadata) >= 5000: raise ValueError("Genomes need to be registered in batches of 5000 genomes or smaller.") # check whether accessions follow the right format - accessions_regExp = re.compile(r"([E|S|D]R[R|Z]\d{6,})") + accessions_reg_exp = re.compile(r"([E|S|D]R[R|Z]\d{6,})") - accessionComparison = pd.DataFrame(columns=["genome_name", "attemptive_accessions", + accession_comparison = pd.DataFrame(columns=["genome_name", "attemptive_accessions", "correct", "mismatching", "co-assembly"]) - accessionComparison["genome_name"] = metadata["genome_name"] + accession_comparison["genome_name"] = metadata["genome_name"] - accessionComparison["attemptive_accessions"] = metadata["accessions"].map( + accession_comparison["attemptive_accessions"] = metadata["accessions"].map( lambda a: len(a.split(','))) - accessionComparison["correct"] = metadata["accessions"].map( - lambda a: len(accessions_regExp.findall(a))) + accession_comparison["correct"] = metadata["accessions"].map( + lambda a: len(accessions_reg_exp.findall(a))) - accessionComparison["mismatching"] = accessionComparison.apply(lambda row: + accession_comparison["mismatching"] = accession_comparison.apply(lambda row: True if row["attemptive_accessions"] == row["correct"] else None, axis=1).isna() - mismatchingAccessions = accessionComparison[accessionComparison["mismatching"]]["genome_name"] - if not mismatchingAccessions.empty: + mismatching_accessions = accession_comparison[accession_comparison["mismatching"]]["genome_name"] + if not mismatching_accessions.empty: raise ValueError("Run accessions are not correctly formatted for the following " + - "genomes: " + ','.join(mismatchingAccessions.values)) + "genomes: " + ','.join(mismatching_accessions.values)) # check whether completeness and contamination are floats try: @@ -117,14 +131,14 @@ def read_and_cleanse_metadata_tsv(inputFile, genomeType, live): raise ValueError("Completeness, contamination or coverage values should be formatted as floats") # check whether all co-assemblies have more than one run associated and viceversa - accessionComparison["co-assembly"] = metadata["co-assembly"] - coassemblyDiscrepancy = metadata[( - (accessionComparison["correct"] < 2) & (accessionComparison["co-assembly"])) | - ((accessionComparison["correct"] > 1) & (~accessionComparison["co-assembly"]) + accession_comparison["co-assembly"] = metadata["co-assembly"] + coassembly_discrepancy = metadata[( + (accession_comparison["correct"] < 2) & (accession_comparison["co-assembly"])) | + ((accession_comparison["correct"] > 1) & (~accession_comparison["co-assembly"]) )]["genome_name"] - if not coassemblyDiscrepancy.empty: + if not coassembly_discrepancy.empty: raise ValueError("The following genomes show discrepancy between number of runs " - "involved and co-assembly status: " + ','.join(coassemblyDiscrepancy.values)) + "involved and co-assembly status: " + ','.join(coassembly_discrepancy.values)) # are provided metagenomes part of the accepted metagenome list? if False in metadata.apply(lambda row: @@ -150,97 +164,97 @@ def read_and_cleanse_metadata_tsv(inputFile, genomeType, live): timestamp = str(int(dt.timestamp(dt.now()))) timestamp_names = [row["genome_name"] + '_' + timestamp for index, row in metadata.iterrows()] metadata["unique_genome_name"] = timestamp_names - genomeInfo = metadata.set_index("unique_genome_name").transpose().to_dict() + genome_info = metadata.set_index("unique_genome_name").transpose().to_dict() else: - genomeInfo = metadata.set_index("genome_name").transpose().to_dict() + genome_info = metadata.set_index("genome_name").transpose().to_dict() else: raise ValueError("Duplicate names found in genome names") - return genomeInfo + return genome_info def round_stats(stats): - newStat = round(float(stats), 2) - if newStat == 100.0: - newStat = 100 - if newStat == 0: - newStat = 0.0 + new_stat = round(float(stats), 2) + if new_stat == 100.0: + new_stat = 100 + if new_stat == 0: + new_stat = 0.0 - return newStat + return new_stat -def compute_MAG_quality(completeness, contamination, RNApresence): - RNApresent = str(RNApresence).lower() in ["true", "yes", "y"] +def compute_mag_quality(completeness, contamination, rna_presence): + rna_presence = str(rna_presence).lower() in ["true", "yes", "y"] quality = MQ - if float(completeness) >= 90 and float(contamination) <= 5 and RNApresent: + if float(completeness) >= 90 and float(contamination) <= 5 and rna_presence: quality = HQ return quality, completeness, contamination -def extract_tax_info(taxInfo): +def extract_tax_info(tax_info): # if unclassified, block the execution - lineage, kingdomPositionInLineage, digitAnnotation = taxInfo.split(';'), 0, False - lineageFirst = lineage[0] - if "Unclassified " in lineageFirst: - if "Archaea" in lineageFirst: - scientificName = "uncultured archaeon" - elif "Bacteria" in lineageFirst: - scientificName = "uncultured bacterium" - elif "Eukaryota" in lineageFirst: - scientificName = "uncultured eukaryote" - submittable, taxid, rank = ena.query_scientific_name(scientificName, searchRank=True) - return taxid, scientificName + lineage, kingdom_position_lineage, digit_annotation = tax_info.split(';'), 0, False + lineage_first = lineage[0] + if "Unclassified " in lineage_first: + if "Archaea" in lineage_first: + scientific_name = "uncultured archaeon" + elif "Bacteria" in lineage_first: + scientific_name = "uncultured bacterium" + elif "Eukaryota" in lineage_first: + scientific_name = "uncultured eukaryote" + submittable, taxid, rank = ena.query_scientific_name(scientific_name, search_rank=True) + return taxid, scientific_name kingdoms = ["Archaea", "Bacteria", "Eukaryota"] - kingdomTaxa = ["2157", "2", "2759"] + kingdom_taxa = ["2157", "2", "2759"] - selectedKingdom, finalKingdom = kingdoms, "" + selected_kingdom, final_kingdom = kingdoms, "" if lineage[1].isdigit(): - selectedKingdom = kingdomTaxa - kingdomPositionInLineage = 2 - digitAnnotation = True - for index, k in enumerate(selectedKingdom): - if digitAnnotation: - if k == lineage[kingdomPositionInLineage]: - finalKingdom = selectedKingdom[index] + selected_kingdom = kingdom_taxa + kingdom_position_lineage = 2 + digit_annotation = True + for index, k in enumerate(selected_kingdom): + if digit_annotation: + if k == lineage[kingdom_position_lineage]: + final_kingdom = selected_kingdom[index] break else: - if k in lineage[kingdomPositionInLineage]: - finalKingdom = selectedKingdom[index] + if k in lineage[kingdom_position_lineage]: + final_kingdom = selected_kingdom[index] break iterator = len(lineage)-1 submittable = False rank = "" while iterator != -1 and not submittable: - scientificName = lineage[iterator].strip() - if digitAnnotation: - if not '*' in scientificName: - scientificName = ena.query_taxid(scientificName) + scientific_name = lineage[iterator].strip() + if digit_annotation: + if not '*' in scientific_name: + scientific_name = ena.query_taxid(scientific_name) else: iterator -= 1 continue - elif "__" in scientificName: - scientificName = scientificName.split("__")[1] + elif "__" in scientific_name: + scientific_name = scientific_name.split("__")[1] else: - raise ValueError("Unrecognised taxonomy format: " + scientificName) - submittable, taxid, rank = ena.query_scientific_name(scientificName, searchRank=True) + raise ValueError("Unrecognised taxonomy format: " + scientific_name) + submittable, taxid, rank = ena.query_scientific_name(scientific_name, search_rank=True) if not submittable: - if finalKingdom == "Archaea" or finalKingdom == "2157": - submittable, scientificName, taxid = extract_Archaea_info(scientificName, rank) - elif finalKingdom == "Bacteria" or finalKingdom == "2": - submittable, scientificName, taxid = extract_Bacteria_info(scientificName, rank) - elif finalKingdom == "Eukaryota" or finalKingdom == "2759": - submittable, scientificName, taxid = extract_Eukaryota_info(scientificName, rank) + if final_kingdom == "Archaea" or final_kingdom == "2157": + submittable, scientific_name, taxid = extract_archaea_info(scientific_name, rank) + elif final_kingdom == "Bacteria" or final_kingdom == "2": + submittable, scientific_name, taxid = extract_bacteria_info(scientific_name, rank) + elif final_kingdom == "Eukaryota" or final_kingdom == "2759": + submittable, scientific_name, taxid = extract_eukaryota_info(scientific_name, rank) iterator -= 1 - return taxid, scientificName + return taxid, scientific_name -def extract_Eukaryota_info(name, rank): - nonSubmittable = (False, "", 0) +def extract_eukaryota_info(name, rank): + non_submittable = (False, "", 0) # Asterisks in given taxonomy suggest the classification might be not confident enough. if '*' in name: - return nonSubmittable + return non_submittable if rank == "super kingdom": name = "uncultured eukaryote" @@ -262,9 +276,9 @@ def extract_Eukaryota_info(name, rank): if submittable: return submittable, name, taxid else: - return nonSubmittable + return non_submittable -def extract_Bacteria_info(name, rank): +def extract_bacteria_info(name, rank): if rank == "species": name = name elif rank == "superkingdom": @@ -274,7 +288,7 @@ def extract_Bacteria_info(name, rank): elif rank == "genus": name = "uncultured {} sp.".format(name) - submittable, taxid, rank = ena.query_scientific_name(name, searchRank=True) + submittable, taxid, rank = ena.query_scientific_name(name, search_rank=True) if not submittable: if rank in ["species", "genus"] and name.lower().endswith("bacteria"): name = "uncultured {}".format(name.lower().replace("bacteria", "bacterium")) @@ -285,7 +299,7 @@ def extract_Bacteria_info(name, rank): return submittable, name, taxid -def extract_Archaea_info(name, rank): +def extract_archaea_info(name, rank): if rank == "species": name = name elif rank == "superkingdom": @@ -302,7 +316,7 @@ def extract_Archaea_info(name, rank): elif rank == "genus": name = "uncultured {} sp.".format(name) - submittable, taxid, rank = ena.query_scientific_name(name, searchRank=True) + submittable, taxid, rank = ena.query_scientific_name(name, search_rank=True) if not submittable: if "Candidatus" in name: if rank == "phylum": @@ -313,149 +327,149 @@ def extract_Archaea_info(name, rank): return submittable, name, taxid -def extract_genomes_info(inputFile, genomeType, live): - genomeInfo = read_and_cleanse_metadata_tsv(inputFile, genomeType, live) - for gen in genomeInfo: - genomeInfo[gen]["accessions"] = genomeInfo[gen]["accessions"].split(',') - accessionType = "run" - assembly_regExp = re.compile(r"([E|S|D]RZ\d{6,})") - if assembly_regExp.findall(genomeInfo[gen]["accessions"][0]): - accessionType = "assembly" - genomeInfo[gen]["accessionType"] = accessionType - - genomeInfo[gen]["isolationSource"] = genomeInfo[gen]["metagenome"] +def extract_genomes_info(input_file, genome_type, live): + genome_info = validate_metadata_tsv(input_file, genome_type, live) + for gen in genome_info: + genome_info[gen]["accessions"] = genome_info[gen]["accessions"].split(',') + accession_type = "run" + assembly_reg_exp = re.compile(r"([E|S|D]RZ\d{6,})") + if assembly_reg_exp.findall(genome_info[gen]["accessions"][0]): + accession_type = "assembly" + genome_info[gen]["accessionType"] = accession_type + + genome_info[gen]["isolationSource"] = genome_info[gen]["metagenome"] try: - (genomeInfo[gen]["MAG_quality"], - genomeInfo[gen]["completeness"], - genomeInfo[gen]["contamination"]) = compute_MAG_quality( - str(round_stats(genomeInfo[gen]["completeness"])), - str(round_stats(genomeInfo[gen]["contamination"])), - genomeInfo[gen]["rRNA_presence"]) + (genome_info[gen]["MAG_quality"], + genome_info[gen]["completeness"], + genome_info[gen]["contamination"]) = compute_mag_quality( + str(round_stats(genome_info[gen]["completeness"])), + str(round_stats(genome_info[gen]["contamination"])), + genome_info[gen]["rRNA_presence"]) except IndexError: pass - if str(genomeInfo[gen]["co-assembly"]).lower() in ["yes", "y", "true"]: - genomeInfo[gen]["co-assembly"] = True + if str(genome_info[gen]["co-assembly"]).lower() in ["yes", "y", "true"]: + genome_info[gen]["co-assembly"] = True else: - genomeInfo[gen]["co-assembly"] = False + genome_info[gen]["co-assembly"] = False - genomeInfo[gen]["alias"] = gen + genome_info[gen]["alias"] = gen - taxID, scientificName = extract_tax_info(genomeInfo[gen]["NCBI_lineage"]) - genomeInfo[gen]["taxID"] = taxID - genomeInfo[gen]["scientific_name"] = scientificName + tax_id, scientific_name = extract_tax_info(genome_info[gen]["NCBI_lineage"]) + genome_info[gen]["taxID"] = tax_id + genome_info[gen]["scientific_name"] = scientific_name - return genomeInfo + return genome_info -def multipleElementSet(metadataList): - return len(set(metadataList))>1 +def multiple_element_set(metadata_list): + return len(set(metadata_list))>1 -def combine_ENA_info(genomeInfo, ENADict): - for g in genomeInfo: +def combine_ENA_info(genome_info, ena_dict): + for g in genome_info: # TODO: optimise all the part below - if genomeInfo[g]["co-assembly"]: - instrumentList, collectionList, countryList = [], [], [] - studyList, descriptionList, samplesList = [], [], [] - longList, latitList = [], [] - for run in genomeInfo[g]["accessions"]: - instrumentList.append(ENADict[run]["instrumentModel"]) - collectionList.append(ENADict[run]["collectionDate"]) - countryList.append(ENADict[run]["country"]) - studyList.append(ENADict[run]["study"]) - descriptionList.append(ENADict[run]["projectDescription"]) - samplesList.append(ENADict[run]["sampleAccession"]) - longList.append(ENADict[run]["longitude"]) - latitList.append(ENADict[run]["latitude"]) - - genomeInfo[g]["study"] = studyList[0] - genomeInfo[g]["description"] = descriptionList[0] + if genome_info[g]["co-assembly"]: + instrument_list, collection_list, country_list = [], [], [] + study_list, description_list, samples_list = [], [], [] + long_list, latit_list = [], [] + for run in genome_info[g]["accessions"]: + instrument_list.append(ena_dict[run]["instrumentModel"]) + collection_list.append(ena_dict[run]["collectionDate"]) + country_list.append(ena_dict[run]["country"]) + study_list.append(ena_dict[run]["study"]) + description_list.append(ena_dict[run]["projectDescription"]) + samples_list.append(ena_dict[run]["sampleAccession"]) + long_list.append(ena_dict[run]["longitude"]) + latit_list.append(ena_dict[run]["latitude"]) + + genome_info[g]["study"] = study_list[0] + genome_info[g]["description"] = description_list[0] - instrument = instrumentList[0] - if multipleElementSet(instrumentList): - instrument = ','.join(instrumentList) - genomeInfo[g]["sequencingMethod"] = instrument - - collectionDate = collectionList[0] - if multipleElementSet(collectionList): - collectionDate = "not provided" - genomeInfo[g]["collectionDate"] = collectionDate - - country = countryList[0] - if multipleElementSet(countryList): + instrument = instrument_list[0] + if multiple_element_set(instrument_list): + instrument = ','.join(instrument_list) + genome_info[g]["sequencingMethod"] = instrument + + collection_date = collection_list[0] + if multiple_element_set(collection_list): + collection_date = "not provided" + genome_info[g]["collectionDate"] = collection_date + + country = country_list[0] + if multiple_element_set(country_list): country = "not applicable" - genomeInfo[g]["country"] = country + genome_info[g]["country"] = country - latitude = latitList[0] - if multipleElementSet(latitList): + latitude = latit_list[0] + if multiple_element_set(latit_list): latitude = "not provided" - genomeInfo[g]["latitude"] = str(round(float(latitude), GEOGRAPHY_DIGIT_COORDS)) + genome_info[g]["latitude"] = str(round(float(latitude), GEOGRAPHY_DIGIT_COORDS)) - longitude = longList[0] - if multipleElementSet(longList): + longitude = long_list[0] + if multiple_element_set(long_list): longitude = "not provided" - genomeInfo[g]["longitude"] = str(round(float(longitude), GEOGRAPHY_DIGIT_COORDS)) + genome_info[g]["longitude"] = str(round(float(longitude), GEOGRAPHY_DIGIT_COORDS)) - samples = samplesList[0] - if multipleElementSet(samplesList): - samples = ','.join(samplesList) - genomeInfo[g]["sample_accessions"] = samples + samples = samples_list[0] + if multiple_element_set(samples_list): + samples = ','.join(samples_list) + genome_info[g]["sample_accessions"] = samples else: - run = genomeInfo[g]["accessions"][0] - genomeInfo[g]["sequencingMethod"] = ENADict[run]["instrumentModel"] - genomeInfo[g]["collectionDate"] = ENADict[run]["collectionDate"] - genomeInfo[g]["study"] = ENADict[run]["study"] - genomeInfo[g]["description"] = ENADict[run]["projectDescription"] - genomeInfo[g]["sample_accessions"] = ENADict[run]["sampleAccession"] - genomeInfo[g]["country"] = ENADict[run]["country"] - genomeInfo[g]["longitude"] = ENADict[run]["longitude"] - genomeInfo[g]["latitude"] = ENADict[run]["latitude"] + run = genome_info[g]["accessions"][0] + genome_info[g]["sequencingMethod"] = ena_dict[run]["instrumentModel"] + genome_info[g]["collectionDate"] = ena_dict[run]["collectionDate"] + genome_info[g]["study"] = ena_dict[run]["study"] + genome_info[g]["description"] = ena_dict[run]["projectDescription"] + genome_info[g]["sample_accessions"] = ena_dict[run]["sampleAccession"] + genome_info[g]["country"] = ena_dict[run]["country"] + genome_info[g]["longitude"] = ena_dict[run]["longitude"] + genome_info[g]["latitude"] = ena_dict[run]["latitude"] - genomeInfo[g]["accessions"] = ','.join(genomeInfo[g]["accessions"]) + genome_info[g]["accessions"] = ','.join(genome_info[g]["accessions"]) -def getAccessions(accessionsFile): - accessionDict = {} - with open(accessionsFile, 'r') as f: +def get_accessions(accessions_file): + accession_dict = {} + with open(accessions_file, 'r') as f: for line in f: line = line.split('\t') alias = line[0] accession = line[1].rstrip('\n') - accessionDict[alias] = accession + accession_dict[alias] = accession - return accessionDict + return accession_dict -def saveAccessions(aliasAccessionDict, accessionsFile, writeMode): - with open(accessionsFile, writeMode) as f: - for elem in aliasAccessionDict: - f.write("{}\t{}\n".format(elem, aliasAccessionDict[elem])) +def save_accessions(alias_accession_dict, accessions_file, write_mode): + with open(accessions_file, write_mode) as f: + for elem in alias_accession_dict: + f.write("{}\t{}\n".format(elem, alias_accession_dict[elem])) -def create_manifest_dictionary(run, alias, assemblySoftware, sequencingMethod, - MAGpath, gen, study, coverage, isCoassembly): - manifestDict = { +def create_manifest_dictionary(run, alias, assembly_software, sequencing_method, + mag_path, gen, study, coverage, is_coassembly): + manifest_dict = { "accessions" : run, "alias" : alias, - "assembler" : assemblySoftware, - "sequencingMethod" : sequencingMethod, - "genome_path" : MAGpath, + "assembler" : assembly_software, + "sequencingMethod" : sequencing_method, + "genome_path" : mag_path, "genome_name" : gen, "study" : study, "coverageDepth" : coverage, - "co-assembly" : isCoassembly + "co-assembly" : is_coassembly } - return manifestDict + return manifest_dict def compute_manifests(genomes): - manifestInfo = {} + manifest_info = {} for g in genomes: - manifestInfo[g] = create_manifest_dictionary(genomes[g]["accessions"], + manifest_info[g] = create_manifest_dictionary(genomes[g]["accessions"], genomes[g]["alias"], genomes[g]["assembly_software"], genomes[g]["sequencingMethod"], genomes[g]["genome_path"], g, genomes[g]["study"], genomes[g]["genome_coverage"], genomes[g]["co-assembly"]) - return manifestInfo + return manifest_info def get_study_from_xml(sample): description = sample.childNodes[5].childNodes[0].data @@ -463,7 +477,7 @@ def get_study_from_xml(sample): return study -def recover_info_from_xml(genomeDict, sample_xml, live_mode): +def recover_info_from_xml(genome_dict, sample_xml, live_mode): logger.info("Retrieving data for genome submission...") # extract list of genomes (samples) to be registered @@ -474,42 +488,42 @@ def recover_info_from_xml(genomeDict, sample_xml, live_mode): study = get_study_from_xml(s) # extract alias from xml and find a match with genomes the user is uploading - XMLalias = s.attributes["alias"].value + xml_alias = s.attributes["alias"].value if not live_mode: # remove time stamp if test mode is selected - aliasSplit = XMLalias.split("_") - XMLalias = '_'.join(aliasSplit[:-1]) - for gen in genomeDict: + alias_split = xml_alias.split("_") + xml_alias = '_'.join(alias_split[:-1]) + for gen in genome_dict: # if match is found, associate attributes listed in the xml file # with genomes to upload - if XMLalias == gen: + if xml_alias == gen: if not live_mode: - currentTimestamp = str(int(dt.timestamp(dt.now()))) - XMLalias = gen + '_' + currentTimestamp - s.attributes["alias"].value = XMLalias - sampleTitle = s.getElementsByTagName("TITLE")[0] - sampleTitleValue = sampleTitle.firstChild.nodeValue.split("_") - sampleTitleValue[-1] = currentTimestamp - newSampleTitle = '_'.join(sampleTitleValue) - s.getElementsByTagName("TITLE")[0].firstChild.replaceWholeText(newSampleTitle) + current_time_stamp = str(int(dt.timestamp(dt.now()))) + xml_alias = gen + '_' + current_time_stamp + s.attributes["alias"].value = xml_alias + sample_title = s.getElementsByTagName("TITLE")[0] + sample_title_value = sample_title.firstChild.nodeValue.split("_") + sample_title_value[-1] = current_time_stamp + new_sample_title = '_'.join(sample_title_value) + s.getElementsByTagName("TITLE")[0].firstChild.replaceWholeText(new_sample_title) attributes = s.childNodes[7].getElementsByTagName("SAMPLE_ATTRIBUTE") - seqMethod, assSoftware = "", "" + seq_method, ass_software = "", "" for a in attributes: - tagElem = a.getElementsByTagName("TAG") - tag = tagElem[0].childNodes[0].nodeValue + tag_elem = a.getElementsByTagName("TAG") + tag = tag_elem[0].childNodes[0].nodeValue if tag == "sequencing method": - seqMethodElem = a.getElementsByTagName("VALUE") - seqMethod = seqMethodElem[0].childNodes[0].nodeValue + seq_method_elem = a.getElementsByTagName("VALUE") + seq_method = seq_method_elem[0].childNodes[0].nodeValue elif tag == "assembly software": - assSoftwareElem = a.getElementsByTagName("VALUE") - assSoftware = assSoftwareElem[0].childNodes[0].nodeValue - if not seqMethod == "" and not assSoftware == "": + ass_software_elem = a.getElementsByTagName("VALUE") + ass_software = ass_software_elem[0].childNodes[0].nodeValue + if not seq_method == "" and not ass_software == "": break - genomeDict[gen]["accessions"] = ','.join(genomeDict[gen]["accessions"]) - genomeDict[gen]["alias"] = XMLalias - genomeDict[gen]["assembly_software"] = assSoftware - genomeDict[gen]["sequencingMethod"] = seqMethod - genomeDict[gen]["study"] = study + genome_dict[gen]["accessions"] = ','.join(genome_dict[gen]["accessions"]) + genome_dict[gen]["alias"] = xml_alias + genome_dict[gen]["assembly_software"] = ass_software + genome_dict[gen]["sequencingMethod"] = seq_method + genome_dict[gen]["study"] = study break if not live_mode: @@ -537,7 +551,7 @@ def create_sample_attribute(sample_attributes, data_list, mag_data=None): if units: ET.SubElement(new_sample_attr, 'UNITS').text = units -def write_genomes_xml(genomes, xml_path, genomeType, centreName, tpa): +def write_genomes_xml(genomes, xml_path, genome_type, centre_name, tpa): map_sample_attributes = [ # tag - value - unit (optional) ["project name", "description"], @@ -561,10 +575,10 @@ def write_genomes_xml(genomes, xml_path, genomeType, centreName, tpa): ["metagenomic source", "metagenome"], ] - checklist, assemblyType = "ERC000047", "Metagenome-assembled genome" - if genomeType == "bins": + checklist, assembly_type = "ERC000047", "Metagenome-assembled genome" + if genome_type == "bins": checklist = "ERC000050" - assemblyType = "binned metagenome" + assembly_type = "binned metagenome" constant_sample_attributes = [ # tag - value @@ -573,9 +587,9 @@ def write_genomes_xml(genomes, xml_path, genomeType, centreName, tpa): ["ENA-CHECKLIST", checklist], ] - tpaDescription = "" + tpa_description = "" if tpa: - tpaDescription = "Third Party Annotation (TPA) " + tpa_description = "Third Party Annotation (TPA) " sample_set = ET.Element("SAMPLE_SET") @@ -584,15 +598,15 @@ def write_genomes_xml(genomes, xml_path, genomeType, centreName, tpa): if genomes[g]["co-assembly"]: plural = 's' description = ("This sample represents a {}{} assembled from the " - "metagenomic run{} {} of study {}.".format(tpaDescription, - assemblyType, plural, genomes[g]["accessions"], + "metagenomic run{} {} of study {}.".format(tpa_description, + assembly_type, plural, genomes[g]["accessions"], genomes[g]["study"])) sample = ET.SubElement(sample_set, "SAMPLE") sample.set("alias", genomes[g]["alias"]) - sample.set("center_name", centreName) + sample.set("center_name", centre_name) - ET.SubElement(sample, 'TITLE').text = ("{}: {}".format(assemblyType, + ET.SubElement(sample, 'TITLE').text = ("{}: {}".format(assembly_type, genomes[g]["alias"])) sample_name = ET.SubElement(sample, "SAMPLE_NAME") ET.SubElement(sample_name, "TAXON_ID").text = genomes[g]["taxID"] @@ -641,35 +655,35 @@ def write_submission_xml(upload_dir, centre_name, study=True): return sub_xml -def generate_genome_manifest(genomeInfo, study, manifestsRoot, aliasToSample, genomeType, tpa): - manifest_path = os.path.join(manifestsRoot, f'{genomeInfo["genome_name"]}.manifest') +def generate_genome_manifest(genome_info, study, manifests_root, alias_to_sample, genome_type, tpa): + manifest_path = os.path.join(manifests_root, f'{genome_info["genome_name"]}.manifest') - tpaAddition, multipleRuns = "", "" + tpa_addition, multiple_runs = "", "" if tpa: - tpaAddition = "Third Party Annotation (TPA) " - if genomeInfo["co-assembly"]: - multipleRuns = "s" - assemblyType = "Metagenome-Assembled Genome (MAG)" - if genomeType == "bins": - assemblyType = "binned metagenome" + tpa_addition = "Third Party Annotation (TPA) " + if genome_info["co-assembly"]: + multiple_runs = "s" + assembly_type = "Metagenome-Assembled Genome (MAG)" + if genome_type == "bins": + assembly_type = "binned metagenome" values = ( ('STUDY', study), - ('SAMPLE', aliasToSample[genomeInfo["alias"]]), - ('ASSEMBLYNAME', genomeInfo["alias"]), - ('ASSEMBLY_TYPE', assemblyType), - ('COVERAGE', genomeInfo["coverageDepth"]), - ('PROGRAM', genomeInfo["assembler"]), - ('PLATFORM', genomeInfo["sequencingMethod"]), + ('SAMPLE', alias_to_sample[genome_info["alias"]]), + ('ASSEMBLYNAME', genome_info["alias"]), + ('ASSEMBLY_TYPE', assembly_type), + ('COVERAGE', genome_info["coverageDepth"]), + ('PROGRAM', genome_info["assembler"]), + ('PLATFORM', genome_info["sequencingMethod"]), ('MOLECULETYPE', "genomic DNA"), ('DESCRIPTION', ("This is a {}bin derived from the primary whole genome " "shotgun (WGS) data set {}. This sample represents a {} from the " - "metagenomic run{} {}.".format(tpaAddition, genomeInfo["study"], - assemblyType, multipleRuns, genomeInfo["accessions"]))), - ('RUN_REF', genomeInfo["accessions"]), - ('FASTA', os.path.abspath(genomeInfo["genome_path"])) + "metagenomic run{} {}.".format(tpa_addition, genome_info["study"], + assembly_type, multiple_runs, genome_info["accessions"]))), + ('RUN_REF', genome_info["accessions"]), + ('FASTA', os.path.abspath(genome_info["genome_path"])) ) - logger.info("Writing manifest file (.manifest) for {}.".format(genomeInfo["alias"])) + logger.info("Writing manifest file (.manifest) for {}.".format(genome_info["alias"])) with open(manifest_path, "w") as outfile: for (k, v) in values: manifest = f'{k}\t{v}\n' @@ -684,63 +698,62 @@ def main(): logger.warning("Warning: genome submission is not in live mode, " + "files will be validated, but not uploaded.") - xmlGenomeFile, xmlSubFile = "genome_samples.xml", "submission.xml" - samples_xml = os.path.join(ENA_uploader.upload_dir, xmlGenomeFile) - submissionXmlPath = os.path.join(ENA_uploader.upload_dir, xmlSubFile) - submission_xml = submissionXmlPath - genomes, manifestInfo = {}, {} + xml_genome_file, xml_sub_file = "genome_samples.xml", "submission.xml" + samples_xml = os.path.join(ENA_uploader.upload_dir, xml_genome_file) + submission_xml_path = os.path.join(ENA_uploader.upload_dir, xml_sub_file) + submission_xml = submission_xml_path + genomes, manifest_info = {}, {} # submission xml existence - if not os.path.exists(submissionXmlPath): + if not os.path.exists(submission_xml_path): submission_xml = write_submission_xml(ENA_uploader.upload_dir, ENA_uploader.centre_name, False) # sample xml generation or recovery genomes = ENA_uploader.create_genome_dictionary(samples_xml) # manifests creation - manifestDir = os.path.join(ENA_uploader.upload_dir, "manifests") - os.makedirs(manifestDir, exist_ok=True) + manifest_dir = os.path.join(ENA_uploader.upload_dir, "manifests") + os.makedirs(manifest_dir, exist_ok=True) accessionsgen = "registered_MAGs.tsv" - if ENA_uploader.genomeType == "bins": + if ENA_uploader.genome_type == "bins": accessionsgen = accessionsgen.replace("MAG", "bin") if not ENA_uploader.live: accessionsgen = accessionsgen.replace(".tsv", "_test.tsv") - accessionsFile = os.path.join(ENA_uploader.upload_dir, accessionsgen) + accessions_file = os.path.join(ENA_uploader.upload_dir, accessionsgen) save = False - writeMode = 'a' - if os.path.exists(accessionsFile): + write_mode = 'a' + if os.path.exists(accessions_file): if not ENA_uploader.live: save = True if ENA_uploader.force: - writeMode = 'w' + write_mode = 'w' if not save: logger.info("Genome samples already registered, reading ERS accessions...") - aliasToNewSampleAccession = getAccessions(accessionsFile) + alias_to_new_sample_accession = get_accessions(accessions_file) else: save = True if save: logger.info("Registering genome samples XMLs...") ena_submit = EnaSubmit(samples_xml, submission_xml, ENA_uploader.live) - aliasToNewSampleAccession = ena_submit.handle_genomes_registration() - saveAccessions(aliasToNewSampleAccession, accessionsFile, writeMode) + alias_to_new_sample_accession = ena_submit.handle_genomes_registration() + save_accessions(alias_to_new_sample_accession, accessions_file, write_mode) logger.info("Generating manifest files...") - manifestInfo = compute_manifests(genomes) + manifest_info = compute_manifests(genomes) - for m in manifestInfo: - generate_genome_manifest(manifestInfo[m], ENA_uploader.upStudy, - manifestDir, aliasToNewSampleAccession, ENA_uploader.genomeType, ENA_uploader.tpa) + for m in manifest_info: + generate_genome_manifest(manifest_info[m], ENA_uploader.up_study, + manifest_dir, alias_to_new_sample_accession, ENA_uploader.genome_type, ENA_uploader.tpa) class GenomeUpload: def __init__(self, argv=sys.argv[1:]): self.args = self.parse_args(argv) - self.upStudy = self.args.upload_study - self.genomeMetadata = self.args.genome_info - self.genomeType = "bins" if self.args.bins else "MAGs" + self.up_study = self.args.upload_study + self.genome_type = "bins" if self.args.bins else "MAGs" self.live = True if self.args.live else False self.private = self.args.private @@ -748,101 +761,78 @@ def __init__(self, argv=sys.argv[1:]): self.centre_name = self.args.centre_name self.force = True if self.args.force else False - workDir = self.args.out if self.args.out else os.getcwd() - self.upload_dir = self.generate_genomes_upload_dir(workDir, self.genomeType) + work_dir = self.args.out if self.args.out else os.getcwd() + self.upload_dir = self.generate_genomes_upload_dir(work_dir, self.genome_type) self.private = self.args.private - - def parse_args(self, argv): - parser = argparse.ArgumentParser(formatter_class = argparse.ArgumentDefaultsHelpFormatter, - description="Create xmls and manifest files for genome upload to ENA") - - parser.add_argument('-u', '--upload_study', type=str, help="Study accession for genomes upload") - parser.add_argument('--genome_info', type=str, required=True, help="Genomes metadata file") - genomeType = parser.add_mutually_exclusive_group(required=True) - genomeType.add_argument('-m', '--mags', action='store_true', help="Select for MAG upload") - genomeType.add_argument('-b', '--bins', action='store_true', help="Select for bin upload") - - parser.add_argument('--out', type=str, help="Output folder. Default: working directory") - parser.add_argument('--force', action='store_true', help="Forces reset of sample xml's backups") - parser.add_argument('--live', action='store_true', help="Uploads on ENA. Omitting this " + - "option allows to validate samples beforehand") - parser.add_argument('--tpa', action='store_true', help="Select if uploading TPA-generated genomes") - - # Users can provide their credentials and centre name manually or using a config file - parser.add_argument('--centre_name', required=False, help="Name of the centre uploading genomes") - parser.add_argument('--private', required=False, help="if data is private", action='store_true', default=False) - - args = parser.parse_args(argv) - - if not args.upload_study: + if not self.args.upload_study: raise ValueError("No project selected for genome upload [-u, --upload_study].") - - if not os.path.exists(args.genome_info): - raise FileNotFoundError('Genome metadata file "{}" does not exist'.format(args.genome_info)) + self.upload_study = self.args.upload_study - return args + if not os.path.exists(self.args.genome_info): + raise FileNotFoundError('Genome metadata file "{}" does not exist'.format(self.args.genome_info)) + self.genome_metadata = self.args.genome_info - def extract_ENA_info(self, genomeInfo, uploadDir, private=False): + def extract_ena_info(self, genome_info, upload_dir): logger.info('Retrieving project and run info from ENA (this might take a while)...') # retrieving metadata from runs (and runs from assembly accessions if provided) - allRuns = [] - for g in genomeInfo: - if genomeInfo[g]["accessionType"] == "assembly": - derivedRuns = [] - for acc in genomeInfo[g]["accessions"]: + all_runs = [] + for g in genome_info: + if genome_info[g]["accessionType"] == "assembly": + derived_runs = [] + for acc in genome_info[g]["accessions"]: ena_query = EnaQuery(acc, "run_assembly", self.private) - derivedRuns.append(ena_query.build_query()) - genomeInfo[g]["accessions"] = derivedRuns - allRuns.extend(genomeInfo[g]["accessions"]) + derived_runs.append(ena_query.build_query()) + genome_info[g]["accessions"] = derived_runs + all_runs.extend(genome_info[g]["accessions"]) - runsSet, studySet, samplesDict, tempDict = set(allRuns), set(), {}, {} - for r in runsSet: + runs_set, study_set, samples_dict, temp_dict = set(all_runs), set(), {}, {} + for r in runs_set: ena_query = EnaQuery(r, "run", self.private) run_info = ena_query.build_query() - studySet.add(run_info["secondary_study_accession"]) - samplesDict[r] = run_info["sample_accession"] + study_set.add(run_info["secondary_study_accession"]) + samples_dict[r] = run_info["sample_accession"] - if not studySet: + if not study_set: raise ValueError("No study corresponding to runs found.") - backupFile = os.path.join(uploadDir, "ENA_backup.json") + backup_file = os.path.join(upload_dir, "ENA_backup.json") counter = 0 - if not os.path.exists(backupFile): - with open(backupFile, 'w') as file: + if not os.path.exists(backup_file): + with open(backup_file, 'w') as file: pass - with open(backupFile, "r+") as file: + with open(backup_file, "r+") as file: try: - backupDict = json.load(file) - tempDict = dict(backupDict) - logger.info(f"A backup file {backupFile} for ENA sample metadata has been found.") + backup_dict = json.load(file) + temp_dict = dict(backup_dict) + logger.info(f"A backup file {backup_file} for ENA sample metadata has been found.") except json.decoder.JSONDecodeError: - backupDict = {} - for s in studySet: + backup_dict = {} + for s in study_set: ena_query = EnaQuery(s, "study", self.private) - studyInfo = ena_query.build_query() - projectDescription = studyInfo["study_description"] + study_info = ena_query.build_query() + project_description = study_info["study_description"] ena_query = EnaQuery(s, "study_runs", self.private) - ENA_info = ena_query.build_query() - if ENA_info == []: + ena_info = ena_query.build_query() + if ena_info == []: raise IOError("No runs found on ENA for project {}.".format(s)) - for run, item in enumerate(ENA_info): - runAccession = ENA_info[run]["run_accession"] - if runAccession not in backupDict: - if runAccession in runsSet: - sampleAccession = ENA_info[run]["sample_accession"] - ena_query = EnaQuery(sampleAccession, "sample", self.private) - sampleInfo = ena_query.build_query() - - if 'latitude' in sampleInfo and 'longitude' in sampleInfo: - latitude = sampleInfo['latitude'] - longitude = sampleInfo['longitude'] + for run, item in enumerate(ena_info): + run_accession = ena_info[run]["run_accession"] + if run_accession not in backup_dict: + if run_accession in runs_set: + sample_accession = ena_info[run]["sample_accession"] + ena_query = EnaQuery(sample_accession, "sample", self.private) + sample_info = ena_query.build_query() + + if 'latitude' in sample_info and 'longitude' in sample_info: + latitude = sample_info['latitude'] + longitude = sample_info['longitude'] else: - location = sampleInfo["location"] + location = sample_info["location"] latitude, longitude = None, None if 'N' in location: latitude = location.split('N')[0].strip() @@ -866,58 +856,58 @@ def extract_ENA_info(self, genomeInfo, uploadDir, private=False): else: longitude = "not provided" - country = sampleInfo["country"].split(':')[0] + country = sample_info["country"].split(':')[0] if not country in GEOGRAPHIC_LOCATIONS: country = "not provided" - collectionDate = sampleInfo["collection_date"] + collectionDate = sample_info["collection_date"] if collectionDate == "" or collectionDate == "missing": collectionDate = "not provided" - tempDict[runAccession] = { - "instrumentModel" : ENA_info[run]["instrument_model"], + temp_dict[run_accession] = { + "instrumentModel" : ena_info[run]["instrument_model"], "collectionDate" : collectionDate, "country" : country, "latitude" : latitude, "longitude" : longitude, - "projectDescription" : projectDescription, + "projectDescription" : project_description, "study" : s, - "sampleAccession" : samplesDict[runAccession] + "sampleAccession" : samples_dict[run_accession] } counter += 1 - if (counter%10 == 0) or (len(runsSet) - len(backupDict) == counter): + if (counter%10 == 0) or (len(runs_set) - len(backup_dict) == counter): file.seek(0) - file.write(json.dumps(tempDict)) + file.write(json.dumps(temp_dict)) file.truncate() - tempDict = {**tempDict, **backupDict} - combine_ENA_info(genomeInfo, tempDict) + temp_dict = {**temp_dict, **backup_dict} + combine_ENA_info(genome_info, temp_dict) - def generate_genomes_upload_dir(self, dir, genomeType): - uploadName = "MAG_upload" - if genomeType == "bins": - uploadName = uploadName.replace("MAG", "bin") - upload_dir = os.path.join(dir, uploadName) + def generate_genomes_upload_dir(self, dir, genome_type): + upload_name = "MAG_upload" + if genome_type == "bins": + upload_name = upload_name.replace("MAG", "bin") + upload_dir = os.path.join(dir, upload_name) os.makedirs(upload_dir, exist_ok=True) return upload_dir def create_genome_dictionary(self, samples_xml): logger.info('Retrieving data for MAG submission...') - genomeInfo = extract_genomes_info(self.genomeMetadata, self.genomeType, self.live) + genome_info = extract_genomes_info(self.genome_metadata, self.genome_type, self.live) if not os.path.exists(samples_xml) or self.force: - extract_ENA_info(genomeInfo, self.upload_dir, self.username, self.password, self.private) + self.extract_ena_info(genome_info, self.upload_dir) logger.info("Writing genome registration XML...") - write_genomes_xml(genomeInfo, samples_xml, self.genomeType, + write_genomes_xml(genome_info, samples_xml, self.genome_type, self.centre_name, self.tpa) logger.info("All files have been written to " + self.upload_dir) else: - recover_info_from_xml(genomeInfo, samples_xml, self.live) + recover_info_from_xml(genome_info, samples_xml, self.live) - return genomeInfo + return genome_info if __name__ == "__main__": main() From d85b0c6aa9b360174bdcf08552b26f3f516141c1 Mon Sep 17 00:00:00 2001 From: Varsha Date: Thu, 19 Dec 2024 12:02:46 +0000 Subject: [PATCH 10/20] modify class and argument parsing --- genomeuploader/genome_upload.py | 470 +++++++++++++++++--------------- 1 file changed, 251 insertions(+), 219 deletions(-) diff --git a/genomeuploader/genome_upload.py b/genomeuploader/genome_upload.py index 9136118..d34435c 100755 --- a/genomeuploader/genome_upload.py +++ b/genomeuploader/genome_upload.py @@ -37,24 +37,24 @@ def parse_args(argv): parser = argparse.ArgumentParser(formatter_class = argparse.ArgumentDefaultsHelpFormatter, - description="Create xmls and manifest files for genome upload to ENA") + description="Create xmls and manifest files for genome upload and upload to ENA") - parser.add_argument('-u', '--upload_study', type=str, help="Study accession for genomes upload") + parser.add_argument('-u', '--upload_study', type=str, required=True, help="Study accession for genomes upload") parser.add_argument('--genome_info', type=str, required=True, help="Genomes metadata file") - genomeType = parser.add_mutually_exclusive_group(required=True) + genomeType = parser.add_mutually_exclusive_group(required=True, default=False) genomeType.add_argument('-m', '--mags', action='store_true', help="Select for MAG upload") genomeType.add_argument('-b', '--bins', action='store_true', help="Select for bin upload") parser.add_argument('--out', type=str, help="Output folder. Default: working directory") - parser.add_argument('--force', action='store_true', help="Forces reset of sample xml's backups") - parser.add_argument('--live', action='store_true', help="Uploads on ENA. Omitting this " + + parser.add_argument('--force', action='store_true', required=False, default=False, help="Forces reset of sample xml's backups") + parser.add_argument('--live', action='store_true', required=False, default=False, help="Uploads on ENA. Omitting this " + "option allows to validate samples beforehand") - parser.add_argument('--tpa', action='store_true', help="Select if uploading TPA-generated genomes") + parser.add_argument('--tpa', action='store_true', required=False, default=False, help="Select if uploading TPA-generated genomes") - # Users can provide their credentials and centre name manually or using a config file - parser.add_argument('--centre_name', required=False, help="Name of the centre uploading genomes") - parser.add_argument('--private', required=False, help="if data is private", action='store_true', default=False) + # Users can provide their credentials in environment variables or using a config file + parser.add_argument('--centre_name', required=True, help="Name of the centre uploading genomes") + parser.add_argument('--private', required=False, action='store_true', default=False, help="if data is private",) return parser.parse_args(argv) @@ -79,99 +79,6 @@ def parse_args(argv): genome_path: path to genome to upload ''' -def validate_metadata_tsv(input_file, genome_type, live): - logger.info('Retrieving info for genomes to submit...') - - all_fields = MAG_MANDATORY_FIELDS + BIN_MANDATORY_FIELDS - metadata = pd.read_csv(input_file, sep='\t', usecols=all_fields) - - # make sure there are no empty cells - clean_columns = list(metadata.dropna(axis=1)) - if genome_type == "MAGs": - missing_values = [item for item in all_fields if item not in clean_columns] - else: - missing_values = [item for item in BIN_MANDATORY_FIELDS if item not in clean_columns] - - if missing_values: - raise ValueError("The following mandatory fields have missing values in " + - "the input file: {}".format(", ".join(missing_values))) - - # check amount of genomes to register at the same time - if len(metadata) >= 5000: - raise ValueError("Genomes need to be registered in batches of 5000 genomes or smaller.") - - # check whether accessions follow the right format - accessions_reg_exp = re.compile(r"([E|S|D]R[R|Z]\d{6,})") - - accession_comparison = pd.DataFrame(columns=["genome_name", "attemptive_accessions", - "correct", "mismatching", "co-assembly"]) - accession_comparison["genome_name"] = metadata["genome_name"] - - accession_comparison["attemptive_accessions"] = metadata["accessions"].map( - lambda a: len(a.split(','))) - - accession_comparison["correct"] = metadata["accessions"].map( - lambda a: len(accessions_reg_exp.findall(a))) - - accession_comparison["mismatching"] = accession_comparison.apply(lambda row: - True if row["attemptive_accessions"] == row["correct"] - else None, axis=1).isna() - - mismatching_accessions = accession_comparison[accession_comparison["mismatching"]]["genome_name"] - if not mismatching_accessions.empty: - raise ValueError("Run accessions are not correctly formatted for the following " + - "genomes: " + ','.join(mismatching_accessions.values)) - - # check whether completeness and contamination are floats - try: - pd.to_numeric(metadata["completeness"]) - pd.to_numeric(metadata["contamination"]) - pd.to_numeric(metadata["genome_coverage"]) - except: - raise ValueError("Completeness, contamination or coverage values should be formatted as floats") - - # check whether all co-assemblies have more than one run associated and viceversa - accession_comparison["co-assembly"] = metadata["co-assembly"] - coassembly_discrepancy = metadata[( - (accession_comparison["correct"] < 2) & (accession_comparison["co-assembly"])) | - ((accession_comparison["correct"] > 1) & (~accession_comparison["co-assembly"]) - )]["genome_name"] - if not coassembly_discrepancy.empty: - raise ValueError("The following genomes show discrepancy between number of runs " - "involved and co-assembly status: " + ','.join(coassembly_discrepancy.values)) - - # are provided metagenomes part of the accepted metagenome list? - if False in metadata.apply(lambda row: - True if row["metagenome"] in METAGENOMES - else False, axis=1).unique(): - raise ValueError("Metagenomes associated with each genome need to belong to ENA's " + - "approved metagenomes list.") - - # do provided file paths exist? - if False in metadata.apply(lambda row: - True if os.path.exists(row["genome_path"]) - else False, axis =1).unique(): - raise FileNotFoundError("Some genome paths do not exist.") - - # check genome name lengths - #if not (metadata["genome_name"].map(lambda a: len(a) < 20).all()): - # raise ValueError("Genome names must be shorter than 20 characters.") - - # create dictionary while checking genome name uniqueness - uniqueness = metadata["genome_name"].nunique() == metadata["genome_name"].size - if uniqueness: - if not live: - timestamp = str(int(dt.timestamp(dt.now()))) - timestamp_names = [row["genome_name"] + '_' + timestamp for index, row in metadata.iterrows()] - metadata["unique_genome_name"] = timestamp_names - genome_info = metadata.set_index("unique_genome_name").transpose().to_dict() - else: - genome_info = metadata.set_index("genome_name").transpose().to_dict() - else: - raise ValueError("Duplicate names found in genome names") - - return genome_info - def round_stats(stats): new_stat = round(float(stats), 2) if new_stat == 100.0: @@ -327,45 +234,10 @@ def extract_archaea_info(name, rank): return submittable, name, taxid -def extract_genomes_info(input_file, genome_type, live): - genome_info = validate_metadata_tsv(input_file, genome_type, live) - for gen in genome_info: - genome_info[gen]["accessions"] = genome_info[gen]["accessions"].split(',') - accession_type = "run" - assembly_reg_exp = re.compile(r"([E|S|D]RZ\d{6,})") - if assembly_reg_exp.findall(genome_info[gen]["accessions"][0]): - accession_type = "assembly" - genome_info[gen]["accessionType"] = accession_type - - genome_info[gen]["isolationSource"] = genome_info[gen]["metagenome"] - - try: - (genome_info[gen]["MAG_quality"], - genome_info[gen]["completeness"], - genome_info[gen]["contamination"]) = compute_mag_quality( - str(round_stats(genome_info[gen]["completeness"])), - str(round_stats(genome_info[gen]["contamination"])), - genome_info[gen]["rRNA_presence"]) - except IndexError: - pass - - if str(genome_info[gen]["co-assembly"]).lower() in ["yes", "y", "true"]: - genome_info[gen]["co-assembly"] = True - else: - genome_info[gen]["co-assembly"] = False - - genome_info[gen]["alias"] = gen - - tax_id, scientific_name = extract_tax_info(genome_info[gen]["NCBI_lineage"]) - genome_info[gen]["taxID"] = tax_id - genome_info[gen]["scientific_name"] = scientific_name - - return genome_info - def multiple_element_set(metadata_list): return len(set(metadata_list))>1 -def combine_ENA_info(genome_info, ena_dict): +def combine_ena_info(genome_info, ena_dict): for g in genome_info: # TODO: optimise all the part below if genome_info[g]["co-assembly"]: @@ -428,7 +300,6 @@ def combine_ENA_info(genome_info, ena_dict): genome_info[g]["accessions"] = ','.join(genome_info[g]["accessions"]) - def get_accessions(accessions_file): accession_dict = {} with open(accessions_file, 'r') as f: @@ -691,90 +562,182 @@ def generate_genome_manifest(genome_info, study, manifests_root, alias_to_sample if tpa: outfile.write("TPA\ttrue\n") -def main(): - ENA_uploader = GenomeUpload() - - if not ENA_uploader.live: - logger.warning("Warning: genome submission is not in live mode, " + - "files will be validated, but not uploaded.") - - xml_genome_file, xml_sub_file = "genome_samples.xml", "submission.xml" - samples_xml = os.path.join(ENA_uploader.upload_dir, xml_genome_file) - submission_xml_path = os.path.join(ENA_uploader.upload_dir, xml_sub_file) - submission_xml = submission_xml_path - genomes, manifest_info = {}, {} - - # submission xml existence - if not os.path.exists(submission_xml_path): - submission_xml = write_submission_xml(ENA_uploader.upload_dir, ENA_uploader.centre_name, False) - - # sample xml generation or recovery - genomes = ENA_uploader.create_genome_dictionary(samples_xml) + +class GenomeUpload: + def __init__( + self, + upload_study: str, + centre_name: str, + genome_info: str, + bins: bool = False, + live: bool = False, + private: bool = False, + tpa: bool = False, + force: bool = False, + out: str = None, + ): + f""" + Submission of genomes. + + :param upload_study: Study accession for genomes upload. + :param centre_name: Name of the centre uploading genomes. + :param genome_info: Genomes metadata file. + :param bins: Performs bin upload. + :params live: Live upload to ENA. + :param private: Is this a private study? + :param tpa: Is this a third-party assembly? + :param force: Resets sample XML backups. + :param out: Output folder. + """ + + self.genome_type = "bins" if bins else "MAGs" + self.live = live + self.private = private + + self.tpa = tpa + self.centre_name = centre_name + self.force = force + + self.work_dir = out if out is not None else os.getcwd() + self.upload_dir = self.generate_genomes_upload_dir() + + if not upload_study: + raise ValueError("No project selected for genome upload [-u, --upload_study].") + self.upload_study = upload_study + + if not os.path.exists(genome_info): + raise FileNotFoundError(f"Genome metadata file {genome_info} does not exist") + self.genome_metadata = genome_info + + def validate_metadata_tsv(self): + logger.info('Retrieving info for genomes to submit...') - # manifests creation - manifest_dir = os.path.join(ENA_uploader.upload_dir, "manifests") - os.makedirs(manifest_dir, exist_ok=True) - - accessionsgen = "registered_MAGs.tsv" - if ENA_uploader.genome_type == "bins": - accessionsgen = accessionsgen.replace("MAG", "bin") - if not ENA_uploader.live: - accessionsgen = accessionsgen.replace(".tsv", "_test.tsv") - - accessions_file = os.path.join(ENA_uploader.upload_dir, accessionsgen) - save = False - write_mode = 'a' - if os.path.exists(accessions_file): - if not ENA_uploader.live: - save = True - if ENA_uploader.force: - write_mode = 'w' - if not save: - logger.info("Genome samples already registered, reading ERS accessions...") - alias_to_new_sample_accession = get_accessions(accessions_file) - else: - save = True + all_fields = MAG_MANDATORY_FIELDS + BIN_MANDATORY_FIELDS + metadata = pd.read_csv(self.genome_metadata, sep='\t', usecols=all_fields) - if save: - logger.info("Registering genome samples XMLs...") - ena_submit = EnaSubmit(samples_xml, submission_xml, ENA_uploader.live) - alias_to_new_sample_accession = ena_submit.handle_genomes_registration() - save_accessions(alias_to_new_sample_accession, accessions_file, write_mode) + # make sure there are no empty cells + clean_columns = list(metadata.dropna(axis=1)) + if self.genome_type == "MAGs": + missing_values = [item for item in all_fields if item not in clean_columns] + else: + missing_values = [item for item in BIN_MANDATORY_FIELDS if item not in clean_columns] + + if missing_values: + raise ValueError("The following mandatory fields have missing values in " + + "the input file: {}".format(", ".join(missing_values))) - logger.info("Generating manifest files...") - - manifest_info = compute_manifests(genomes) + # check amount of genomes to register at the same time + if len(metadata) >= 5000: + raise ValueError("Genomes need to be registered in batches of 5000 genomes or smaller.") - for m in manifest_info: - generate_genome_manifest(manifest_info[m], ENA_uploader.up_study, - manifest_dir, alias_to_new_sample_accession, ENA_uploader.genome_type, ENA_uploader.tpa) + # check whether accessions follow the right format + accessions_reg_exp = re.compile(r"([E|S|D]R[R|Z]\d{6,})") -class GenomeUpload: - def __init__(self, argv=sys.argv[1:]): - self.args = self.parse_args(argv) - self.up_study = self.args.upload_study - self.genome_type = "bins" if self.args.bins else "MAGs" - self.live = True if self.args.live else False - self.private = self.args.private + accession_comparison = pd.DataFrame(columns=["genome_name", "attemptive_accessions", + "correct", "mismatching", "co-assembly"]) + accession_comparison["genome_name"] = metadata["genome_name"] - self.tpa = True if self.args.tpa else False - self.centre_name = self.args.centre_name - self.force = True if self.args.force else False + accession_comparison["attemptive_accessions"] = metadata["accessions"].map( + lambda a: len(a.split(','))) - work_dir = self.args.out if self.args.out else os.getcwd() - self.upload_dir = self.generate_genomes_upload_dir(work_dir, self.genome_type) + accession_comparison["correct"] = metadata["accessions"].map( + lambda a: len(accessions_reg_exp.findall(a))) - self.private = self.args.private + accession_comparison["mismatching"] = accession_comparison.apply(lambda row: + True if row["attemptive_accessions"] == row["correct"] + else None, axis=1).isna() - if not self.args.upload_study: - raise ValueError("No project selected for genome upload [-u, --upload_study].") - self.upload_study = self.args.upload_study + mismatching_accessions = accession_comparison[accession_comparison["mismatching"]]["genome_name"] + if not mismatching_accessions.empty: + raise ValueError("Run accessions are not correctly formatted for the following " + + "genomes: " + ','.join(mismatching_accessions.values)) + + # check whether completeness and contamination are floats + try: + pd.to_numeric(metadata["completeness"]) + pd.to_numeric(metadata["contamination"]) + pd.to_numeric(metadata["genome_coverage"]) + except: + raise ValueError("Completeness, contamination or coverage values should be formatted as floats") + + # check whether all co-assemblies have more than one run associated and viceversa + accession_comparison["co-assembly"] = metadata["co-assembly"] + coassembly_discrepancy = metadata[( + (accession_comparison["correct"] < 2) & (accession_comparison["co-assembly"])) | + ((accession_comparison["correct"] > 1) & (~accession_comparison["co-assembly"]) + )]["genome_name"] + if not coassembly_discrepancy.empty: + raise ValueError("The following genomes show discrepancy between number of runs " + "involved and co-assembly status: " + ','.join(coassembly_discrepancy.values)) + + # are provided metagenomes part of the accepted metagenome list? + if False in metadata.apply(lambda row: + True if row["metagenome"] in METAGENOMES + else False, axis=1).unique(): + raise ValueError("Metagenomes associated with each genome need to belong to ENA's " + + "approved metagenomes list.") + + # do provided file paths exist? + if False in metadata.apply(lambda row: + True if os.path.exists(row["genome_path"]) + else False, axis =1).unique(): + raise FileNotFoundError("Some genome paths do not exist.") + + # check genome name lengths + #if not (metadata["genome_name"].map(lambda a: len(a) < 20).all()): + # raise ValueError("Genome names must be shorter than 20 characters.") + + # create dictionary while checking genome name uniqueness + uniqueness = metadata["genome_name"].nunique() == metadata["genome_name"].size + if uniqueness: + if not self.live: + timestamp = str(int(dt.timestamp(dt.now()))) + timestamp_names = [row["genome_name"] + '_' + timestamp for index, row in metadata.iterrows()] + metadata["unique_genome_name"] = timestamp_names + genome_info = metadata.set_index("unique_genome_name").transpose().to_dict() + else: + genome_info = metadata.set_index("genome_name").transpose().to_dict() + else: + raise ValueError("Duplicate names found in genome names") + + return genome_info + + def extract_genomes_info(self): + genome_info = self.validate_metadata_tsv() + for gen in genome_info: + genome_info[gen]["accessions"] = genome_info[gen]["accessions"].split(',') + accession_type = "run" + assembly_reg_exp = re.compile(r"([E|S|D]RZ\d{6,})") + if assembly_reg_exp.findall(genome_info[gen]["accessions"][0]): + accession_type = "assembly" + genome_info[gen]["accessionType"] = accession_type + + genome_info[gen]["isolationSource"] = genome_info[gen]["metagenome"] + + try: + (genome_info[gen]["MAG_quality"], + genome_info[gen]["completeness"], + genome_info[gen]["contamination"]) = compute_mag_quality( + str(round_stats(genome_info[gen]["completeness"])), + str(round_stats(genome_info[gen]["contamination"])), + genome_info[gen]["rRNA_presence"]) + except IndexError: + pass + + if str(genome_info[gen]["co-assembly"]).lower() in ["yes", "y", "true"]: + genome_info[gen]["co-assembly"] = True + else: + genome_info[gen]["co-assembly"] = False + + genome_info[gen]["alias"] = gen - if not os.path.exists(self.args.genome_info): - raise FileNotFoundError('Genome metadata file "{}" does not exist'.format(self.args.genome_info)) - self.genome_metadata = self.args.genome_info + tax_id, scientific_name = extract_tax_info(genome_info[gen]["NCBI_lineage"]) + genome_info[gen]["taxID"] = tax_id + genome_info[gen]["scientific_name"] = scientific_name - def extract_ena_info(self, genome_info, upload_dir): + return genome_info + + def extract_ena_info(self, genome_info): logger.info('Retrieving project and run info from ENA (this might take a while)...') # retrieving metadata from runs (and runs from assembly accessions if provided) @@ -798,7 +761,7 @@ def extract_ena_info(self, genome_info, upload_dir): if not study_set: raise ValueError("No study corresponding to runs found.") - backup_file = os.path.join(upload_dir, "ENA_backup.json") + backup_file = os.path.join(self.upload_dir, "ENA_backup.json") counter = 0 if not os.path.exists(backup_file): with open(backup_file, 'w') as file: @@ -881,24 +844,24 @@ def extract_ena_info(self, genome_info, upload_dir): file.write(json.dumps(temp_dict)) file.truncate() temp_dict = {**temp_dict, **backup_dict} - combine_ENA_info(genome_info, temp_dict) + combine_ena_info(genome_info, temp_dict) - def generate_genomes_upload_dir(self, dir, genome_type): + def generate_genomes_upload_dir(self): upload_name = "MAG_upload" - if genome_type == "bins": + if self.genome_type == "bins": upload_name = upload_name.replace("MAG", "bin") - upload_dir = os.path.join(dir, upload_name) + upload_dir = os.path.join(self.work_dir, upload_name) os.makedirs(upload_dir, exist_ok=True) return upload_dir def create_genome_dictionary(self, samples_xml): logger.info('Retrieving data for MAG submission...') - genome_info = extract_genomes_info(self.genome_metadata, self.genome_type, self.live) + genome_info = self.extract_genomes_info() if not os.path.exists(samples_xml) or self.force: - self.extract_ena_info(genome_info, self.upload_dir) + self.extract_ena_info(genome_info) logger.info("Writing genome registration XML...") write_genomes_xml(genome_info, samples_xml, self.genome_type, @@ -908,7 +871,76 @@ def create_genome_dictionary(self, samples_xml): recover_info_from_xml(genome_info, samples_xml, self.live) return genome_info + + def genome_upload(self): + if not self.live: + logger.warning("Warning: genome submission is not in live mode, " + + "files will be validated, but not uploaded.") + samples_xml = os.path.join(self.upload_dir, "genome_samples.xml") + submission_xml = os.path.join(self.upload_dir, "submission.xml") + genomes, manifest_info = {}, {} + + # submission xml existence + if not os.path.exists(submission_xml): + submission_xml = write_submission_xml(self.upload_dir, self.centre_name, False) + + # sample xml generation or recovery + genomes = self.create_genome_dictionary(samples_xml) + + # manifests creation + manifest_dir = os.path.join(self.upload_dir, "manifests") + os.makedirs(manifest_dir, exist_ok=True) + + accessionsgen = "registered_MAGs.tsv" + if self.genome_type == "bins": + accessionsgen = accessionsgen.replace("MAG", "bin") + if not self.live: + accessionsgen = accessionsgen.replace(".tsv", "_test.tsv") + + accessions_file = os.path.join(self.upload_dir, accessionsgen) + save = False + write_mode = 'a' + if os.path.exists(accessions_file): + if not self.live: + save = True + if self.force: + write_mode = 'w' + if not save: + logger.info("Genome samples already registered, reading ERS accessions...") + alias_to_new_sample_accession = get_accessions(accessions_file) + else: + save = True + + if save: + logger.info("Registering genome samples XMLs...") + ena_submit = EnaSubmit(samples_xml, submission_xml, self.live) + alias_to_new_sample_accession = ena_submit.handle_genomes_registration() + save_accessions(alias_to_new_sample_accession, accessions_file, write_mode) + logger.info("Generating manifest files...") + + manifest_info = compute_manifests(genomes) + + for m in manifest_info: + generate_genome_manifest(manifest_info[m], self.upload_study, + manifest_dir, alias_to_new_sample_accession, self.genome_type, self.tpa) + + +def main(): + args = parse_args(sys.argv[1:]) + ena_upload = GenomeUpload( + upload_study=args.upload_study, + centre_name=args.centre_name, + genome_info=args.genome_info, + bins=args.bins, + live=args.live, + private=args.private, + tpa=args.tpa, + force=args.force, + out = args.out, + ) + ena_upload.genome_upload() + if __name__ == "__main__": main() logger.info('Completed') From 753645601c248667c416ee387e1f8e3c86c34f3d Mon Sep 17 00:00:00 2001 From: Varsha Date: Thu, 19 Dec 2024 13:38:54 +0000 Subject: [PATCH 11/20] precommit checks --- genomeuploader/__init__.py | 2 +- genomeuploader/constants.py | 40 +-- genomeuploader/ena.py | 195 +++++-------- genomeuploader/genome_upload.py | 474 +++++++++++++++++--------------- 4 files changed, 346 insertions(+), 365 deletions(-) diff --git a/genomeuploader/__init__.py b/genomeuploader/__init__.py index 55fa725..58039f5 100644 --- a/genomeuploader/__init__.py +++ b/genomeuploader/__init__.py @@ -1 +1 @@ -__version__ = '2.1.1' +__version__ = "2.1.1" diff --git a/genomeuploader/constants.py b/genomeuploader/constants.py index 5a27b39..7c74036 100644 --- a/genomeuploader/constants.py +++ b/genomeuploader/constants.py @@ -14,14 +14,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -HQ = ( - "Multiple fragments where gaps span repetitive regions. Presence of the " - "23S, 16S, and 5S rRNA genes and at least 18 tRNAs." -) -MQ = ( - "Many fragments with little to no review of assembly other than reporting " - "of standard assembly statistics." -) +HQ = "Multiple fragments where gaps span repetitive regions. Presence of the " "23S, 16S, and 5S rRNA genes and at least 18 tRNAs." +MQ = "Many fragments with little to no review of assembly other than reporting " "of standard assembly statistics." METAGENOMES = [ "activated carbon metagenome", @@ -637,26 +631,22 @@ ] BIN_MANDATORY_FIELDS = [ - "genome_name", + "genome_name", "accessions", - "assembly_software", - "binning_software", - "binning_parameters", - "stats_generation_software", + "assembly_software", + "binning_software", + "binning_parameters", + "stats_generation_software", "NCBI_lineage", - "broad_environment", - "local_environment", - "environmental_medium", + "broad_environment", + "local_environment", + "environmental_medium", "metagenome", - "co-assembly", - "genome_coverage", - "genome_path" + "co-assembly", + "genome_coverage", + "genome_path", ] -MAG_MANDATORY_FIELDS = [ - "rRNA_presence", - "completeness", - "contamination" -] +MAG_MANDATORY_FIELDS = ["rRNA_presence", "completeness", "contamination"] -GEOGRAPHY_DIGIT_COORDS = 8 \ No newline at end of file +GEOGRAPHY_DIGIT_COORDS = 8 diff --git a/genomeuploader/ena.py b/genomeuploader/ena.py index 7faab70..d968e55 100644 --- a/genomeuploader/ena.py +++ b/genomeuploader/ena.py @@ -1,4 +1,3 @@ - #!/usr/bin/env python # -*- coding: utf-8 -*- @@ -16,16 +15,16 @@ # limitations under the License. -import requests import json import logging import os -import sys -from time import sleep +import sys +import xml.dom.minidom as minidom from pathlib import Path -from dotenv import load_dotenv +from time import sleep -import xml.dom.minidom as minidom +import requests +from dotenv import load_dotenv from requests.exceptions import ConnectionError, HTTPError, RequestException, Timeout logging.basicConfig(level=logging.INFO) @@ -37,32 +36,20 @@ class NoDataException(ValueError): pass -RUN_DEFAULT_FIELDS = ','.join([ - 'secondary_study_accession', - 'sample_accession' -]) +RUN_DEFAULT_FIELDS = ",".join(["secondary_study_accession", "sample_accession"]) -STUDY_RUN_DEFAULT_FIELDS = ','.join([ - 'sample_accession', - 'run_accession', - 'instrument_model' -]) +STUDY_RUN_DEFAULT_FIELDS = ",".join(["sample_accession", "run_accession", "instrument_model"]) -ASSEMBLY_DEFAULT_FIELDS = 'sample_accession' +ASSEMBLY_DEFAULT_FIELDS = "sample_accession" -SAMPLE_DEFAULT_FIELDS = ','.join([ - 'sample_accession', - 'secondary_sample_accession', - 'collection_date', - 'country', - 'location' -]) +SAMPLE_DEFAULT_FIELDS = ",".join(["sample_accession", "secondary_sample_accession", "collection_date", "country", "location"]) -STUDY_DEFAULT_FIELDS = 'study_description' +STUDY_DEFAULT_FIELDS = "study_description" RETRY_COUNT = 3 + def get_default_connection_headers(): return { "headers": { @@ -71,12 +58,10 @@ def get_default_connection_headers(): } } + def get_default_params(): - return { - 'format': 'json', - 'includeMetagenomes': True, - 'dataPortal': 'ena' - } + return {"format": "json", "includeMetagenomes": True, "dataPortal": "ena"} + def parse_accession(accession): if accession.startswith("PRJ"): @@ -93,11 +78,12 @@ def parse_accession(accession): logging.error(f"{accession} is not a valid accession") sys.exit() + def configure_credentials(): # Config file user_config = Path.home() / ".genome_uploader.config.env" if user_config.exists(): - logger.debug("Loading the env variables from ".format(user_config)) + logger.debug("Loading the env variables from ".format()) load_dotenv(str(user_config)) else: cwd_config = Path.cwd() / ".genome_uploader.config.env" @@ -114,34 +100,36 @@ def configure_credentials(): return username, password + def query_taxid(taxid): url = "https://www.ebi.ac.uk/ena/taxonomy/rest/tax-id/{}".format(taxid) response = requests.get(url) try: - # Will raise exception if response status code is non-200 + # Will raise exception if response status code is non-200 response.raise_for_status() except requests.exceptions.HTTPError as e: print("Request failed {} with error {}".format(url, e)) return False - + res = response.json() - + return res.get("scientificName", "") + def query_scientific_name(scientific_name, search_rank=False): url = "https://www.ebi.ac.uk/ena/taxonomy/rest/scientific-name/{}".format(scientific_name) response = requests.get(url) - + try: - # Will raise exception if response status code is non-200 + # Will raise exception if response status code is non-200 response.raise_for_status() - except requests.exceptions.HTTPError as e: + except requests.exceptions.HTTPError: if search_rank: return False, "", "" else: return False, "" - + try: res = response.json()[0] except IndexError: @@ -160,8 +148,7 @@ def query_scientific_name(scientific_name, search_rank=False): return submittable, taxid - -class EnaQuery(): +class EnaQuery: def __init__(self, accession, query_type, private=False): self.private_url = "https://www.ebi.ac.uk/ena/submit/report" self.public_url = "https://www.ebi.ac.uk/ena/portal/api/search" @@ -179,9 +166,7 @@ def __init__(self, accession, query_type, private=False): self.query_type = query_type def post_request(self, data): - response = requests.post( - self.public_url, data=data, **get_default_connection_headers() - ) + response = requests.post(self.public_url, data=data, **get_default_connection_headers()) return response def get_request(self, url): @@ -196,24 +181,20 @@ def get_data_or_handle_error(self, response, xml=False, full_json=False): data_txt = response.text if data_txt is None: if self.private: - logging.error( - f"{self.accession} private data is not present in the specified Webin account" - ) + logging.error(f"{self.accession} private data is not present in the specified Webin account") else: logging.error(f"{self.accession} public data does not exist") else: if xml: - return minidom.parseString(data_txt) - elif full_json: + return minidom.parseString(data_txt) + elif full_json: # return the full json when more than one entry expected return response.json() else: - # only return the fist element in the list is the default. In these cases there should only be one entry returned + # only return the fist element in the list is the default. In these cases there should only be one entry returned return json.loads(response.txt)[0] except (IndexError, TypeError, ValueError, KeyError): - logging.error( - f"Failed to fetch {self.accession}, returned error: {response.text}" - ) + logging.error(f"Failed to fetch {self.accession}, returned error: {response.text}") def retry_or_handle_request_error(self, request, *args, **kwargs): attempt = 0 @@ -226,9 +207,7 @@ def retry_or_handle_request_error(self, request, *args, **kwargs): except (Timeout, ConnectionError) as retry_err: attempt += 1 if attempt >= RETRY_COUNT: - raise ValueError( - f"Could not find {self.accession} in ENA after {RETRY_COUNT} attempts. Error: {retry_err}" - ) + raise ValueError(f"Could not find {self.accession} in ENA after {RETRY_COUNT} attempts. Error: {retry_err}") sleep(1) except HTTPError as http_err: print(f"HTTP response has an error status: {http_err}") @@ -247,7 +226,7 @@ def _get_private_run(self): run = self.get_data_or_handle_error(response) run_data = run["report"] reformatted_data = { - "secondary_study_accession": run_data['studyID'], + "secondary_study_accession": run_data["studyID"], "sample_accession": run_data["sampleId"], } logging.info(f"{self.accession} private run returned from ENA") @@ -257,25 +236,22 @@ def _get_public_run(self): data = get_default_params() data.update( { - "result": "read_run", - "query": f'run_accession="{self.accession}"', - "fields": "secondary_study_accession,sample_accession", - } + "result": "read_run", + "query": f'run_accession="{self.accession}"', + "fields": "secondary_study_accession,sample_accession", + } ) response = self.retry_or_handle_request_error(self.post_request, data) run = self.get_data_or_handle_error(response) logging.info(f"{self.accession} public run returned from ENA") - return run + return run - def _get_private_study(self): url = f"{self.private_url}/studies/xml/{self.accession}" response = self.retry_or_handle_request_error(self.get_request, url) study = self.get_data_or_handle_error(response, xml=True) study_desc = study.getElementsByTagName("STUDY_DESCRIPTION")[0].firstChild.nodeValue - reformatted_data = { - "study_description" : study_desc - } + reformatted_data = {"study_description": study_desc} logging.info(f"{self.accession} private study returned from ENA") return reformatted_data @@ -283,15 +259,15 @@ def _get_public_study(self): data = get_default_params() data.update( { - "result": "study", - "query": f'{self.acc_type}="{self.accession}"', - "fields": STUDY_DEFAULT_FIELDS, - } + "result": "study", + "query": f'{self.acc_type}="{self.accession}"', + "fields": STUDY_DEFAULT_FIELDS, + } ) response = self.retry_or_handle_request_error(self.post_request, data) study = self.get_data_or_handle_error(response) logging.info(f"{self.accession} public study returned from ENA") - return study + return study def _get_private_run_from_assembly(self): url = f"{self.private_url}/analyses/xml/{self.accession}" @@ -310,7 +286,7 @@ def _get_public_run_from_assembly(self): run = run_ref[0].attributes["accession"].value logging.info(f"public run from the assembly {self.accession} returned from ENA") - return run + return run def _get_private_study_runs(self): url = f"{self.private_url}/runs/{self.accession}" @@ -318,66 +294,51 @@ def _get_private_study_runs(self): runs = self.get_data_or_handle_error(response, full_json=True) reformatted_data = [] for run in runs: - run_data = run['report'] - if 'sampleId' in run_data: - run_data['sample_accession'] = run_data.pop('sampleId') - if 'id' in run_data: - run_data['run_accession'] = run_data.pop('id') - if 'instrumentModel' in run_data: - run_data['instrument_model'] = run_data.pop('instrumentModel') + run_data = run["report"] + if "sampleId" in run_data: + run_data["sample_accession"] = run_data.pop("sampleId") + if "id" in run_data: + run_data["run_accession"] = run_data.pop("id") + if "instrumentModel" in run_data: + run_data["instrument_model"] = run_data.pop("instrumentModel") reformatted_data.append(run_data) logging.info(f"private runs from study {self.accession}, returned from ENA") return reformatted_data - def _get_public_study_runs(self): data = get_default_params() - data.update( - { - "result": "read_run", - "fields": STUDY_RUN_DEFAULT_FIELDS, - "query": f"{self.acc_type}={self.accession}" - } - ) + data.update({"result": "read_run", "fields": STUDY_RUN_DEFAULT_FIELDS, "query": f"{self.acc_type}={self.accession}"}) response = self.retry_or_handle_request_error(self.post_request, data) runs = self.get_data_or_handle_error(response, full_json=True) logging.info(f"public runs from study {self.accession}, returned from ENA") return runs - + def _get_private_sample(self): url = f"{self.private_url}/samples/xml/{self.accession}" reponse = self.retry_or_handle_request_error(self.get_request, url) reformatted_data = {} parsed_xml = self.get_data_or_handle_error(reponse, xml=True) - sample_attributes = parsed_xml.getElementsByTagName('SAMPLE_ATTRIBUTE') + sample_attributes = parsed_xml.getElementsByTagName("SAMPLE_ATTRIBUTE") for attribute in sample_attributes: - tag = attribute.getElementsByTagName('TAG')[0].firstChild.nodeValue + tag = attribute.getElementsByTagName("TAG")[0].firstChild.nodeValue if tag == "geographic location (country and/or sea)": - reformatted_data['country'] = attribute.getElementsByTagName('VALUE')[0].firstChild.nodeValue + reformatted_data["country"] = attribute.getElementsByTagName("VALUE")[0].firstChild.nodeValue if tag == "geographic location (latitude)": - reformatted_data['latitude'] = attribute.getElementsByTagName('VALUE')[0].firstChild.nodeValue + reformatted_data["latitude"] = attribute.getElementsByTagName("VALUE")[0].firstChild.nodeValue if tag == "geographic location (longitude)": - reformatted_data['longitude'] = attribute.getElementsByTagName('VALUE')[0].firstChild.nodeValue + reformatted_data["longitude"] = attribute.getElementsByTagName("VALUE")[0].firstChild.nodeValue if tag == "collection date": - reformatted_data['collection_date'] = attribute.getElementsByTagName('VALUE')[0].firstChild.nodeValue - logging.info(f"{self.accession} private sample returned from ENA") + reformatted_data["collection_date"] = attribute.getElementsByTagName("VALUE")[0].firstChild.nodeValue + logging.info(f"{self.accession} private sample returned from ENA") return reformatted_data - def _get_public_sample(self): data = get_default_params() - data.update( - { - "result": "sample", - "fields": SAMPLE_DEFAULT_FIELDS, - "query": f"{self.acc_type}={self.accession}" - } - ) + data.update({"result": "sample", "fields": SAMPLE_DEFAULT_FIELDS, "query": f"{self.acc_type}={self.accession}"}) response = self.retry_or_handle_request_error(self.post_request, data) sample = self.get_data_or_handle_error(response) logging.info(f"{self.accession} public sample returned from ENA") - return sample - + return sample def build_query(self): if self.query_type == "study": @@ -408,7 +369,7 @@ def build_query(self): return ena_response -class EnaSubmit(): +class EnaSubmit: def __init__(self, sample_xml, submission_xml, live=False): self.sample_xml = sample_xml self.submission_xml = submission_xml @@ -420,7 +381,6 @@ def __init__(self, sample_xml, submission_xml, live=False): self.auth = (username, password) else: self.auth = None - def handle_genomes_registration(self): live_sub, mode = "", "live" @@ -431,22 +391,17 @@ def handle_genomes_registration(self): url = "https://www{}.ebi.ac.uk/ena/submit/drop-box/submit/".format(live_sub) - logger.info('Registering sample xml in {} mode.'.format(mode)) + logger.info("Registering sample xml in {} mode.".format(mode)) - f = { - 'SUBMISSION': open(self.submission_xml, 'r'), - 'SAMPLE': open(self.sample_xml, 'r') - } + f = {"SUBMISSION": open(self.submission_xml, "r"), "SAMPLE": open(self.sample_xml, "r")} - submission_response = requests.post(url, files = f, auth = self.auth) + submission_response = requests.post(url, files=f, auth=self.auth) if submission_response.status_code != 200: - if str(submission_response.status_code).startswith('5'): - raise Exception("Genomes could not be submitted to ENA as the server " + - "does not respond. Please again try later.") + if str(submission_response.status_code).startswith("5"): + raise Exception("Genomes could not be submitted to ENA as the server " + "does not respond. Please again try later.") else: - raise Exception("Genomes could not be submitted to ENA. HTTP response: " + - submission_response.reason) + raise Exception("Genomes could not be submitted to ENA. HTTP response: " + submission_response.reason) receipt_xml = minidom.parseString((submission_response.content).decode("utf-8")) receipt = receipt_xml.getElementsByTagName("RECEIPT") @@ -466,11 +421,7 @@ def handle_genomes_registration(self): final_error += "\n\tIf you wish to validate again your data and metadata, " final_error += "please use the --force option." raise Exception(final_error) - - logger.info('{} genome samples successfully registered.'.format(str(len(alias_dict)))) - - return alias_dict - + logger.info("{} genome samples successfully registered.".format(str(len(alias_dict)))) - + return alias_dict diff --git a/genomeuploader/genome_upload.py b/genomeuploader/genome_upload.py index d34435c..264d57d 100755 --- a/genomeuploader/genome_upload.py +++ b/genomeuploader/genome_upload.py @@ -14,51 +14,72 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os -import sys -import logging import argparse -import re import json -import pandas as pd -from datetime import date, datetime as dt - -import xml.etree.ElementTree as ET +import logging +import os +import re +import sys import xml.dom.minidom as minidom -import requests +import xml.etree.ElementTree as ET +from datetime import date +from datetime import datetime as dt import ena +import pandas as pd +from constants import ( + BIN_MANDATORY_FIELDS, + GEOGRAPHIC_LOCATIONS, + GEOGRAPHY_DIGIT_COORDS, + HQ, + MAG_MANDATORY_FIELDS, + METAGENOMES, + MQ, +) from ena import EnaQuery, EnaSubmit -from constants import * - logging.basicConfig(level=logging.DEBUG) logger = logging.getLogger(__name__) + def parse_args(argv): - parser = argparse.ArgumentParser(formatter_class = argparse.ArgumentDefaultsHelpFormatter, - description="Create xmls and manifest files for genome upload and upload to ENA") - - parser.add_argument('-u', '--upload_study', type=str, required=True, help="Study accession for genomes upload") - parser.add_argument('--genome_info', type=str, required=True, help="Genomes metadata file") + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + description="Create xmls and manifest files for genome upload and upload to ENA", + ) + + parser.add_argument("-u", "--upload_study", type=str, required=True, help="Study accession for genomes upload") + parser.add_argument("--genome_info", type=str, required=True, help="Genomes metadata file") genomeType = parser.add_mutually_exclusive_group(required=True, default=False) - genomeType.add_argument('-m', '--mags', action='store_true', help="Select for MAG upload") - genomeType.add_argument('-b', '--bins', action='store_true', help="Select for bin upload") - - parser.add_argument('--out', type=str, help="Output folder. Default: working directory") - parser.add_argument('--force', action='store_true', required=False, default=False, help="Forces reset of sample xml's backups") - parser.add_argument('--live', action='store_true', required=False, default=False, help="Uploads on ENA. Omitting this " + - "option allows to validate samples beforehand") - parser.add_argument('--tpa', action='store_true', required=False, default=False, help="Select if uploading TPA-generated genomes") - + genomeType.add_argument("-m", "--mags", action="store_true", help="Select for MAG upload") + genomeType.add_argument("-b", "--bins", action="store_true", help="Select for bin upload") + + parser.add_argument("--out", type=str, help="Output folder. Default: working directory") + parser.add_argument("--force", action="store_true", required=False, default=False, help="Forces reset of sample xml's backups") + parser.add_argument( + "--live", + action="store_true", + required=False, + default=False, + help="Uploads on ENA. Omitting this " + "option allows to validate samples beforehand", + ) + parser.add_argument("--tpa", action="store_true", required=False, default=False, help="Select if uploading TPA-generated genomes") + # Users can provide their credentials in environment variables or using a config file - parser.add_argument('--centre_name', required=True, help="Name of the centre uploading genomes") - parser.add_argument('--private', required=False, action='store_true', default=False, help="if data is private",) + parser.add_argument("--centre_name", required=True, help="Name of the centre uploading genomes") + parser.add_argument( + "--private", + required=False, + action="store_true", + default=False, + help="if data is private", + ) return parser.parse_args(argv) -''' + +""" Input table: expects the following parameters: genome_name: genome file name accessions: run(s) or assembly(ies) the genome was generated from @@ -77,7 +98,8 @@ def parse_args(argv): co-assembly: True/False, whether the genome was generated from a co-assembly genome_coverage : genome coverage genome_path: path to genome to upload -''' +""" + def round_stats(stats): new_stat = round(float(stats), 2) @@ -88,17 +110,19 @@ def round_stats(stats): return new_stat + def compute_mag_quality(completeness, contamination, rna_presence): rna_presence = str(rna_presence).lower() in ["true", "yes", "y"] quality = MQ if float(completeness) >= 90 and float(contamination) <= 5 and rna_presence: quality = HQ - + return quality, completeness, contamination + def extract_tax_info(tax_info): # if unclassified, block the execution - lineage, kingdom_position_lineage, digit_annotation = tax_info.split(';'), 0, False + lineage, kingdom_position_lineage, digit_annotation = tax_info.split(";"), 0, False lineage_first = lineage[0] if "Unclassified " in lineage_first: if "Archaea" in lineage_first: @@ -128,13 +152,13 @@ def extract_tax_info(tax_info): final_kingdom = selected_kingdom[index] break - iterator = len(lineage)-1 + iterator = len(lineage) - 1 submittable = False rank = "" while iterator != -1 and not submittable: scientific_name = lineage[iterator].strip() if digit_annotation: - if not '*' in scientific_name: + if "*" not in scientific_name: scientific_name = ena.query_taxid(scientific_name) else: iterator -= 1 @@ -156,11 +180,12 @@ def extract_tax_info(tax_info): return taxid, scientific_name + def extract_eukaryota_info(name, rank): non_submittable = (False, "", 0) # Asterisks in given taxonomy suggest the classification might be not confident enough. - if '*' in name: + if "*" in name: return non_submittable if rank == "super kingdom": @@ -178,23 +203,24 @@ def extract_eukaryota_info(name, rank): if submittable: return submittable, name, taxid else: - name = name.replace(" sp.", '') + name = name.replace(" sp.", "") submittable, taxid = ena.query_scientific_name(name) if submittable: return submittable, name, taxid else: return non_submittable + def extract_bacteria_info(name, rank): if rank == "species": name = name elif rank == "superkingdom": - name = "uncultured bacterium".format(name) + name = "uncultured bacterium".format() elif rank in ["family", "order", "class", "phylum"]: name = "uncultured {} bacterium".format(name) elif rank == "genus": name = "uncultured {} sp.".format(name) - + submittable, taxid, rank = ena.query_scientific_name(name, search_rank=True) if not submittable: if rank in ["species", "genus"] and name.lower().endswith("bacteria"): @@ -206,6 +232,7 @@ def extract_bacteria_info(name, rank): return submittable, name, taxid + def extract_archaea_info(name, rank): if rank == "species": name = name @@ -222,20 +249,22 @@ def extract_archaea_info(name, rank): name = "uncultured {} archaeon".format(name) elif rank == "genus": name = "uncultured {} sp.".format(name) - + submittable, taxid, rank = ena.query_scientific_name(name, search_rank=True) if not submittable: if "Candidatus" in name: if rank == "phylum": - name = name.replace("Candidatus ", '') + name = name.replace("Candidatus ", "") elif rank == "family": - name = name.replace("uncultured ", '') + name = name.replace("uncultured ", "") submittable, taxid = ena.query_scientific_name(name) - + return submittable, name, taxid + def multiple_element_set(metadata_list): - return len(set(metadata_list))>1 + return len(set(metadata_list)) > 1 + def combine_ena_info(genome_info, ena_dict): for g in genome_info: @@ -254,12 +283,12 @@ def combine_ena_info(genome_info, ena_dict): long_list.append(ena_dict[run]["longitude"]) latit_list.append(ena_dict[run]["latitude"]) - genome_info[g]["study"] = study_list[0] + genome_info[g]["study"] = study_list[0] genome_info[g]["description"] = description_list[0] - + instrument = instrument_list[0] if multiple_element_set(instrument_list): - instrument = ','.join(instrument_list) + instrument = ",".join(instrument_list) genome_info[g]["sequencingMethod"] = instrument collection_date = collection_list[0] @@ -284,7 +313,7 @@ def combine_ena_info(genome_info, ena_dict): samples = samples_list[0] if multiple_element_set(samples_list): - samples = ','.join(samples_list) + samples = ",".join(samples_list) genome_info[g]["sample_accessions"] = samples else: run = genome_info[g]["accessions"][0] @@ -296,85 +325,96 @@ def combine_ena_info(genome_info, ena_dict): genome_info[g]["country"] = ena_dict[run]["country"] genome_info[g]["longitude"] = ena_dict[run]["longitude"] genome_info[g]["latitude"] = ena_dict[run]["latitude"] - - genome_info[g]["accessions"] = ','.join(genome_info[g]["accessions"]) + + genome_info[g]["accessions"] = ",".join(genome_info[g]["accessions"]) def get_accessions(accessions_file): accession_dict = {} - with open(accessions_file, 'r') as f: + with open(accessions_file, "r") as f: for line in f: - line = line.split('\t') + line = line.split("\t") alias = line[0] - accession = line[1].rstrip('\n') + accession = line[1].rstrip("\n") accession_dict[alias] = accession return accession_dict + def save_accessions(alias_accession_dict, accessions_file, write_mode): with open(accessions_file, write_mode) as f: for elem in alias_accession_dict: f.write("{}\t{}\n".format(elem, alias_accession_dict[elem])) -def create_manifest_dictionary(run, alias, assembly_software, sequencing_method, - mag_path, gen, study, coverage, is_coassembly): + +def create_manifest_dictionary(run, alias, assembly_software, sequencing_method, mag_path, gen, study, coverage, is_coassembly): manifest_dict = { - "accessions" : run, - "alias" : alias, - "assembler" : assembly_software, - "sequencingMethod" : sequencing_method, - "genome_path" : mag_path, - "genome_name" : gen, - "study" : study, - "coverageDepth" : coverage, - "co-assembly" : is_coassembly + "accessions": run, + "alias": alias, + "assembler": assembly_software, + "sequencingMethod": sequencing_method, + "genome_path": mag_path, + "genome_name": gen, + "study": study, + "coverageDepth": coverage, + "co-assembly": is_coassembly, } return manifest_dict + def compute_manifests(genomes): manifest_info = {} for g in genomes: - manifest_info[g] = create_manifest_dictionary(genomes[g]["accessions"], - genomes[g]["alias"], genomes[g]["assembly_software"], - genomes[g]["sequencingMethod"], genomes[g]["genome_path"], g, - genomes[g]["study"], genomes[g]["genome_coverage"], genomes[g]["co-assembly"]) - + manifest_info[g] = create_manifest_dictionary( + genomes[g]["accessions"], + genomes[g]["alias"], + genomes[g]["assembly_software"], + genomes[g]["sequencingMethod"], + genomes[g]["genome_path"], + g, + genomes[g]["study"], + genomes[g]["genome_coverage"], + genomes[g]["co-assembly"], + ) + return manifest_info + def get_study_from_xml(sample): description = sample.childNodes[5].childNodes[0].data - study = description.split(' ')[-1][:-1] + study = description.split(" ")[-1][:-1] return study + def recover_info_from_xml(genome_dict, sample_xml, live_mode): logger.info("Retrieving data for genome submission...") # extract list of genomes (samples) to be registered xml_structure = minidom.parse(sample_xml) samples = xml_structure.getElementsByTagName("SAMPLE") - + for s in samples: study = get_study_from_xml(s) # extract alias from xml and find a match with genomes the user is uploading xml_alias = s.attributes["alias"].value - if not live_mode: # remove time stamp if test mode is selected + if not live_mode: # remove time stamp if test mode is selected alias_split = xml_alias.split("_") - xml_alias = '_'.join(alias_split[:-1]) + xml_alias = "_".join(alias_split[:-1]) for gen in genome_dict: # if match is found, associate attributes listed in the xml file # with genomes to upload if xml_alias == gen: if not live_mode: current_time_stamp = str(int(dt.timestamp(dt.now()))) - xml_alias = gen + '_' + current_time_stamp + xml_alias = gen + "_" + current_time_stamp s.attributes["alias"].value = xml_alias sample_title = s.getElementsByTagName("TITLE")[0] sample_title_value = sample_title.firstChild.nodeValue.split("_") sample_title_value[-1] = current_time_stamp - new_sample_title = '_'.join(sample_title_value) + new_sample_title = "_".join(sample_title_value) s.getElementsByTagName("TITLE")[0].firstChild.replaceWholeText(new_sample_title) attributes = s.childNodes[7].getElementsByTagName("SAMPLE_ATTRIBUTE") seq_method, ass_software = "", "" @@ -390,7 +430,7 @@ def recover_info_from_xml(genome_dict, sample_xml, live_mode): if not seq_method == "" and not ass_software == "": break - genome_dict[gen]["accessions"] = ','.join(genome_dict[gen]["accessions"]) + genome_dict[gen]["accessions"] = ",".join(genome_dict[gen]["accessions"]) genome_dict[gen]["alias"] = xml_alias genome_dict[gen]["assembly_software"] = ass_software genome_dict[gen]["sequencingMethod"] = seq_method @@ -400,12 +440,13 @@ def recover_info_from_xml(genome_dict, sample_xml, live_mode): if not live_mode: for s in samples: xml_structure.firstChild.appendChild(s) - - with open(sample_xml, 'wb') as f: + + with open(sample_xml, "wb") as f: dom_string = xml_structure.toprettyxml().encode("utf-8") - dom_string = b'\n'.join([s for s in dom_string.splitlines() if s.strip()]) + dom_string = b"\n".join([s for s in dom_string.splitlines() if s.strip()]) f.write(dom_string) + def create_sample_attribute(sample_attributes, data_list, mag_data=None): tag = data_list[0] value = data_list[1] @@ -414,13 +455,14 @@ def create_sample_attribute(sample_attributes, data_list, mag_data=None): units = None if len(data_list) == 3: units = data_list[2] - + new_sample_attr = ET.SubElement(sample_attributes, "SAMPLE_ATTRIBUTE") - ET.SubElement(new_sample_attr, 'TAG').text = tag - ET.SubElement(new_sample_attr, 'VALUE').text = value + ET.SubElement(new_sample_attr, "TAG").text = tag + ET.SubElement(new_sample_attr, "VALUE").text = value if units: - ET.SubElement(new_sample_attr, 'UNITS').text = units + ET.SubElement(new_sample_attr, "UNITS").text = units + def write_genomes_xml(genomes, xml_path, genome_type, centre_name, tpa): map_sample_attributes = [ @@ -467,18 +509,16 @@ def write_genomes_xml(genomes, xml_path, genome_type, centre_name, tpa): for g in genomes: plural = "" if genomes[g]["co-assembly"]: - plural = 's' - description = ("This sample represents a {}{} assembled from the " - "metagenomic run{} {} of study {}.".format(tpa_description, - assembly_type, plural, genomes[g]["accessions"], - genomes[g]["study"])) - + plural = "s" + description = "This sample represents a {}{} assembled from the " "metagenomic run{} {} of study {}.".format( + tpa_description, assembly_type, plural, genomes[g]["accessions"], genomes[g]["study"] + ) + sample = ET.SubElement(sample_set, "SAMPLE") sample.set("alias", genomes[g]["alias"]) sample.set("center_name", centre_name) - ET.SubElement(sample, 'TITLE').text = ("{}: {}".format(assembly_type, - genomes[g]["alias"])) + ET.SubElement(sample, "TITLE").text = "{}: {}".format(assembly_type, genomes[g]["alias"]) sample_name = ET.SubElement(sample, "SAMPLE_NAME") ET.SubElement(sample_name, "TAXON_ID").text = genomes[g]["taxID"] ET.SubElement(sample_name, "SCIENTIFIC_NAME").text = genomes[g]["scientific_name"] @@ -494,41 +534,39 @@ def write_genomes_xml(genomes, xml_path, genome_type, centre_name, tpa): for constant in constant_sample_attributes: create_sample_attribute(sample_attributes, constant) - with open(xml_path, 'wb') as f: - dom = minidom.parseString( - ET.tostring(sample_set, encoding="utf-8") - ) + with open(xml_path, "wb") as f: + dom = minidom.parseString(ET.tostring(sample_set, encoding="utf-8")) f.write(dom.toprettyxml().encode("utf-8")) + def write_submission_xml(upload_dir, centre_name, study=True): today = str(date.today()) - sub_xml = os.path.join(upload_dir, 'submission.xml') + sub_xml = os.path.join(upload_dir, "submission.xml") - submission = ET.Element('SUBMISSION') - submission.set('center_name', centre_name) + submission = ET.Element("SUBMISSION") + submission.set("center_name", centre_name) # template - actions = ET.SubElement(submission, 'ACTIONS') - action_sub = ET.SubElement(actions, 'ACTION') - ET.SubElement(action_sub, 'ADD') + actions = ET.SubElement(submission, "ACTIONS") + action_sub = ET.SubElement(actions, "ACTION") + ET.SubElement(action_sub, "ADD") # attributes: function and hold date if study: - action_hold = ET.SubElement(actions, 'ACTION') - hold = ET.SubElement(action_hold, 'HOLD') - hold.set('HoldUntilDate', today) + action_hold = ET.SubElement(actions, "ACTION") + hold = ET.SubElement(action_hold, "HOLD") + hold.set("HoldUntilDate", today) - with open(sub_xml, 'wb') as submission_file: - dom = minidom.parseString( - ET.tostring(submission, encoding="utf-8") - ) + with open(sub_xml, "wb") as submission_file: + dom = minidom.parseString(ET.tostring(submission, encoding="utf-8")) submission_file.write(dom.toprettyxml().encode("utf-8")) return sub_xml + def generate_genome_manifest(genome_info, study, manifests_root, alias_to_sample, genome_type, tpa): manifest_path = os.path.join(manifests_root, f'{genome_info["genome_name"]}.manifest') - + tpa_addition, multiple_runs = "", "" if tpa: tpa_addition = "Third Party Annotation (TPA) " @@ -539,25 +577,29 @@ def generate_genome_manifest(genome_info, study, manifests_root, alias_to_sample assembly_type = "binned metagenome" values = ( - ('STUDY', study), - ('SAMPLE', alias_to_sample[genome_info["alias"]]), - ('ASSEMBLYNAME', genome_info["alias"]), - ('ASSEMBLY_TYPE', assembly_type), - ('COVERAGE', genome_info["coverageDepth"]), - ('PROGRAM', genome_info["assembler"]), - ('PLATFORM', genome_info["sequencingMethod"]), - ('MOLECULETYPE', "genomic DNA"), - ('DESCRIPTION', ("This is a {}bin derived from the primary whole genome " - "shotgun (WGS) data set {}. This sample represents a {} from the " - "metagenomic run{} {}.".format(tpa_addition, genome_info["study"], - assembly_type, multiple_runs, genome_info["accessions"]))), - ('RUN_REF', genome_info["accessions"]), - ('FASTA', os.path.abspath(genome_info["genome_path"])) + ("STUDY", study), + ("SAMPLE", alias_to_sample[genome_info["alias"]]), + ("ASSEMBLYNAME", genome_info["alias"]), + ("ASSEMBLY_TYPE", assembly_type), + ("COVERAGE", genome_info["coverageDepth"]), + ("PROGRAM", genome_info["assembler"]), + ("PLATFORM", genome_info["sequencingMethod"]), + ("MOLECULETYPE", "genomic DNA"), + ( + "DESCRIPTION", + ( + "This is a {}bin derived from the primary whole genome " + "shotgun (WGS) data set {}. This sample represents a {} from the " + "metagenomic run{} {}.".format(tpa_addition, genome_info["study"], assembly_type, multiple_runs, genome_info["accessions"]) + ), + ), + ("RUN_REF", genome_info["accessions"]), + ("FASTA", os.path.abspath(genome_info["genome_path"])), ) logger.info("Writing manifest file (.manifest) for {}.".format(genome_info["alias"])) with open(manifest_path, "w") as outfile: - for (k, v) in values: - manifest = f'{k}\t{v}\n' + for k, v in values: + manifest = f"{k}\t{v}\n" outfile.write(manifest) if tpa: outfile.write("TPA\ttrue\n") @@ -575,8 +617,8 @@ def __init__( tpa: bool = False, force: bool = False, out: str = None, - ): - f""" + ): + """ Submission of genomes. :param upload_study: Study accession for genomes upload. @@ -610,21 +652,22 @@ def __init__( self.genome_metadata = genome_info def validate_metadata_tsv(self): - logger.info('Retrieving info for genomes to submit...') - + logger.info("Retrieving info for genomes to submit...") + all_fields = MAG_MANDATORY_FIELDS + BIN_MANDATORY_FIELDS - metadata = pd.read_csv(self.genome_metadata, sep='\t', usecols=all_fields) - + metadata = pd.read_csv(self.genome_metadata, sep="\t", usecols=all_fields) + # make sure there are no empty cells clean_columns = list(metadata.dropna(axis=1)) if self.genome_type == "MAGs": missing_values = [item for item in all_fields if item not in clean_columns] else: missing_values = [item for item in BIN_MANDATORY_FIELDS if item not in clean_columns] - + if missing_values: - raise ValueError("The following mandatory fields have missing values in " + - "the input file: {}".format(", ".join(missing_values))) + raise ValueError( + "The following mandatory fields have missing values in " + "the input file: {}".format(", ".join(missing_values)) + ) # check amount of genomes to register at the same time if len(metadata) >= 5000: @@ -633,58 +676,53 @@ def validate_metadata_tsv(self): # check whether accessions follow the right format accessions_reg_exp = re.compile(r"([E|S|D]R[R|Z]\d{6,})") - accession_comparison = pd.DataFrame(columns=["genome_name", "attemptive_accessions", - "correct", "mismatching", "co-assembly"]) + accession_comparison = pd.DataFrame(columns=["genome_name", "attemptive_accessions", "correct", "mismatching", "co-assembly"]) accession_comparison["genome_name"] = metadata["genome_name"] - accession_comparison["attemptive_accessions"] = metadata["accessions"].map( - lambda a: len(a.split(','))) + accession_comparison["attemptive_accessions"] = metadata["accessions"].map(lambda a: len(a.split(","))) - accession_comparison["correct"] = metadata["accessions"].map( - lambda a: len(accessions_reg_exp.findall(a))) + accession_comparison["correct"] = metadata["accessions"].map(lambda a: len(accessions_reg_exp.findall(a))) - accession_comparison["mismatching"] = accession_comparison.apply(lambda row: - True if row["attemptive_accessions"] == row["correct"] - else None, axis=1).isna() + accession_comparison["mismatching"] = accession_comparison.apply( + lambda row: True if row["attemptive_accessions"] == row["correct"] else None, axis=1 + ).isna() mismatching_accessions = accession_comparison[accession_comparison["mismatching"]]["genome_name"] if not mismatching_accessions.empty: - raise ValueError("Run accessions are not correctly formatted for the following " + - "genomes: " + ','.join(mismatching_accessions.values)) + raise ValueError( + "Run accessions are not correctly formatted for the following " + "genomes: " + ",".join(mismatching_accessions.values) + ) # check whether completeness and contamination are floats try: pd.to_numeric(metadata["completeness"]) pd.to_numeric(metadata["contamination"]) pd.to_numeric(metadata["genome_coverage"]) - except: + except ValueError: raise ValueError("Completeness, contamination or coverage values should be formatted as floats") # check whether all co-assemblies have more than one run associated and viceversa accession_comparison["co-assembly"] = metadata["co-assembly"] - coassembly_discrepancy = metadata[( - (accession_comparison["correct"] < 2) & (accession_comparison["co-assembly"])) | - ((accession_comparison["correct"] > 1) & (~accession_comparison["co-assembly"]) - )]["genome_name"] + coassembly_discrepancy = metadata[ + ((accession_comparison["correct"] < 2) & (accession_comparison["co-assembly"])) + | ((accession_comparison["correct"] > 1) & (~accession_comparison["co-assembly"])) + ]["genome_name"] if not coassembly_discrepancy.empty: - raise ValueError("The following genomes show discrepancy between number of runs " - "involved and co-assembly status: " + ','.join(coassembly_discrepancy.values)) + raise ValueError( + "The following genomes show discrepancy between number of runs " + "involved and co-assembly status: " + ",".join(coassembly_discrepancy.values) + ) # are provided metagenomes part of the accepted metagenome list? - if False in metadata.apply(lambda row: - True if row["metagenome"] in METAGENOMES - else False, axis=1).unique(): - raise ValueError("Metagenomes associated with each genome need to belong to ENA's " + - "approved metagenomes list.") + if False in metadata.apply(lambda row: True if row["metagenome"] in METAGENOMES else False, axis=1).unique(): + raise ValueError("Metagenomes associated with each genome need to belong to ENA's " + "approved metagenomes list.") # do provided file paths exist? - if False in metadata.apply(lambda row: - True if os.path.exists(row["genome_path"]) - else False, axis =1).unique(): + if False in metadata.apply(lambda row: True if os.path.exists(row["genome_path"]) else False, axis=1).unique(): raise FileNotFoundError("Some genome paths do not exist.") # check genome name lengths - #if not (metadata["genome_name"].map(lambda a: len(a) < 20).all()): + # if not (metadata["genome_name"].map(lambda a: len(a) < 20).all()): # raise ValueError("Genome names must be shorter than 20 characters.") # create dictionary while checking genome name uniqueness @@ -692,7 +730,7 @@ def validate_metadata_tsv(self): if uniqueness: if not self.live: timestamp = str(int(dt.timestamp(dt.now()))) - timestamp_names = [row["genome_name"] + '_' + timestamp for index, row in metadata.iterrows()] + timestamp_names = [row["genome_name"] + "_" + timestamp for index, row in metadata.iterrows()] metadata["unique_genome_name"] = timestamp_names genome_info = metadata.set_index("unique_genome_name").transpose().to_dict() else: @@ -705,7 +743,7 @@ def validate_metadata_tsv(self): def extract_genomes_info(self): genome_info = self.validate_metadata_tsv() for gen in genome_info: - genome_info[gen]["accessions"] = genome_info[gen]["accessions"].split(',') + genome_info[gen]["accessions"] = genome_info[gen]["accessions"].split(",") accession_type = "run" assembly_reg_exp = re.compile(r"([E|S|D]RZ\d{6,})") if assembly_reg_exp.findall(genome_info[gen]["accessions"][0]): @@ -713,14 +751,17 @@ def extract_genomes_info(self): genome_info[gen]["accessionType"] = accession_type genome_info[gen]["isolationSource"] = genome_info[gen]["metagenome"] - + try: - (genome_info[gen]["MAG_quality"], - genome_info[gen]["completeness"], - genome_info[gen]["contamination"]) = compute_mag_quality( - str(round_stats(genome_info[gen]["completeness"])), - str(round_stats(genome_info[gen]["contamination"])), - genome_info[gen]["rRNA_presence"]) + ( + genome_info[gen]["MAG_quality"], + genome_info[gen]["completeness"], + genome_info[gen]["contamination"], + ) = compute_mag_quality( + str(round_stats(genome_info[gen]["completeness"])), + str(round_stats(genome_info[gen]["contamination"])), + genome_info[gen]["rRNA_presence"], + ) except IndexError: pass @@ -728,7 +769,7 @@ def extract_genomes_info(self): genome_info[gen]["co-assembly"] = True else: genome_info[gen]["co-assembly"] = False - + genome_info[gen]["alias"] = gen tax_id, scientific_name = extract_tax_info(genome_info[gen]["NCBI_lineage"]) @@ -738,8 +779,8 @@ def extract_genomes_info(self): return genome_info def extract_ena_info(self, genome_info): - logger.info('Retrieving project and run info from ENA (this might take a while)...') - + logger.info("Retrieving project and run info from ENA (this might take a while)...") + # retrieving metadata from runs (and runs from assembly accessions if provided) all_runs = [] for g in genome_info: @@ -757,14 +798,14 @@ def extract_ena_info(self, genome_info): run_info = ena_query.build_query() study_set.add(run_info["secondary_study_accession"]) samples_dict[r] = run_info["sample_accession"] - + if not study_set: raise ValueError("No study corresponding to runs found.") backup_file = os.path.join(self.upload_dir, "ENA_backup.json") counter = 0 if not os.path.exists(backup_file): - with open(backup_file, 'w') as file: + with open(backup_file, "w") as file: pass with open(backup_file, "r+") as file: try: @@ -782,7 +823,7 @@ def extract_ena_info(self, genome_info): ena_info = ena_query.build_query() if ena_info == []: raise IOError("No runs found on ENA for project {}.".format(s)) - + for run, item in enumerate(ena_info): run_accession = ena_info[run]["run_accession"] if run_accession not in backup_dict: @@ -791,23 +832,23 @@ def extract_ena_info(self, genome_info): ena_query = EnaQuery(sample_accession, "sample", self.private) sample_info = ena_query.build_query() - if 'latitude' in sample_info and 'longitude' in sample_info: - latitude = sample_info['latitude'] - longitude = sample_info['longitude'] + if "latitude" in sample_info and "longitude" in sample_info: + latitude = sample_info["latitude"] + longitude = sample_info["longitude"] else: location = sample_info["location"] latitude, longitude = None, None - if 'N' in location: - latitude = location.split('N')[0].strip() - longitude = location.split('N')[1].strip() - elif 'S' in location: - latitude = '-' + location.split('S')[0].strip() - longitude = location.split('S')[1].strip() - - if 'W' in longitude: - longitude = '-' + longitude.split('W')[0].strip() - elif longitude.endswith('E'): - longitude = longitude.split('E')[0].strip() + if "N" in location: + latitude = location.split("N")[0].strip() + longitude = location.split("N")[1].strip() + elif "S" in location: + latitude = "-" + location.split("S")[0].strip() + longitude = location.split("S")[1].strip() + + if "W" in longitude: + longitude = "-" + longitude.split("W")[0].strip() + elif longitude.endswith("E"): + longitude = longitude.split("E")[0].strip() if latitude: latitude = "{:.{}f}".format(round(float(latitude), GEOGRAPHY_DIGIT_COORDS), GEOGRAPHY_DIGIT_COORDS) @@ -818,35 +859,34 @@ def extract_ena_info(self, genome_info): longitude = "{:.{}f}".format(round(float(longitude), GEOGRAPHY_DIGIT_COORDS), GEOGRAPHY_DIGIT_COORDS) else: longitude = "not provided" - - country = sample_info["country"].split(':')[0] - if not country in GEOGRAPHIC_LOCATIONS: + + country = sample_info["country"].split(":")[0] + if country not in GEOGRAPHIC_LOCATIONS: country = "not provided" - + collectionDate = sample_info["collection_date"] if collectionDate == "" or collectionDate == "missing": collectionDate = "not provided" temp_dict[run_accession] = { - "instrumentModel" : ena_info[run]["instrument_model"], - "collectionDate" : collectionDate, - "country" : country, - "latitude" : latitude, - "longitude" : longitude, - "projectDescription" : project_description, - "study" : s, - "sampleAccession" : samples_dict[run_accession] + "instrumentModel": ena_info[run]["instrument_model"], + "collectionDate": collectionDate, + "country": country, + "latitude": latitude, + "longitude": longitude, + "projectDescription": project_description, + "study": s, + "sampleAccession": samples_dict[run_accession], } counter += 1 - if (counter%10 == 0) or (len(runs_set) - len(backup_dict) == counter): + if (counter % 10 == 0) or (len(runs_set) - len(backup_dict) == counter): file.seek(0) file.write(json.dumps(temp_dict)) file.truncate() temp_dict = {**temp_dict, **backup_dict} combine_ena_info(genome_info, temp_dict) - def generate_genomes_upload_dir(self): upload_name = "MAG_upload" if self.genome_type == "bins": @@ -856,7 +896,7 @@ def generate_genomes_upload_dir(self): return upload_dir def create_genome_dictionary(self, samples_xml): - logger.info('Retrieving data for MAG submission...') + logger.info("Retrieving data for MAG submission...") genome_info = self.extract_genomes_info() @@ -864,18 +904,16 @@ def create_genome_dictionary(self, samples_xml): self.extract_ena_info(genome_info) logger.info("Writing genome registration XML...") - write_genomes_xml(genome_info, samples_xml, self.genome_type, - self.centre_name, self.tpa) + write_genomes_xml(genome_info, samples_xml, self.genome_type, self.centre_name, self.tpa) logger.info("All files have been written to " + self.upload_dir) else: recover_info_from_xml(genome_info, samples_xml, self.live) return genome_info - + def genome_upload(self): if not self.live: - logger.warning("Warning: genome submission is not in live mode, " + - "files will be validated, but not uploaded.") + logger.warning("Warning: genome submission is not in live mode, " + "files will be validated, but not uploaded.") samples_xml = os.path.join(self.upload_dir, "genome_samples.xml") submission_xml = os.path.join(self.upload_dir, "submission.xml") genomes, manifest_info = {}, {} @@ -883,7 +921,7 @@ def genome_upload(self): # submission xml existence if not os.path.exists(submission_xml): submission_xml = write_submission_xml(self.upload_dir, self.centre_name, False) - + # sample xml generation or recovery genomes = self.create_genome_dictionary(samples_xml) @@ -899,18 +937,18 @@ def genome_upload(self): accessions_file = os.path.join(self.upload_dir, accessionsgen) save = False - write_mode = 'a' + write_mode = "a" if os.path.exists(accessions_file): if not self.live: save = True if self.force: - write_mode = 'w' + write_mode = "w" if not save: logger.info("Genome samples already registered, reading ERS accessions...") alias_to_new_sample_accession = get_accessions(accessions_file) else: save = True - + if save: logger.info("Registering genome samples XMLs...") ena_submit = EnaSubmit(samples_xml, submission_xml, self.live) @@ -922,8 +960,9 @@ def genome_upload(self): manifest_info = compute_manifests(genomes) for m in manifest_info: - generate_genome_manifest(manifest_info[m], self.upload_study, - manifest_dir, alias_to_new_sample_accession, self.genome_type, self.tpa) + generate_genome_manifest( + manifest_info[m], self.upload_study, manifest_dir, alias_to_new_sample_accession, self.genome_type, self.tpa + ) def main(): @@ -937,10 +976,11 @@ def main(): private=args.private, tpa=args.tpa, force=args.force, - out = args.out, + out=args.out, ) ena_upload.genome_upload() - + + if __name__ == "__main__": main() - logger.info('Completed') + logger.info("Completed") From 77dcaea6dab8274b5ffe5c389d85c96859a95bdd Mon Sep 17 00:00:00 2001 From: Varsha Date: Thu, 19 Dec 2024 14:06:49 +0000 Subject: [PATCH 12/20] always try public APIs if private fails --- genomeuploader/ena.py | 67 +++++++++++++++++++++++++++++++++---------- 1 file changed, 52 insertions(+), 15 deletions(-) diff --git a/genomeuploader/ena.py b/genomeuploader/ena.py index d968e55..d181d82 100644 --- a/genomeuploader/ena.py +++ b/genomeuploader/ena.py @@ -343,31 +343,68 @@ def _get_public_sample(self): def build_query(self): if self.query_type == "study": if self.private: - ena_response = self._get_private_study() - else: - ena_response = self._get_public_study() + try: + ena_response = self._get_private_study() + except Exception: + logging.info("Private API failed, trying public") + ena_response = self._get_public_study() elif self.query_type == "run": if self.private: - ena_response = self._get_private_run() - else: - ena_response = self._get_public_run() + try: + ena_response = self._get_private_run() + except Exception: + logging.info("Private API failed, trying public") + + ena_response = self._get_public_run() elif self.query_type == "run_assembly": if self.private: - ena_response = self._get_private_run_from_assembly() - else: - ena_response = self._get_public_run_from_assembly() + try: + ena_response = self._get_private_run_from_assembly() + except Exception: + logging.info("Private API failed, trying public") + ena_response = self._get_public_run_from_assembly() elif self.query_type == "study_runs": if self.private: - ena_response = self._get_private_study_runs() - else: - ena_response = self._get_public_study_runs() + try: + ena_response = self._get_private_study_runs() + except Exception: + logging.info("Private API failed, trying public") + ena_response = self._get_public_study_runs() elif self.query_type == "sample": if self.private: - ena_response = self._get_private_sample() - else: - ena_response = self._get_public_sample() + try: + ena_response = self._get_private_sample() + except Exception: + logging.info("Private API failed, trying public") + ena_response = self._get_public_sample() return ena_response + def build_query(self): + """If the private flag is given, assume private data and try private APIs. + ENA also has cases where a run may be private but the sample may be public etc. Hence always try + public if private fails""" + api_map = { + "study": (self._get_private_study, self._get_public_study), + "run": (self._get_private_run, self._get_public_run), + "run_assembly": (self._get_private_run_from_assembly, self._get_public_run_from_assembly), + "study_runs": (self._get_private_study_runs, self._get_public_study_runs), + "sample": (self._get_private_sample, self._get_public_sample) + } + + private_api, public_api = api_map.get(self.query_type, (None, None)) + + if self.private: + try: + ena_response = private_api() + except Exception: + logging.info(f"Private API for {self.query_type} failed, trying public.") + ena_response = public_api() + else: + ena_response = public_api() + + return ena_response + + class EnaSubmit: def __init__(self, sample_xml, submission_xml, live=False): From b136582edd0356275bcd934123ae9a3d84e92a38 Mon Sep 17 00:00:00 2001 From: Varsha Date: Thu, 19 Dec 2024 18:43:42 +0000 Subject: [PATCH 13/20] add unit tests for ena queries --- genomeuploader/ena.py | 62 ++----- genomeuploader/genome_upload.py | 2 +- tests/conftest.py | 100 +++++++++++ tests/fixtures/responses/private_run.json | 17 ++ .../responses/private_study_runs.json | 152 +++++++++++++++++ tests/fixtures/responses/public_run.json | 7 + tests/fixtures/responses/public_sample.json | 8 + tests/fixtures/responses/public_study.json | 6 + .../fixtures/responses/public_study_runs.json | 52 ++++++ tests/unit/test_ena.py | 158 ++++++++++++++++++ 10 files changed, 514 insertions(+), 50 deletions(-) create mode 100644 tests/conftest.py create mode 100644 tests/fixtures/responses/private_run.json create mode 100644 tests/fixtures/responses/private_study_runs.json create mode 100644 tests/fixtures/responses/public_run.json create mode 100644 tests/fixtures/responses/public_sample.json create mode 100644 tests/fixtures/responses/public_study.json create mode 100644 tests/fixtures/responses/public_study_runs.json create mode 100644 tests/unit/test_ena.py diff --git a/genomeuploader/ena.py b/genomeuploader/ena.py index d181d82..f5ae0b9 100644 --- a/genomeuploader/ena.py +++ b/genomeuploader/ena.py @@ -43,9 +43,9 @@ class NoDataException(ValueError): ASSEMBLY_DEFAULT_FIELDS = "sample_accession" -SAMPLE_DEFAULT_FIELDS = ",".join(["sample_accession", "secondary_sample_accession", "collection_date", "country", "location"]) +SAMPLE_DEFAULT_FIELDS = ",".join(["sample_accession", "collection_date", "country", "location"]) -STUDY_DEFAULT_FIELDS = "study_description" +STUDY_DEFAULT_FIELDS = "study_accession,study_description" RETRY_COUNT = 3 @@ -71,9 +71,11 @@ def parse_accession(accession): elif "RR" in accession: return "run_accession" elif "SAM" in accession: - return "secondary_sample_accession" - elif "RS" in accession: return "sample_accession" + elif "RS" in accession: + return "secondary_sample_accession" + elif "RZ" in accession: + return "analysis_accession" else: logging.error(f"{accession} is not a valid accession") sys.exit() @@ -192,7 +194,7 @@ def get_data_or_handle_error(self, response, xml=False, full_json=False): return response.json() else: # only return the fist element in the list is the default. In these cases there should only be one entry returned - return json.loads(response.txt)[0] + return json.loads(response.text)[0] except (IndexError, TypeError, ValueError, KeyError): logging.error(f"Failed to fetch {self.accession}, returned error: {response.text}") @@ -226,7 +228,8 @@ def _get_private_run(self): run = self.get_data_or_handle_error(response) run_data = run["report"] reformatted_data = { - "secondary_study_accession": run_data["studyID"], + "run_accession": run_data["id"], + "secondary_study_accession": run_data["studyId"], "sample_accession": run_data["sampleId"], } logging.info(f"{self.accession} private run returned from ENA") @@ -251,7 +254,7 @@ def _get_private_study(self): response = self.retry_or_handle_request_error(self.get_request, url) study = self.get_data_or_handle_error(response, xml=True) study_desc = study.getElementsByTagName("STUDY_DESCRIPTION")[0].firstChild.nodeValue - reformatted_data = {"study_description": study_desc} + reformatted_data = {"study_accession": self.accession, "study_description": study_desc} logging.info(f"{self.accession} private study returned from ENA") return reformatted_data @@ -317,6 +320,7 @@ def _get_private_sample(self): url = f"{self.private_url}/samples/xml/{self.accession}" reponse = self.retry_or_handle_request_error(self.get_request, url) reformatted_data = {} + reformatted_data["sample_accession"] = self.accession parsed_xml = self.get_data_or_handle_error(reponse, xml=True) sample_attributes = parsed_xml.getElementsByTagName("SAMPLE_ATTRIBUTE") for attribute in sample_attributes: @@ -340,55 +344,16 @@ def _get_public_sample(self): logging.info(f"{self.accession} public sample returned from ENA") return sample - def build_query(self): - if self.query_type == "study": - if self.private: - try: - ena_response = self._get_private_study() - except Exception: - logging.info("Private API failed, trying public") - ena_response = self._get_public_study() - elif self.query_type == "run": - if self.private: - try: - ena_response = self._get_private_run() - except Exception: - logging.info("Private API failed, trying public") - - ena_response = self._get_public_run() - elif self.query_type == "run_assembly": - if self.private: - try: - ena_response = self._get_private_run_from_assembly() - except Exception: - logging.info("Private API failed, trying public") - ena_response = self._get_public_run_from_assembly() - elif self.query_type == "study_runs": - if self.private: - try: - ena_response = self._get_private_study_runs() - except Exception: - logging.info("Private API failed, trying public") - ena_response = self._get_public_study_runs() - elif self.query_type == "sample": - if self.private: - try: - ena_response = self._get_private_sample() - except Exception: - logging.info("Private API failed, trying public") - ena_response = self._get_public_sample() - return ena_response - def build_query(self): """If the private flag is given, assume private data and try private APIs. - ENA also has cases where a run may be private but the sample may be public etc. Hence always try + ENA also has cases where a run may be private but the sample may be public etc. Hence always try public if private fails""" api_map = { "study": (self._get_private_study, self._get_public_study), "run": (self._get_private_run, self._get_public_run), "run_assembly": (self._get_private_run_from_assembly, self._get_public_run_from_assembly), "study_runs": (self._get_private_study_runs, self._get_public_study_runs), - "sample": (self._get_private_sample, self._get_public_sample) + "sample": (self._get_private_sample, self._get_public_sample), } private_api, public_api = api_map.get(self.query_type, (None, None)) @@ -405,7 +370,6 @@ def build_query(self): return ena_response - class EnaSubmit: def __init__(self, sample_xml, submission_xml, live=False): self.sample_xml = sample_xml diff --git a/genomeuploader/genome_upload.py b/genomeuploader/genome_upload.py index 264d57d..4436195 100755 --- a/genomeuploader/genome_upload.py +++ b/genomeuploader/genome_upload.py @@ -51,7 +51,7 @@ def parse_args(argv): parser.add_argument("-u", "--upload_study", type=str, required=True, help="Study accession for genomes upload") parser.add_argument("--genome_info", type=str, required=True, help="Genomes metadata file") - genomeType = parser.add_mutually_exclusive_group(required=True, default=False) + genomeType = parser.add_mutually_exclusive_group(required=True) genomeType.add_argument("-m", "--mags", action="store_true", help="Select for MAG upload") genomeType.add_argument("-b", "--bins", action="store_true", help="Select for bin upload") diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..c08e123 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,100 @@ +from pathlib import Path + +import pytest + +@pytest.fixture(scope="module") +def test_file_dir(): + return Path(__file__).resolve().parent / Path("fixtures/responses") + +@pytest.fixture(scope="module") +def public_study_json(test_file_dir): + return test_file_dir / Path("public_study.json") + +@pytest.fixture(scope="module") +def private_study_xml(test_file_dir): + return test_file_dir / Path("private_study.xml") + +@pytest.fixture(scope="module") +def public_run_json(test_file_dir): + return test_file_dir / Path("public_run.json") + +@pytest.fixture(scope="module") +def private_run_json(test_file_dir): + return test_file_dir / Path("private_run.json") + +@pytest.fixture(scope="module") +def public_run_from_assembly_xml(test_file_dir): + return test_file_dir / Path("public_run_from_assembly.xml") + +@pytest.fixture(scope="module") +def private_run_from_assembly_xml(test_file_dir): + return test_file_dir / Path("private_run_from_assembly.xml") + +@pytest.fixture(scope="module") +def public_study_runs_json(test_file_dir): + return test_file_dir / Path("public_study_runs.json") + +@pytest.fixture(scope="module") +def private_study_runs_json(test_file_dir): + return test_file_dir / Path("private_study_runs.json") + +@pytest.fixture(scope="module") +def public_sample_json(test_file_dir): + return test_file_dir / Path("public_sample.json") + +@pytest.fixture(scope="module") +def private_sample_xml(test_file_dir): + return test_file_dir / Path("private_sample.xml") + + +@pytest.fixture +def public_study_data(): + return { + "study_accession": "PRJEB41657", + "study_description": "Metagenomic raw reads, assemblies, and bins derived from HoloFood salmon gut samples from trial A and B. The samples in this project contributed to the salmon MAG catalogue (project: PRJEB55376 [ERP140265])." + } + +# private XML has no PRJ study ID +@pytest.fixture +def private_study_data(): + return { + "study_accession": "ERP125469", + "study_description": "Metagenomic raw reads, assemblies, and bins derived from HoloFood salmon gut samples from trial A and B. The samples in this project contributed to the salmon MAG catalogue (project: PRJEB55376 [ERP140265])." + } + +@pytest.fixture +def public_run_data(): + return { + "run_accession": "ERR4918394", + "sample_accession": "SAMEA7687881", + "secondary_study_accession": "ERP125469" + } + +# private json only has ERS sample IDs +@pytest.fixture +def private_run_data(): + return { + "run_accession": "ERR4918394", + "sample_accession": "ERS5444411", + "secondary_study_accession": "ERP125469" + } + +@pytest.fixture +def public_sample_data(): + return { + "sample_accession": "SAMEA7687881", + "collection_date": "2019-06-11", + "country": "Norway", + "location": "66.079905 N 12.587848 E" + } + +# private xml can only query with RS accession and lat and long are split +@pytest.fixture +def private_sample_data(): + return { + "sample_accession": "ERS5444411", + "collection_date": "2019-06-11", + "country": "Norway", + 'latitude': '66.079905', + 'longitude': '12.587848' + } diff --git a/tests/fixtures/responses/private_run.json b/tests/fixtures/responses/private_run.json new file mode 100644 index 0000000..09c0a03 --- /dev/null +++ b/tests/fixtures/responses/private_run.json @@ -0,0 +1,17 @@ +[ + { + "report": { + "id": "ERR4918394", + "alias": "webin-reads-SA01_02C1a_metaG", + "instrumentModel": "DNBSEQ-G400", + "firstCreated": "2020-12-04T12:14:11", + "firstPublic": "2022-08-02T17:24:21", + "releaseStatus": "PUBLIC", + "submissionAccountId": "Webin-51990", + "studyId": "ERP125469", + "experimentId": "ERX4783303", + "sampleId": "ERS5444411" + }, + "links": [] + } +] diff --git a/tests/fixtures/responses/private_study_runs.json b/tests/fixtures/responses/private_study_runs.json new file mode 100644 index 0000000..b3f7149 --- /dev/null +++ b/tests/fixtures/responses/private_study_runs.json @@ -0,0 +1,152 @@ +[ + { + "report": { + "id": "ERR4918752", + "alias": "webin-reads-SB06_18C1a_metaG", + "instrumentModel": "DNBSEQ-G400", + "firstCreated": "2020-12-04T12:32:19", + "firstPublic": "2022-08-02T17:24:21", + "releaseStatus": "PUBLIC", + "submissionAccountId": "Webin-51990", + "studyId": "ERP125469", + "experimentId": "ERX4783661", + "sampleId": "ERS5444665" + }, + "links": [] + }, + { + "report": { + "id": "ERR4918751", + "alias": "webin-reads-SB06_13C1a_metaG", + "instrumentModel": "DNBSEQ-G400", + "firstCreated": "2020-12-04T12:31:46", + "firstPublic": "2022-08-02T17:24:21", + "releaseStatus": "PUBLIC", + "submissionAccountId": "Webin-51990", + "studyId": "ERP125469", + "experimentId": "ERX4783660", + "sampleId": "ERS5444660" + }, + "links": [] + }, + { + "report": { + "id": "ERR4918750", + "alias": "webin-reads-SB10_16C1a_metaG", + "instrumentModel": "DNBSEQ-G400", + "firstCreated": "2020-12-04T12:31:17", + "firstPublic": "2022-08-02T17:25:05", + "releaseStatus": "PUBLIC", + "submissionAccountId": "Webin-51990", + "studyId": "ERP125469", + "experimentId": "ERX4783659", + "sampleId": "ERS5444722" + }, + "links": [] + }, + { + "report": { + "id": "ERR4918749", + "alias": "webin-reads-SB10_30C1a_metaG", + "instrumentModel": "DNBSEQ-G400", + "firstCreated": "2020-12-04T12:30:42", + "firstPublic": "2022-08-02T17:24:20", + "releaseStatus": "PUBLIC", + "submissionAccountId": "Webin-51990", + "studyId": "ERP125469", + "experimentId": "ERX4783658", + "sampleId": "ERS5444736" + }, + "links": [] + }, + { + "report": { + "id": "ERR4918748", + "alias": "webin-reads-SB11_25C1a_metaG", + "instrumentModel": "DNBSEQ-G400", + "firstCreated": "2020-12-04T12:30:20", + "firstPublic": "2022-08-02T17:25:03", + "releaseStatus": "PUBLIC", + "submissionAccountId": "Webin-51990", + "studyId": "ERP125469", + "experimentId": "ERX4783657", + "sampleId": "ERS5444756" + }, + "links": [] + }, + { + "report": { + "id": "ERR4918747", + "alias": "webin-reads-SB11_13C1a_metaG", + "instrumentModel": "DNBSEQ-G400", + "firstCreated": "2020-12-04T12:30:12", + "firstPublic": "2022-08-02T17:25:04", + "releaseStatus": "PUBLIC", + "submissionAccountId": "Webin-51990", + "studyId": "ERP125469", + "experimentId": "ERX4783656", + "sampleId": "ERS5444744" + }, + "links": [] + }, + { + "report": { + "id": "ERR4918746", + "alias": "webin-reads-SB11_29C1a_metaG", + "instrumentModel": "DNBSEQ-G400", + "firstCreated": "2020-12-04T12:29:52", + "firstPublic": "2022-08-02T17:24:21", + "releaseStatus": "PUBLIC", + "submissionAccountId": "Webin-51990", + "studyId": "ERP125469", + "experimentId": "ERX4783655", + "sampleId": "ERS5444760" + }, + "links": [] + }, + { + "report": { + "id": "ERR4918745", + "alias": "webin-reads-SA09_02C1a_metaG", + "instrumentModel": "DNBSEQ-G400", + "firstCreated": "2020-12-04T12:29:29", + "firstPublic": "2022-08-02T17:24:20", + "releaseStatus": "PUBLIC", + "submissionAccountId": "Webin-51990", + "studyId": "ERP125469", + "experimentId": "ERX4783654", + "sampleId": "ERS5444529" + }, + "links": [] + }, + { + "report": { + "id": "ERR4918744", + "alias": "webin-reads-SB08_05C1a_metaG", + "instrumentModel": "DNBSEQ-G400", + "firstCreated": "2020-12-04T12:29:27", + "firstPublic": "2022-08-02T17:24:21", + "releaseStatus": "PUBLIC", + "submissionAccountId": "Webin-51990", + "studyId": "ERP125469", + "experimentId": "ERX4783653", + "sampleId": "ERS5444707" + }, + "links": [] + }, + { + "report": { + "id": "ERR4918743", + "alias": "webin-reads-SB11_28C1a_metaG", + "instrumentModel": "DNBSEQ-G400", + "firstCreated": "2020-12-04T12:29:25", + "firstPublic": "2022-08-02T17:25:05", + "releaseStatus": "PUBLIC", + "submissionAccountId": "Webin-51990", + "studyId": "ERP125469", + "experimentId": "ERX4783652", + "sampleId": "ERS5444759" + }, + "links": [] + } +] diff --git a/tests/fixtures/responses/public_run.json b/tests/fixtures/responses/public_run.json new file mode 100644 index 0000000..cc790e8 --- /dev/null +++ b/tests/fixtures/responses/public_run.json @@ -0,0 +1,7 @@ +[ + { + "run_accession": "ERR4918394", + "sample_accession": "SAMEA7687881", + "secondary_study_accession": "ERP125469" + } +] diff --git a/tests/fixtures/responses/public_sample.json b/tests/fixtures/responses/public_sample.json new file mode 100644 index 0000000..d59db96 --- /dev/null +++ b/tests/fixtures/responses/public_sample.json @@ -0,0 +1,8 @@ +[ + { + "sample_accession": "SAMEA7687881", + "collection_date": "2019-06-11", + "country": "Norway", + "location": "66.079905 N 12.587848 E" + } +] diff --git a/tests/fixtures/responses/public_study.json b/tests/fixtures/responses/public_study.json new file mode 100644 index 0000000..482c952 --- /dev/null +++ b/tests/fixtures/responses/public_study.json @@ -0,0 +1,6 @@ +[ + { + "study_accession": "PRJEB41657", + "study_description": "Metagenomic raw reads, assemblies, and bins derived from HoloFood salmon gut samples from trial A and B. The samples in this project contributed to the salmon MAG catalogue (project: PRJEB55376 [ERP140265])." + } + ] \ No newline at end of file diff --git a/tests/fixtures/responses/public_study_runs.json b/tests/fixtures/responses/public_study_runs.json new file mode 100644 index 0000000..ae8c790 --- /dev/null +++ b/tests/fixtures/responses/public_study_runs.json @@ -0,0 +1,52 @@ +[ + { + "run_accession": "ERR4918394", + "sample_accession": "SAMEA7687881", + "instrument_model": "DNBSEQ-G400" + }, + { + "run_accession": "ERR4918397", + "sample_accession": "SAMEA7687882", + "instrument_model": "DNBSEQ-G400" + }, + { + "run_accession": "ERR4918398", + "sample_accession": "SAMEA7678309", + "instrument_model": "DNBSEQ-G400" + }, + { + "run_accession": "ERR4918404", + "sample_accession": "SAMEA7687910", + "instrument_model": "DNBSEQ-G400" + }, + { + "run_accession": "ERR4918405", + "sample_accession": "SAMEA7687902", + "instrument_model": "DNBSEQ-G400" + }, + { + "run_accession": "ERR4918406", + "sample_accession": "SAMEA7687895", + "instrument_model": "DNBSEQ-G400" + }, + { + "run_accession": "ERR4918408", + "sample_accession": "SAMEA7687899", + "instrument_model": "DNBSEQ-G400" + }, + { + "run_accession": "ERR4918413", + "sample_accession": "SAMEA7687903", + "instrument_model": "DNBSEQ-G400" + }, + { + "run_accession": "ERR4918414", + "sample_accession": "SAMEA7687912", + "instrument_model": "DNBSEQ-G400" + }, + { + "run_accession": "ERR4918416", + "sample_accession": "SAMEA7687892", + "instrument_model": "DNBSEQ-G400" + } +] diff --git a/tests/unit/test_ena.py b/tests/unit/test_ena.py new file mode 100644 index 0000000..8917fca --- /dev/null +++ b/tests/unit/test_ena.py @@ -0,0 +1,158 @@ +import pytest +import responses +from requests.exceptions import ConnectionError, HTTPError +import json + +from genomeuploader.ena import EnaQuery +#from assembly_uploader.webin_utils import ENA_WEBIN, ENA_WEBIN_PASSWORD + + +def read_json(path): + with open(path, 'r') as jpath: + return json.load(jpath) + +def read_xml(path): + with open(path, 'r') as xpath: + return xpath.read() + + +def test_ena_study(public_study_data, private_study_data, public_study_json, private_study_xml): + responses.add( + responses.POST, + "https://www.ebi.ac.uk/ena/portal/api/search", + json= read_json(public_study_json) + ) + + responses.add( + responses.GET, + "https://www.ebi.ac.uk/ena/submit/report/studies/xml/ERP125469", + body= read_xml(private_study_xml), + content_type='application/xml' + ) + + ena_study_public = EnaQuery( + accession="ERP125469", + query_type="study", + private=False + ) + + ena_study_private = EnaQuery( + accession="ERP125469", + query_type="study", + private=True, + ) + assert ena_study_public.build_query() == public_study_data + assert ena_study_private.build_query() == private_study_data + + +def test_ena_run(public_run_data, private_run_data, public_run_json, private_run_json): + responses.add( + responses.POST, + "https://www.ebi.ac.uk/ena/portal/api/search", + json = read_json(public_run_json) + ) + + responses.add( + responses.GET, + "https://www.ebi.ac.uk/ena/submit/report/runs/ERR4918394", + json = read_json(private_run_json) + ) + + ena_run_public = EnaQuery( + accession="ERR4918394", + query_type="run", + private=False + ) + + ena_run_private = EnaQuery( + accession="ERR4918394", + query_type="run", + private=True + ) + assert ena_run_public.build_query() == public_run_data + assert ena_run_private.build_query() == private_run_data + +def test_ena_run_from_assembly(public_run_from_assembly_xml, private_run_from_assembly_xml): + responses.add( + responses.GET, + "https://www.ebi.ac.uk/ena/browser/api/xml/ERZ2626953", + body = read_xml(public_run_from_assembly_xml), + content_type='application/xml' + ) + + responses.add( + responses.GET, + "https://www.ebi.ac.uk/ena/submit/report/analyses/xml/ERZ2626953", + body = read_xml(private_run_from_assembly_xml), + content_type='application/xml' + ) + + ena_run_from_assembly_public = EnaQuery( + accession="ERZ2626953", + query_type="run_assembly", + private=False + ) + + ena_run_from_assembly_public = EnaQuery( + accession="ERZ2626953", + query_type="run_assembly", + private=True + ) + + assert ena_run_from_assembly_public.build_query() and ena_run_from_assembly_public.build_query() == "ERR4918394" + +def test_ena_study_runs(public_study_runs_json, private_study_runs_json): + """response mock limited to 10 runs for the study, need to check if there is page limit""" + responses.add( + responses.POST, + "https://www.ebi.ac.uk/ena/portal/api/search", + json = read_json(public_study_runs_json) + ) + + responses.add( + responses.GET, + "https://www.ebi.ac.uk/ena/submit/report/runs/ERP125469", + json = read_json(private_study_runs_json) + ) + + ena_study_runs_public = EnaQuery( + accession="ERP125469", + query_type="study_runs", + private=False + ) + + ena_study_runs_private = EnaQuery( + accession="ERP125469", + query_type="study_runs", + private=True + ) + + assert len(ena_study_runs_public.build_query()) and len(ena_study_runs_private.build_query()) == 10 + +def test_ena_sample(public_sample_data, private_sample_data, public_sample_json, private_sample_xml): + responses.add( + responses.POST, + "https://www.ebi.ac.uk/ena/portal/api/search", + json = read_json(public_sample_json) + ) + + responses.add( + responses.GET, + "https://www.ebi.ac.uk/ena/submit/report/samples/xml/ERS5444411", + body = read_xml(private_sample_xml) + ) + + ena_sample_public = EnaQuery( + accession="SAMEA7687881", + query_type="sample", + private=False + ) + + ena_sample_private = EnaQuery( + accession="ERS5444411", + query_type="sample", + private=True + ) + + assert ena_sample_public.build_query() == public_sample_data + assert ena_sample_private.build_query() == private_sample_data \ No newline at end of file From d249395c69e899eaa4858102c7fe884a6662fde4 Mon Sep 17 00:00:00 2001 From: Varsha Date: Fri, 27 Dec 2024 16:27:41 +0000 Subject: [PATCH 14/20] add exception test --- tests/conftest.py | 49 ++++++------ tests/unit/test_ena.py | 174 ++++++++++++++++++++--------------------- 2 files changed, 112 insertions(+), 111 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index c08e123..032779d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -2,46 +2,57 @@ import pytest + @pytest.fixture(scope="module") def test_file_dir(): return Path(__file__).resolve().parent / Path("fixtures/responses") + @pytest.fixture(scope="module") def public_study_json(test_file_dir): return test_file_dir / Path("public_study.json") + @pytest.fixture(scope="module") def private_study_xml(test_file_dir): return test_file_dir / Path("private_study.xml") + @pytest.fixture(scope="module") def public_run_json(test_file_dir): return test_file_dir / Path("public_run.json") + @pytest.fixture(scope="module") def private_run_json(test_file_dir): return test_file_dir / Path("private_run.json") + @pytest.fixture(scope="module") def public_run_from_assembly_xml(test_file_dir): return test_file_dir / Path("public_run_from_assembly.xml") + @pytest.fixture(scope="module") def private_run_from_assembly_xml(test_file_dir): return test_file_dir / Path("private_run_from_assembly.xml") + @pytest.fixture(scope="module") def public_study_runs_json(test_file_dir): return test_file_dir / Path("public_study_runs.json") + @pytest.fixture(scope="module") def private_study_runs_json(test_file_dir): return test_file_dir / Path("private_study_runs.json") + @pytest.fixture(scope="module") def public_sample_json(test_file_dir): return test_file_dir / Path("public_sample.json") + @pytest.fixture(scope="module") def private_sample_xml(test_file_dir): return test_file_dir / Path("private_sample.xml") @@ -50,43 +61,35 @@ def private_sample_xml(test_file_dir): @pytest.fixture def public_study_data(): return { - "study_accession": "PRJEB41657", - "study_description": "Metagenomic raw reads, assemblies, and bins derived from HoloFood salmon gut samples from trial A and B. The samples in this project contributed to the salmon MAG catalogue (project: PRJEB55376 [ERP140265])." + "study_accession": "PRJEB41657", + "study_description": "Metagenomic raw reads, assemblies, and bins derived from HoloFood salmon gut samples from trial A and B. The samples in this project contributed to the salmon MAG catalogue (project: PRJEB55376 [ERP140265]).", } + # private XML has no PRJ study ID @pytest.fixture def private_study_data(): return { - "study_accession": "ERP125469", - "study_description": "Metagenomic raw reads, assemblies, and bins derived from HoloFood salmon gut samples from trial A and B. The samples in this project contributed to the salmon MAG catalogue (project: PRJEB55376 [ERP140265])." + "study_accession": "ERP125469", + "study_description": "Metagenomic raw reads, assemblies, and bins derived from HoloFood salmon gut samples from trial A and B. The samples in this project contributed to the salmon MAG catalogue (project: PRJEB55376 [ERP140265]).", } + @pytest.fixture def public_run_data(): - return { - "run_accession": "ERR4918394", - "sample_accession": "SAMEA7687881", - "secondary_study_accession": "ERP125469" - } + return {"run_accession": "ERR4918394", "sample_accession": "SAMEA7687881", "secondary_study_accession": "ERP125469"} + # private json only has ERS sample IDs @pytest.fixture def private_run_data(): - return { - "run_accession": "ERR4918394", - "sample_accession": "ERS5444411", - "secondary_study_accession": "ERP125469" - } + return {"run_accession": "ERR4918394", "sample_accession": "ERS5444411", "secondary_study_accession": "ERP125469"} + @pytest.fixture def public_sample_data(): - return { - "sample_accession": "SAMEA7687881", - "collection_date": "2019-06-11", - "country": "Norway", - "location": "66.079905 N 12.587848 E" - } + return {"sample_accession": "SAMEA7687881", "collection_date": "2019-06-11", "country": "Norway", "location": "66.079905 N 12.587848 E"} + # private xml can only query with RS accession and lat and long are split @pytest.fixture @@ -95,6 +98,6 @@ def private_sample_data(): "sample_accession": "ERS5444411", "collection_date": "2019-06-11", "country": "Norway", - 'latitude': '66.079905', - 'longitude': '12.587848' - } + "latitude": "66.079905", + "longitude": "12.587848", + } diff --git a/tests/unit/test_ena.py b/tests/unit/test_ena.py index 8917fca..902cf13 100644 --- a/tests/unit/test_ena.py +++ b/tests/unit/test_ena.py @@ -1,40 +1,36 @@ +import json + import pytest import responses from requests.exceptions import ConnectionError, HTTPError -import json from genomeuploader.ena import EnaQuery -#from assembly_uploader.webin_utils import ENA_WEBIN, ENA_WEBIN_PASSWORD + +ENA_WEBIN = "ENA_WEBIN" +ENA_WEBIN_PASSWORD = "ENA_WEBIN_PASSWORD" def read_json(path): - with open(path, 'r') as jpath: + with open(path, "r") as jpath: return json.load(jpath) + def read_xml(path): - with open(path, 'r') as xpath: + with open(path, "r") as xpath: return xpath.read() def test_ena_study(public_study_data, private_study_data, public_study_json, private_study_xml): - responses.add( - responses.POST, - "https://www.ebi.ac.uk/ena/portal/api/search", - json= read_json(public_study_json) - ) + responses.add(responses.POST, "https://www.ebi.ac.uk/ena/portal/api/search", json=read_json(public_study_json)) responses.add( responses.GET, "https://www.ebi.ac.uk/ena/submit/report/studies/xml/ERP125469", - body= read_xml(private_study_xml), - content_type='application/xml' + body=read_xml(private_study_xml), + content_type="application/xml", ) - ena_study_public = EnaQuery( - accession="ERP125469", - query_type="study", - private=False - ) + ena_study_public = EnaQuery(accession="ERP125469", query_type="study", private=False) ena_study_private = EnaQuery( accession="ERP125469", @@ -46,113 +42,115 @@ def test_ena_study(public_study_data, private_study_data, public_study_json, pri def test_ena_run(public_run_data, private_run_data, public_run_json, private_run_json): - responses.add( - responses.POST, - "https://www.ebi.ac.uk/ena/portal/api/search", - json = read_json(public_run_json) - ) + responses.add(responses.POST, "https://www.ebi.ac.uk/ena/portal/api/search", json=read_json(public_run_json)) - responses.add( - responses.GET, - "https://www.ebi.ac.uk/ena/submit/report/runs/ERR4918394", - json = read_json(private_run_json) - ) + responses.add(responses.GET, "https://www.ebi.ac.uk/ena/submit/report/runs/ERR4918394", json=read_json(private_run_json)) - ena_run_public = EnaQuery( - accession="ERR4918394", - query_type="run", - private=False - ) + ena_run_public = EnaQuery(accession="ERR4918394", query_type="run", private=False) - ena_run_private = EnaQuery( - accession="ERR4918394", - query_type="run", - private=True - ) + ena_run_private = EnaQuery(accession="ERR4918394", query_type="run", private=True) assert ena_run_public.build_query() == public_run_data assert ena_run_private.build_query() == private_run_data + def test_ena_run_from_assembly(public_run_from_assembly_xml, private_run_from_assembly_xml): responses.add( responses.GET, "https://www.ebi.ac.uk/ena/browser/api/xml/ERZ2626953", - body = read_xml(public_run_from_assembly_xml), - content_type='application/xml' + body=read_xml(public_run_from_assembly_xml), + content_type="application/xml", ) responses.add( responses.GET, "https://www.ebi.ac.uk/ena/submit/report/analyses/xml/ERZ2626953", - body = read_xml(private_run_from_assembly_xml), - content_type='application/xml' + body=read_xml(private_run_from_assembly_xml), + content_type="application/xml", ) - ena_run_from_assembly_public = EnaQuery( - accession="ERZ2626953", - query_type="run_assembly", - private=False - ) + ena_run_from_assembly_public = EnaQuery(accession="ERZ2626953", query_type="run_assembly", private=False) - ena_run_from_assembly_public = EnaQuery( - accession="ERZ2626953", - query_type="run_assembly", - private=True - ) + ena_run_from_assembly_public = EnaQuery(accession="ERZ2626953", query_type="run_assembly", private=True) assert ena_run_from_assembly_public.build_query() and ena_run_from_assembly_public.build_query() == "ERR4918394" + def test_ena_study_runs(public_study_runs_json, private_study_runs_json): """response mock limited to 10 runs for the study, need to check if there is page limit""" - responses.add( - responses.POST, - "https://www.ebi.ac.uk/ena/portal/api/search", - json = read_json(public_study_runs_json) - ) + responses.add(responses.POST, "https://www.ebi.ac.uk/ena/portal/api/search", json=read_json(public_study_runs_json)) - responses.add( - responses.GET, - "https://www.ebi.ac.uk/ena/submit/report/runs/ERP125469", - json = read_json(private_study_runs_json) - ) + responses.add(responses.GET, "https://www.ebi.ac.uk/ena/submit/report/runs/ERP125469", json=read_json(private_study_runs_json)) - ena_study_runs_public = EnaQuery( - accession="ERP125469", - query_type="study_runs", - private=False - ) + ena_study_runs_public = EnaQuery(accession="ERP125469", query_type="study_runs", private=False) - ena_study_runs_private = EnaQuery( - accession="ERP125469", - query_type="study_runs", - private=True - ) + ena_study_runs_private = EnaQuery(accession="ERP125469", query_type="study_runs", private=True) + + assert len(ena_study_runs_public.build_query()) and len(ena_study_runs_private.build_query()) == 10 - assert len(ena_study_runs_public.build_query()) and len(ena_study_runs_private.build_query()) == 10 def test_ena_sample(public_sample_data, private_sample_data, public_sample_json, private_sample_xml): - responses.add( - responses.POST, - "https://www.ebi.ac.uk/ena/portal/api/search", - json = read_json(public_sample_json) + responses.add(responses.POST, "https://www.ebi.ac.uk/ena/portal/api/search", json=read_json(public_sample_json)) + + responses.add(responses.GET, "https://www.ebi.ac.uk/ena/submit/report/samples/xml/ERS5444411", body=read_xml(private_sample_xml)) + + ena_sample_public = EnaQuery(accession="SAMEA7687881", query_type="sample", private=False) + + ena_sample_private = EnaQuery(accession="ERS5444411", query_type="sample", private=True) + + assert ena_sample_public.build_query() == public_sample_data + assert ena_sample_private.build_query() == private_sample_data + + +def test_ena_exceptions(monkeypatch): + monkeypatch.setenv(ENA_WEBIN, "fake-webin-999") + monkeypatch.setenv(ENA_WEBIN_PASSWORD, "fakewebinpw") + + ena_query = EnaQuery( + accession="ERP125469", + query_type="study", + private=True, ) + public_api_data = { + "format": "json", + "includeMetagenomes": True, + "dataPortal": "ena", + "result": "study", + "query": "secondary_study_accession=ERP125469", + "fields": "study_accession,study_description", + } + responses.add( responses.GET, - "https://www.ebi.ac.uk/ena/submit/report/samples/xml/ERS5444411", - body = read_xml(private_sample_xml) + "https://www.ebi.ac.uk/ena/submit/report/studies/ERP125469", + body=ConnectionError("Test retry private error"), ) - ena_sample_public = EnaQuery( - accession="SAMEA7687881", - query_type="sample", - private=False - ) + responses.add(responses.POST, "https://www.ebi.ac.uk/ena/portal/api/search", body=ConnectionError("Test retry public error")) + + with pytest.raises( + ValueError, + match="Could not find ERP125469 in ENA after 3 attempts. Error: Test retry private error", + ): + ena_query.retry_or_handle_request_error( + request=ena_query.get_request, + url="https://www.ebi.ac.uk/ena/submit/report/studies/ERP125469", + ) + assert responses.assert_call_count("https://www.ebi.ac.uk/ena/submit/report/studies/ERP125469", 3) + + with pytest.raises(ValueError, match="Could not find ERP125469 in ENA after 3 attempts. Error: Test retry public error"): + ena_query.retry_or_handle_request_error(request=ena_query.post_request, data=public_api_data) + assert responses.assert_call_count("https://www.ebi.ac.uk/ena/portal/api/search", 3) - ena_sample_private = EnaQuery( - accession="ERS5444411", - query_type="sample", - private=True + responses.add( + responses.GET, + "https://www.ebi.ac.uk/ena/submit/report/studies/ERPXYZ", + body=HTTPError("Test failure error"), ) - assert ena_sample_public.build_query() == public_sample_data - assert ena_sample_private.build_query() == private_sample_data \ No newline at end of file + with pytest.raises(HTTPError): + ena_query.retry_or_handle_request_error( + request=ena_query.get_request, + url="https://www.ebi.ac.uk/ena/submit/report/studies/ERPXYZ", + ) + assert responses.assert_call_count("https://www.ebi.ac.uk/ena/submit/report/studies/ERPXYZ", 1) From 9c7da3d83dee4d2c8c183f4fcd98093881fd1138 Mon Sep 17 00:00:00 2001 From: Varsha Date: Fri, 27 Dec 2024 16:29:26 +0000 Subject: [PATCH 15/20] precommit reformatting and removal of dummy test file --- .env.example | 4 ++-- .github/workflows/pytest_workflow.yml | 2 +- README.md | 16 ++++++++-------- pytest.ini | 1 + requirements-test.yml | 2 +- tests/test_dummy.py | 5 ----- 6 files changed, 13 insertions(+), 17 deletions(-) delete mode 100644 tests/test_dummy.py diff --git a/.env.example b/.env.example index 41fba3f..3e69e4e 100644 --- a/.env.example +++ b/.env.example @@ -1,2 +1,2 @@ -ENA_WEBIN="" -ENA_WEBIN_PASSWORD="" \ No newline at end of file +ENA_WEBIN="ENA_WEBIN" +ENA_WEBIN_PASSWORD="ENA_WEBIN_PASSWORD" diff --git a/.github/workflows/pytest_workflow.yml b/.github/workflows/pytest_workflow.yml index c0b190f..3c9a0c1 100644 --- a/.github/workflows/pytest_workflow.yml +++ b/.github/workflows/pytest_workflow.yml @@ -27,7 +27,7 @@ jobs: emoji: false verbose: true job-summary: true - + pytest_39: name: Run pytest 3.9 needs: diff --git a/README.md b/README.md index 05a3b8c..05354d6 100755 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ # Public bins and MAGs uploader -Python script to upload bins and MAGs in fasta format to ENA (European Nucleotide Archive). This script generates xmls and manifests necessary for submission with webin-cli. +Python script to upload bins and MAGs in fasta format to ENA (European Nucleotide Archive). This script generates xmls and manifests necessary for submission with webin-cli. It takes as input one tsv (tab-separated values) table in the following format: @@ -26,20 +26,20 @@ With columns indicating: * _local_environment_: `string` (explanation following) * _environmental_medium_: `string` (explanation following) -According to ENA checklist's guidelines, 'broad_environment' describes the broad ecological context of a sample - desert, taiga, coral reef, ... 'local_environment' is more local - lake, harbour, cliff, ... 'environmental_medium' is either the material displaced by the sample, or the one in which the sample was embedded prior to the sampling event - air, soil, water, ... +According to ENA checklist's guidelines, 'broad_environment' describes the broad ecological context of a sample - desert, taiga, coral reef, ... 'local_environment' is more local - lake, harbour, cliff, ... 'environmental_medium' is either the material displaced by the sample, or the one in which the sample was embedded prior to the sampling event - air, soil, water, ... For host-associated metagenomic samples, the three variables can be defined similarly to the following example for the chicken gut metagenome: "chicken digestive system", "digestive tube", "caecum". More information can be found at [ERC000050]() for bins and [ERC000047]() for MAGs under field names "broad-scale environmental context", "local environmental context", "environmental medium" Another example can be found [here](examples/input_example.tsv) ### Warnings -Raw-read runs from which genomes were generated should already be available on the INSDC (ENA by EBI, GenBank by NCBI, or DDBJ), hence at least one DRR|ERR|SRR accession should be available for every genome to be uploaded. Assembly accessions (ERZ|SRZ|DRZ) are also supported. +Raw-read runs from which genomes were generated should already be available on the INSDC (ENA by EBI, GenBank by NCBI, or DDBJ), hence at least one DRR|ERR|SRR accession should be available for every genome to be uploaded. Assembly accessions (ERZ|SRZ|DRZ) are also supported. -If uploading TPA (Third PArty) genomes, you will need to contact [ENA support]() before using the script. They will provide instructions on how to correctly register a TPA project where to submit your genomes. If both TPA and non-TPA genomes need to be uploaded, please divide them in two batches and use the `--tpa` flag only with TPA genomes. +If uploading TPA (Third PArty) genomes, you will need to contact [ENA support]() before using the script. They will provide instructions on how to correctly register a TPA project where to submit your genomes. If both TPA and non-TPA genomes need to be uploaded, please divide them in two batches and use the `--tpa` flag only with TPA genomes. -Files to be uploaded will need to be compressed (e.g. already in .gz format). +Files to be uploaded will need to be compressed (e.g. already in .gz format). -No more than 5000 genomes can be submitted at the same time. +No more than 5000 genomes can be submitted at the same time. ## Register samples and generate pre-upload files The script needs `python`, `pandas`, `requests`, and `ena-webin-cli` to run. We provide a yaml file for the generation of a conda environment: @@ -68,9 +68,9 @@ where * `--centre_name CENTRE_NAME`: name of the centre generating and uploading genomes * `--tpa`: if uploading TPA (Third PArty) generated genomes -It is recommended to validate your genomes in test mode (i.e. without `--live` in the registration step and with `-test` during the upload) before attempting the final upload. Launching the registration in test mode will add a timestamp to the genome name to allow multiple executions of the test process. +It is recommended to validate your genomes in test mode (i.e. without `--live` in the registration step and with `-test` during the upload) before attempting the final upload. Launching the registration in test mode will add a timestamp to the genome name to allow multiple executions of the test process. -Sample xmls won't be regenerated automatically if a previous xml already exists. If any metadata or value in the tsv table changes, `--force` will allow xml regeneration. +Sample xmls won't be regenerated automatically if a previous xml already exists. If any metadata or value in the tsv table changes, `--force` will allow xml regeneration. ### Produced files: The script produces the following files and folders: diff --git a/pytest.ini b/pytest.ini index 8e78522..654f24a 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,3 +1,4 @@ [pytest] python_files = tests/*.py testpaths = tests +required_plugins = pytest-workflow diff --git a/requirements-test.yml b/requirements-test.yml index 333f06c..914434e 100644 --- a/requirements-test.yml +++ b/requirements-test.yml @@ -1 +1 @@ --r requirements.yml \ No newline at end of file +-r requirements.yml diff --git a/tests/test_dummy.py b/tests/test_dummy.py deleted file mode 100644 index 9095527..0000000 --- a/tests/test_dummy.py +++ /dev/null @@ -1,5 +0,0 @@ - -class TestDummy: - - def test_dummy(self): - assert 1 == 1 \ No newline at end of file From d47e0414d8c9f208d126d11d244c44991a6673bc Mon Sep 17 00:00:00 2001 From: Varsha Date: Fri, 27 Dec 2024 16:39:47 +0000 Subject: [PATCH 16/20] run test on PR to dev --- .github/workflows/pytest_workflow.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/pytest_workflow.yml b/.github/workflows/pytest_workflow.yml index 3c9a0c1..4effa1e 100644 --- a/.github/workflows/pytest_workflow.yml +++ b/.github/workflows/pytest_workflow.yml @@ -4,7 +4,9 @@ on: push: branches: [main] pull_request: - branches: [main] + branches: + - main + - dev jobs: pytest_38: From 35751c41490c58326f2bbada63f90e7d9dfedee2 Mon Sep 17 00:00:00 2001 From: Varsha Date: Fri, 27 Dec 2024 16:42:14 +0000 Subject: [PATCH 17/20] add responses to test requirements --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 82527f3..293445a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -52,6 +52,7 @@ test = [ "pytest-md==0.2.0", "pytest-workflow==2.0.1", "pytest-cov==3.0.0", + "pytest-responses==0.5.1" ] [build-system] From 4e8d1081c45830eb55a802df3a18d69905257b24 Mon Sep 17 00:00:00 2001 From: Varsha Date: Fri, 27 Dec 2024 16:42:20 +0000 Subject: [PATCH 18/20] add responses to test requirements --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 293445a..968ffa2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -52,7 +52,7 @@ test = [ "pytest-md==0.2.0", "pytest-workflow==2.0.1", "pytest-cov==3.0.0", - "pytest-responses==0.5.1" + "pytest-responses==0.5.1", ] [build-system] From 677001c198fd492db319d6a6f2a10054b0b30f4c Mon Sep 17 00:00:00 2001 From: Varsha Date: Fri, 27 Dec 2024 16:45:30 +0000 Subject: [PATCH 19/20] add responses for private data --- .../responses/private_run_from_assembly.xml | 40 +++++ tests/fixtures/responses/private_sample.xml | 159 ++++++++++++++++++ tests/fixtures/responses/private_study.xml | 16 ++ .../responses/public_run_from_assembly.xml | 25 +++ tests/fixtures/responses/public_study.json | 2 +- 5 files changed, 241 insertions(+), 1 deletion(-) create mode 100644 tests/fixtures/responses/private_run_from_assembly.xml create mode 100644 tests/fixtures/responses/private_sample.xml create mode 100644 tests/fixtures/responses/private_study.xml create mode 100644 tests/fixtures/responses/public_run_from_assembly.xml diff --git a/tests/fixtures/responses/private_run_from_assembly.xml b/tests/fixtures/responses/private_run_from_assembly.xml new file mode 100644 index 0000000..a387933 --- /dev/null +++ b/tests/fixtures/responses/private_run_from_assembly.xml @@ -0,0 +1,40 @@ + + + + + ERZ2626953 + webin-genome-ERR4918394_62952e961c26f0859b4ebdd83d93e4e7 + + Genome assembly: ERR4918394_62952e961c26f0859b4ebdd83d93e4e7 + + + ERP125469 + PRJEB41657 + + + + + ERS5444411 + + + + + ERR4918394 + + + + + ERR4918394_62952e961c26f0859b4ebdd83d93e4e7 + primary metagenome + false + 17.0 + metaSPAdes v3.14.1 + DNBSEQ-G400 + genomic DNA + + + + + + + diff --git a/tests/fixtures/responses/private_sample.xml b/tests/fixtures/responses/private_sample.xml new file mode 100644 index 0000000..e0a50bd --- /dev/null +++ b/tests/fixtures/responses/private_sample.xml @@ -0,0 +1,159 @@ + + + + + ERS5444411 + SA01_02C1a_metaG + + SA01.02C1a + + 1602388 + fish gut metagenome + + Salmon metagenomic DNA data from Distal gut content; animal SA01.02 from tank SA01 with treatment Tiger + + + adapters + AAGTCGGAGGCCAAGCGGTCTTAGGAAGACAA;GAACGACATGGCTACGATCCGACTT + + + collection date + 2019-06-11 + + + geographic location (country and/or sea) + Norway + + + geographic location (latitude) + 66.079905 + DD + + + geographic location (longitude) + 12.587848 + DD + + + geographic location (region and locality) + Dønna; Nordland county + + + host body site + Distal gut content + + + host common name + Atlantic salmon + + + host diet + Fermented algae meal (added in in oil coating) + + + host diet treatment + Tiger: SA Control + + + host disease status + Wounded:- + + + host scientific name + Salmo salar + + + host subject id + SA01.02 + + + host taxid + 8030 + + + host total mass + 290.4 + g + + + nucleic acid extraction + D-rex protocol + + + pcr primers + N/A + + + project name + HoloFood Salmon - Metagenomic DNA + + + reference host genome for decontamination + GCA_000000000.0 + + + sample storage buffer + Shield + + + sample storage container + 2ml E-matrix + + + sample storage location + UCPH + + + sample storage temperature + -20 + °C + + + sample volume or weight for DNA extraction + 0.2 + mL + + + sequencing method + MGISEQ-2000 + + + trial timepoint + 0 + days + + + host storage container temperature + 12.76 + °C + + + host storage container pH + 7.05 + + + host gutted mass + 246.6 + g + + + host length + 29.5 + cm + + + host diet treatment concentration + 0 + % mass + + + host storage container + LetSea Tank: Tiger + + + ENA-CHECKLIST + ERC000052 + + + + diff --git a/tests/fixtures/responses/private_study.xml b/tests/fixtures/responses/private_study.xml new file mode 100644 index 0000000..32ef2c1 --- /dev/null +++ b/tests/fixtures/responses/private_study.xml @@ -0,0 +1,16 @@ + + + + + ERP125469 + ena-STUDY-UNIVERSITY OF COPENHAGEN-02-12-2020-08:09:52:762-111 + + + HoloFood Salmon Trial A+B Gut Metagenome + Metagenomic raw reads, assemblies, and bins derived from HoloFood salmon gut samples from trial A and B. The samples in this project contributed to the salmon MAG catalogue (project: PRJEB55376 [ERP140265]). + Metagenomic raw reads, assemblies, and bins derived from HoloFood salmon gut samples from trial A and B. The samples in this project contributed to the salmon MAG catalogue (project: PRJEB55376 [ERP140265]). + HoloFood Salmon Trial A+B Gut Metagenome + + + + \ No newline at end of file diff --git a/tests/fixtures/responses/public_run_from_assembly.xml b/tests/fixtures/responses/public_run_from_assembly.xml new file mode 100644 index 0000000..7d5b620 --- /dev/null +++ b/tests/fixtures/responses/public_run_from_assembly.xml @@ -0,0 +1,25 @@ + + + + ERZ2626953 + webin-genome-ERR4918394_62952e961c26f0859b4ebdd83d93e4e7 + + Genome assembly: ERR4918394_62952e961c26f0859b4ebdd83d93e4e7 + + + ERP125469 + PRJEB41657 + + + + + ERS5444411 + SAMEA7687881 + + + + + ERR4918394 + + + diff --git a/tests/fixtures/responses/public_study.json b/tests/fixtures/responses/public_study.json index 482c952..eb360c6 100644 --- a/tests/fixtures/responses/public_study.json +++ b/tests/fixtures/responses/public_study.json @@ -3,4 +3,4 @@ "study_accession": "PRJEB41657", "study_description": "Metagenomic raw reads, assemblies, and bins derived from HoloFood salmon gut samples from trial A and B. The samples in this project contributed to the salmon MAG catalogue (project: PRJEB55376 [ERP140265])." } - ] \ No newline at end of file + ] From b601790e6feaa1a917496881085b8019097cf489 Mon Sep 17 00:00:00 2001 From: Varsha Date: Fri, 27 Dec 2024 17:38:33 +0000 Subject: [PATCH 20/20] max private study runs set to 2000 --- genomeuploader/ena.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/genomeuploader/ena.py b/genomeuploader/ena.py index f5ae0b9..0f3c5b6 100644 --- a/genomeuploader/ena.py +++ b/genomeuploader/ena.py @@ -186,6 +186,7 @@ def get_data_or_handle_error(self, response, xml=False, full_json=False): logging.error(f"{self.accession} private data is not present in the specified Webin account") else: logging.error(f"{self.accession} public data does not exist") + return None else: if xml: return minidom.parseString(data_txt) @@ -194,7 +195,7 @@ def get_data_or_handle_error(self, response, xml=False, full_json=False): return response.json() else: # only return the fist element in the list is the default. In these cases there should only be one entry returned - return json.loads(response.text)[0] + return json.loads(response.text)[0] except (IndexError, TypeError, ValueError, KeyError): logging.error(f"Failed to fetch {self.accession}, returned error: {response.text}") @@ -290,9 +291,10 @@ def _get_public_run_from_assembly(self): logging.info(f"public run from the assembly {self.accession} returned from ENA") return run - + def _get_private_study_runs(self): - url = f"{self.private_url}/runs/{self.accession}" + # Pagination documentation unclear - offest not working. Using hardcoded 2000 default. TO MODIFY + url = f"{self.private_url}/runs/{self.accession}?format=json&max-results=2000" response = self.retry_or_handle_request_error(self.get_request, url) runs = self.get_data_or_handle_error(response, full_json=True) reformatted_data = [] @@ -305,6 +307,7 @@ def _get_private_study_runs(self): if "instrumentModel" in run_data: run_data["instrument_model"] = run_data.pop("instrumentModel") reformatted_data.append(run_data) + logging.info(f"found {len(reformatted_data)} runs for study {self.accession}") logging.info(f"private runs from study {self.accession}, returned from ENA") return reformatted_data