Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/private submission #21

Open
wants to merge 20 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
223 changes: 142 additions & 81 deletions genomeuploader/ena.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,9 @@ class NoDataException(ValueError):

RETRY_COUNT = 5

PRIVATE_DATA_URL = "https://www.ebi.ac.uk/ena/submit/report/"
PUBLIC_DATA_URL = "https://www.ebi.ac.uk/ena/portal/api/search"


class ENA():
def get_default_params(self):
Expand All @@ -68,28 +71,36 @@ def get_default_params(self):
'dataPortal': 'ena'
}

def post_request(self, data, webin, password):
url = "https://www.ebi.ac.uk/ena/portal/api/search"
auth = (webin, password)
def post_request(self, data):
url = PUBLIC_DATA_URL
default_connection_headers = {
"Content-Type": "application/x-www-form-urlencoded",
"Accept": "*/*"
}
response = requests.post(url, data=data, auth=auth, headers=default_connection_headers)
response = requests.post(url, data=data, headers=default_connection_headers)

return response

def get_run(self, run_accession, webin, password, attempt=0, search_params=None):
data = self.get_default_params()
data['result'] = 'read_run'
data['fields'] = RUN_DEFAULT_FIELDS
data['query'] = 'run_accession=\"{}\"'.format(run_accession)

if search_params:
data.update(search_params)

response = self.post_request(data, webin, password)
def get_request(self, url, webin, password):
auth = (webin, password)
response = requests.get(url, auth=auth)

return response

def get_run(self, run_accession, webin, password, private=False, attempt=0, search_params=None):
if not private:
data = self.get_default_params()
data['result'] = 'read_run'
data['fields'] = RUN_DEFAULT_FIELDS
data['query'] = 'run_accession=\"{}\"'.format(run_accession)

if search_params:
data.update(search_params)
response = self.post_request(data)
else:
url = f'{PRIVATE_DATA_URL}runs/{run_accession}'
response = self.get_request(url, webin, password)

if not response.ok and attempt > 2:
raise ValueError("Could not retrieve run with accession {}, returned "
"message: {}".format(run_accession, response.text))
Expand All @@ -107,58 +118,77 @@ def get_run(self, run_accession, webin, password, attempt=0, search_params=None)
raise ValueError("Could not find run {} in ENA.".format(run_accession))
except:
raise Exception("Could not query ENA API: {}".format(response.text))

if private:
run_data = run['report']
final_data = {'secondary_study_accession': run_data['studyId'], 'sample_accession': run_data['sampleId']}
return final_data
else:
return run

def get_run_from_assembly(self, assembly_name, webin, password, private=False):
if not private:
response = requests.get("https://www.ebi.ac.uk/ena/browser/api/xml/" + assembly_name)

else:
response = requests.get("https://www.ebi.ac.uk/ena/submit/report/analyses/xml/" + assembly_name, auth=(webin, password))

return run

def get_run_from_assembly(self, assembly_name):
manifestXml = minidom.parseString(requests.get("https://www.ebi.ac.uk" +
"/ena/browser/api/xml/" + assembly_name).text)

manifestXml = minidom.parseString(response.text)
run_ref = manifestXml.getElementsByTagName("RUN_REF")
run = run_ref[0].attributes["accession"].value

return run

def get_study(self, webin, password, study_accession):
data = self.get_default_params()
data['result'] = "study"
data['fields'] = STUDY_DEFAULT_FIELDS
data['query'] = 'study_accession="{}" OR secondary_study_accession="{}"' \
.format(study_accession, study_accession)

data['dataPortal'] = "ena"
def get_study(self, study_accession, webin, password, private=False):
if not private:
data = self.get_default_params()
data['result'] = "study"
data['fields'] = STUDY_DEFAULT_FIELDS
data['query'] = 'study_accession="{}" OR secondary_study_accession="{}"' \
.format(study_accession, study_accession)

try:
response = self.post_request(data, webin, password)
if response.status_code == 204:
raise NoDataException()
data['dataPortal'] = "ena"
try:
studyList = response.json()
assert len(studyList) == 1
study = studyList[0]
except (IndexError, TypeError, ValueError, KeyError) as e:
raise e
return study
except NoDataException:
print("No info found to fetch study {}".format(study_accession))
except (IndexError, TypeError, ValueError, KeyError):
print("Failed to fetch study {}, returned error: {}".format(study_accession, response.text))

raise ValueError('Could not find study {} in ENA.'.format(study_accession))

def get_study_runs(self, study_acc, webin, password, fields=None, search_params=None):
data = self.get_default_params()
data['result'] = 'read_run'
data['fields'] = fields or RUN_DEFAULT_FIELDS
data['query'] = '(study_accession=\"{}\" OR secondary_study_accession=\"{}\")'.format(study_acc, study_acc)

if search_params:
data.update(search_params)

response = self.post_request(data, webin, password)
response = self.post_request(data)
if response.status_code == 204:
raise NoDataException()
try:
study = json.loads(response.text)[0]
except (IndexError, TypeError, ValueError, KeyError) as e:
raise e
return study

except NoDataException:
print("No info found to fetch study {}".format(study_accession))
except (IndexError, TypeError, ValueError, KeyError):
print("Failed to fetch study {}, returned error: {}".format(study_accession, response.text))

raise ValueError('Could not find study {} in ENA.'.format(study_accession))
else:
url = f"https://www.ebi.ac.uk/ena/submit/report/studies/xml/{study_accession}"
manifestXml = minidom.parseString(requests.get(url, auth=(webin, password)).text)
study_desc = manifestXml.getElementsByTagName("STUDY_DESCRIPTION")[0].firstChild.nodeValue
final_data = {'study_description': study_desc}
return final_data


def get_study_runs(self, study_accession, webin, password, private= False, fields=None, search_params=None):
if not private:
data = self.get_default_params()
data['result'] = 'read_run'
data['fields'] = fields or RUN_DEFAULT_FIELDS
data['query'] = '(study_accession=\"{}\" OR secondary_study_accession=\"{}\")'.format(study_accession, study_accession)

if search_params:
data.update(search_params)

response = self.post_request(data)
else:
url = f'{PRIVATE_DATA_URL}runs/{study_accession}'
response = self.get_request(url, webin, password)

if not response.ok:
raise ValueError("Could not retrieve runs for study %s.", study_acc)
raise ValueError("Could not retrieve runs for study %s.", study_accession)

if response.status_code == 204:
return []
Expand All @@ -168,38 +198,68 @@ def get_study_runs(self, study_acc, webin, password, fields=None, search_params=
except:
raise ValueError("Query against ENA API did not work. Returned "
"message: {}".format(response.text))

if private:
final_data = []
for run in runs:
run_data = run['report']
if 'sampleId' in run_data:
run_data['sample_accession'] = run_data.pop('sampleId')
if 'id' in run_data:
run_data['run_accession'] = run_data.pop('id')
if 'instrumentModel' in run_data:
run_data['instrument_model'] = run_data.pop('instrumentModel')
final_data.append(run_data)
return(final_data)
else:
return runs

return runs

def get_sample(self, sample_accession, webin, password, fields=None, search_params=None, attempt=0):
data = self.get_default_params()
data['result'] = 'sample'
data['fields'] = fields or SAMPLE_DEFAULT_FIELDS
data['query'] = ('(sample_accession=\"{acc}\" OR secondary_sample_accession'
'=\"{acc}\") ').format(acc=sample_accession)
def get_sample(self, sample_accession, webin, password, private=False, fields=None, search_params=None, attempt=0):
if not private:
data = self.get_default_params()
data['result'] = 'sample'
data['fields'] = fields or SAMPLE_DEFAULT_FIELDS
data['query'] = ('(sample_accession=\"{acc}\" OR secondary_sample_accession'
'=\"{acc}\") ').format(acc=sample_accession)

if search_params:
data.update(search_params)
if search_params:
data.update(search_params)

response = self.post_request(data, webin, password)
response = self.post_request(data)

if response.status_code == 200:
sample = response.json()
assert len(sample) == 1
return sample[0]
if response.status_code == 200:
sample = response.json()
assert len(sample) == 1
return sample[0]

if response.status_code == 204:
if attempt < 2:
new_params = {'dataPortal': 'metagenome' if data['dataPortal'] == 'ena' else 'ena'}
attempt += 1
return self.get_sample(sample_accession, webin, password, fields=fields,
search_params=new_params, attempt=attempt)
if response.status_code == 204:
if attempt < 2:
new_params = {'dataPortal': 'metagenome' if data['dataPortal'] == 'ena' else 'ena'}
attempt += 1
return self.get_sample(sample_accession, webin, password, fields=fields,
search_params=new_params, attempt=attempt)
else:
raise ValueError("Could not find sample {} in ENA after "
"{} attempts.".format(sample_accession, RETRY_COUNT))
else:
raise ValueError("Could not find sample {} in ENA after "
"{} attempts.".format(sample_accession, RETRY_COUNT))
raise ValueError("Could not retrieve sample with accession {}. "
"Returned message: {}".format(sample_accession, response.text))
else:
raise ValueError("Could not retrieve sample with accession {}. "
"Returned message: {}".format(sample_accession, response.text))
url = f"https://www.ebi.ac.uk/ena/submit/report/samples/xml/{sample_accession}"
final_data = {}
manifestXml = minidom.parseString(requests.get(url, auth=(webin, password)).text)
sample_attributes = manifestXml.getElementsByTagName('SAMPLE_ATTRIBUTE')
for attribute in sample_attributes:
tag = attribute.getElementsByTagName('TAG')[0].firstChild.nodeValue
if tag == "geographic location (country and/or sea)":
final_data['country'] = attribute.getElementsByTagName('VALUE')[0].firstChild.nodeValue
if tag == "geographic location (latitude)":
final_data['latitude'] = attribute.getElementsByTagName('VALUE')[0].firstChild.nodeValue
if tag == "geographic location (longitude)":
final_data['longitude'] = attribute.getElementsByTagName('VALUE')[0].firstChild.nodeValue
if tag == "collection date":
final_data['collection_date'] = attribute.getElementsByTagName('VALUE')[0].firstChild.nodeValue
return final_data

def query_taxid(self, taxid):
url = "https://www.ebi.ac.uk/ena/taxonomy/rest/tax-id/{}".format(taxid)
Expand Down Expand Up @@ -294,3 +354,4 @@ def handle_genomes_registration(self, sample_xml, submission_xml, webin, passwor
logger.info('{} genome samples successfully registered.'.format(str(len(aliasDict))))

return aliasDict

48 changes: 27 additions & 21 deletions genomeuploader/genome_upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,7 +351,7 @@ def extract_genomes_info(inputFile, genomeType, live):

return genomeInfo

def extract_ENA_info(genomeInfo, uploadDir, webin, password):
def extract_ENA_info(genomeInfo, uploadDir, webin, password, private=False):
logger.info('Retrieving project and run info from ENA (this might take a while)...')

# retrieving metadata from runs (and runs from assembly accessions if provided)
Expand All @@ -360,13 +360,13 @@ def extract_ENA_info(genomeInfo, uploadDir, webin, password):
if genomeInfo[g]["accessionType"] == "assembly":
derivedRuns = []
for acc in genomeInfo[g]["accessions"]:
derivedRuns.append(ena.get_run_from_assembly(acc))
derivedRuns.append(ena.get_run_from_assembly(acc, private))
genomeInfo[g]["accessions"] = derivedRuns
allRuns.extend(genomeInfo[g]["accessions"])

runsSet, studySet, samplesDict, tempDict = set(allRuns), set(), {}, {}
for r in runsSet:
run_info = ena.get_run(r, webin, password)
run_info = ena.get_run(r, webin, password, private)
studySet.add(run_info["secondary_study_accession"])
samplesDict[r] = run_info["sample_accession"]

Expand All @@ -386,10 +386,10 @@ def extract_ENA_info(genomeInfo, uploadDir, webin, password):
except json.decoder.JSONDecodeError:
backupDict = {}
for s in studySet:
studyInfo = ena.get_study(webin, password, s)
studyInfo = ena.get_study(s, webin, password, private)
projectDescription = studyInfo["study_description"]

ENA_info = ena.get_study_runs(s, webin, password)
ENA_info = ena.get_study_runs(s, webin, password, private)
if ENA_info == []:
raise IOError("No runs found on ENA for project {}.".format(s))

Expand All @@ -398,21 +398,25 @@ def extract_ENA_info(genomeInfo, uploadDir, webin, password):
if runAccession not in backupDict:
if runAccession in runsSet:
sampleAccession = ENA_info[run]["sample_accession"]
sampleInfo = ena.get_sample(sampleAccession, webin, password)

location = sampleInfo["location"]
latitude, longitude = None, None
if 'N' in location:
latitude = location.split('N')[0].strip()
longitude = location.split('N')[1].strip()
elif 'S' in location:
latitude = '-' + location.split('S')[0].strip()
longitude = location.split('S')[1].strip()

if 'W' in longitude:
longitude = '-' + longitude.split('W')[0].strip()
elif longitude.endswith('E'):
longitude = longitude.split('E')[0].strip()
sampleInfo = ena.get_sample(sampleAccession, webin, password, private)

if 'latitude' in sampleInfo and 'longitude' in sampleInfo:
latitude = sampleInfo['latitude']
longitude = sampleInfo['longitude']
else:
location = sampleInfo["location"]
latitude, longitude = None, None
if 'N' in location:
latitude = location.split('N')[0].strip()
longitude = location.split('N')[1].strip()
elif 'S' in location:
latitude = '-' + location.split('S')[0].strip()
longitude = location.split('S')[1].strip()

if 'W' in longitude:
longitude = '-' + longitude.split('W')[0].strip()
elif longitude.endswith('E'):
longitude = longitude.split('E')[0].strip()

if latitude:
latitude = "{:.{}f}".format(round(float(latitude), GEOGRAPHY_DIGIT_COORDS), GEOGRAPHY_DIGIT_COORDS)
Expand Down Expand Up @@ -845,6 +849,7 @@ def __init__(self, argv=sys.argv[1:]):
self.genomeMetadata = self.args.genome_info
self.genomeType = "bins" if self.args.bins else "MAGs"
self.live = True if self.args.live else False
self.private = self.args.private

if self.args.webin and self.args.password:
self.username = self.args.webin
Expand Down Expand Up @@ -900,6 +905,7 @@ def parse_args(self, argv):
parser.add_argument('--webin', required=False, help="Webin id")
parser.add_argument('--password', required=False, help="Webin password")
parser.add_argument('--centre_name', required=False, help="Name of the centre uploading genomes")
parser.add_argument('--private', required=False, help="if data is private", action='store_true', default=False)

args = parser.parse_args(argv)

Expand All @@ -925,7 +931,7 @@ def create_genome_dictionary(self, samples_xml):
genomeInfo = extract_genomes_info(self.genomeMetadata, self.genomeType, self.live)

if not os.path.exists(samples_xml) or self.force:
extract_ENA_info(genomeInfo, self.upload_dir, self.username, self.password)
extract_ENA_info(genomeInfo, self.upload_dir, self.username, self.password, self.private)
logger.info("Writing genome registration XML...")

write_genomes_xml(genomeInfo, samples_xml, self.genomeType,
Expand Down