From e5c1e4626845e63a9112ea253c2a5630b7a8c1d4 Mon Sep 17 00:00:00 2001
From: Varsha <vkale@ebi.ac.uk>
Date: Sun, 13 Oct 2024 16:58:43 +0100
Subject: [PATCH 01/20] change endpoints for private data

---
 genomeuploader/ena.py           | 224 ++++++++++++++++++++------------
 genomeuploader/genome_upload.py |  48 ++++---
 2 files changed, 169 insertions(+), 103 deletions(-)

diff --git a/genomeuploader/ena.py b/genomeuploader/ena.py
index 51ab45c..5230aee 100644
--- a/genomeuploader/ena.py
+++ b/genomeuploader/ena.py
@@ -59,6 +59,9 @@ class NoDataException(ValueError):
 
 RETRY_COUNT = 5
 
+PRIVATE_DATA_URL = "https://www.ebi.ac.uk/ena/submit/report/"
+PUBLIC_DATA_URL = "https://www.ebi.ac.uk/ena/portal/api/search"
+
 
 class ENA():
     def get_default_params(self):
@@ -68,28 +71,36 @@ def get_default_params(self):
             'dataPortal': 'ena'
         }
 
-    def post_request(self, data, webin, password):
-        url = "https://www.ebi.ac.uk/ena/portal/api/search"
-        auth = (webin, password)
+    def post_request(self, data):
+        url = PUBLIC_DATA_URL
         default_connection_headers = {
             "Content-Type": "application/x-www-form-urlencoded",
             "Accept": "*/*"
         }
-        response = requests.post(url, data=data, auth=auth, headers=default_connection_headers)
+        response = requests.post(url, data=data, headers=default_connection_headers)
         
         return response
 
-    def get_run(self, run_accession, webin, password, attempt=0, search_params=None):
-        data = self.get_default_params()
-        data['result'] = 'read_run'
-        data['fields'] = RUN_DEFAULT_FIELDS
-        data['query'] = 'run_accession=\"{}\"'.format(run_accession)
-
-        if search_params:
-            data.update(search_params)
-
-        response = self.post_request(data, webin, password)
+    def get_request(self, url, webin, password):
+        auth = (webin, password)
+        response = requests.get(url, auth=auth)
         
+        return response        
+
+    def get_run(self, run_accession, webin, password,  private=False, attempt=0, search_params=None):
+        if not private:
+            data = self.get_default_params()
+            data['result'] = 'read_run'
+            data['fields'] = RUN_DEFAULT_FIELDS
+            data['query'] = 'run_accession=\"{}\"'.format(run_accession)
+
+            if search_params:
+                data.update(search_params)
+            response = self.post_request(data)
+        else:
+            url = f'{PRIVATE_DATA_URL}runs/{run_accession}'
+            response = self.get_request(url, webin, password)
+            
         if not response.ok and attempt > 2:
             raise ValueError("Could not retrieve run with accession {}, returned "
                 "message: {}".format(run_accession, response.text))
@@ -107,99 +118,147 @@ def get_run(self, run_accession, webin, password, attempt=0, search_params=None)
             raise ValueError("Could not find run {} in ENA.".format(run_accession))
         except:
             raise Exception("Could not query ENA API: {}".format(response.text))
-
-        return run
-
-    def get_run_from_assembly(self, assembly_name):
-        manifestXml = minidom.parseString(requests.get("https://www.ebi.ac.uk" +
-            "/ena/browser/api/xml/" + assembly_name).text)
+        
+        if private:
+            run_data = run['report']
+            final_data = {'secondary_study_accession': run_data['studyId'], 'sample_accession': run_data['sampleId']}
+            return final_data
+        else:
+            return run
+        
+    #   will not work for private, needs modification after checking best endpoint with ENA for run ref
+    def get_run_from_assembly(self, assembly_name, webin, password, private=False):
+        if not private:
+            manifestXml = minidom.parseString(requests.get("https://www.ebi.ac.uk" +
+                "/ena/browser/api/xml/" + assembly_name).text)
+        else:
+            url = f"https://www.ebi.ac.uk/ena/submit/report/analyses/xml/{assembly_name}"
+            manifestXml = minidom.parseString(requests.get(url, auth=(webin, password)).text)
 
         run_ref = manifestXml.getElementsByTagName("RUN_REF")
         run = run_ref[0].attributes["accession"].value
         
         return run
 
-    def get_study(self, webin, password, study_accession):
-        data = self.get_default_params()
-        data['result'] = "study"
-        data['fields'] = STUDY_DEFAULT_FIELDS
-        data['query'] = 'study_accession="{}" OR secondary_study_accession="{}"' \
-            .format(study_accession, study_accession)
-
-        data['dataPortal'] = "ena"
+    def get_study(self, study_accession, webin, password, private=False):
+        if not private:
+            data = self.get_default_params()
+            data['result'] = "study"
+            data['fields'] = STUDY_DEFAULT_FIELDS
+            data['query'] = 'study_accession="{}" OR secondary_study_accession="{}"' \
+                .format(study_accession, study_accession)
 
-        try:
-            response = self.post_request(data, webin, password)
-            if response.status_code == 204:
-                raise NoDataException()
+            data['dataPortal'] = "ena"
             try:
-                studyList = response.json()
-                assert len(studyList) == 1
-                study = studyList[0]
-            except (IndexError, TypeError, ValueError, KeyError) as e:
-                raise e
-            return study
-        except NoDataException:
-            print("No info found to fetch study {}".format(study_accession))
-        except (IndexError, TypeError, ValueError, KeyError):
-            print("Failed to fetch study {}, returned error: {}".format(study_accession, response.text))
-
-        raise ValueError('Could not find study {} in ENA.'.format(study_accession))
-
-    def get_study_runs(self, study_acc, webin, password, fields=None, search_params=None):
-        data = self.get_default_params()
-        data['result'] = 'read_run'
-        data['fields'] = fields or RUN_DEFAULT_FIELDS
-        data['query'] = '(study_accession=\"{}\" OR secondary_study_accession=\"{}\")'.format(study_acc, study_acc)
-
-        if search_params:
-            data.update(search_params)
-
-        response = self.post_request(data, webin, password)
+                response = self.post_request(data)
+                if response.status_code == 204:
+                    raise NoDataException()
+                try:
+                    study = json.loads(response.text)[0]
+                except (IndexError, TypeError, ValueError, KeyError) as e:
+                    raise e
+                return study
+        
+            except NoDataException:
+                print("No info found to fetch study {}".format(study_accession))
+            except (IndexError, TypeError, ValueError, KeyError):
+                print("Failed to fetch study {}, returned error: {}".format(study_accession, response.text))
+
+            raise ValueError('Could not find study {} in ENA.'.format(study_accession))
+        else:
+            url = f"https://www.ebi.ac.uk/ena/submit/report/studies/xml/{study_accession}"
+            manifestXml = minidom.parseString(requests.get(url, auth=(webin, password)).text)
+            study_desc= manifestXml.getElementsByTagName("STUDY_DESCRIPTION")[0].firstChild.nodeValue
+            final_data = {'study_description': study_desc}
+            return final_data
+
+
+    def get_study_runs(self, study_accession, webin, password, private= False, fields=None, search_params=None):
+        if not private:
+            data = self.get_default_params()
+            data['result'] = 'read_run'
+            data['fields'] = fields or RUN_DEFAULT_FIELDS
+            data['query'] = '(study_accession=\"{}\" OR secondary_study_accession=\"{}\")'.format(study_accession, study_accession)
+
+            if search_params:
+                data.update(search_params)
+
+            response = self.post_request(data)
+        else:
+            url = f'{PRIVATE_DATA_URL}runs/{study_accession}'
+            response = self.get_request(url, webin, password)
         
         if not response.ok:
-            raise ValueError("Could not retrieve runs for study %s.", study_acc)
+            raise ValueError("Could not retrieve runs for study %s.", study_accession)
         
         if response.status_code == 204:
             return []
 
         try:
-            runs = response.json()
+            runs = response.json()[0:2]
         except:
             raise ValueError("Query against ENA API did not work. Returned "
                 "message: {}".format(response.text))
+        
+        if private:
+            final_data = []
+            for run in runs:
+                run_data = run['report']
+                if 'sampleId' in run_data:
+                    run_data['sample_accession'] = run_data.pop('sampleId')
+                if 'id' in run_data:
+                    run_data['run_accession'] = run_data.pop('id')
+                final_data.append(run_data)
+            return(final_data)
+        else:
+            return runs
 
-        return runs
-
-    def get_sample(self, sample_accession, webin, password, fields=None, search_params=None, attempt=0):
-        data = self.get_default_params()
-        data['result'] = 'sample'
-        data['fields'] = fields or SAMPLE_DEFAULT_FIELDS
-        data['query'] = ('(sample_accession=\"{acc}\" OR secondary_sample_accession'
-            '=\"{acc}\") ').format(acc=sample_accession)
+    def get_sample(self, sample_accession, webin, password, private=False, fields=None, search_params=None, attempt=0):
+        if not private:
+            data = self.get_default_params()
+            data['result'] = 'sample'
+            data['fields'] = fields or SAMPLE_DEFAULT_FIELDS
+            data['query'] = ('(sample_accession=\"{acc}\" OR secondary_sample_accession'
+                '=\"{acc}\") ').format(acc=sample_accession)
 
-        if search_params:
-            data.update(search_params)
+            if search_params:
+                data.update(search_params)
 
-        response = self.post_request(data, webin, password)
+            response = self.post_request(data)
         
-        if response.status_code == 200:
-            sample = response.json()
-            assert len(sample) == 1 
-            return sample[0]
+            if response.status_code == 200:
+                sample = response.json()
+                assert len(sample) == 1 
+                return sample[0]
 
-        if response.status_code == 204:
-            if attempt < 2:
-                new_params = {'dataPortal': 'metagenome' if data['dataPortal'] == 'ena' else 'ena'}
-                attempt += 1
-                return self.get_sample(sample_accession, webin, password, fields=fields, 
-                    search_params=new_params, attempt=attempt)
+            if response.status_code == 204:
+                if attempt < 2:
+                    new_params = {'dataPortal': 'metagenome' if data['dataPortal'] == 'ena' else 'ena'}
+                    attempt += 1
+                    return self.get_sample(sample_accession, webin, password, fields=fields, 
+                        search_params=new_params, attempt=attempt)
+                else:
+                    raise ValueError("Could not find sample {} in ENA after "
+                        "{} attempts.".format(sample_accession, RETRY_COUNT))
             else:
-                raise ValueError("Could not find sample {} in ENA after "
-                    "{} attempts.".format(sample_accession, RETRY_COUNT))
+                raise ValueError("Could not retrieve sample with accession {}. "
+                    "Returned message: {}".format(sample_accession, response.text))
         else:
-            raise ValueError("Could not retrieve sample with accession {}. "
-                "Returned message: {}".format(sample_accession, response.text))
+            url = f"https://www.ebi.ac.uk/ena/submit/report/samples/xml/{sample_accession}"
+            final_data = {}
+            manifestXml = minidom.parseString(requests.get(url, auth=(webin, password)).text)
+            sample_attributes = manifestXml.getElementsByTagName('SAMPLE_ATTRIBUTE')
+            for attribute in sample_attributes:
+                tag = attribute.getElementsByTagName('TAG')[0].firstChild.nodeValue
+                if tag == "geographic location (country and/or sea)":
+                    final_data['country'] = attribute.getElementsByTagName('VALUE')[0].firstChild.nodeValue
+                if tag == "geographic location (latitude)":
+                    final_data['latitude'] = attribute.getElementsByTagName('VALUE')[0].firstChild.nodeValue
+                if tag == "geographic location (longitude)":
+                    final_data['longitude'] = attribute.getElementsByTagName('VALUE')[0].firstChild.nodeValue
+                if tag == "collection date":
+                    final_data['collection_date'] = attribute.getElementsByTagName('VALUE')[0].firstChild.nodeValue
+            return final_data
 
     def query_taxid(self, taxid):
         url = "https://www.ebi.ac.uk/ena/taxonomy/rest/tax-id/{}".format(taxid)
@@ -294,3 +353,4 @@ def handle_genomes_registration(self, sample_xml, submission_xml, webin, passwor
         logger.info('{} genome samples successfully registered.'.format(str(len(aliasDict))))
 
         return aliasDict
+    
\ No newline at end of file
diff --git a/genomeuploader/genome_upload.py b/genomeuploader/genome_upload.py
index 99cf3fb..6ec1f65 100755
--- a/genomeuploader/genome_upload.py
+++ b/genomeuploader/genome_upload.py
@@ -351,7 +351,7 @@ def extract_genomes_info(inputFile, genomeType, live):
 
     return genomeInfo
 
-def extract_ENA_info(genomeInfo, uploadDir, webin, password):
+def extract_ENA_info(genomeInfo, uploadDir, webin, password, private=False):
     logger.info('Retrieving project and run info from ENA (this might take a while)...')
     
     # retrieving metadata from runs (and runs from assembly accessions if provided)
@@ -360,13 +360,13 @@ def extract_ENA_info(genomeInfo, uploadDir, webin, password):
         if genomeInfo[g]["accessionType"] == "assembly":
             derivedRuns = []
             for acc in genomeInfo[g]["accessions"]:
-                derivedRuns.append(ena.get_run_from_assembly(acc))
+                derivedRuns.append(ena.get_run_from_assembly(acc, private))
             genomeInfo[g]["accessions"] = derivedRuns
         allRuns.extend(genomeInfo[g]["accessions"])
 
     runsSet, studySet, samplesDict, tempDict = set(allRuns), set(), {}, {}
     for r in runsSet:
-        run_info = ena.get_run(r, webin, password)
+        run_info = ena.get_run(r, webin, password, private)
         studySet.add(run_info["secondary_study_accession"])
         samplesDict[r] = run_info["sample_accession"]
     
@@ -386,10 +386,10 @@ def extract_ENA_info(genomeInfo, uploadDir, webin, password):
         except json.decoder.JSONDecodeError:
             backupDict = {}
         for s in studySet:
-            studyInfo = ena.get_study(webin, password, s)
+            studyInfo = ena.get_study(s, webin, password, private=False)
             projectDescription = studyInfo["study_description"]
 
-            ENA_info = ena.get_study_runs(s, webin, password)
+            ENA_info = ena.get_study_runs(s, webin, password, private=False)
             if ENA_info == []:
                 raise IOError("No runs found on ENA for project {}.".format(s))
             
@@ -398,21 +398,25 @@ def extract_ENA_info(genomeInfo, uploadDir, webin, password):
                 if runAccession not in backupDict:
                     if runAccession in runsSet:
                         sampleAccession = ENA_info[run]["sample_accession"]
-                        sampleInfo = ena.get_sample(sampleAccession, webin, password)
-
-                        location = sampleInfo["location"]
-                        latitude, longitude = None, None
-                        if 'N' in location:
-                            latitude = location.split('N')[0].strip()
-                            longitude = location.split('N')[1].strip()
-                        elif 'S' in location:
-                            latitude = '-' + location.split('S')[0].strip()
-                            longitude = location.split('S')[1].strip()
-
-                        if 'W' in longitude:
-                            longitude = '-' + longitude.split('W')[0].strip()
-                        elif longitude.endswith('E'):
-                            longitude = longitude.split('E')[0].strip()
+                        sampleInfo = ena.get_sample(sampleAccession, webin, password, private=False)
+
+                        if sampleInfo['latitude'] and sampleInfo['longitude']:
+                            latitude =  sampleInfo['latitude']
+                            longitude = ['longitude']
+                        else:
+                            location = sampleInfo["location"]
+                            latitude, longitude = None, None
+                            if 'N' in location:
+                                latitude = location.split('N')[0].strip()
+                                longitude = location.split('N')[1].strip()
+                            elif 'S' in location:
+                                latitude = '-' + location.split('S')[0].strip()
+                                longitude = location.split('S')[1].strip()
+
+                            if 'W' in longitude:
+                                longitude = '-' + longitude.split('W')[0].strip()
+                            elif longitude.endswith('E'):
+                                longitude = longitude.split('E')[0].strip()
 
                         if latitude:
                             latitude = "{:.{}f}".format(round(float(latitude), GEOGRAPHY_DIGIT_COORDS), GEOGRAPHY_DIGIT_COORDS)
@@ -845,6 +849,7 @@ def __init__(self, argv=sys.argv[1:]):
         self.genomeMetadata = self.args.genome_info
         self.genomeType = "bins" if self.args.bins else "MAGs"
         self.live = True if self.args.live else False
+        self.private = self.args.private
         
         if self.args.webin and self.args.password:
             self.username = self.args.webin
@@ -900,6 +905,7 @@ def parse_args(self, argv):
         parser.add_argument('--webin', required=False, help="Webin id")
         parser.add_argument('--password', required=False, help="Webin password")
         parser.add_argument('--centre_name', required=False, help="Name of the centre uploading genomes")
+        parser.add_argument('--private', required=False, help="if data is private", action='store_true', default=False)
 
         args = parser.parse_args(argv)
 
@@ -925,7 +931,7 @@ def create_genome_dictionary(self, samples_xml):
         genomeInfo = extract_genomes_info(self.genomeMetadata, self.genomeType, self.live)
 
         if not os.path.exists(samples_xml) or self.force:
-            extract_ENA_info(genomeInfo, self.upload_dir, self.username, self.password)
+            extract_ENA_info(genomeInfo, self.upload_dir, self.username, self.password, self.private)
             logger.info("Writing genome registration XML...")
 
             write_genomes_xml(genomeInfo, samples_xml, self.genomeType, 

From 2e4b8ac89f6d8ba08838013c76ccbd57f18f97bf Mon Sep 17 00:00:00 2001
From: Varsha <vkale@ebi.ac.uk>
Date: Sun, 13 Oct 2024 17:00:09 +0100
Subject: [PATCH 02/20] remove comment

---
 genomeuploader/ena.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/genomeuploader/ena.py b/genomeuploader/ena.py
index 5230aee..9b60095 100644
--- a/genomeuploader/ena.py
+++ b/genomeuploader/ena.py
@@ -126,7 +126,6 @@ def get_run(self, run_accession, webin, password,  private=False, attempt=0, sea
         else:
             return run
         
-    #   will not work for private, needs modification after checking best endpoint with ENA for run ref
     def get_run_from_assembly(self, assembly_name, webin, password, private=False):
         if not private:
             manifestXml = minidom.parseString(requests.get("https://www.ebi.ac.uk" +

From c816a0c1aecf3f0c5a8de2a251227639a280d19e Mon Sep 17 00:00:00 2001
From: Varsha <vkale@ebi.ac.uk>
Date: Sun, 13 Oct 2024 17:01:20 +0100
Subject: [PATCH 03/20] remove subset for test

---
 genomeuploader/ena.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/genomeuploader/ena.py b/genomeuploader/ena.py
index 9b60095..ef0082b 100644
--- a/genomeuploader/ena.py
+++ b/genomeuploader/ena.py
@@ -194,7 +194,7 @@ def get_study_runs(self, study_accession, webin, password, private= False, field
             return []
 
         try:
-            runs = response.json()[0:2]
+            runs = response.json()
         except:
             raise ValueError("Query against ENA API did not work. Returned "
                 "message: {}".format(response.text))

From 7c6e7c952ecc3c6c37ee133da857c18beedaf9cf Mon Sep 17 00:00:00 2001
From: Varsha <vkale@ebi.ac.uk>
Date: Sun, 13 Oct 2024 17:06:34 +0100
Subject: [PATCH 04/20] code corrections

---
 genomeuploader/ena.py           | 2 +-
 genomeuploader/genome_upload.py | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/genomeuploader/ena.py b/genomeuploader/ena.py
index ef0082b..534b2ea 100644
--- a/genomeuploader/ena.py
+++ b/genomeuploader/ena.py
@@ -167,7 +167,7 @@ def get_study(self, study_accession, webin, password, private=False):
         else:
             url = f"https://www.ebi.ac.uk/ena/submit/report/studies/xml/{study_accession}"
             manifestXml = minidom.parseString(requests.get(url, auth=(webin, password)).text)
-            study_desc= manifestXml.getElementsByTagName("STUDY_DESCRIPTION")[0].firstChild.nodeValue
+            study_desc = manifestXml.getElementsByTagName("STUDY_DESCRIPTION")[0].firstChild.nodeValue
             final_data = {'study_description': study_desc}
             return final_data
 
diff --git a/genomeuploader/genome_upload.py b/genomeuploader/genome_upload.py
index 6ec1f65..5b9a3c8 100755
--- a/genomeuploader/genome_upload.py
+++ b/genomeuploader/genome_upload.py
@@ -386,10 +386,10 @@ def extract_ENA_info(genomeInfo, uploadDir, webin, password, private=False):
         except json.decoder.JSONDecodeError:
             backupDict = {}
         for s in studySet:
-            studyInfo = ena.get_study(s, webin, password, private=False)
+            studyInfo = ena.get_study(s, webin, password, private)
             projectDescription = studyInfo["study_description"]
 
-            ENA_info = ena.get_study_runs(s, webin, password, private=False)
+            ENA_info = ena.get_study_runs(s, webin, password, private)
             if ENA_info == []:
                 raise IOError("No runs found on ENA for project {}.".format(s))
             
@@ -398,11 +398,11 @@ def extract_ENA_info(genomeInfo, uploadDir, webin, password, private=False):
                 if runAccession not in backupDict:
                     if runAccession in runsSet:
                         sampleAccession = ENA_info[run]["sample_accession"]
-                        sampleInfo = ena.get_sample(sampleAccession, webin, password, private=False)
+                        sampleInfo = ena.get_sample(sampleAccession, webin, password, private)
 
                         if sampleInfo['latitude'] and sampleInfo['longitude']:
                             latitude =  sampleInfo['latitude']
-                            longitude = ['longitude']
+                            longitude = sampleInfo['longitude']
                         else:
                             location = sampleInfo["location"]
                             latitude, longitude = None, None

From 2cbae287955ff7f93c02cfff0442b0b9c267e440 Mon Sep 17 00:00:00 2001
From: Varsha <vkale@ebi.ac.uk>
Date: Tue, 15 Oct 2024 16:21:10 +0100
Subject: [PATCH 05/20] add instrument model to run data

---
 genomeuploader/ena.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/genomeuploader/ena.py b/genomeuploader/ena.py
index 534b2ea..c62a168 100644
--- a/genomeuploader/ena.py
+++ b/genomeuploader/ena.py
@@ -128,12 +128,12 @@ def get_run(self, run_accession, webin, password,  private=False, attempt=0, sea
         
     def get_run_from_assembly(self, assembly_name, webin, password, private=False):
         if not private:
-            manifestXml = minidom.parseString(requests.get("https://www.ebi.ac.uk" +
-                "/ena/browser/api/xml/" + assembly_name).text)
+            response = requests.get("https://www.ebi.ac.uk/ena/browser/api/xml/" + assembly_name)
+            
         else:
-            url = f"https://www.ebi.ac.uk/ena/submit/report/analyses/xml/{assembly_name}"
-            manifestXml = minidom.parseString(requests.get(url, auth=(webin, password)).text)
+            response = requests.get("https://www.ebi.ac.uk/ena/submit/report/analyses/xml/" + assembly_name, auth=(webin, password))
 
+        manifestXml = minidom.parseString(response.text)
         run_ref = manifestXml.getElementsByTagName("RUN_REF")
         run = run_ref[0].attributes["accession"].value
         
@@ -207,6 +207,8 @@ def get_study_runs(self, study_accession, webin, password, private= False, field
                     run_data['sample_accession'] = run_data.pop('sampleId')
                 if 'id' in run_data:
                     run_data['run_accession'] = run_data.pop('id')
+                if 'instrumentModel' in run_data:
+                    run_data['instrument_model'] = run_data.pop('instrumentModel')
                 final_data.append(run_data)
             return(final_data)
         else:

From 3b09a7e8f1add94fde8a1472724e5694b15307bd Mon Sep 17 00:00:00 2001
From: Varsha <vkale@ebi.ac.uk>
Date: Wed, 16 Oct 2024 09:44:29 +0100
Subject: [PATCH 06/20] fix code bug

---
 genomeuploader/genome_upload.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/genomeuploader/genome_upload.py b/genomeuploader/genome_upload.py
index 5b9a3c8..e62c89f 100755
--- a/genomeuploader/genome_upload.py
+++ b/genomeuploader/genome_upload.py
@@ -400,7 +400,7 @@ def extract_ENA_info(genomeInfo, uploadDir, webin, password, private=False):
                         sampleAccession = ENA_info[run]["sample_accession"]
                         sampleInfo = ena.get_sample(sampleAccession, webin, password, private)
 
-                        if sampleInfo['latitude'] and sampleInfo['longitude']:
+                        if 'latitude' in sampleInfo and 'longitude' in sampleInfo:
                             latitude =  sampleInfo['latitude']
                             longitude = sampleInfo['longitude']
                         else:

From 5ee563597dab1efa01b16aa7e6bb27b7fc6436e1 Mon Sep 17 00:00:00 2001
From: Varsha <vkale@ebi.ac.uk>
Date: Tue, 17 Dec 2024 15:18:54 +0000
Subject: [PATCH 07/20] separate public and private API calls and streamline
 exceptions

---
 genomeuploader/ena.py | 444 +++++++++++++++++++++++-------------------
 1 file changed, 247 insertions(+), 197 deletions(-)

diff --git a/genomeuploader/ena.py b/genomeuploader/ena.py
index c62a168..110f979 100644
--- a/genomeuploader/ena.py
+++ b/genomeuploader/ena.py
@@ -19,9 +19,12 @@
 import requests
 import json
 import logging
+import os 
 from time import sleep
+import sys
 
 import xml.dom.minidom as minidom
+from requests.exceptions import ConnectionError, HTTPError, RequestException, Timeout
 
 logging.basicConfig(level=logging.INFO)
 
@@ -33,15 +36,19 @@ class NoDataException(ValueError):
 
 
 RUN_DEFAULT_FIELDS = ','.join([
-    'study_accession',
     'secondary_study_accession',
-    'instrument_model',
-    'run_accession',
     'sample_accession'
 ])
 
+STUDY_RUN_DEFAULT_FIELDS = ','.join([
+    'sample_accession',
+    'run_accession',
+    'instrument_model'
+])
+
 ASSEMBLY_DEFAULT_FIELDS = 'sample_accession'
 
+
 SAMPLE_DEFAULT_FIELDS = ','.join([
     'sample_accession',
     'secondary_sample_accession',
@@ -50,216 +57,259 @@ class NoDataException(ValueError):
     'location'
 ])
 
-STUDY_DEFAULT_FIELDS = ','.join([
-    'study_accession',
-    'secondary_study_accession',
-    'study_description',
-    'study_title'
-])
+STUDY_DEFAULT_FIELDS = 'study_description'
 
-RETRY_COUNT = 5
+RETRY_COUNT = 3
 
-PRIVATE_DATA_URL = "https://www.ebi.ac.uk/ena/submit/report/"
-PUBLIC_DATA_URL = "https://www.ebi.ac.uk/ena/portal/api/search"
+def get_default_connection_headers():
+    return {
+        "headers": {
+            "Content-Type": "application/x-www-form-urlencoded",
+            "Accept": "*/*",
+        }
+    }
+
+def get_default_params():
+    return {
+        'format': 'json',
+        'includeMetagenomes': True,
+        'dataPortal': 'ena'
+    }
+
+def parse_accession(accession):
+    if accession.startswith("PRJ"):
+        return "study_accession"
+    elif "RP" in accession:
+        return "secondary_study_accession"
+    elif "RR" in accession:
+        return "run_accession"
+    elif "SAM" in accession:
+        return "secondary_sample_accession"
+    elif "RS" in accession:
+        return "sample_accession"
+    else:
+        logging.error(f"{accession} is not a valid accession")
+        sys.exit()
 
 
 class ENA():
-    def get_default_params(self):
-        return {
-            'format': 'json',
-            'includeMetagenomes': True,
-            'dataPortal': 'ena'
-        }
+    def __init__(self, accession, private=False):
+        self.private_url = "https://www.ebi.ac.uk/ena/submit/report"
+        self.public_url = "https://www.ebi.ac.uk/ena/portal/api/search"
+        self.browser_url = "https://www.ebi.ac.uk/ena/browser/api/xml"
+        self.accession = accession
+        self.acc_type = parse_accession(accession)
+        username = os.getenv("ENA_WEBIN")
+        password = os.getenv("ENA_WEBIN_PASSWORD")
+        if username is None or password is None:
+            logging.error("ENA_WEBIN and ENA_WEBIN_PASSWORD are not set")
+        if username and password:
+            self.auth = (username, password)
+        else:
+            self.auth = None
+        self.private = private
+
 
     def post_request(self, data):
-        url = PUBLIC_DATA_URL
-        default_connection_headers = {
-            "Content-Type": "application/x-www-form-urlencoded",
-            "Accept": "*/*"
-        }
-        response = requests.post(url, data=data, headers=default_connection_headers)
-        
+        response = requests.post(
+            self.public_url, data=data, **get_default_connection_headers()
+        )
         return response
 
-    def get_request(self, url, webin, password):
-        auth = (webin, password)
-        response = requests.get(url, auth=auth)
-        
-        return response        
-
-    def get_run(self, run_accession, webin, password,  private=False, attempt=0, search_params=None):
-        if not private:
-            data = self.get_default_params()
-            data['result'] = 'read_run'
-            data['fields'] = RUN_DEFAULT_FIELDS
-            data['query'] = 'run_accession=\"{}\"'.format(run_accession)
-
-            if search_params:
-                data.update(search_params)
-            response = self.post_request(data)
+    def get_request(self, url):
+        if self.private:
+            response = requests.get(url, auth=self.auth)
         else:
-            url = f'{PRIVATE_DATA_URL}runs/{run_accession}'
-            response = self.get_request(url, webin, password)
-            
-        if not response.ok and attempt > 2:
-            raise ValueError("Could not retrieve run with accession {}, returned "
-                "message: {}".format(run_accession, response.text))
-        elif response.status_code == 204:
-            if attempt < 2:
+            response = requests.get(url)
+        return response
+
+    def get_data_or_handle_error(self, response, xml=False, full_json=False):
+        try:
+            data_txt = response.text
+            if data_txt is None:
+                if self.private:
+                    logging.error(
+                        f"{self.accession} private data is not present in the specified Webin account"
+                    )
+                else:
+                    logging.error(f"{self.accession} public data does not exist")
+            else:
+                if xml:
+                    return  minidom.parseString(data_txt)
+                elif full_json: 
+                    #   return the full json when more than one entry expected
+                    return response.json()
+                else:
+                    #   only return the fist element in the list is the default. In these cases there should only be one entry returned 
+                    return json.loads(response.txt)[0]
+        except (IndexError, TypeError, ValueError, KeyError):
+            logging.error(
+                f"Failed to fetch {self.accession}, returned error: {response.text}"
+            )     
+
+    def retry_or_handle_request_error(self, request, *args, **kwargs):
+        attempt = 0
+        while attempt < RETRY_COUNT:
+            try:
+                response = request(*args, **kwargs)
+                response.raise_for_status()
+                return response
+            #   all other RequestExceptions are raised below
+            except (Timeout, ConnectionError) as retry_err:
                 attempt += 1
+                if attempt >= RETRY_COUNT:
+                    raise ValueError(
+                        f"Could not find {self.accession} in ENA after {RETRY_COUNT} attempts. Error: {retry_err}"
+                    )
                 sleep(1)
-                return self.get_run(run_accession, webin, password, attempt)
-            else:
-                raise ValueError("Could not find run {} in ENA after {}"
-                    " attempts".format(run_accession, RETRY_COUNT))
-        try:
-            run = json.loads(response.text)[0]
-        except (IndexError, TypeError, ValueError):
-            raise ValueError("Could not find run {} in ENA.".format(run_accession))
-        except:
-            raise Exception("Could not query ENA API: {}".format(response.text))
-        
-        if private:
-            run_data = run['report']
-            final_data = {'secondary_study_accession': run_data['studyId'], 'sample_accession': run_data['sampleId']}
-            return final_data
-        else:
-            return run
-        
-    def get_run_from_assembly(self, assembly_name, webin, password, private=False):
-        if not private:
-            response = requests.get("https://www.ebi.ac.uk/ena/browser/api/xml/" + assembly_name)
-            
-        else:
-            response = requests.get("https://www.ebi.ac.uk/ena/submit/report/analyses/xml/" + assembly_name, auth=(webin, password))
-
-        manifestXml = minidom.parseString(response.text)
-        run_ref = manifestXml.getElementsByTagName("RUN_REF")
+            except HTTPError as http_err:
+                print(f"HTTP response has an error status: {http_err}")
+                raise
+            except RequestException as req_err:
+                print(f"Network-related error status: {req_err}")
+                raise
+            #   should hopefully encompass all other issues...
+            except Exception as e:
+                print(f"An unexpected error occurred: {e}")
+                raise
+
+    def _get_private_run(self):
+        url = f"{self.private_url}/runs/{self.accession}"
+        response = self.retry_or_handle_request_error(self.get_request, url)
+        run = self.get_data_or_handle_error(response)
+        run_data = run["report"]
+        reformatted_data = {
+            "secondary_study_accession": run_data['studyID'],
+            "sample_accession": run_data["sampleId"],
+        }
+        logging.info(f"{self.accession} private run returned from ENA")
+        return reformatted_data
+
+    def _get_public_run(self):
+        data = get_default_params()
+        data.update(
+            {
+            "result": "read_run",
+            "query": f'run_accession="{self.accession}"',
+            "fields": "secondary_study_accession,sample_accession",
+        }
+        )
+        response = self.retry_or_handle_request_error(self.post_request, data)
+        run = self.get_data_or_handle_error(response)
+        logging.info(f"{self.accession} public run returned from ENA")
+        return run   
+
+    
+    def _get_private_study(self):
+        url = f"{self.private_url}/studies/xml/{self.accession}"
+        response = self.retry_or_handle_request_error(self.get_request, url)
+        study = self.get_data_or_handle_error(response, xml=True)
+        study_desc = study.getElementsByTagName("STUDY_DESCRIPTION")[0].firstChild.nodeValue
+        reformatted_data = {
+            "study_description" : study_desc
+        }
+        logging.info(f"{self.accession} private study returned from ENA")
+        return reformatted_data
+
+    def _get_public_study(self):
+        data = get_default_params()
+        data.update(
+            {
+            "result": "study",
+            "query": f'{self.acc_type}="{self.accession}"',
+            "fields": STUDY_DEFAULT_FIELDS,
+        }
+        )
+        response = self.retry_or_handle_request_error(self.post_request, data)
+        study = self.get_data_or_handle_error(response)
+        logging.info(f"{self.accession} public study returned from ENA")
+        return study   
+
+    def _get_private_run_from_assembly(self):
+        url = f"{self.private_url}/analyses/xml/{self.accession}"
+        reponse = self.retry_or_handle_request_error(self.get_request, url)
+        parsed_xml = self.get_data_or_handle_error(reponse, xml=True)
+        run_ref = parsed_xml.getElementsByTagName("RUN_REF")
         run = run_ref[0].attributes["accession"].value
-        
+        logging.info(f"private run from the assembly {self.accession} returned from ENA")
         return run
 
-    def get_study(self, study_accession, webin, password, private=False):
-        if not private:
-            data = self.get_default_params()
-            data['result'] = "study"
-            data['fields'] = STUDY_DEFAULT_FIELDS
-            data['query'] = 'study_accession="{}" OR secondary_study_accession="{}"' \
-                .format(study_accession, study_accession)
-
-            data['dataPortal'] = "ena"
-            try:
-                response = self.post_request(data)
-                if response.status_code == 204:
-                    raise NoDataException()
-                try:
-                    study = json.loads(response.text)[0]
-                except (IndexError, TypeError, ValueError, KeyError) as e:
-                    raise e
-                return study
-        
-            except NoDataException:
-                print("No info found to fetch study {}".format(study_accession))
-            except (IndexError, TypeError, ValueError, KeyError):
-                print("Failed to fetch study {}, returned error: {}".format(study_accession, response.text))
-
-            raise ValueError('Could not find study {} in ENA.'.format(study_accession))
-        else:
-            url = f"https://www.ebi.ac.uk/ena/submit/report/studies/xml/{study_accession}"
-            manifestXml = minidom.parseString(requests.get(url, auth=(webin, password)).text)
-            study_desc = manifestXml.getElementsByTagName("STUDY_DESCRIPTION")[0].firstChild.nodeValue
-            final_data = {'study_description': study_desc}
-            return final_data
-
-
-    def get_study_runs(self, study_accession, webin, password, private= False, fields=None, search_params=None):
-        if not private:
-            data = self.get_default_params()
-            data['result'] = 'read_run'
-            data['fields'] = fields or RUN_DEFAULT_FIELDS
-            data['query'] = '(study_accession=\"{}\" OR secondary_study_accession=\"{}\")'.format(study_accession, study_accession)
-
-            if search_params:
-                data.update(search_params)
-
-            response = self.post_request(data)
-        else:
-            url = f'{PRIVATE_DATA_URL}runs/{study_accession}'
-            response = self.get_request(url, webin, password)
-        
-        if not response.ok:
-            raise ValueError("Could not retrieve runs for study %s.", study_accession)
-        
-        if response.status_code == 204:
-            return []
-
-        try:
-            runs = response.json()
-        except:
-            raise ValueError("Query against ENA API did not work. Returned "
-                "message: {}".format(response.text))
-        
-        if private:
-            final_data = []
-            for run in runs:
-                run_data = run['report']
-                if 'sampleId' in run_data:
-                    run_data['sample_accession'] = run_data.pop('sampleId')
-                if 'id' in run_data:
-                    run_data['run_accession'] = run_data.pop('id')
-                if 'instrumentModel' in run_data:
-                    run_data['instrument_model'] = run_data.pop('instrumentModel')
-                final_data.append(run_data)
-            return(final_data)
-        else:
-            return runs
+    def _get_public_run_from_assembly(self):
+        url = f"{self.browser_url}/{self.accession}"
+        reponse = self.retry_or_handle_request_error(self.get_request, url)
+        parsed_xml = self.get_data_or_handle_error(reponse, xml=True)
+        run_ref = parsed_xml.getElementsByTagName("RUN_REF")
+        run = run_ref[0].attributes["accession"].value
+        logging.info(f"public run from the assembly {self.accession} returned from ENA")
 
-    def get_sample(self, sample_accession, webin, password, private=False, fields=None, search_params=None, attempt=0):
-        if not private:
-            data = self.get_default_params()
-            data['result'] = 'sample'
-            data['fields'] = fields or SAMPLE_DEFAULT_FIELDS
-            data['query'] = ('(sample_accession=\"{acc}\" OR secondary_sample_accession'
-                '=\"{acc}\") ').format(acc=sample_accession)
+        return run  
 
-            if search_params:
-                data.update(search_params)
+    def _get_private_study_runs(self):
+        url = f"{self.private_url}/runs/{self.accession}"
+        response = self.retry_or_handle_request_error(self.get_request, url)
+        runs = self.get_data_or_handle_error(response, full_json=True)
+        reformatted_data = []
+        for run in runs:
+            run_data = run['report']
+            if 'sampleId' in run_data:
+                run_data['sample_accession'] = run_data.pop('sampleId')
+            if 'id' in run_data:
+                run_data['run_accession'] = run_data.pop('id')
+            if 'instrumentModel' in run_data:
+                run_data['instrument_model'] = run_data.pop('instrumentModel')
+            reformatted_data.append(run_data)
+        logging.info(f"private runs from study {self.accession}, returned from ENA")
+        return reformatted_data
+
+
+    def _get_public_study_runs(self):
+        data = get_default_params()
+        data.update(
+            {
+                "result": "read_run",
+                "fields": STUDY_RUN_DEFAULT_FIELDS,
+                "query": f"{self.acc_type}={self.accession}"
+            }
+        )
+        response = self.retry_or_handle_request_error(self.post_request, data)
+        runs = self.get_data_or_handle_error(response, full_json=True)
+        logging.info(f"public runs from study {self.accession}, returned from ENA")
+        return runs
+    
+    def _get_private_sample(self):
+        url = f"{self.private_url}/samples/xml/{self.accession}"
+        reponse = self.retry_or_handle_request_error(self.get_request, url)
+        reformatted_data = {}
+        parsed_xml = self.get_data_or_handle_error(reponse, xml=True)
+        sample_attributes = parsed_xml.getElementsByTagName('SAMPLE_ATTRIBUTE')
+        for attribute in sample_attributes:
+            tag = attribute.getElementsByTagName('TAG')[0].firstChild.nodeValue
+            if tag == "geographic location (country and/or sea)":
+                reformatted_data['country'] = attribute.getElementsByTagName('VALUE')[0].firstChild.nodeValue
+            if tag == "geographic location (latitude)":
+                reformatted_data['latitude'] = attribute.getElementsByTagName('VALUE')[0].firstChild.nodeValue
+            if tag == "geographic location (longitude)":
+                reformatted_data['longitude'] = attribute.getElementsByTagName('VALUE')[0].firstChild.nodeValue
+            if tag == "collection date":
+                reformatted_data['collection_date'] = attribute.getElementsByTagName('VALUE')[0].firstChild.nodeValue
+        logging.info(f"{self.accession} private sample returned from ENA")        
+        return reformatted_data
+
+
+    def _get_public_sample(self):
+        data = get_default_params()
+        data.update(
+            {
+                "result": "sample",
+                "fields": SAMPLE_DEFAULT_FIELDS,
+                "query":  f"{self.acc_type}={self.accession}"
+            }
+        )
+        response = self.retry_or_handle_request_error(self.post_request, data)
+        sample = self.get_data_or_handle_error(response)
+        logging.info(f"{self.accession} public sample returned from ENA")
+        return sample         
 
-            response = self.post_request(data)
-        
-            if response.status_code == 200:
-                sample = response.json()
-                assert len(sample) == 1 
-                return sample[0]
-
-            if response.status_code == 204:
-                if attempt < 2:
-                    new_params = {'dataPortal': 'metagenome' if data['dataPortal'] == 'ena' else 'ena'}
-                    attempt += 1
-                    return self.get_sample(sample_accession, webin, password, fields=fields, 
-                        search_params=new_params, attempt=attempt)
-                else:
-                    raise ValueError("Could not find sample {} in ENA after "
-                        "{} attempts.".format(sample_accession, RETRY_COUNT))
-            else:
-                raise ValueError("Could not retrieve sample with accession {}. "
-                    "Returned message: {}".format(sample_accession, response.text))
-        else:
-            url = f"https://www.ebi.ac.uk/ena/submit/report/samples/xml/{sample_accession}"
-            final_data = {}
-            manifestXml = minidom.parseString(requests.get(url, auth=(webin, password)).text)
-            sample_attributes = manifestXml.getElementsByTagName('SAMPLE_ATTRIBUTE')
-            for attribute in sample_attributes:
-                tag = attribute.getElementsByTagName('TAG')[0].firstChild.nodeValue
-                if tag == "geographic location (country and/or sea)":
-                    final_data['country'] = attribute.getElementsByTagName('VALUE')[0].firstChild.nodeValue
-                if tag == "geographic location (latitude)":
-                    final_data['latitude'] = attribute.getElementsByTagName('VALUE')[0].firstChild.nodeValue
-                if tag == "geographic location (longitude)":
-                    final_data['longitude'] = attribute.getElementsByTagName('VALUE')[0].firstChild.nodeValue
-                if tag == "collection date":
-                    final_data['collection_date'] = attribute.getElementsByTagName('VALUE')[0].firstChild.nodeValue
-            return final_data
 
     def query_taxid(self, taxid):
         url = "https://www.ebi.ac.uk/ena/taxonomy/rest/tax-id/{}".format(taxid)
@@ -306,7 +356,7 @@ def query_scientific_name(self, scientificName, searchRank=False):
         else:
             return submittable, taxid
 
-    def handle_genomes_registration(self, sample_xml, submission_xml, webin, password, live=False):
+    def handle_genomes_registration(self, sample_xml, submission_xml, live=False):
         liveSub, mode = "", "live"
 
         if not live:
@@ -322,7 +372,7 @@ def handle_genomes_registration(self, sample_xml, submission_xml, webin, passwor
             'SAMPLE': open(sample_xml, 'r')
         }
 
-        submissionResponse = requests.post(url, files = f, auth = (webin, password))
+        submissionResponse = requests.post(url, files = f, auth = self.auth)
 
         if submissionResponse.status_code != 200:
             if str(submissionResponse.status_code).startswith('5'):

From 0e5d8d055a68281517df5e4e300b0ff64b724512 Mon Sep 17 00:00:00 2001
From: Varsha <vkale@ebi.ac.uk>
Date: Tue, 17 Dec 2024 17:15:58 +0000
Subject: [PATCH 08/20] remove webin credential parameter, remove webin
 credentials from genome upload script, redfine ena functions

---
 genomeuploader/ena.py           | 171 ++++++++++++++-------
 genomeuploader/genome_upload.py | 255 +++++++++++++++-----------------
 2 files changed, 236 insertions(+), 190 deletions(-)

diff --git a/genomeuploader/ena.py b/genomeuploader/ena.py
index 110f979..7682799 100644
--- a/genomeuploader/ena.py
+++ b/genomeuploader/ena.py
@@ -19,9 +19,11 @@
 import requests
 import json
 import logging
-import os 
+import os
+import sys 
 from time import sleep
-import sys
+from pathlib import Path
+from dotenv import load_dotenv
 
 import xml.dom.minidom as minidom
 from requests.exceptions import ConnectionError, HTTPError, RequestException, Timeout
@@ -91,16 +93,82 @@ def parse_accession(accession):
         logging.error(f"{accession} is not a valid accession")
         sys.exit()
 
+def configure_credentials():
+    # Config file
+    user_config = Path.home() / ".genome_uploader.config.env"
+    if user_config.exists():
+        logger.debug("Loading the env variables from ".format(user_config))
+        load_dotenv(str(user_config))
+    else:
+        cwd_config = Path.cwd() / ".genome_uploader.config.env"
+        if cwd_config.exists():
+            logger.debug("Loading the variables from the current directory.")
+            load_dotenv(str(cwd_config))
+        else:
+            logger.debug("Trying to load env variables from the .env file")
+            # from a local .env file
+            load_dotenv()
+
+    username = os.getenv("ENA_WEBIN")
+    password = os.getenv("ENA_WEBIN_PASSWORD")
+
+    return username, password
+
+def query_taxid(taxid):
+    url = "https://www.ebi.ac.uk/ena/taxonomy/rest/tax-id/{}".format(taxid)
+    response = requests.get(url)
+
+    try:
+        # Will raise exception if response status code is non-200 
+        response.raise_for_status()
+    except requests.exceptions.HTTPError as e:
+        print("Request failed {} with error {}".format(url, e))
+        return False
+    
+    res = response.json()
+    
+    return res.get("scientificName", "")
+
+def query_scientific_name(scientificName, searchRank=False):
+    url = "https://www.ebi.ac.uk/ena/taxonomy/rest/scientific-name/{}".format(scientificName)
+    response = requests.get(url)
+    
+    try:
+        # Will raise exception if response status code is non-200 
+        response.raise_for_status()
+    except requests.exceptions.HTTPError as e:
+        if searchRank:
+            return False, "", ""
+        else:
+            return False, ""
+    
+    try:
+        res = response.json()[0]
+    except IndexError:
+        if searchRank:
+            return False, "", ""
+        else:
+            return False, ""
+
+    submittable = res.get("submittable", "").lower() == "true"
+    taxid = res.get("taxId", "")
+    rank = res.get("rank", "")
 
-class ENA():
-    def __init__(self, accession, private=False):
+    if searchRank:
+        return submittable, taxid, rank
+    else:
+        return submittable, taxid
+
+
+
+class EnaQuery():
+    def __init__(self, accession, query_type, private=False):
         self.private_url = "https://www.ebi.ac.uk/ena/submit/report"
         self.public_url = "https://www.ebi.ac.uk/ena/portal/api/search"
         self.browser_url = "https://www.ebi.ac.uk/ena/browser/api/xml"
         self.accession = accession
         self.acc_type = parse_accession(accession)
-        username = os.getenv("ENA_WEBIN")
-        password = os.getenv("ENA_WEBIN_PASSWORD")
+        username, password = configure_credentials()
         if username is None or password is None:
             logging.error("ENA_WEBIN and ENA_WEBIN_PASSWORD are not set")
         if username and password:
@@ -108,7 +176,7 @@ def __init__(self, accession, private=False):
         else:
             self.auth = None
         self.private = private
-
+        self.query_type = query_type
 
     def post_request(self, data):
         response = requests.post(
@@ -311,55 +379,53 @@ def _get_public_sample(self):
         return sample         
 
 
-    def query_taxid(self, taxid):
-        url = "https://www.ebi.ac.uk/ena/taxonomy/rest/tax-id/{}".format(taxid)
-        response = requests.get(url)
-
-        try:
-            # Will raise exception if response status code is non-200 
-            response.raise_for_status()
-        except requests.exceptions.HTTPError as e:
-            print("Request failed {} with error {}".format(url, e))
-            return False
-        
-        res = response.json()
-        
-        return res.get("scientificName", "")
-
-    def query_scientific_name(self, scientificName, searchRank=False):
-        url = "https://www.ebi.ac.uk/ena/taxonomy/rest/scientific-name/{}".format(scientificName)
-        response = requests.get(url)
-        
-        try:
-            # Will raise exception if response status code is non-200 
-            response.raise_for_status()
-        except requests.exceptions.HTTPError as e:
-            if searchRank:
-                return False, "", ""
+    def build_query(self):
+        if self.query_type == "study":
+            if self.private:
+                ena_response = self._get_private_study()
             else:
-                return False, ""
-        
-        try:
-            res = response.json()[0]
-        except IndexError:
-            if searchRank:
-                return False, "", ""
+                ena_response = self._get_public_study()
+        elif self.query_type == "run":
+            if self.private:
+                ena_response = self._get_private_run()
+            else:
+                ena_response = self._get_public_run()
+        elif self.query_type == "run_assembly":
+            if self.private:
+                ena_response = self._get_private_run_from_assembly()
+            else:
+                ena_response = self._get_public_run_from_assembly()
+        elif self.query_type == "study_runs":
+            if self.private:
+                ena_response = self._get_private_study_runs()
+            else:
+                ena_response = self._get_public_study_runs()
+        elif self.query_type == "sample":
+            if self.private:
+                ena_response = self._get_private_sample()
             else:
-                return False, ""
+                ena_response = self._get_public_sample()
+        return ena_response
 
-        submittable = res.get("submittable", "").lower() == "true"
-        taxid = res.get("taxId", "")
-        rank = res.get("rank", "")
 
-        if searchRank:
-            return submittable, taxid, rank
+class EnaSubmit():
+    def __init__(self, sample_xml, submission_xml, live=False):
+        self.sample_xml = sample_xml
+        self.submission_xml = submission_xml
+        self.live = live
+        username, password = configure_credentials()
+        if username is None or password is None:
+            logging.error("ENA_WEBIN and ENA_WEBIN_PASSWORD are not set")
+        if username and password:
+            self.auth = (username, password)
         else:
-            return submittable, taxid
+            self.auth = None
+        
 
-    def handle_genomes_registration(self, sample_xml, submission_xml, live=False):
+    def handle_genomes_registration(self):
         liveSub, mode = "", "live"
 
-        if not live:
+        if not self.live:
             liveSub = "dev"
             mode = "test"
 
@@ -368,8 +434,8 @@ def handle_genomes_registration(self, sample_xml, submission_xml, live=False):
         logger.info('Registering sample xml in {} mode.'.format(mode))
 
         f = {
-            'SUBMISSION': open(submission_xml, 'r'),
-            'SAMPLE': open(sample_xml, 'r')
+            'SUBMISSION': open(self.submission_xml, 'r'),
+            'SAMPLE': open(self.sample_xml, 'r')
         }
 
         submissionResponse = requests.post(url, files = f, auth = self.auth)
@@ -404,4 +470,7 @@ def handle_genomes_registration(self, sample_xml, submission_xml, live=False):
         logger.info('{} genome samples successfully registered.'.format(str(len(aliasDict))))
 
         return aliasDict
-    
\ No newline at end of file
+    
+
+
+        
diff --git a/genomeuploader/genome_upload.py b/genomeuploader/genome_upload.py
index e62c89f..9a54f5e 100755
--- a/genomeuploader/genome_upload.py
+++ b/genomeuploader/genome_upload.py
@@ -22,14 +22,13 @@
 import json
 import pandas as pd
 from datetime import date, datetime as dt
-from dotenv import load_dotenv
-from pathlib import Path
 
 import xml.etree.ElementTree as ET
 import xml.dom.minidom as minidom
 import requests
 
-from ena import ENA
+import ena
+from ena import EnaQuery, EnaSubmit
 
 from constants import METAGENOMES, GEOGRAPHIC_LOCATIONS, MQ, HQ
 
@@ -37,8 +36,6 @@
 
 logger = logging.getLogger(__name__)
 
-ena = ENA()
-
 GEOGRAPHY_DIGIT_COORDS = 8
 
 '''
@@ -351,110 +348,6 @@ def extract_genomes_info(inputFile, genomeType, live):
 
     return genomeInfo
 
-def extract_ENA_info(genomeInfo, uploadDir, webin, password, private=False):
-    logger.info('Retrieving project and run info from ENA (this might take a while)...')
-    
-    # retrieving metadata from runs (and runs from assembly accessions if provided)
-    allRuns = []
-    for g in genomeInfo:
-        if genomeInfo[g]["accessionType"] == "assembly":
-            derivedRuns = []
-            for acc in genomeInfo[g]["accessions"]:
-                derivedRuns.append(ena.get_run_from_assembly(acc, private))
-            genomeInfo[g]["accessions"] = derivedRuns
-        allRuns.extend(genomeInfo[g]["accessions"])
-
-    runsSet, studySet, samplesDict, tempDict = set(allRuns), set(), {}, {}
-    for r in runsSet:
-        run_info = ena.get_run(r, webin, password, private)
-        studySet.add(run_info["secondary_study_accession"])
-        samplesDict[r] = run_info["sample_accession"]
-    
-    if not studySet:
-        raise ValueError("No study corresponding to runs found.")
-
-    backupFile = os.path.join(uploadDir, "ENA_backup.json")
-    counter = 0
-    if not os.path.exists(backupFile):
-        with open(backupFile, 'w') as file:
-            pass
-    with open(backupFile, "r+") as file:
-        try:
-            backupDict = json.load(file)
-            tempDict = dict(backupDict)
-            logger.info(f"A backup file {backupFile} for ENA sample metadata has been found.")
-        except json.decoder.JSONDecodeError:
-            backupDict = {}
-        for s in studySet:
-            studyInfo = ena.get_study(s, webin, password, private)
-            projectDescription = studyInfo["study_description"]
-
-            ENA_info = ena.get_study_runs(s, webin, password, private)
-            if ENA_info == []:
-                raise IOError("No runs found on ENA for project {}.".format(s))
-            
-            for run, item in enumerate(ENA_info):
-                runAccession = ENA_info[run]["run_accession"]
-                if runAccession not in backupDict:
-                    if runAccession in runsSet:
-                        sampleAccession = ENA_info[run]["sample_accession"]
-                        sampleInfo = ena.get_sample(sampleAccession, webin, password, private)
-
-                        if 'latitude' in sampleInfo and 'longitude' in sampleInfo:
-                            latitude =  sampleInfo['latitude']
-                            longitude = sampleInfo['longitude']
-                        else:
-                            location = sampleInfo["location"]
-                            latitude, longitude = None, None
-                            if 'N' in location:
-                                latitude = location.split('N')[0].strip()
-                                longitude = location.split('N')[1].strip()
-                            elif 'S' in location:
-                                latitude = '-' + location.split('S')[0].strip()
-                                longitude = location.split('S')[1].strip()
-
-                            if 'W' in longitude:
-                                longitude = '-' + longitude.split('W')[0].strip()
-                            elif longitude.endswith('E'):
-                                longitude = longitude.split('E')[0].strip()
-
-                        if latitude:
-                            latitude = "{:.{}f}".format(round(float(latitude), GEOGRAPHY_DIGIT_COORDS), GEOGRAPHY_DIGIT_COORDS)
-                        else:
-                            latitude = "not provided"
-
-                        if longitude:
-                            longitude = "{:.{}f}".format(round(float(longitude), GEOGRAPHY_DIGIT_COORDS), GEOGRAPHY_DIGIT_COORDS)
-                        else:
-                            longitude = "not provided"
- 
-                        country = sampleInfo["country"].split(':')[0]
-                        if not country in GEOGRAPHIC_LOCATIONS:
-                            country = "not provided"
-                        
-                        collectionDate = sampleInfo["collection_date"]
-                        if collectionDate == "" or collectionDate == "missing":
-                            collectionDate = "not provided"
-
-                        tempDict[runAccession] = {
-                            "instrumentModel" : ENA_info[run]["instrument_model"],
-                            "collectionDate" : collectionDate,
-                            "country" : country,
-                            "latitude" : latitude,
-                            "longitude" : longitude,
-                            "projectDescription" : projectDescription,
-                            "study" : s,
-                            "sampleAccession" : samplesDict[runAccession]
-                        }
-                        counter += 1
-
-                        if (counter%10 == 0) or (len(runsSet) - len(backupDict) == counter):
-                            file.seek(0)
-                            file.write(json.dumps(tempDict))
-                            file.truncate()
-    tempDict = {**tempDict, **backupDict}
-    combine_ENA_info(genomeInfo, tempDict)
-
 def multipleElementSet(metadataList):
     return len(set(metadataList))>1
 
@@ -830,8 +723,8 @@ def main():
         
     if save:
         logger.info("Registering genome samples XMLs...")
-        aliasToNewSampleAccession = ena.handle_genomes_registration(samples_xml, 
-            submission_xml, ENA_uploader.username, ENA_uploader.password, ENA_uploader.live)
+        ena_submit = EnaSubmit(samples_xml, submission_xml, ENA_uploader.live)
+        aliasToNewSampleAccession = ena_submit.handle_genomes_registration()
         saveAccessions(aliasToNewSampleAccession, accessionsFile, writeMode)
 
     logger.info("Generating manifest files...")
@@ -850,32 +743,6 @@ def __init__(self, argv=sys.argv[1:]):
         self.genomeType = "bins" if self.args.bins else "MAGs"
         self.live = True if self.args.live else False
         self.private = self.args.private
-        
-        if self.args.webin and self.args.password:
-            self.username = self.args.webin
-            self.password = self.args.password
-        else:
-            # Config file
-            user_config = Path.home() / ".genome_uploader.config.env"
-            if user_config.exists():
-                logger.debug("Loading the env variables from ".format(user_config))
-                load_dotenv(str(user_config))
-            else:
-                cwd_config = Path.cwd() / ".genome_uploader.config.env"
-                if cwd_config.exists():
-                    logger.debug("Loading the variables from the current directory.")
-                    load_dotenv(str(cwd_config))
-                else:
-                    logger.debug("Trying to load env variables from the .env file")
-                    # from a local .env file
-                    load_dotenv()
-
-            self.username = os.getenv("ENA_WEBIN")
-            self.password = os.getenv("ENA_WEBIN_PASSWORD")
-        
-        if not self.username or not self.password:
-            logger.error("ENA Webin username or password are empty")
-            sys.exit(1)
 
         self.tpa = True if self.args.tpa else False
         self.centre_name = self.args.centre_name
@@ -883,6 +750,8 @@ def __init__(self, argv=sys.argv[1:]):
 
         workDir = self.args.out if self.args.out else os.getcwd()
         self.upload_dir = self.generate_genomes_upload_dir(workDir, self.genomeType)
+
+        self.private = self.args.private
     
     def parse_args(self, argv):
         parser = argparse.ArgumentParser(formatter_class = argparse.ArgumentDefaultsHelpFormatter,
@@ -902,8 +771,6 @@ def parse_args(self, argv):
         parser.add_argument('--tpa', action='store_true', help="Select if uploading TPA-generated genomes")
         
         # Users can provide their credentials and centre name manually or using a config file
-        parser.add_argument('--webin', required=False, help="Webin id")
-        parser.add_argument('--password', required=False, help="Webin password")
         parser.add_argument('--centre_name', required=False, help="Name of the centre uploading genomes")
         parser.add_argument('--private', required=False, help="if data is private", action='store_true', default=False)
 
@@ -917,6 +784,116 @@ def parse_args(self, argv):
 
         return args
 
+    def extract_ENA_info(self, genomeInfo, uploadDir, private=False):
+        logger.info('Retrieving project and run info from ENA (this might take a while)...')
+        
+        # retrieving metadata from runs (and runs from assembly accessions if provided)
+        allRuns = []
+        for g in genomeInfo:
+            if genomeInfo[g]["accessionType"] == "assembly":
+                derivedRuns = []
+                for acc in genomeInfo[g]["accessions"]:
+                    ena_query = EnaQuery(acc, "run_assembly", self.private)
+                    derivedRuns.append(ena_query.build_query())
+                genomeInfo[g]["accessions"] = derivedRuns
+            allRuns.extend(genomeInfo[g]["accessions"])
+
+        runsSet, studySet, samplesDict, tempDict = set(allRuns), set(), {}, {}
+        for r in runsSet:
+            ena_query = EnaQuery(r, "run", self.private)
+            run_info = ena_query.build_query()
+            studySet.add(run_info["secondary_study_accession"])
+            samplesDict[r] = run_info["sample_accession"]
+        
+        if not studySet:
+            raise ValueError("No study corresponding to runs found.")
+
+        backupFile = os.path.join(uploadDir, "ENA_backup.json")
+        counter = 0
+        if not os.path.exists(backupFile):
+            with open(backupFile, 'w') as file:
+                pass
+        with open(backupFile, "r+") as file:
+            try:
+                backupDict = json.load(file)
+                tempDict = dict(backupDict)
+                logger.info(f"A backup file {backupFile} for ENA sample metadata has been found.")
+            except json.decoder.JSONDecodeError:
+                backupDict = {}
+            for s in studySet:
+                ena_query = EnaQuery(s, "study", self.private)
+                studyInfo = ena_query.build_query()
+                projectDescription = studyInfo["study_description"]
+
+                ena_query = EnaQuery(s, "study_runs", self.private)
+                ENA_info = ena_query.build_query()
+                if ENA_info == []:
+                    raise IOError("No runs found on ENA for project {}.".format(s))
+                
+                for run, item in enumerate(ENA_info):
+                    runAccession = ENA_info[run]["run_accession"]
+                    if runAccession not in backupDict:
+                        if runAccession in runsSet:
+                            sampleAccession = ENA_info[run]["sample_accession"]
+                            ena_query = EnaQuery(sampleAccession, "sample", self.private)
+                            sampleInfo = ena_query.build_query()
+
+                            if 'latitude' in sampleInfo and 'longitude' in sampleInfo:
+                                latitude =  sampleInfo['latitude']
+                                longitude = sampleInfo['longitude']
+                            else:
+                                location = sampleInfo["location"]
+                                latitude, longitude = None, None
+                                if 'N' in location:
+                                    latitude = location.split('N')[0].strip()
+                                    longitude = location.split('N')[1].strip()
+                                elif 'S' in location:
+                                    latitude = '-' + location.split('S')[0].strip()
+                                    longitude = location.split('S')[1].strip()
+
+                                if 'W' in longitude:
+                                    longitude = '-' + longitude.split('W')[0].strip()
+                                elif longitude.endswith('E'):
+                                    longitude = longitude.split('E')[0].strip()
+
+                            if latitude:
+                                latitude = "{:.{}f}".format(round(float(latitude), GEOGRAPHY_DIGIT_COORDS), GEOGRAPHY_DIGIT_COORDS)
+                            else:
+                                latitude = "not provided"
+
+                            if longitude:
+                                longitude = "{:.{}f}".format(round(float(longitude), GEOGRAPHY_DIGIT_COORDS), GEOGRAPHY_DIGIT_COORDS)
+                            else:
+                                longitude = "not provided"
+    
+                            country = sampleInfo["country"].split(':')[0]
+                            if not country in GEOGRAPHIC_LOCATIONS:
+                                country = "not provided"
+                            
+                            collectionDate = sampleInfo["collection_date"]
+                            if collectionDate == "" or collectionDate == "missing":
+                                collectionDate = "not provided"
+
+                            tempDict[runAccession] = {
+                                "instrumentModel" : ENA_info[run]["instrument_model"],
+                                "collectionDate" : collectionDate,
+                                "country" : country,
+                                "latitude" : latitude,
+                                "longitude" : longitude,
+                                "projectDescription" : projectDescription,
+                                "study" : s,
+                                "sampleAccession" : samplesDict[runAccession]
+                            }
+                            counter += 1
+
+                            if (counter%10 == 0) or (len(runsSet) - len(backupDict) == counter):
+                                file.seek(0)
+                                file.write(json.dumps(tempDict))
+                                file.truncate()
+        tempDict = {**tempDict, **backupDict}
+        combine_ENA_info(genomeInfo, tempDict)
+
+
     def generate_genomes_upload_dir(self, dir, genomeType):
         uploadName = "MAG_upload"
         if genomeType == "bins":

From ea275f27a4dcfa1933bb1bbda8fc989bdfd78e78 Mon Sep 17 00:00:00 2001
From: Varsha <vkale@ebi.ac.uk>
Date: Wed, 18 Dec 2024 11:23:04 +0000
Subject: [PATCH 09/20] move lists to contstants, pull arguments out of class,
 reformat camelCase

---
 genomeuploader/constants.py     |  25 ++
 genomeuploader/ena.py           |  52 +--
 genomeuploader/genome_upload.py | 710 ++++++++++++++++----------------
 3 files changed, 401 insertions(+), 386 deletions(-)

diff --git a/genomeuploader/constants.py b/genomeuploader/constants.py
index 11d9a5e..5a27b39 100644
--- a/genomeuploader/constants.py
+++ b/genomeuploader/constants.py
@@ -635,3 +635,28 @@
     "Zambia",
     "Zimbabwe",
 ]
+
+BIN_MANDATORY_FIELDS = [
+    "genome_name", 
+    "accessions",
+    "assembly_software", 
+    "binning_software", 
+    "binning_parameters", 
+    "stats_generation_software", 
+    "NCBI_lineage",
+    "broad_environment", 
+    "local_environment", 
+    "environmental_medium", 
+    "metagenome",
+    "co-assembly", 
+    "genome_coverage", 
+    "genome_path"
+]
+
+MAG_MANDATORY_FIELDS = [
+    "rRNA_presence", 
+    "completeness", 
+    "contamination"
+]
+
+GEOGRAPHY_DIGIT_COORDS = 8
\ No newline at end of file
diff --git a/genomeuploader/ena.py b/genomeuploader/ena.py
index 7682799..7faab70 100644
--- a/genomeuploader/ena.py
+++ b/genomeuploader/ena.py
@@ -129,15 +129,15 @@ def query_taxid(taxid):
     
     return res.get("scientificName", "")
 
-def query_scientific_name(scientificName, searchRank=False):
-    url = "https://www.ebi.ac.uk/ena/taxonomy/rest/scientific-name/{}".format(scientificName)
+def query_scientific_name(scientific_name, search_rank=False):
+    url = "https://www.ebi.ac.uk/ena/taxonomy/rest/scientific-name/{}".format(scientific_name)
     response = requests.get(url)
     
     try:
         # Will raise exception if response status code is non-200 
         response.raise_for_status()
     except requests.exceptions.HTTPError as e:
-        if searchRank:
+        if search_rank:
             return False, "", ""
         else:
             return False, ""
@@ -145,7 +145,7 @@ def query_scientific_name(scientificName, searchRank=False):
     try:
         res = response.json()[0]
     except IndexError:
-        if searchRank:
+        if search_rank:
             return False, "", ""
         else:
             return False, ""
@@ -154,7 +154,7 @@ def query_scientific_name(scientificName, searchRank=False):
     taxid = res.get("taxId", "")
     rank = res.get("rank", "")
 
-    if searchRank:
+    if search_rank:
         return submittable, taxid, rank
     else:
         return submittable, taxid
@@ -423,13 +423,13 @@ def __init__(self, sample_xml, submission_xml, live=False):
         
 
     def handle_genomes_registration(self):
-        liveSub, mode = "", "live"
+        live_sub, mode = "", "live"
 
         if not self.live:
-            liveSub = "dev"
+            live_sub = "dev"
             mode = "test"
 
-        url = "https://www{}.ebi.ac.uk/ena/submit/drop-box/submit/".format(liveSub)
+        url = "https://www{}.ebi.ac.uk/ena/submit/drop-box/submit/".format(live_sub)
 
         logger.info('Registering sample xml in {} mode.'.format(mode))
 
@@ -438,38 +438,38 @@ def handle_genomes_registration(self):
             'SAMPLE': open(self.sample_xml, 'r')
         }
 
-        submissionResponse = requests.post(url, files = f, auth = self.auth)
+        submission_response = requests.post(url, files = f, auth = self.auth)
 
-        if submissionResponse.status_code != 200:
-            if str(submissionResponse.status_code).startswith('5'):
+        if submission_response.status_code != 200:
+            if str(submission_response.status_code).startswith('5'):
                 raise Exception("Genomes could not be submitted to ENA as the server " +
                     "does not respond. Please again try later.")
             else:
                 raise Exception("Genomes could not be submitted to ENA. HTTP response: " +
-                    submissionResponse.reason)
+                    submission_response.reason)
 
-        receiptXml = minidom.parseString((submissionResponse.content).decode("utf-8"))
-        receipt = receiptXml.getElementsByTagName("RECEIPT")
+        receipt_xml = minidom.parseString((submission_response.content).decode("utf-8"))
+        receipt = receipt_xml.getElementsByTagName("RECEIPT")
         success = receipt[0].attributes["success"].value
         if success == "true":
-            aliasDict = {}
-            samples = receiptXml.getElementsByTagName("SAMPLE")
+            alias_dict = {}
+            samples = receipt_xml.getElementsByTagName("SAMPLE")
             for s in samples:
-                sraAcc = s.attributes["accession"].value
+                sra_acc = s.attributes["accession"].value
                 alias = s.attributes["alias"].value
-                aliasDict[alias] = sraAcc
+                alias_dict[alias] = sra_acc
         elif success == "false":
-            errors = receiptXml.getElementsByTagName("ERROR")
-            finalError = "\tSome genomes could not be submitted to ENA. Please, check the errors below."
+            errors = receipt_xml.getElementsByTagName("ERROR")
+            final_error = "\tSome genomes could not be submitted to ENA. Please, check the errors below."
             for error in errors:
-                finalError += "\n\t" + error.firstChild.data
-            finalError += "\n\tIf you wish to validate again your data and metadata, "
-            finalError += "please use the --force option."
-            raise Exception(finalError)
+                final_error += "\n\t" + error.firstChild.data
+            final_error += "\n\tIf you wish to validate again your data and metadata, "
+            final_error += "please use the --force option."
+            raise Exception(final_error)
         
-        logger.info('{} genome samples successfully registered.'.format(str(len(aliasDict))))
+        logger.info('{} genome samples successfully registered.'.format(str(len(alias_dict))))
 
-        return aliasDict
+        return alias_dict
     
 
 
diff --git a/genomeuploader/genome_upload.py b/genomeuploader/genome_upload.py
index 9a54f5e..9136118 100755
--- a/genomeuploader/genome_upload.py
+++ b/genomeuploader/genome_upload.py
@@ -30,13 +30,33 @@
 import ena
 from ena import EnaQuery, EnaSubmit
 
-from constants import METAGENOMES, GEOGRAPHIC_LOCATIONS, MQ, HQ
+from constants import *
 
 logging.basicConfig(level=logging.DEBUG)
-
 logger = logging.getLogger(__name__)
 
-GEOGRAPHY_DIGIT_COORDS = 8
+def parse_args(argv):
+    parser = argparse.ArgumentParser(formatter_class = argparse.ArgumentDefaultsHelpFormatter,
+        description="Create xmls and manifest files for genome upload to ENA")
+    
+    parser.add_argument('-u', '--upload_study', type=str, help="Study accession for genomes upload")
+    parser.add_argument('--genome_info', type=str, required=True, help="Genomes metadata file")
+
+    genomeType = parser.add_mutually_exclusive_group(required=True)
+    genomeType.add_argument('-m', '--mags', action='store_true', help="Select for MAG upload")
+    genomeType.add_argument('-b', '--bins', action='store_true', help="Select for bin upload")
+    
+    parser.add_argument('--out', type=str, help="Output folder. Default: working directory")
+    parser.add_argument('--force', action='store_true', help="Forces reset of sample xml's backups")
+    parser.add_argument('--live', action='store_true', help="Uploads on ENA. Omitting this " +
+        "option allows to validate samples beforehand")
+    parser.add_argument('--tpa', action='store_true', help="Select if uploading TPA-generated genomes")
+    
+    # Users can provide their credentials and centre name manually or using a config file
+    parser.add_argument('--centre_name', required=False, help="Name of the centre uploading genomes")
+    parser.add_argument('--private', required=False, help="if data is private", action='store_true', default=False)
+
+    return parser.parse_args(argv)
 
 '''
 Input table: expects the following parameters:
@@ -58,55 +78,49 @@
     genome_coverage : genome coverage
     genome_path: path to genome to upload
 '''
-def read_and_cleanse_metadata_tsv(inputFile, genomeType, live):
+
+def validate_metadata_tsv(input_file, genome_type, live):
     logger.info('Retrieving info for genomes to submit...')
     
-    binMandatoryFields = ["genome_name", "accessions",
-        "assembly_software", "binning_software", 
-        "binning_parameters", "stats_generation_software", "NCBI_lineage",
-        "broad_environment", "local_environment", "environmental_medium", "metagenome",
-        "co-assembly", "genome_coverage", "genome_path"]
-    MAGMandatoryFields = ["rRNA_presence", "completeness", "contamination"]
-    
-    allFields = MAGMandatoryFields + binMandatoryFields
-    metadata = pd.read_csv(inputFile, sep='\t', usecols=allFields)
+    all_fields = MAG_MANDATORY_FIELDS + BIN_MANDATORY_FIELDS
+    metadata = pd.read_csv(input_file, sep='\t', usecols=all_fields)
     
     # make sure there are no empty cells
-    cleanColumns = list(metadata.dropna(axis=1))
-    if genomeType == "MAGs":
-        missingValues = [item for item in allFields if item not in cleanColumns]
+    clean_columns = list(metadata.dropna(axis=1))
+    if genome_type == "MAGs":
+        missing_values = [item for item in all_fields if item not in clean_columns]
     else:
-        missingValues = [item for item in binMandatoryFields if item not in cleanColumns]
+        missing_values = [item for item in BIN_MANDATORY_FIELDS if item not in clean_columns]
     
-    if missingValues:
+    if missing_values:
         raise ValueError("The following mandatory fields have missing values in " +
-            "the input file: {}".format(", ".join(missingValues)))
+            "the input file: {}".format(", ".join(missing_values)))
 
     # check amount of genomes to register at the same time
     if len(metadata) >= 5000:
         raise ValueError("Genomes need to be registered in batches of 5000 genomes or smaller.")
 
     # check whether accessions follow the right format
-    accessions_regExp = re.compile(r"([E|S|D]R[R|Z]\d{6,})")
+    accessions_reg_exp = re.compile(r"([E|S|D]R[R|Z]\d{6,})")
 
-    accessionComparison = pd.DataFrame(columns=["genome_name", "attemptive_accessions", 
+    accession_comparison = pd.DataFrame(columns=["genome_name", "attemptive_accessions", 
         "correct", "mismatching", "co-assembly"])
-    accessionComparison["genome_name"] = metadata["genome_name"]
+    accession_comparison["genome_name"] = metadata["genome_name"]
 
-    accessionComparison["attemptive_accessions"] = metadata["accessions"].map(
+    accession_comparison["attemptive_accessions"] = metadata["accessions"].map(
         lambda a: len(a.split(',')))
 
-    accessionComparison["correct"] = metadata["accessions"].map(
-        lambda a: len(accessions_regExp.findall(a)))
+    accession_comparison["correct"] = metadata["accessions"].map(
+        lambda a: len(accessions_reg_exp.findall(a)))
 
-    accessionComparison["mismatching"] = accessionComparison.apply(lambda row: 
+    accession_comparison["mismatching"] = accession_comparison.apply(lambda row: 
         True if row["attemptive_accessions"] == row["correct"] 
         else None, axis=1).isna()
 
-    mismatchingAccessions = accessionComparison[accessionComparison["mismatching"]]["genome_name"]
-    if not mismatchingAccessions.empty:
+    mismatching_accessions = accession_comparison[accession_comparison["mismatching"]]["genome_name"]
+    if not mismatching_accessions.empty:
         raise ValueError("Run accessions are not correctly formatted for the following " + 
-            "genomes: " + ','.join(mismatchingAccessions.values))
+            "genomes: " + ','.join(mismatching_accessions.values))
 
     # check whether completeness and contamination are floats
     try:
@@ -117,14 +131,14 @@ def read_and_cleanse_metadata_tsv(inputFile, genomeType, live):
         raise ValueError("Completeness, contamination or coverage values should be formatted as floats")
 
     # check whether all co-assemblies have more than one run associated and viceversa
-    accessionComparison["co-assembly"] = metadata["co-assembly"]
-    coassemblyDiscrepancy = metadata[(
-        (accessionComparison["correct"] < 2) & (accessionComparison["co-assembly"])) |
-        ((accessionComparison["correct"] > 1) & (~accessionComparison["co-assembly"])
+    accession_comparison["co-assembly"] = metadata["co-assembly"]
+    coassembly_discrepancy = metadata[(
+        (accession_comparison["correct"] < 2) & (accession_comparison["co-assembly"])) |
+        ((accession_comparison["correct"] > 1) & (~accession_comparison["co-assembly"])
         )]["genome_name"]
-    if not coassemblyDiscrepancy.empty:
+    if not coassembly_discrepancy.empty:
         raise ValueError("The following genomes show discrepancy between number of runs "
-            "involved and co-assembly status: " + ','.join(coassemblyDiscrepancy.values))
+            "involved and co-assembly status: " + ','.join(coassembly_discrepancy.values))
 
     # are provided metagenomes part of the accepted metagenome list?
     if False in metadata.apply(lambda row: 
@@ -150,97 +164,97 @@ def read_and_cleanse_metadata_tsv(inputFile, genomeType, live):
             timestamp = str(int(dt.timestamp(dt.now())))
             timestamp_names = [row["genome_name"] + '_' + timestamp for index, row in metadata.iterrows()]
             metadata["unique_genome_name"] = timestamp_names
-            genomeInfo = metadata.set_index("unique_genome_name").transpose().to_dict()
+            genome_info = metadata.set_index("unique_genome_name").transpose().to_dict()
         else:
-            genomeInfo = metadata.set_index("genome_name").transpose().to_dict()
+            genome_info = metadata.set_index("genome_name").transpose().to_dict()
     else:
         raise ValueError("Duplicate names found in genome names")
 
-    return genomeInfo
+    return genome_info
 
 def round_stats(stats):
-    newStat = round(float(stats), 2)
-    if newStat == 100.0:
-        newStat = 100
-    if newStat == 0:
-        newStat = 0.0
+    new_stat = round(float(stats), 2)
+    if new_stat == 100.0:
+        new_stat = 100
+    if new_stat == 0:
+        new_stat = 0.0
 
-    return newStat
+    return new_stat
 
-def compute_MAG_quality(completeness, contamination, RNApresence):
-    RNApresent = str(RNApresence).lower() in ["true", "yes", "y"]
+def compute_mag_quality(completeness, contamination, rna_presence):
+    rna_presence = str(rna_presence).lower() in ["true", "yes", "y"]
     quality = MQ
-    if float(completeness) >= 90 and float(contamination) <= 5 and RNApresent:
+    if float(completeness) >= 90 and float(contamination) <= 5 and rna_presence:
         quality = HQ
     
     return quality, completeness, contamination
 
-def extract_tax_info(taxInfo):
+def extract_tax_info(tax_info):
     # if unclassified, block the execution
-    lineage, kingdomPositionInLineage, digitAnnotation = taxInfo.split(';'), 0, False
-    lineageFirst = lineage[0]
-    if "Unclassified " in lineageFirst:
-        if "Archaea" in lineageFirst:
-            scientificName = "uncultured archaeon"
-        elif "Bacteria" in lineageFirst:
-            scientificName = "uncultured bacterium"
-        elif "Eukaryota" in lineageFirst:
-            scientificName = "uncultured eukaryote"
-        submittable, taxid, rank = ena.query_scientific_name(scientificName, searchRank=True)
-        return taxid, scientificName
+    lineage, kingdom_position_lineage, digit_annotation = tax_info.split(';'), 0, False
+    lineage_first = lineage[0]
+    if "Unclassified " in lineage_first:
+        if "Archaea" in lineage_first:
+            scientific_name = "uncultured archaeon"
+        elif "Bacteria" in lineage_first:
+            scientific_name = "uncultured bacterium"
+        elif "Eukaryota" in lineage_first:
+            scientific_name = "uncultured eukaryote"
+        submittable, taxid, rank = ena.query_scientific_name(scientific_name, search_rank=True)
+        return taxid, scientific_name
 
     kingdoms = ["Archaea", "Bacteria", "Eukaryota"]
-    kingdomTaxa = ["2157", "2", "2759"]
+    kingdom_taxa = ["2157", "2", "2759"]
 
-    selectedKingdom, finalKingdom = kingdoms, ""
+    selected_kingdom, final_kingdom = kingdoms, ""
     if lineage[1].isdigit():
-        selectedKingdom = kingdomTaxa
-        kingdomPositionInLineage = 2
-        digitAnnotation = True
-    for index, k in enumerate(selectedKingdom):
-        if digitAnnotation:
-            if k == lineage[kingdomPositionInLineage]:
-                finalKingdom = selectedKingdom[index]
+        selected_kingdom = kingdom_taxa
+        kingdom_position_lineage = 2
+        digit_annotation = True
+    for index, k in enumerate(selected_kingdom):
+        if digit_annotation:
+            if k == lineage[kingdom_position_lineage]:
+                final_kingdom = selected_kingdom[index]
                 break
         else:
-            if k in lineage[kingdomPositionInLineage]:
-                finalKingdom = selectedKingdom[index]
+            if k in lineage[kingdom_position_lineage]:
+                final_kingdom = selected_kingdom[index]
                 break
 
     iterator = len(lineage)-1
     submittable = False
     rank = ""
     while iterator != -1 and not submittable:
-        scientificName = lineage[iterator].strip()
-        if digitAnnotation:
-            if not '*' in scientificName:
-                scientificName = ena.query_taxid(scientificName)
+        scientific_name = lineage[iterator].strip()
+        if digit_annotation:
+            if not '*' in scientific_name:
+                scientific_name = ena.query_taxid(scientific_name)
             else:
                 iterator -= 1
                 continue
-        elif "__" in scientificName:
-            scientificName = scientificName.split("__")[1]
+        elif "__" in scientific_name:
+            scientific_name = scientific_name.split("__")[1]
         else:
-            raise ValueError("Unrecognised taxonomy format: " + scientificName)
-        submittable, taxid, rank = ena.query_scientific_name(scientificName, searchRank=True)
+            raise ValueError("Unrecognised taxonomy format: " + scientific_name)
+        submittable, taxid, rank = ena.query_scientific_name(scientific_name, search_rank=True)
 
         if not submittable:
-            if finalKingdom == "Archaea" or finalKingdom == "2157":
-                submittable, scientificName, taxid = extract_Archaea_info(scientificName, rank)
-            elif finalKingdom == "Bacteria" or finalKingdom == "2":
-                submittable, scientificName, taxid = extract_Bacteria_info(scientificName, rank)
-            elif finalKingdom == "Eukaryota" or finalKingdom == "2759":
-                submittable, scientificName, taxid = extract_Eukaryota_info(scientificName, rank)
+            if final_kingdom == "Archaea" or final_kingdom == "2157":
+                submittable, scientific_name, taxid = extract_archaea_info(scientific_name, rank)
+            elif final_kingdom == "Bacteria" or final_kingdom == "2":
+                submittable, scientific_name, taxid = extract_bacteria_info(scientific_name, rank)
+            elif final_kingdom == "Eukaryota" or final_kingdom == "2759":
+                submittable, scientific_name, taxid = extract_eukaryota_info(scientific_name, rank)
         iterator -= 1
 
-    return taxid, scientificName
+    return taxid, scientific_name
 
-def extract_Eukaryota_info(name, rank):
-    nonSubmittable = (False, "", 0)
+def extract_eukaryota_info(name, rank):
+    non_submittable = (False, "", 0)
 
     # Asterisks in given taxonomy suggest the classification might be not confident enough.
     if '*' in name:
-        return nonSubmittable
+        return non_submittable
 
     if rank == "super kingdom":
         name = "uncultured eukaryote"
@@ -262,9 +276,9 @@ def extract_Eukaryota_info(name, rank):
                 if submittable:
                     return submittable, name, taxid
                 else:
-                    return nonSubmittable
+                    return non_submittable
 
-def extract_Bacteria_info(name, rank):
+def extract_bacteria_info(name, rank):
     if rank == "species":
         name = name
     elif rank == "superkingdom":
@@ -274,7 +288,7 @@ def extract_Bacteria_info(name, rank):
     elif rank == "genus":
         name = "uncultured {} sp.".format(name)
     
-    submittable, taxid, rank = ena.query_scientific_name(name, searchRank=True)
+    submittable, taxid, rank = ena.query_scientific_name(name, search_rank=True)
     if not submittable:
         if rank in ["species", "genus"] and name.lower().endswith("bacteria"):
             name = "uncultured {}".format(name.lower().replace("bacteria", "bacterium"))
@@ -285,7 +299,7 @@ def extract_Bacteria_info(name, rank):
 
     return submittable, name, taxid
 
-def extract_Archaea_info(name, rank):
+def extract_archaea_info(name, rank):
     if rank == "species":
         name = name
     elif rank == "superkingdom":
@@ -302,7 +316,7 @@ def extract_Archaea_info(name, rank):
     elif rank == "genus":
         name = "uncultured {} sp.".format(name)
     
-    submittable, taxid, rank = ena.query_scientific_name(name, searchRank=True)
+    submittable, taxid, rank = ena.query_scientific_name(name, search_rank=True)
     if not submittable:
         if "Candidatus" in name:
             if rank == "phylum":
@@ -313,149 +327,149 @@ def extract_Archaea_info(name, rank):
                             
     return submittable, name, taxid
 
-def extract_genomes_info(inputFile, genomeType, live):
-    genomeInfo = read_and_cleanse_metadata_tsv(inputFile, genomeType, live)
-    for gen in genomeInfo:
-        genomeInfo[gen]["accessions"] = genomeInfo[gen]["accessions"].split(',')
-        accessionType = "run"
-        assembly_regExp = re.compile(r"([E|S|D]RZ\d{6,})")
-        if assembly_regExp.findall(genomeInfo[gen]["accessions"][0]):
-            accessionType = "assembly"
-        genomeInfo[gen]["accessionType"] = accessionType
-
-        genomeInfo[gen]["isolationSource"] = genomeInfo[gen]["metagenome"]
+def extract_genomes_info(input_file, genome_type, live):
+    genome_info = validate_metadata_tsv(input_file, genome_type, live)
+    for gen in genome_info:
+        genome_info[gen]["accessions"] = genome_info[gen]["accessions"].split(',')
+        accession_type = "run"
+        assembly_reg_exp = re.compile(r"([E|S|D]RZ\d{6,})")
+        if assembly_reg_exp.findall(genome_info[gen]["accessions"][0]):
+            accession_type = "assembly"
+        genome_info[gen]["accessionType"] = accession_type
+
+        genome_info[gen]["isolationSource"] = genome_info[gen]["metagenome"]
         
         try:
-            (genomeInfo[gen]["MAG_quality"], 
-            genomeInfo[gen]["completeness"], 
-            genomeInfo[gen]["contamination"]) = compute_MAG_quality(
-                                    str(round_stats(genomeInfo[gen]["completeness"])),
-                                    str(round_stats(genomeInfo[gen]["contamination"])), 
-                                    genomeInfo[gen]["rRNA_presence"])
+            (genome_info[gen]["MAG_quality"], 
+            genome_info[gen]["completeness"], 
+            genome_info[gen]["contamination"]) = compute_mag_quality(
+                                    str(round_stats(genome_info[gen]["completeness"])),
+                                    str(round_stats(genome_info[gen]["contamination"])), 
+                                    genome_info[gen]["rRNA_presence"])
         except IndexError:
             pass
 
-        if str(genomeInfo[gen]["co-assembly"]).lower() in ["yes", "y", "true"]:
-            genomeInfo[gen]["co-assembly"] = True
+        if str(genome_info[gen]["co-assembly"]).lower() in ["yes", "y", "true"]:
+            genome_info[gen]["co-assembly"] = True
         else:
-            genomeInfo[gen]["co-assembly"] = False
+            genome_info[gen]["co-assembly"] = False
         
-        genomeInfo[gen]["alias"] = gen
+        genome_info[gen]["alias"] = gen
 
-        taxID, scientificName = extract_tax_info(genomeInfo[gen]["NCBI_lineage"])
-        genomeInfo[gen]["taxID"] = taxID
-        genomeInfo[gen]["scientific_name"] = scientificName
+        tax_id, scientific_name = extract_tax_info(genome_info[gen]["NCBI_lineage"])
+        genome_info[gen]["taxID"] = tax_id
+        genome_info[gen]["scientific_name"] = scientific_name
 
-    return genomeInfo
+    return genome_info
 
-def multipleElementSet(metadataList):
-    return len(set(metadataList))>1
+def multiple_element_set(metadata_list):
+    return len(set(metadata_list))>1
 
-def combine_ENA_info(genomeInfo, ENADict):
-    for g in genomeInfo:
+def combine_ENA_info(genome_info, ena_dict):
+    for g in genome_info:
         # TODO: optimise all the part below
-        if genomeInfo[g]["co-assembly"]:
-            instrumentList, collectionList, countryList = [], [], []
-            studyList, descriptionList, samplesList = [], [], []
-            longList, latitList = [], []
-            for run in genomeInfo[g]["accessions"]:
-                instrumentList.append(ENADict[run]["instrumentModel"])
-                collectionList.append(ENADict[run]["collectionDate"])
-                countryList.append(ENADict[run]["country"])
-                studyList.append(ENADict[run]["study"])
-                descriptionList.append(ENADict[run]["projectDescription"])
-                samplesList.append(ENADict[run]["sampleAccession"])
-                longList.append(ENADict[run]["longitude"])
-                latitList.append(ENADict[run]["latitude"])
-
-            genomeInfo[g]["study"] = studyList[0] 
-            genomeInfo[g]["description"] = descriptionList[0]
+        if genome_info[g]["co-assembly"]:
+            instrument_list, collection_list, country_list = [], [], []
+            study_list, description_list, samples_list = [], [], []
+            long_list, latit_list = [], []
+            for run in genome_info[g]["accessions"]:
+                instrument_list.append(ena_dict[run]["instrumentModel"])
+                collection_list.append(ena_dict[run]["collectionDate"])
+                country_list.append(ena_dict[run]["country"])
+                study_list.append(ena_dict[run]["study"])
+                description_list.append(ena_dict[run]["projectDescription"])
+                samples_list.append(ena_dict[run]["sampleAccession"])
+                long_list.append(ena_dict[run]["longitude"])
+                latit_list.append(ena_dict[run]["latitude"])
+
+            genome_info[g]["study"] = study_list[0] 
+            genome_info[g]["description"] = description_list[0]
             
-            instrument = instrumentList[0]
-            if multipleElementSet(instrumentList):
-                instrument = ','.join(instrumentList)
-            genomeInfo[g]["sequencingMethod"] = instrument
-
-            collectionDate = collectionList[0]
-            if multipleElementSet(collectionList):
-                collectionDate = "not provided"
-            genomeInfo[g]["collectionDate"] = collectionDate
-
-            country = countryList[0]
-            if multipleElementSet(countryList):
+            instrument = instrument_list[0]
+            if multiple_element_set(instrument_list):
+                instrument = ','.join(instrument_list)
+            genome_info[g]["sequencingMethod"] = instrument
+
+            collection_date = collection_list[0]
+            if multiple_element_set(collection_list):
+                collection_date = "not provided"
+            genome_info[g]["collectionDate"] = collection_date
+
+            country = country_list[0]
+            if multiple_element_set(country_list):
                 country = "not applicable"
-            genomeInfo[g]["country"] = country
+            genome_info[g]["country"] = country
 
-            latitude = latitList[0]
-            if multipleElementSet(latitList):
+            latitude = latit_list[0]
+            if multiple_element_set(latit_list):
                 latitude = "not provided"
-            genomeInfo[g]["latitude"] = str(round(float(latitude), GEOGRAPHY_DIGIT_COORDS))
+            genome_info[g]["latitude"] = str(round(float(latitude), GEOGRAPHY_DIGIT_COORDS))
 
-            longitude = longList[0]
-            if multipleElementSet(longList):
+            longitude = long_list[0]
+            if multiple_element_set(long_list):
                 longitude = "not provided"
-            genomeInfo[g]["longitude"] = str(round(float(longitude), GEOGRAPHY_DIGIT_COORDS))
+            genome_info[g]["longitude"] = str(round(float(longitude), GEOGRAPHY_DIGIT_COORDS))
 
-            samples = samplesList[0]
-            if multipleElementSet(samplesList):
-                samples = ','.join(samplesList)
-            genomeInfo[g]["sample_accessions"] = samples
+            samples = samples_list[0]
+            if multiple_element_set(samples_list):
+                samples = ','.join(samples_list)
+            genome_info[g]["sample_accessions"] = samples
         else:
-            run = genomeInfo[g]["accessions"][0]
-            genomeInfo[g]["sequencingMethod"] = ENADict[run]["instrumentModel"]
-            genomeInfo[g]["collectionDate"] = ENADict[run]["collectionDate"]
-            genomeInfo[g]["study"] = ENADict[run]["study"]
-            genomeInfo[g]["description"] = ENADict[run]["projectDescription"]
-            genomeInfo[g]["sample_accessions"] = ENADict[run]["sampleAccession"]
-            genomeInfo[g]["country"] = ENADict[run]["country"]
-            genomeInfo[g]["longitude"] = ENADict[run]["longitude"]
-            genomeInfo[g]["latitude"] = ENADict[run]["latitude"]
+            run = genome_info[g]["accessions"][0]
+            genome_info[g]["sequencingMethod"] = ena_dict[run]["instrumentModel"]
+            genome_info[g]["collectionDate"] = ena_dict[run]["collectionDate"]
+            genome_info[g]["study"] = ena_dict[run]["study"]
+            genome_info[g]["description"] = ena_dict[run]["projectDescription"]
+            genome_info[g]["sample_accessions"] = ena_dict[run]["sampleAccession"]
+            genome_info[g]["country"] = ena_dict[run]["country"]
+            genome_info[g]["longitude"] = ena_dict[run]["longitude"]
+            genome_info[g]["latitude"] = ena_dict[run]["latitude"]
         
-        genomeInfo[g]["accessions"] = ','.join(genomeInfo[g]["accessions"])
+        genome_info[g]["accessions"] = ','.join(genome_info[g]["accessions"])
 
 
 
-def getAccessions(accessionsFile):
-    accessionDict = {}
-    with open(accessionsFile, 'r') as f:
+def get_accessions(accessions_file):
+    accession_dict = {}
+    with open(accessions_file, 'r') as f:
         for line in f:
             line = line.split('\t')
             alias = line[0]
             accession = line[1].rstrip('\n')
-            accessionDict[alias] = accession
+            accession_dict[alias] = accession
 
-    return accessionDict
+    return accession_dict
 
-def saveAccessions(aliasAccessionDict, accessionsFile, writeMode):
-    with open(accessionsFile, writeMode) as f:
-        for elem in aliasAccessionDict:
-            f.write("{}\t{}\n".format(elem, aliasAccessionDict[elem]))
+def save_accessions(alias_accession_dict, accessions_file, write_mode):
+    with open(accessions_file, write_mode) as f:
+        for elem in alias_accession_dict:
+            f.write("{}\t{}\n".format(elem, alias_accession_dict[elem]))
 
-def create_manifest_dictionary(run, alias, assemblySoftware, sequencingMethod, 
-    MAGpath, gen, study, coverage, isCoassembly):
-    manifestDict = {
+def create_manifest_dictionary(run, alias, assembly_software, sequencing_method, 
+    mag_path, gen, study, coverage, is_coassembly):
+    manifest_dict = {
         "accessions" : run,
         "alias" : alias,
-        "assembler" : assemblySoftware, 
-        "sequencingMethod" : sequencingMethod, 
-        "genome_path" : MAGpath,
+        "assembler" : assembly_software, 
+        "sequencingMethod" : sequencing_method, 
+        "genome_path" : mag_path,
         "genome_name" : gen,
         "study" : study,
         "coverageDepth" : coverage, 
-        "co-assembly" : isCoassembly
+        "co-assembly" : is_coassembly
     }
 
-    return manifestDict
+    return manifest_dict
 
 def compute_manifests(genomes):
-    manifestInfo = {}
+    manifest_info = {}
     for g in genomes:
-        manifestInfo[g] = create_manifest_dictionary(genomes[g]["accessions"],
+        manifest_info[g] = create_manifest_dictionary(genomes[g]["accessions"],
             genomes[g]["alias"], genomes[g]["assembly_software"], 
             genomes[g]["sequencingMethod"], genomes[g]["genome_path"], g, 
             genomes[g]["study"], genomes[g]["genome_coverage"], genomes[g]["co-assembly"])
     
-    return manifestInfo
+    return manifest_info
 
 def get_study_from_xml(sample):
     description = sample.childNodes[5].childNodes[0].data
@@ -463,7 +477,7 @@ def get_study_from_xml(sample):
 
     return study
 
-def recover_info_from_xml(genomeDict, sample_xml, live_mode):
+def recover_info_from_xml(genome_dict, sample_xml, live_mode):
     logger.info("Retrieving data for genome submission...")
 
     # extract list of genomes (samples) to be registered
@@ -474,42 +488,42 @@ def recover_info_from_xml(genomeDict, sample_xml, live_mode):
         study = get_study_from_xml(s)
 
         # extract alias from xml and find a match with genomes the user is uploading
-        XMLalias = s.attributes["alias"].value
+        xml_alias = s.attributes["alias"].value
         if not live_mode:         # remove time stamp if test mode is selected
-            aliasSplit = XMLalias.split("_")
-            XMLalias = '_'.join(aliasSplit[:-1])
-        for gen in genomeDict:
+            alias_split = xml_alias.split("_")
+            xml_alias = '_'.join(alias_split[:-1])
+        for gen in genome_dict:
             # if match is found, associate attributes listed in the xml file
             # with genomes to upload
-            if XMLalias == gen:
+            if xml_alias == gen:
                 if not live_mode:
-                    currentTimestamp = str(int(dt.timestamp(dt.now())))
-                    XMLalias = gen + '_' + currentTimestamp
-                    s.attributes["alias"].value = XMLalias
-                    sampleTitle = s.getElementsByTagName("TITLE")[0]
-                    sampleTitleValue = sampleTitle.firstChild.nodeValue.split("_")
-                    sampleTitleValue[-1] = currentTimestamp
-                    newSampleTitle = '_'.join(sampleTitleValue)
-                    s.getElementsByTagName("TITLE")[0].firstChild.replaceWholeText(newSampleTitle)
+                    current_time_stamp = str(int(dt.timestamp(dt.now())))
+                    xml_alias = gen + '_' + current_time_stamp
+                    s.attributes["alias"].value = xml_alias
+                    sample_title = s.getElementsByTagName("TITLE")[0]
+                    sample_title_value = sample_title.firstChild.nodeValue.split("_")
+                    sample_title_value[-1] = current_time_stamp
+                    new_sample_title = '_'.join(sample_title_value)
+                    s.getElementsByTagName("TITLE")[0].firstChild.replaceWholeText(new_sample_title)
                 attributes = s.childNodes[7].getElementsByTagName("SAMPLE_ATTRIBUTE")
-                seqMethod, assSoftware = "", ""
+                seq_method, ass_software = "", ""
                 for a in attributes:
-                    tagElem = a.getElementsByTagName("TAG")
-                    tag = tagElem[0].childNodes[0].nodeValue
+                    tag_elem = a.getElementsByTagName("TAG")
+                    tag = tag_elem[0].childNodes[0].nodeValue
                     if tag == "sequencing method":
-                        seqMethodElem = a.getElementsByTagName("VALUE")
-                        seqMethod = seqMethodElem[0].childNodes[0].nodeValue
+                        seq_method_elem = a.getElementsByTagName("VALUE")
+                        seq_method = seq_method_elem[0].childNodes[0].nodeValue
                     elif tag == "assembly software":
-                        assSoftwareElem = a.getElementsByTagName("VALUE")
-                        assSoftware = assSoftwareElem[0].childNodes[0].nodeValue
-                    if not seqMethod == "" and not assSoftware == "":
+                        ass_software_elem = a.getElementsByTagName("VALUE")
+                        ass_software = ass_software_elem[0].childNodes[0].nodeValue
+                    if not seq_method == "" and not ass_software == "":
                         break
 
-                genomeDict[gen]["accessions"] = ','.join(genomeDict[gen]["accessions"])
-                genomeDict[gen]["alias"] = XMLalias
-                genomeDict[gen]["assembly_software"] = assSoftware
-                genomeDict[gen]["sequencingMethod"] = seqMethod
-                genomeDict[gen]["study"] = study
+                genome_dict[gen]["accessions"] = ','.join(genome_dict[gen]["accessions"])
+                genome_dict[gen]["alias"] = xml_alias
+                genome_dict[gen]["assembly_software"] = ass_software
+                genome_dict[gen]["sequencingMethod"] = seq_method
+                genome_dict[gen]["study"] = study
                 break
 
     if not live_mode:
@@ -537,7 +551,7 @@ def create_sample_attribute(sample_attributes, data_list, mag_data=None):
     if units:
         ET.SubElement(new_sample_attr, 'UNITS').text = units
 
-def write_genomes_xml(genomes, xml_path, genomeType, centreName, tpa):
+def write_genomes_xml(genomes, xml_path, genome_type, centre_name, tpa):
     map_sample_attributes = [
         # tag - value - unit (optional)
         ["project name", "description"],
@@ -561,10 +575,10 @@ def write_genomes_xml(genomes, xml_path, genomeType, centreName, tpa):
         ["metagenomic source", "metagenome"],
     ]
 
-    checklist, assemblyType = "ERC000047", "Metagenome-assembled genome"
-    if genomeType == "bins":
+    checklist, assembly_type = "ERC000047", "Metagenome-assembled genome"
+    if genome_type == "bins":
         checklist = "ERC000050"
-        assemblyType = "binned metagenome"
+        assembly_type = "binned metagenome"
 
     constant_sample_attributes = [
         # tag - value
@@ -573,9 +587,9 @@ def write_genomes_xml(genomes, xml_path, genomeType, centreName, tpa):
         ["ENA-CHECKLIST", checklist],
     ]
 
-    tpaDescription = ""
+    tpa_description = ""
     if tpa:
-        tpaDescription = "Third Party Annotation (TPA) "
+        tpa_description = "Third Party Annotation (TPA) "
 
     sample_set = ET.Element("SAMPLE_SET")
 
@@ -584,15 +598,15 @@ def write_genomes_xml(genomes, xml_path, genomeType, centreName, tpa):
         if genomes[g]["co-assembly"]:
             plural = 's'
         description = ("This sample represents a {}{} assembled from the "
-            "metagenomic run{} {} of study {}.".format(tpaDescription, 
-            assemblyType, plural, genomes[g]["accessions"], 
+            "metagenomic run{} {} of study {}.".format(tpa_description, 
+            assembly_type, plural, genomes[g]["accessions"], 
             genomes[g]["study"]))
         
         sample = ET.SubElement(sample_set, "SAMPLE")
         sample.set("alias", genomes[g]["alias"])
-        sample.set("center_name", centreName)
+        sample.set("center_name", centre_name)
 
-        ET.SubElement(sample, 'TITLE').text = ("{}: {}".format(assemblyType, 
+        ET.SubElement(sample, 'TITLE').text = ("{}: {}".format(assembly_type, 
             genomes[g]["alias"]))
         sample_name = ET.SubElement(sample, "SAMPLE_NAME")
         ET.SubElement(sample_name, "TAXON_ID").text = genomes[g]["taxID"]
@@ -641,35 +655,35 @@ def write_submission_xml(upload_dir, centre_name, study=True):
 
     return sub_xml
 
-def generate_genome_manifest(genomeInfo, study, manifestsRoot, aliasToSample, genomeType, tpa):
-    manifest_path = os.path.join(manifestsRoot, f'{genomeInfo["genome_name"]}.manifest')
+def generate_genome_manifest(genome_info, study, manifests_root, alias_to_sample, genome_type, tpa):
+    manifest_path = os.path.join(manifests_root, f'{genome_info["genome_name"]}.manifest')
     
-    tpaAddition, multipleRuns = "", ""
+    tpa_addition, multiple_runs = "", ""
     if tpa:
-        tpaAddition = "Third Party Annotation (TPA) "
-    if genomeInfo["co-assembly"]:
-        multipleRuns = "s"
-    assemblyType = "Metagenome-Assembled Genome (MAG)"
-    if genomeType == "bins":
-        assemblyType = "binned metagenome"
+        tpa_addition = "Third Party Annotation (TPA) "
+    if genome_info["co-assembly"]:
+        multiple_runs = "s"
+    assembly_type = "Metagenome-Assembled Genome (MAG)"
+    if genome_type == "bins":
+        assembly_type = "binned metagenome"
 
     values = (
         ('STUDY', study),
-        ('SAMPLE', aliasToSample[genomeInfo["alias"]]),
-        ('ASSEMBLYNAME', genomeInfo["alias"]),
-        ('ASSEMBLY_TYPE', assemblyType),
-        ('COVERAGE', genomeInfo["coverageDepth"]),
-        ('PROGRAM', genomeInfo["assembler"]),
-        ('PLATFORM', genomeInfo["sequencingMethod"]),
+        ('SAMPLE', alias_to_sample[genome_info["alias"]]),
+        ('ASSEMBLYNAME', genome_info["alias"]),
+        ('ASSEMBLY_TYPE', assembly_type),
+        ('COVERAGE', genome_info["coverageDepth"]),
+        ('PROGRAM', genome_info["assembler"]),
+        ('PLATFORM', genome_info["sequencingMethod"]),
         ('MOLECULETYPE', "genomic DNA"),
         ('DESCRIPTION', ("This is a {}bin derived from the primary whole genome "
             "shotgun (WGS) data set {}. This sample represents a {} from the "
-            "metagenomic run{} {}.".format(tpaAddition, genomeInfo["study"], 
-            assemblyType, multipleRuns, genomeInfo["accessions"]))),
-        ('RUN_REF', genomeInfo["accessions"]),
-        ('FASTA', os.path.abspath(genomeInfo["genome_path"]))
+            "metagenomic run{} {}.".format(tpa_addition, genome_info["study"], 
+            assembly_type, multiple_runs, genome_info["accessions"]))),
+        ('RUN_REF', genome_info["accessions"]),
+        ('FASTA', os.path.abspath(genome_info["genome_path"]))
     )
-    logger.info("Writing manifest file (.manifest) for {}.".format(genomeInfo["alias"]))
+    logger.info("Writing manifest file (.manifest) for {}.".format(genome_info["alias"]))
     with open(manifest_path, "w") as outfile:
         for (k, v) in values:
             manifest = f'{k}\t{v}\n'
@@ -684,63 +698,62 @@ def main():
         logger.warning("Warning: genome submission is not in live mode, " +
             "files will be validated, but not uploaded.")
 
-    xmlGenomeFile, xmlSubFile = "genome_samples.xml", "submission.xml"
-    samples_xml = os.path.join(ENA_uploader.upload_dir, xmlGenomeFile)
-    submissionXmlPath = os.path.join(ENA_uploader.upload_dir, xmlSubFile)
-    submission_xml = submissionXmlPath
-    genomes, manifestInfo = {}, {}
+    xml_genome_file, xml_sub_file = "genome_samples.xml", "submission.xml"
+    samples_xml = os.path.join(ENA_uploader.upload_dir, xml_genome_file)
+    submission_xml_path = os.path.join(ENA_uploader.upload_dir, xml_sub_file)
+    submission_xml = submission_xml_path
+    genomes, manifest_info = {}, {}
 
     # submission xml existence
-    if not os.path.exists(submissionXmlPath):
+    if not os.path.exists(submission_xml_path):
         submission_xml = write_submission_xml(ENA_uploader.upload_dir, ENA_uploader.centre_name, False)
 
     # sample xml generation or recovery
     genomes = ENA_uploader.create_genome_dictionary(samples_xml)
         
     # manifests creation
-    manifestDir = os.path.join(ENA_uploader.upload_dir, "manifests")
-    os.makedirs(manifestDir, exist_ok=True)
+    manifest_dir = os.path.join(ENA_uploader.upload_dir, "manifests")
+    os.makedirs(manifest_dir, exist_ok=True)
     
     accessionsgen = "registered_MAGs.tsv"
-    if ENA_uploader.genomeType == "bins":
+    if ENA_uploader.genome_type == "bins":
         accessionsgen = accessionsgen.replace("MAG", "bin")
     if not ENA_uploader.live:
         accessionsgen = accessionsgen.replace(".tsv", "_test.tsv")
     
-    accessionsFile = os.path.join(ENA_uploader.upload_dir, accessionsgen)
+    accessions_file = os.path.join(ENA_uploader.upload_dir, accessionsgen)
     save = False
-    writeMode = 'a'
-    if os.path.exists(accessionsFile):
+    write_mode = 'a'
+    if os.path.exists(accessions_file):
         if not ENA_uploader.live:
             save = True
             if ENA_uploader.force:
-                writeMode = 'w'
+                write_mode = 'w'
         if not save:
             logger.info("Genome samples already registered, reading ERS accessions...")
-            aliasToNewSampleAccession = getAccessions(accessionsFile)
+            alias_to_new_sample_accession = get_accessions(accessions_file)
     else:
         save = True
         
     if save:
         logger.info("Registering genome samples XMLs...")
         ena_submit = EnaSubmit(samples_xml, submission_xml, ENA_uploader.live)
-        aliasToNewSampleAccession = ena_submit.handle_genomes_registration()
-        saveAccessions(aliasToNewSampleAccession, accessionsFile, writeMode)
+        alias_to_new_sample_accession = ena_submit.handle_genomes_registration()
+        save_accessions(alias_to_new_sample_accession, accessions_file, write_mode)
 
     logger.info("Generating manifest files...")
     
-    manifestInfo = compute_manifests(genomes)
+    manifest_info = compute_manifests(genomes)
 
-    for m in manifestInfo:
-        generate_genome_manifest(manifestInfo[m], ENA_uploader.upStudy,  
-            manifestDir, aliasToNewSampleAccession, ENA_uploader.genomeType, ENA_uploader.tpa)
+    for m in manifest_info:
+        generate_genome_manifest(manifest_info[m], ENA_uploader.up_study,  
+            manifest_dir, alias_to_new_sample_accession, ENA_uploader.genome_type, ENA_uploader.tpa)
 
 class GenomeUpload:
     def __init__(self, argv=sys.argv[1:]):
         self.args = self.parse_args(argv)
-        self.upStudy = self.args.upload_study
-        self.genomeMetadata = self.args.genome_info
-        self.genomeType = "bins" if self.args.bins else "MAGs"
+        self.up_study = self.args.upload_study
+        self.genome_type = "bins" if self.args.bins else "MAGs"
         self.live = True if self.args.live else False
         self.private = self.args.private
 
@@ -748,101 +761,78 @@ def __init__(self, argv=sys.argv[1:]):
         self.centre_name = self.args.centre_name
         self.force = True if self.args.force else False
 
-        workDir = self.args.out if self.args.out else os.getcwd()
-        self.upload_dir = self.generate_genomes_upload_dir(workDir, self.genomeType)
+        work_dir = self.args.out if self.args.out else os.getcwd()
+        self.upload_dir = self.generate_genomes_upload_dir(work_dir, self.genome_type)
 
         self.private = self.args.private
-    
-    def parse_args(self, argv):
-        parser = argparse.ArgumentParser(formatter_class = argparse.ArgumentDefaultsHelpFormatter,
-            description="Create xmls and manifest files for genome upload to ENA")
-        
-        parser.add_argument('-u', '--upload_study', type=str, help="Study accession for genomes upload")
-        parser.add_argument('--genome_info', type=str, required=True, help="Genomes metadata file")
 
-        genomeType = parser.add_mutually_exclusive_group(required=True)
-        genomeType.add_argument('-m', '--mags', action='store_true', help="Select for MAG upload")
-        genomeType.add_argument('-b', '--bins', action='store_true', help="Select for bin upload")
-        
-        parser.add_argument('--out', type=str, help="Output folder. Default: working directory")
-        parser.add_argument('--force', action='store_true', help="Forces reset of sample xml's backups")
-        parser.add_argument('--live', action='store_true', help="Uploads on ENA. Omitting this " +
-            "option allows to validate samples beforehand")
-        parser.add_argument('--tpa', action='store_true', help="Select if uploading TPA-generated genomes")
-        
-        # Users can provide their credentials and centre name manually or using a config file
-        parser.add_argument('--centre_name', required=False, help="Name of the centre uploading genomes")
-        parser.add_argument('--private', required=False, help="if data is private", action='store_true', default=False)
-
-        args = parser.parse_args(argv)
-
-        if not args.upload_study:
+        if not self.args.upload_study:
             raise ValueError("No project selected for genome upload [-u, --upload_study].")
-        
-        if not os.path.exists(args.genome_info):
-            raise FileNotFoundError('Genome metadata file "{}" does not exist'.format(args.genome_info))
+        self.upload_study = self.args.upload_study
 
-        return args
+        if not os.path.exists(self.args.genome_info):
+            raise FileNotFoundError('Genome metadata file "{}" does not exist'.format(self.args.genome_info))
+        self.genome_metadata = self.args.genome_info
 
-    def extract_ENA_info(self, genomeInfo, uploadDir, private=False):
+    def extract_ena_info(self, genome_info, upload_dir):
         logger.info('Retrieving project and run info from ENA (this might take a while)...')
         
         # retrieving metadata from runs (and runs from assembly accessions if provided)
-        allRuns = []
-        for g in genomeInfo:
-            if genomeInfo[g]["accessionType"] == "assembly":
-                derivedRuns = []
-                for acc in genomeInfo[g]["accessions"]:
+        all_runs = []
+        for g in genome_info:
+            if genome_info[g]["accessionType"] == "assembly":
+                derived_runs = []
+                for acc in genome_info[g]["accessions"]:
                     ena_query = EnaQuery(acc, "run_assembly", self.private)
-                    derivedRuns.append(ena_query.build_query())
-                genomeInfo[g]["accessions"] = derivedRuns
-            allRuns.extend(genomeInfo[g]["accessions"])
+                    derived_runs.append(ena_query.build_query())
+                genome_info[g]["accessions"] = derived_runs
+            all_runs.extend(genome_info[g]["accessions"])
 
-        runsSet, studySet, samplesDict, tempDict = set(allRuns), set(), {}, {}
-        for r in runsSet:
+        runs_set, study_set, samples_dict, temp_dict = set(all_runs), set(), {}, {}
+        for r in runs_set:
             ena_query = EnaQuery(r, "run", self.private)
             run_info = ena_query.build_query()
-            studySet.add(run_info["secondary_study_accession"])
-            samplesDict[r] = run_info["sample_accession"]
+            study_set.add(run_info["secondary_study_accession"])
+            samples_dict[r] = run_info["sample_accession"]
         
-        if not studySet:
+        if not study_set:
             raise ValueError("No study corresponding to runs found.")
 
-        backupFile = os.path.join(uploadDir, "ENA_backup.json")
+        backup_file = os.path.join(upload_dir, "ENA_backup.json")
         counter = 0
-        if not os.path.exists(backupFile):
-            with open(backupFile, 'w') as file:
+        if not os.path.exists(backup_file):
+            with open(backup_file, 'w') as file:
                 pass
-        with open(backupFile, "r+") as file:
+        with open(backup_file, "r+") as file:
             try:
-                backupDict = json.load(file)
-                tempDict = dict(backupDict)
-                logger.info(f"A backup file {backupFile} for ENA sample metadata has been found.")
+                backup_dict = json.load(file)
+                temp_dict = dict(backup_dict)
+                logger.info(f"A backup file {backup_file} for ENA sample metadata has been found.")
             except json.decoder.JSONDecodeError:
-                backupDict = {}
-            for s in studySet:
+                backup_dict = {}
+            for s in study_set:
                 ena_query = EnaQuery(s, "study", self.private)
-                studyInfo = ena_query.build_query()
-                projectDescription = studyInfo["study_description"]
+                study_info = ena_query.build_query()
+                project_description = study_info["study_description"]
 
                 ena_query = EnaQuery(s, "study_runs", self.private)
-                ENA_info = ena_query.build_query()
-                if ENA_info == []:
+                ena_info = ena_query.build_query()
+                if ena_info == []:
                     raise IOError("No runs found on ENA for project {}.".format(s))
                 
-                for run, item in enumerate(ENA_info):
-                    runAccession = ENA_info[run]["run_accession"]
-                    if runAccession not in backupDict:
-                        if runAccession in runsSet:
-                            sampleAccession = ENA_info[run]["sample_accession"]
-                            ena_query = EnaQuery(sampleAccession, "sample", self.private)
-                            sampleInfo = ena_query.build_query()
-
-                            if 'latitude' in sampleInfo and 'longitude' in sampleInfo:
-                                latitude =  sampleInfo['latitude']
-                                longitude = sampleInfo['longitude']
+                for run, item in enumerate(ena_info):
+                    run_accession = ena_info[run]["run_accession"]
+                    if run_accession not in backup_dict:
+                        if run_accession in runs_set:
+                            sample_accession = ena_info[run]["sample_accession"]
+                            ena_query = EnaQuery(sample_accession, "sample", self.private)
+                            sample_info = ena_query.build_query()
+
+                            if 'latitude' in sample_info and 'longitude' in sample_info:
+                                latitude =  sample_info['latitude']
+                                longitude = sample_info['longitude']
                             else:
-                                location = sampleInfo["location"]
+                                location = sample_info["location"]
                                 latitude, longitude = None, None
                                 if 'N' in location:
                                     latitude = location.split('N')[0].strip()
@@ -866,58 +856,58 @@ def extract_ENA_info(self, genomeInfo, uploadDir, private=False):
                             else:
                                 longitude = "not provided"
     
-                            country = sampleInfo["country"].split(':')[0]
+                            country = sample_info["country"].split(':')[0]
                             if not country in GEOGRAPHIC_LOCATIONS:
                                 country = "not provided"
                             
-                            collectionDate = sampleInfo["collection_date"]
+                            collectionDate = sample_info["collection_date"]
                             if collectionDate == "" or collectionDate == "missing":
                                 collectionDate = "not provided"
 
-                            tempDict[runAccession] = {
-                                "instrumentModel" : ENA_info[run]["instrument_model"],
+                            temp_dict[run_accession] = {
+                                "instrumentModel" : ena_info[run]["instrument_model"],
                                 "collectionDate" : collectionDate,
                                 "country" : country,
                                 "latitude" : latitude,
                                 "longitude" : longitude,
-                                "projectDescription" : projectDescription,
+                                "projectDescription" : project_description,
                                 "study" : s,
-                                "sampleAccession" : samplesDict[runAccession]
+                                "sampleAccession" : samples_dict[run_accession]
                             }
                             counter += 1
 
-                            if (counter%10 == 0) or (len(runsSet) - len(backupDict) == counter):
+                            if (counter%10 == 0) or (len(runs_set) - len(backup_dict) == counter):
                                 file.seek(0)
-                                file.write(json.dumps(tempDict))
+                                file.write(json.dumps(temp_dict))
                                 file.truncate()
-        tempDict = {**tempDict, **backupDict}
-        combine_ENA_info(genomeInfo, tempDict)
+        temp_dict = {**temp_dict, **backup_dict}
+        combine_ENA_info(genome_info, temp_dict)
 
 
-    def generate_genomes_upload_dir(self, dir, genomeType):
-        uploadName = "MAG_upload"
-        if genomeType == "bins":
-            uploadName = uploadName.replace("MAG", "bin")
-        upload_dir = os.path.join(dir, uploadName)
+    def generate_genomes_upload_dir(self, dir, genome_type):
+        upload_name = "MAG_upload"
+        if genome_type == "bins":
+            upload_name = upload_name.replace("MAG", "bin")
+        upload_dir = os.path.join(dir, upload_name)
         os.makedirs(upload_dir, exist_ok=True)
         return upload_dir
 
     def create_genome_dictionary(self, samples_xml):
         logger.info('Retrieving data for MAG submission...')
 
-        genomeInfo = extract_genomes_info(self.genomeMetadata, self.genomeType, self.live)
+        genome_info = extract_genomes_info(self.genome_metadata, self.genome_type, self.live)
 
         if not os.path.exists(samples_xml) or self.force:
-            extract_ENA_info(genomeInfo, self.upload_dir, self.username, self.password, self.private)
+            self.extract_ena_info(genome_info, self.upload_dir)
             logger.info("Writing genome registration XML...")
 
-            write_genomes_xml(genomeInfo, samples_xml, self.genomeType, 
+            write_genomes_xml(genome_info, samples_xml, self.genome_type, 
                               self.centre_name, self.tpa)
             logger.info("All files have been written to " + self.upload_dir)
         else:
-            recover_info_from_xml(genomeInfo, samples_xml, self.live)
+            recover_info_from_xml(genome_info, samples_xml, self.live)
 
-        return genomeInfo
+        return genome_info
 
 if __name__ == "__main__":
     main()

From d85b0c6aa9b360174bdcf08552b26f3f516141c1 Mon Sep 17 00:00:00 2001
From: Varsha <vkale@ebi.ac.uk>
Date: Thu, 19 Dec 2024 12:02:46 +0000
Subject: [PATCH 10/20] modify class and argument parsing

---
 genomeuploader/genome_upload.py | 470 +++++++++++++++++---------------
 1 file changed, 251 insertions(+), 219 deletions(-)

diff --git a/genomeuploader/genome_upload.py b/genomeuploader/genome_upload.py
index 9136118..d34435c 100755
--- a/genomeuploader/genome_upload.py
+++ b/genomeuploader/genome_upload.py
@@ -37,24 +37,24 @@
 
 def parse_args(argv):
     parser = argparse.ArgumentParser(formatter_class = argparse.ArgumentDefaultsHelpFormatter,
-        description="Create xmls and manifest files for genome upload to ENA")
+        description="Create xmls and manifest files for genome upload and upload to ENA")
     
-    parser.add_argument('-u', '--upload_study', type=str, help="Study accession for genomes upload")
+    parser.add_argument('-u', '--upload_study', type=str, required=True, help="Study accession for genomes upload")
     parser.add_argument('--genome_info', type=str, required=True, help="Genomes metadata file")
 
-    genomeType = parser.add_mutually_exclusive_group(required=True)
+    genomeType = parser.add_mutually_exclusive_group(required=True, default=False)
     genomeType.add_argument('-m', '--mags', action='store_true', help="Select for MAG upload")
     genomeType.add_argument('-b', '--bins', action='store_true', help="Select for bin upload")
     
     parser.add_argument('--out', type=str, help="Output folder. Default: working directory")
-    parser.add_argument('--force', action='store_true', help="Forces reset of sample xml's backups")
-    parser.add_argument('--live', action='store_true', help="Uploads on ENA. Omitting this " +
+    parser.add_argument('--force', action='store_true', required=False, default=False, help="Forces reset of sample xml's backups")
+    parser.add_argument('--live', action='store_true', required=False, default=False, help="Uploads on ENA. Omitting this " +
         "option allows to validate samples beforehand")
-    parser.add_argument('--tpa', action='store_true', help="Select if uploading TPA-generated genomes")
+    parser.add_argument('--tpa', action='store_true', required=False, default=False, help="Select if uploading TPA-generated genomes")
     
-    # Users can provide their credentials and centre name manually or using a config file
-    parser.add_argument('--centre_name', required=False, help="Name of the centre uploading genomes")
-    parser.add_argument('--private', required=False, help="if data is private", action='store_true', default=False)
+    # Users can provide their credentials in environment variables or using a config file
+    parser.add_argument('--centre_name', required=True, help="Name of the centre uploading genomes")
+    parser.add_argument('--private', required=False, action='store_true', default=False, help="if data is private",)
 
     return parser.parse_args(argv)
 
@@ -79,99 +79,6 @@ def parse_args(argv):
     genome_path: path to genome to upload
 '''
 
-def validate_metadata_tsv(input_file, genome_type, live):
-    logger.info('Retrieving info for genomes to submit...')
-    
-    all_fields = MAG_MANDATORY_FIELDS + BIN_MANDATORY_FIELDS
-    metadata = pd.read_csv(input_file, sep='\t', usecols=all_fields)
-    
-    # make sure there are no empty cells
-    clean_columns = list(metadata.dropna(axis=1))
-    if genome_type == "MAGs":
-        missing_values = [item for item in all_fields if item not in clean_columns]
-    else:
-        missing_values = [item for item in BIN_MANDATORY_FIELDS if item not in clean_columns]
-    
-    if missing_values:
-        raise ValueError("The following mandatory fields have missing values in " +
-            "the input file: {}".format(", ".join(missing_values)))
-
-    # check amount of genomes to register at the same time
-    if len(metadata) >= 5000:
-        raise ValueError("Genomes need to be registered in batches of 5000 genomes or smaller.")
-
-    # check whether accessions follow the right format
-    accessions_reg_exp = re.compile(r"([E|S|D]R[R|Z]\d{6,})")
-
-    accession_comparison = pd.DataFrame(columns=["genome_name", "attemptive_accessions", 
-        "correct", "mismatching", "co-assembly"])
-    accession_comparison["genome_name"] = metadata["genome_name"]
-
-    accession_comparison["attemptive_accessions"] = metadata["accessions"].map(
-        lambda a: len(a.split(',')))
-
-    accession_comparison["correct"] = metadata["accessions"].map(
-        lambda a: len(accessions_reg_exp.findall(a)))
-
-    accession_comparison["mismatching"] = accession_comparison.apply(lambda row: 
-        True if row["attemptive_accessions"] == row["correct"] 
-        else None, axis=1).isna()
-
-    mismatching_accessions = accession_comparison[accession_comparison["mismatching"]]["genome_name"]
-    if not mismatching_accessions.empty:
-        raise ValueError("Run accessions are not correctly formatted for the following " + 
-            "genomes: " + ','.join(mismatching_accessions.values))
-
-    # check whether completeness and contamination are floats
-    try:
-        pd.to_numeric(metadata["completeness"])
-        pd.to_numeric(metadata["contamination"])
-        pd.to_numeric(metadata["genome_coverage"])
-    except:
-        raise ValueError("Completeness, contamination or coverage values should be formatted as floats")
-
-    # check whether all co-assemblies have more than one run associated and viceversa
-    accession_comparison["co-assembly"] = metadata["co-assembly"]
-    coassembly_discrepancy = metadata[(
-        (accession_comparison["correct"] < 2) & (accession_comparison["co-assembly"])) |
-        ((accession_comparison["correct"] > 1) & (~accession_comparison["co-assembly"])
-        )]["genome_name"]
-    if not coassembly_discrepancy.empty:
-        raise ValueError("The following genomes show discrepancy between number of runs "
-            "involved and co-assembly status: " + ','.join(coassembly_discrepancy.values))
-
-    # are provided metagenomes part of the accepted metagenome list?
-    if False in metadata.apply(lambda row: 
-        True if row["metagenome"] in METAGENOMES 
-        else False, axis=1).unique():
-        raise ValueError("Metagenomes associated with each genome need to belong to ENA's " +
-            "approved metagenomes list.")
-
-    # do provided file paths exist?
-    if False in metadata.apply(lambda row: 
-        True if os.path.exists(row["genome_path"]) 
-        else False, axis =1).unique():
-        raise FileNotFoundError("Some genome paths do not exist.")
-
-    # check genome name lengths
-    #if not (metadata["genome_name"].map(lambda a: len(a) < 20).all()):
-    #    raise ValueError("Genome names must be shorter than 20 characters.")
-
-    # create dictionary while checking genome name uniqueness
-    uniqueness = metadata["genome_name"].nunique() == metadata["genome_name"].size
-    if uniqueness:
-        if not live:
-            timestamp = str(int(dt.timestamp(dt.now())))
-            timestamp_names = [row["genome_name"] + '_' + timestamp for index, row in metadata.iterrows()]
-            metadata["unique_genome_name"] = timestamp_names
-            genome_info = metadata.set_index("unique_genome_name").transpose().to_dict()
-        else:
-            genome_info = metadata.set_index("genome_name").transpose().to_dict()
-    else:
-        raise ValueError("Duplicate names found in genome names")
-
-    return genome_info
-
 def round_stats(stats):
     new_stat = round(float(stats), 2)
     if new_stat == 100.0:
@@ -327,45 +234,10 @@ def extract_archaea_info(name, rank):
                             
     return submittable, name, taxid
 
-def extract_genomes_info(input_file, genome_type, live):
-    genome_info = validate_metadata_tsv(input_file, genome_type, live)
-    for gen in genome_info:
-        genome_info[gen]["accessions"] = genome_info[gen]["accessions"].split(',')
-        accession_type = "run"
-        assembly_reg_exp = re.compile(r"([E|S|D]RZ\d{6,})")
-        if assembly_reg_exp.findall(genome_info[gen]["accessions"][0]):
-            accession_type = "assembly"
-        genome_info[gen]["accessionType"] = accession_type
-
-        genome_info[gen]["isolationSource"] = genome_info[gen]["metagenome"]
-        
-        try:
-            (genome_info[gen]["MAG_quality"], 
-            genome_info[gen]["completeness"], 
-            genome_info[gen]["contamination"]) = compute_mag_quality(
-                                    str(round_stats(genome_info[gen]["completeness"])),
-                                    str(round_stats(genome_info[gen]["contamination"])), 
-                                    genome_info[gen]["rRNA_presence"])
-        except IndexError:
-            pass
-
-        if str(genome_info[gen]["co-assembly"]).lower() in ["yes", "y", "true"]:
-            genome_info[gen]["co-assembly"] = True
-        else:
-            genome_info[gen]["co-assembly"] = False
-        
-        genome_info[gen]["alias"] = gen
-
-        tax_id, scientific_name = extract_tax_info(genome_info[gen]["NCBI_lineage"])
-        genome_info[gen]["taxID"] = tax_id
-        genome_info[gen]["scientific_name"] = scientific_name
-
-    return genome_info
-
 def multiple_element_set(metadata_list):
     return len(set(metadata_list))>1
 
-def combine_ENA_info(genome_info, ena_dict):
+def combine_ena_info(genome_info, ena_dict):
     for g in genome_info:
         # TODO: optimise all the part below
         if genome_info[g]["co-assembly"]:
@@ -428,7 +300,6 @@ def combine_ENA_info(genome_info, ena_dict):
         genome_info[g]["accessions"] = ','.join(genome_info[g]["accessions"])
 
 
-
 def get_accessions(accessions_file):
     accession_dict = {}
     with open(accessions_file, 'r') as f:
@@ -691,90 +562,182 @@ def generate_genome_manifest(genome_info, study, manifests_root, alias_to_sample
         if tpa:
             outfile.write("TPA\ttrue\n")
 
-def main():
-    ENA_uploader = GenomeUpload()
-    
-    if not ENA_uploader.live:
-        logger.warning("Warning: genome submission is not in live mode, " +
-            "files will be validated, but not uploaded.")
-
-    xml_genome_file, xml_sub_file = "genome_samples.xml", "submission.xml"
-    samples_xml = os.path.join(ENA_uploader.upload_dir, xml_genome_file)
-    submission_xml_path = os.path.join(ENA_uploader.upload_dir, xml_sub_file)
-    submission_xml = submission_xml_path
-    genomes, manifest_info = {}, {}
-
-    # submission xml existence
-    if not os.path.exists(submission_xml_path):
-        submission_xml = write_submission_xml(ENA_uploader.upload_dir, ENA_uploader.centre_name, False)
-
-    # sample xml generation or recovery
-    genomes = ENA_uploader.create_genome_dictionary(samples_xml)
+
+class GenomeUpload:
+    def __init__(
+        self,
+        upload_study: str,
+        centre_name: str,
+        genome_info: str,
+        bins: bool = False,
+        live: bool = False,
+        private: bool = False,
+        tpa: bool = False,
+        force: bool = False,
+        out: str = None,
+        ):
+        f"""
+        Submission of genomes.
+
+        :param upload_study: Study accession for genomes upload.
+        :param centre_name: Name of the centre uploading genomes.
+        :param genome_info: Genomes metadata file.
+        :param bins: Performs bin upload.
+        :params live: Live upload to ENA.
+        :param private: Is this a private study?
+        :param tpa: Is this a third-party assembly?
+        :param force: Resets sample XML backups.
+        :param out: Output folder.
+        """
+
+        self.genome_type = "bins" if bins else "MAGs"
+        self.live = live
+        self.private = private
+
+        self.tpa = tpa
+        self.centre_name = centre_name
+        self.force = force
+
+        self.work_dir = out if out is not None else os.getcwd()
+        self.upload_dir = self.generate_genomes_upload_dir()
+
+        if not upload_study:
+            raise ValueError("No project selected for genome upload [-u, --upload_study].")
+        self.upload_study = upload_study
+
+        if not os.path.exists(genome_info):
+            raise FileNotFoundError(f"Genome metadata file {genome_info} does not exist")
+        self.genome_metadata = genome_info
+
+    def validate_metadata_tsv(self):
+        logger.info('Retrieving info for genomes to submit...')
         
-    # manifests creation
-    manifest_dir = os.path.join(ENA_uploader.upload_dir, "manifests")
-    os.makedirs(manifest_dir, exist_ok=True)
-    
-    accessionsgen = "registered_MAGs.tsv"
-    if ENA_uploader.genome_type == "bins":
-        accessionsgen = accessionsgen.replace("MAG", "bin")
-    if not ENA_uploader.live:
-        accessionsgen = accessionsgen.replace(".tsv", "_test.tsv")
-    
-    accessions_file = os.path.join(ENA_uploader.upload_dir, accessionsgen)
-    save = False
-    write_mode = 'a'
-    if os.path.exists(accessions_file):
-        if not ENA_uploader.live:
-            save = True
-            if ENA_uploader.force:
-                write_mode = 'w'
-        if not save:
-            logger.info("Genome samples already registered, reading ERS accessions...")
-            alias_to_new_sample_accession = get_accessions(accessions_file)
-    else:
-        save = True
+        all_fields = MAG_MANDATORY_FIELDS + BIN_MANDATORY_FIELDS
+        metadata = pd.read_csv(self.genome_metadata, sep='\t', usecols=all_fields)
         
-    if save:
-        logger.info("Registering genome samples XMLs...")
-        ena_submit = EnaSubmit(samples_xml, submission_xml, ENA_uploader.live)
-        alias_to_new_sample_accession = ena_submit.handle_genomes_registration()
-        save_accessions(alias_to_new_sample_accession, accessions_file, write_mode)
+        # make sure there are no empty cells
+        clean_columns = list(metadata.dropna(axis=1))
+        if self.genome_type == "MAGs":
+            missing_values = [item for item in all_fields if item not in clean_columns]
+        else:
+            missing_values = [item for item in BIN_MANDATORY_FIELDS if item not in clean_columns]
+        
+        if missing_values:
+            raise ValueError("The following mandatory fields have missing values in " +
+                "the input file: {}".format(", ".join(missing_values)))
 
-    logger.info("Generating manifest files...")
-    
-    manifest_info = compute_manifests(genomes)
+        # check amount of genomes to register at the same time
+        if len(metadata) >= 5000:
+            raise ValueError("Genomes need to be registered in batches of 5000 genomes or smaller.")
 
-    for m in manifest_info:
-        generate_genome_manifest(manifest_info[m], ENA_uploader.up_study,  
-            manifest_dir, alias_to_new_sample_accession, ENA_uploader.genome_type, ENA_uploader.tpa)
+        # check whether accessions follow the right format
+        accessions_reg_exp = re.compile(r"([E|S|D]R[R|Z]\d{6,})")
 
-class GenomeUpload:
-    def __init__(self, argv=sys.argv[1:]):
-        self.args = self.parse_args(argv)
-        self.up_study = self.args.upload_study
-        self.genome_type = "bins" if self.args.bins else "MAGs"
-        self.live = True if self.args.live else False
-        self.private = self.args.private
+        accession_comparison = pd.DataFrame(columns=["genome_name", "attemptive_accessions", 
+            "correct", "mismatching", "co-assembly"])
+        accession_comparison["genome_name"] = metadata["genome_name"]
 
-        self.tpa = True if self.args.tpa else False
-        self.centre_name = self.args.centre_name
-        self.force = True if self.args.force else False
+        accession_comparison["attemptive_accessions"] = metadata["accessions"].map(
+            lambda a: len(a.split(',')))
 
-        work_dir = self.args.out if self.args.out else os.getcwd()
-        self.upload_dir = self.generate_genomes_upload_dir(work_dir, self.genome_type)
+        accession_comparison["correct"] = metadata["accessions"].map(
+            lambda a: len(accessions_reg_exp.findall(a)))
 
-        self.private = self.args.private
+        accession_comparison["mismatching"] = accession_comparison.apply(lambda row: 
+            True if row["attemptive_accessions"] == row["correct"] 
+            else None, axis=1).isna()
 
-        if not self.args.upload_study:
-            raise ValueError("No project selected for genome upload [-u, --upload_study].")
-        self.upload_study = self.args.upload_study
+        mismatching_accessions = accession_comparison[accession_comparison["mismatching"]]["genome_name"]
+        if not mismatching_accessions.empty:
+            raise ValueError("Run accessions are not correctly formatted for the following " + 
+                "genomes: " + ','.join(mismatching_accessions.values))
+
+        # check whether completeness and contamination are floats
+        try:
+            pd.to_numeric(metadata["completeness"])
+            pd.to_numeric(metadata["contamination"])
+            pd.to_numeric(metadata["genome_coverage"])
+        except:
+            raise ValueError("Completeness, contamination or coverage values should be formatted as floats")
+
+        # check whether all co-assemblies have more than one run associated and viceversa
+        accession_comparison["co-assembly"] = metadata["co-assembly"]
+        coassembly_discrepancy = metadata[(
+            (accession_comparison["correct"] < 2) & (accession_comparison["co-assembly"])) |
+            ((accession_comparison["correct"] > 1) & (~accession_comparison["co-assembly"])
+            )]["genome_name"]
+        if not coassembly_discrepancy.empty:
+            raise ValueError("The following genomes show discrepancy between number of runs "
+                "involved and co-assembly status: " + ','.join(coassembly_discrepancy.values))
+
+        # are provided metagenomes part of the accepted metagenome list?
+        if False in metadata.apply(lambda row: 
+            True if row["metagenome"] in METAGENOMES 
+            else False, axis=1).unique():
+            raise ValueError("Metagenomes associated with each genome need to belong to ENA's " +
+                "approved metagenomes list.")
+
+        # do provided file paths exist?
+        if False in metadata.apply(lambda row: 
+            True if os.path.exists(row["genome_path"]) 
+            else False, axis =1).unique():
+            raise FileNotFoundError("Some genome paths do not exist.")
+
+        # check genome name lengths
+        #if not (metadata["genome_name"].map(lambda a: len(a) < 20).all()):
+        #    raise ValueError("Genome names must be shorter than 20 characters.")
+
+        # create dictionary while checking genome name uniqueness
+        uniqueness = metadata["genome_name"].nunique() == metadata["genome_name"].size
+        if uniqueness:
+            if not self.live:
+                timestamp = str(int(dt.timestamp(dt.now())))
+                timestamp_names = [row["genome_name"] + '_' + timestamp for index, row in metadata.iterrows()]
+                metadata["unique_genome_name"] = timestamp_names
+                genome_info = metadata.set_index("unique_genome_name").transpose().to_dict()
+            else:
+                genome_info = metadata.set_index("genome_name").transpose().to_dict()
+        else:
+            raise ValueError("Duplicate names found in genome names")
+
+        return genome_info
+
+    def extract_genomes_info(self):
+        genome_info = self.validate_metadata_tsv()
+        for gen in genome_info:
+            genome_info[gen]["accessions"] = genome_info[gen]["accessions"].split(',')
+            accession_type = "run"
+            assembly_reg_exp = re.compile(r"([E|S|D]RZ\d{6,})")
+            if assembly_reg_exp.findall(genome_info[gen]["accessions"][0]):
+                accession_type = "assembly"
+            genome_info[gen]["accessionType"] = accession_type
+
+            genome_info[gen]["isolationSource"] = genome_info[gen]["metagenome"]
+            
+            try:
+                (genome_info[gen]["MAG_quality"], 
+                genome_info[gen]["completeness"], 
+                genome_info[gen]["contamination"]) = compute_mag_quality(
+                                        str(round_stats(genome_info[gen]["completeness"])),
+                                        str(round_stats(genome_info[gen]["contamination"])), 
+                                        genome_info[gen]["rRNA_presence"])
+            except IndexError:
+                pass
+
+            if str(genome_info[gen]["co-assembly"]).lower() in ["yes", "y", "true"]:
+                genome_info[gen]["co-assembly"] = True
+            else:
+                genome_info[gen]["co-assembly"] = False
+            
+            genome_info[gen]["alias"] = gen
 
-        if not os.path.exists(self.args.genome_info):
-            raise FileNotFoundError('Genome metadata file "{}" does not exist'.format(self.args.genome_info))
-        self.genome_metadata = self.args.genome_info
+            tax_id, scientific_name = extract_tax_info(genome_info[gen]["NCBI_lineage"])
+            genome_info[gen]["taxID"] = tax_id
+            genome_info[gen]["scientific_name"] = scientific_name
 
-    def extract_ena_info(self, genome_info, upload_dir):
+        return genome_info
+
+    def extract_ena_info(self, genome_info):
         logger.info('Retrieving project and run info from ENA (this might take a while)...')
         
         # retrieving metadata from runs (and runs from assembly accessions if provided)
@@ -798,7 +761,7 @@ def extract_ena_info(self, genome_info, upload_dir):
         if not study_set:
             raise ValueError("No study corresponding to runs found.")
 
-        backup_file = os.path.join(upload_dir, "ENA_backup.json")
+        backup_file = os.path.join(self.upload_dir, "ENA_backup.json")
         counter = 0
         if not os.path.exists(backup_file):
             with open(backup_file, 'w') as file:
@@ -881,24 +844,24 @@ def extract_ena_info(self, genome_info, upload_dir):
                                 file.write(json.dumps(temp_dict))
                                 file.truncate()
         temp_dict = {**temp_dict, **backup_dict}
-        combine_ENA_info(genome_info, temp_dict)
+        combine_ena_info(genome_info, temp_dict)
 
 
-    def generate_genomes_upload_dir(self, dir, genome_type):
+    def generate_genomes_upload_dir(self):
         upload_name = "MAG_upload"
-        if genome_type == "bins":
+        if self.genome_type == "bins":
             upload_name = upload_name.replace("MAG", "bin")
-        upload_dir = os.path.join(dir, upload_name)
+        upload_dir = os.path.join(self.work_dir, upload_name)
         os.makedirs(upload_dir, exist_ok=True)
         return upload_dir
 
     def create_genome_dictionary(self, samples_xml):
         logger.info('Retrieving data for MAG submission...')
 
-        genome_info = extract_genomes_info(self.genome_metadata, self.genome_type, self.live)
+        genome_info = self.extract_genomes_info()
 
         if not os.path.exists(samples_xml) or self.force:
-            self.extract_ena_info(genome_info, self.upload_dir)
+            self.extract_ena_info(genome_info)
             logger.info("Writing genome registration XML...")
 
             write_genomes_xml(genome_info, samples_xml, self.genome_type, 
@@ -908,7 +871,76 @@ def create_genome_dictionary(self, samples_xml):
             recover_info_from_xml(genome_info, samples_xml, self.live)
 
         return genome_info
+    
+    def genome_upload(self):
+        if not self.live:
+            logger.warning("Warning: genome submission is not in live mode, " +
+                           "files will be validated, but not uploaded.")
+        samples_xml = os.path.join(self.upload_dir, "genome_samples.xml")
+        submission_xml = os.path.join(self.upload_dir, "submission.xml")
+        genomes, manifest_info = {}, {}
+
+        # submission xml existence
+        if not os.path.exists(submission_xml):
+            submission_xml = write_submission_xml(self.upload_dir, self.centre_name, False)
+        
+        # sample xml generation or recovery
+        genomes = self.create_genome_dictionary(samples_xml)
+
+        # manifests creation
+        manifest_dir = os.path.join(self.upload_dir, "manifests")
+        os.makedirs(manifest_dir, exist_ok=True)
+
+        accessionsgen = "registered_MAGs.tsv"
+        if self.genome_type == "bins":
+            accessionsgen = accessionsgen.replace("MAG", "bin")
+        if not self.live:
+            accessionsgen = accessionsgen.replace(".tsv", "_test.tsv")
+
+        accessions_file = os.path.join(self.upload_dir, accessionsgen)
+        save = False
+        write_mode = 'a'
+        if os.path.exists(accessions_file):
+            if not self.live:
+                save = True
+                if self.force:
+                    write_mode = 'w'
+            if not save:
+                logger.info("Genome samples already registered, reading ERS accessions...")
+                alias_to_new_sample_accession = get_accessions(accessions_file)
+        else:
+            save = True
+            
+        if save:
+            logger.info("Registering genome samples XMLs...")
+            ena_submit = EnaSubmit(samples_xml, submission_xml, self.live)
+            alias_to_new_sample_accession = ena_submit.handle_genomes_registration()
+            save_accessions(alias_to_new_sample_accession, accessions_file, write_mode)
 
+        logger.info("Generating manifest files...")
+
+        manifest_info = compute_manifests(genomes)
+
+        for m in manifest_info:
+            generate_genome_manifest(manifest_info[m], self.upload_study,  
+                manifest_dir, alias_to_new_sample_accession, self.genome_type, self.tpa)
+
+
+def main():
+    args = parse_args(sys.argv[1:])
+    ena_upload = GenomeUpload(
+        upload_study=args.upload_study,
+        centre_name=args.centre_name,
+        genome_info=args.genome_info,
+        bins=args.bins,
+        live=args.live,
+        private=args.private,
+        tpa=args.tpa,
+        force=args.force,
+        out = args.out,
+    )
+    ena_upload.genome_upload()
+    
 if __name__ == "__main__":
     main()
     logger.info('Completed')

From 753645601c248667c416ee387e1f8e3c86c34f3d Mon Sep 17 00:00:00 2001
From: Varsha <vkale@ebi.ac.uk>
Date: Thu, 19 Dec 2024 13:38:54 +0000
Subject: [PATCH 11/20] precommit checks

---
 genomeuploader/__init__.py      |   2 +-
 genomeuploader/constants.py     |  40 +--
 genomeuploader/ena.py           | 195 +++++--------
 genomeuploader/genome_upload.py | 474 +++++++++++++++++---------------
 4 files changed, 346 insertions(+), 365 deletions(-)

diff --git a/genomeuploader/__init__.py b/genomeuploader/__init__.py
index 55fa725..58039f5 100644
--- a/genomeuploader/__init__.py
+++ b/genomeuploader/__init__.py
@@ -1 +1 @@
-__version__ = '2.1.1'
+__version__ = "2.1.1"
diff --git a/genomeuploader/constants.py b/genomeuploader/constants.py
index 5a27b39..7c74036 100644
--- a/genomeuploader/constants.py
+++ b/genomeuploader/constants.py
@@ -14,14 +14,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-HQ = (
-    "Multiple fragments where gaps span repetitive regions. Presence of the "
-    "23S, 16S, and 5S rRNA genes and at least 18 tRNAs."
-)
-MQ = (
-    "Many fragments with little to no review of assembly other than reporting "
-    "of standard assembly statistics."
-)
+HQ = "Multiple fragments where gaps span repetitive regions. Presence of the " "23S, 16S, and 5S rRNA genes and at least 18 tRNAs."
+MQ = "Many fragments with little to no review of assembly other than reporting " "of standard assembly statistics."
 
 METAGENOMES = [
     "activated carbon metagenome",
@@ -637,26 +631,22 @@
 ]
 
 BIN_MANDATORY_FIELDS = [
-    "genome_name", 
+    "genome_name",
     "accessions",
-    "assembly_software", 
-    "binning_software", 
-    "binning_parameters", 
-    "stats_generation_software", 
+    "assembly_software",
+    "binning_software",
+    "binning_parameters",
+    "stats_generation_software",
     "NCBI_lineage",
-    "broad_environment", 
-    "local_environment", 
-    "environmental_medium", 
+    "broad_environment",
+    "local_environment",
+    "environmental_medium",
     "metagenome",
-    "co-assembly", 
-    "genome_coverage", 
-    "genome_path"
+    "co-assembly",
+    "genome_coverage",
+    "genome_path",
 ]
 
-MAG_MANDATORY_FIELDS = [
-    "rRNA_presence", 
-    "completeness", 
-    "contamination"
-]
+MAG_MANDATORY_FIELDS = ["rRNA_presence", "completeness", "contamination"]
 
-GEOGRAPHY_DIGIT_COORDS = 8
\ No newline at end of file
+GEOGRAPHY_DIGIT_COORDS = 8
diff --git a/genomeuploader/ena.py b/genomeuploader/ena.py
index 7faab70..d968e55 100644
--- a/genomeuploader/ena.py
+++ b/genomeuploader/ena.py
@@ -1,4 +1,3 @@
-
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 
@@ -16,16 +15,16 @@
 # limitations under the License.
 
 
-import requests
 import json
 import logging
 import os
-import sys 
-from time import sleep
+import sys
+import xml.dom.minidom as minidom
 from pathlib import Path
-from dotenv import load_dotenv
+from time import sleep
 
-import xml.dom.minidom as minidom
+import requests
+from dotenv import load_dotenv
 from requests.exceptions import ConnectionError, HTTPError, RequestException, Timeout
 
 logging.basicConfig(level=logging.INFO)
@@ -37,32 +36,20 @@ class NoDataException(ValueError):
     pass
 
 
-RUN_DEFAULT_FIELDS = ','.join([
-    'secondary_study_accession',
-    'sample_accession'
-])
+RUN_DEFAULT_FIELDS = ",".join(["secondary_study_accession", "sample_accession"])
 
-STUDY_RUN_DEFAULT_FIELDS = ','.join([
-    'sample_accession',
-    'run_accession',
-    'instrument_model'
-])
+STUDY_RUN_DEFAULT_FIELDS = ",".join(["sample_accession", "run_accession", "instrument_model"])
 
-ASSEMBLY_DEFAULT_FIELDS = 'sample_accession'
+ASSEMBLY_DEFAULT_FIELDS = "sample_accession"
 
 
-SAMPLE_DEFAULT_FIELDS = ','.join([
-    'sample_accession',
-    'secondary_sample_accession',
-    'collection_date',
-    'country',
-    'location'
-])
+SAMPLE_DEFAULT_FIELDS = ",".join(["sample_accession", "secondary_sample_accession", "collection_date", "country", "location"])
 
-STUDY_DEFAULT_FIELDS = 'study_description'
+STUDY_DEFAULT_FIELDS = "study_description"
 
 RETRY_COUNT = 3
 
+
 def get_default_connection_headers():
     return {
         "headers": {
@@ -71,12 +58,10 @@ def get_default_connection_headers():
         }
     }
 
+
 def get_default_params():
-    return {
-        'format': 'json',
-        'includeMetagenomes': True,
-        'dataPortal': 'ena'
-    }
+    return {"format": "json", "includeMetagenomes": True, "dataPortal": "ena"}
+
 
 def parse_accession(accession):
     if accession.startswith("PRJ"):
@@ -93,11 +78,12 @@ def parse_accession(accession):
         logging.error(f"{accession} is not a valid accession")
         sys.exit()
 
+
 def configure_credentials():
     # Config file
     user_config = Path.home() / ".genome_uploader.config.env"
     if user_config.exists():
-        logger.debug("Loading the env variables from ".format(user_config))
+        logger.debug("Loading the env variables from ".format())
         load_dotenv(str(user_config))
     else:
         cwd_config = Path.cwd() / ".genome_uploader.config.env"
@@ -114,34 +100,36 @@ def configure_credentials():
 
     return username, password
 
+
 def query_taxid(taxid):
     url = "https://www.ebi.ac.uk/ena/taxonomy/rest/tax-id/{}".format(taxid)
     response = requests.get(url)
 
     try:
-        # Will raise exception if response status code is non-200 
+        # Will raise exception if response status code is non-200
         response.raise_for_status()
     except requests.exceptions.HTTPError as e:
         print("Request failed {} with error {}".format(url, e))
         return False
-    
+
     res = response.json()
-    
+
     return res.get("scientificName", "")
 
+
 def query_scientific_name(scientific_name, search_rank=False):
     url = "https://www.ebi.ac.uk/ena/taxonomy/rest/scientific-name/{}".format(scientific_name)
     response = requests.get(url)
-    
+
     try:
-        # Will raise exception if response status code is non-200 
+        # Will raise exception if response status code is non-200
         response.raise_for_status()
-    except requests.exceptions.HTTPError as e:
+    except requests.exceptions.HTTPError:
         if search_rank:
             return False, "", ""
         else:
             return False, ""
-    
+
     try:
         res = response.json()[0]
     except IndexError:
@@ -160,8 +148,7 @@ def query_scientific_name(scientific_name, search_rank=False):
         return submittable, taxid
 
 
-
-class EnaQuery():
+class EnaQuery:
     def __init__(self, accession, query_type, private=False):
         self.private_url = "https://www.ebi.ac.uk/ena/submit/report"
         self.public_url = "https://www.ebi.ac.uk/ena/portal/api/search"
@@ -179,9 +166,7 @@ def __init__(self, accession, query_type, private=False):
         self.query_type = query_type
 
     def post_request(self, data):
-        response = requests.post(
-            self.public_url, data=data, **get_default_connection_headers()
-        )
+        response = requests.post(self.public_url, data=data, **get_default_connection_headers())
         return response
 
     def get_request(self, url):
@@ -196,24 +181,20 @@ def get_data_or_handle_error(self, response, xml=False, full_json=False):
             data_txt = response.text
             if data_txt is None:
                 if self.private:
-                    logging.error(
-                        f"{self.accession} private data is not present in the specified Webin account"
-                    )
+                    logging.error(f"{self.accession} private data is not present in the specified Webin account")
                 else:
                     logging.error(f"{self.accession} public data does not exist")
             else:
                 if xml:
-                    return  minidom.parseString(data_txt)
-                elif full_json: 
+                    return minidom.parseString(data_txt)
+                elif full_json:
                     #   return the full json when more than one entry expected
                     return response.json()
                 else:
-                    #   only return the fist element in the list is the default. In these cases there should only be one entry returned 
+                    #   only return the fist element in the list is the default. In these cases there should only be one entry returned
                     return json.loads(response.txt)[0]
         except (IndexError, TypeError, ValueError, KeyError):
-            logging.error(
-                f"Failed to fetch {self.accession}, returned error: {response.text}"
-            )     
+            logging.error(f"Failed to fetch {self.accession}, returned error: {response.text}")
 
     def retry_or_handle_request_error(self, request, *args, **kwargs):
         attempt = 0
@@ -226,9 +207,7 @@ def retry_or_handle_request_error(self, request, *args, **kwargs):
             except (Timeout, ConnectionError) as retry_err:
                 attempt += 1
                 if attempt >= RETRY_COUNT:
-                    raise ValueError(
-                        f"Could not find {self.accession} in ENA after {RETRY_COUNT} attempts. Error: {retry_err}"
-                    )
+                    raise ValueError(f"Could not find {self.accession} in ENA after {RETRY_COUNT} attempts. Error: {retry_err}")
                 sleep(1)
             except HTTPError as http_err:
                 print(f"HTTP response has an error status: {http_err}")
@@ -247,7 +226,7 @@ def _get_private_run(self):
         run = self.get_data_or_handle_error(response)
         run_data = run["report"]
         reformatted_data = {
-            "secondary_study_accession": run_data['studyID'],
+            "secondary_study_accession": run_data["studyID"],
             "sample_accession": run_data["sampleId"],
         }
         logging.info(f"{self.accession} private run returned from ENA")
@@ -257,25 +236,22 @@ def _get_public_run(self):
         data = get_default_params()
         data.update(
             {
-            "result": "read_run",
-            "query": f'run_accession="{self.accession}"',
-            "fields": "secondary_study_accession,sample_accession",
-        }
+                "result": "read_run",
+                "query": f'run_accession="{self.accession}"',
+                "fields": "secondary_study_accession,sample_accession",
+            }
         )
         response = self.retry_or_handle_request_error(self.post_request, data)
         run = self.get_data_or_handle_error(response)
         logging.info(f"{self.accession} public run returned from ENA")
-        return run   
+        return run
 
-    
     def _get_private_study(self):
         url = f"{self.private_url}/studies/xml/{self.accession}"
         response = self.retry_or_handle_request_error(self.get_request, url)
         study = self.get_data_or_handle_error(response, xml=True)
         study_desc = study.getElementsByTagName("STUDY_DESCRIPTION")[0].firstChild.nodeValue
-        reformatted_data = {
-            "study_description" : study_desc
-        }
+        reformatted_data = {"study_description": study_desc}
         logging.info(f"{self.accession} private study returned from ENA")
         return reformatted_data
 
@@ -283,15 +259,15 @@ def _get_public_study(self):
         data = get_default_params()
         data.update(
             {
-            "result": "study",
-            "query": f'{self.acc_type}="{self.accession}"',
-            "fields": STUDY_DEFAULT_FIELDS,
-        }
+                "result": "study",
+                "query": f'{self.acc_type}="{self.accession}"',
+                "fields": STUDY_DEFAULT_FIELDS,
+            }
         )
         response = self.retry_or_handle_request_error(self.post_request, data)
         study = self.get_data_or_handle_error(response)
         logging.info(f"{self.accession} public study returned from ENA")
-        return study   
+        return study
 
     def _get_private_run_from_assembly(self):
         url = f"{self.private_url}/analyses/xml/{self.accession}"
@@ -310,7 +286,7 @@ def _get_public_run_from_assembly(self):
         run = run_ref[0].attributes["accession"].value
         logging.info(f"public run from the assembly {self.accession} returned from ENA")
 
-        return run  
+        return run
 
     def _get_private_study_runs(self):
         url = f"{self.private_url}/runs/{self.accession}"
@@ -318,66 +294,51 @@ def _get_private_study_runs(self):
         runs = self.get_data_or_handle_error(response, full_json=True)
         reformatted_data = []
         for run in runs:
-            run_data = run['report']
-            if 'sampleId' in run_data:
-                run_data['sample_accession'] = run_data.pop('sampleId')
-            if 'id' in run_data:
-                run_data['run_accession'] = run_data.pop('id')
-            if 'instrumentModel' in run_data:
-                run_data['instrument_model'] = run_data.pop('instrumentModel')
+            run_data = run["report"]
+            if "sampleId" in run_data:
+                run_data["sample_accession"] = run_data.pop("sampleId")
+            if "id" in run_data:
+                run_data["run_accession"] = run_data.pop("id")
+            if "instrumentModel" in run_data:
+                run_data["instrument_model"] = run_data.pop("instrumentModel")
             reformatted_data.append(run_data)
         logging.info(f"private runs from study {self.accession}, returned from ENA")
         return reformatted_data
 
-
     def _get_public_study_runs(self):
         data = get_default_params()
-        data.update(
-            {
-                "result": "read_run",
-                "fields": STUDY_RUN_DEFAULT_FIELDS,
-                "query": f"{self.acc_type}={self.accession}"
-            }
-        )
+        data.update({"result": "read_run", "fields": STUDY_RUN_DEFAULT_FIELDS, "query": f"{self.acc_type}={self.accession}"})
         response = self.retry_or_handle_request_error(self.post_request, data)
         runs = self.get_data_or_handle_error(response, full_json=True)
         logging.info(f"public runs from study {self.accession}, returned from ENA")
         return runs
-    
+
     def _get_private_sample(self):
         url = f"{self.private_url}/samples/xml/{self.accession}"
         reponse = self.retry_or_handle_request_error(self.get_request, url)
         reformatted_data = {}
         parsed_xml = self.get_data_or_handle_error(reponse, xml=True)
-        sample_attributes = parsed_xml.getElementsByTagName('SAMPLE_ATTRIBUTE')
+        sample_attributes = parsed_xml.getElementsByTagName("SAMPLE_ATTRIBUTE")
         for attribute in sample_attributes:
-            tag = attribute.getElementsByTagName('TAG')[0].firstChild.nodeValue
+            tag = attribute.getElementsByTagName("TAG")[0].firstChild.nodeValue
             if tag == "geographic location (country and/or sea)":
-                reformatted_data['country'] = attribute.getElementsByTagName('VALUE')[0].firstChild.nodeValue
+                reformatted_data["country"] = attribute.getElementsByTagName("VALUE")[0].firstChild.nodeValue
             if tag == "geographic location (latitude)":
-                reformatted_data['latitude'] = attribute.getElementsByTagName('VALUE')[0].firstChild.nodeValue
+                reformatted_data["latitude"] = attribute.getElementsByTagName("VALUE")[0].firstChild.nodeValue
             if tag == "geographic location (longitude)":
-                reformatted_data['longitude'] = attribute.getElementsByTagName('VALUE')[0].firstChild.nodeValue
+                reformatted_data["longitude"] = attribute.getElementsByTagName("VALUE")[0].firstChild.nodeValue
             if tag == "collection date":
-                reformatted_data['collection_date'] = attribute.getElementsByTagName('VALUE')[0].firstChild.nodeValue
-        logging.info(f"{self.accession} private sample returned from ENA")        
+                reformatted_data["collection_date"] = attribute.getElementsByTagName("VALUE")[0].firstChild.nodeValue
+        logging.info(f"{self.accession} private sample returned from ENA")
         return reformatted_data
 
-
     def _get_public_sample(self):
         data = get_default_params()
-        data.update(
-            {
-                "result": "sample",
-                "fields": SAMPLE_DEFAULT_FIELDS,
-                "query":  f"{self.acc_type}={self.accession}"
-            }
-        )
+        data.update({"result": "sample", "fields": SAMPLE_DEFAULT_FIELDS, "query": f"{self.acc_type}={self.accession}"})
         response = self.retry_or_handle_request_error(self.post_request, data)
         sample = self.get_data_or_handle_error(response)
         logging.info(f"{self.accession} public sample returned from ENA")
-        return sample         
-
+        return sample
 
     def build_query(self):
         if self.query_type == "study":
@@ -408,7 +369,7 @@ def build_query(self):
         return ena_response
 
 
-class EnaSubmit():
+class EnaSubmit:
     def __init__(self, sample_xml, submission_xml, live=False):
         self.sample_xml = sample_xml
         self.submission_xml = submission_xml
@@ -420,7 +381,6 @@ def __init__(self, sample_xml, submission_xml, live=False):
             self.auth = (username, password)
         else:
             self.auth = None
-        
 
     def handle_genomes_registration(self):
         live_sub, mode = "", "live"
@@ -431,22 +391,17 @@ def handle_genomes_registration(self):
 
         url = "https://www{}.ebi.ac.uk/ena/submit/drop-box/submit/".format(live_sub)
 
-        logger.info('Registering sample xml in {} mode.'.format(mode))
+        logger.info("Registering sample xml in {} mode.".format(mode))
 
-        f = {
-            'SUBMISSION': open(self.submission_xml, 'r'),
-            'SAMPLE': open(self.sample_xml, 'r')
-        }
+        f = {"SUBMISSION": open(self.submission_xml, "r"), "SAMPLE": open(self.sample_xml, "r")}
 
-        submission_response = requests.post(url, files = f, auth = self.auth)
+        submission_response = requests.post(url, files=f, auth=self.auth)
 
         if submission_response.status_code != 200:
-            if str(submission_response.status_code).startswith('5'):
-                raise Exception("Genomes could not be submitted to ENA as the server " +
-                    "does not respond. Please again try later.")
+            if str(submission_response.status_code).startswith("5"):
+                raise Exception("Genomes could not be submitted to ENA as the server " + "does not respond. Please again try later.")
             else:
-                raise Exception("Genomes could not be submitted to ENA. HTTP response: " +
-                    submission_response.reason)
+                raise Exception("Genomes could not be submitted to ENA. HTTP response: " + submission_response.reason)
 
         receipt_xml = minidom.parseString((submission_response.content).decode("utf-8"))
         receipt = receipt_xml.getElementsByTagName("RECEIPT")
@@ -466,11 +421,7 @@ def handle_genomes_registration(self):
             final_error += "\n\tIf you wish to validate again your data and metadata, "
             final_error += "please use the --force option."
             raise Exception(final_error)
-        
-        logger.info('{} genome samples successfully registered.'.format(str(len(alias_dict))))
-
-        return alias_dict
-    
 
+        logger.info("{} genome samples successfully registered.".format(str(len(alias_dict))))
 
-        
+        return alias_dict
diff --git a/genomeuploader/genome_upload.py b/genomeuploader/genome_upload.py
index d34435c..264d57d 100755
--- a/genomeuploader/genome_upload.py
+++ b/genomeuploader/genome_upload.py
@@ -14,51 +14,72 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
-import sys
-import logging
 import argparse
-import re
 import json
-import pandas as pd
-from datetime import date, datetime as dt
-
-import xml.etree.ElementTree as ET
+import logging
+import os
+import re
+import sys
 import xml.dom.minidom as minidom
-import requests
+import xml.etree.ElementTree as ET
+from datetime import date
+from datetime import datetime as dt
 
 import ena
+import pandas as pd
+from constants import (
+    BIN_MANDATORY_FIELDS,
+    GEOGRAPHIC_LOCATIONS,
+    GEOGRAPHY_DIGIT_COORDS,
+    HQ,
+    MAG_MANDATORY_FIELDS,
+    METAGENOMES,
+    MQ,
+)
 from ena import EnaQuery, EnaSubmit
 
-from constants import *
-
 logging.basicConfig(level=logging.DEBUG)
 logger = logging.getLogger(__name__)
 
+
 def parse_args(argv):
-    parser = argparse.ArgumentParser(formatter_class = argparse.ArgumentDefaultsHelpFormatter,
-        description="Create xmls and manifest files for genome upload and upload to ENA")
-    
-    parser.add_argument('-u', '--upload_study', type=str, required=True, help="Study accession for genomes upload")
-    parser.add_argument('--genome_info', type=str, required=True, help="Genomes metadata file")
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        description="Create xmls and manifest files for genome upload and upload to ENA",
+    )
+
+    parser.add_argument("-u", "--upload_study", type=str, required=True, help="Study accession for genomes upload")
+    parser.add_argument("--genome_info", type=str, required=True, help="Genomes metadata file")
 
     genomeType = parser.add_mutually_exclusive_group(required=True, default=False)
-    genomeType.add_argument('-m', '--mags', action='store_true', help="Select for MAG upload")
-    genomeType.add_argument('-b', '--bins', action='store_true', help="Select for bin upload")
-    
-    parser.add_argument('--out', type=str, help="Output folder. Default: working directory")
-    parser.add_argument('--force', action='store_true', required=False, default=False, help="Forces reset of sample xml's backups")
-    parser.add_argument('--live', action='store_true', required=False, default=False, help="Uploads on ENA. Omitting this " +
-        "option allows to validate samples beforehand")
-    parser.add_argument('--tpa', action='store_true', required=False, default=False, help="Select if uploading TPA-generated genomes")
-    
+    genomeType.add_argument("-m", "--mags", action="store_true", help="Select for MAG upload")
+    genomeType.add_argument("-b", "--bins", action="store_true", help="Select for bin upload")
+
+    parser.add_argument("--out", type=str, help="Output folder. Default: working directory")
+    parser.add_argument("--force", action="store_true", required=False, default=False, help="Forces reset of sample xml's backups")
+    parser.add_argument(
+        "--live",
+        action="store_true",
+        required=False,
+        default=False,
+        help="Uploads on ENA. Omitting this " + "option allows to validate samples beforehand",
+    )
+    parser.add_argument("--tpa", action="store_true", required=False, default=False, help="Select if uploading TPA-generated genomes")
+
     # Users can provide their credentials in environment variables or using a config file
-    parser.add_argument('--centre_name', required=True, help="Name of the centre uploading genomes")
-    parser.add_argument('--private', required=False, action='store_true', default=False, help="if data is private",)
+    parser.add_argument("--centre_name", required=True, help="Name of the centre uploading genomes")
+    parser.add_argument(
+        "--private",
+        required=False,
+        action="store_true",
+        default=False,
+        help="if data is private",
+    )
 
     return parser.parse_args(argv)
 
-'''
+
+"""
 Input table: expects the following parameters:
     genome_name: genome file name
     accessions: run(s) or assembly(ies) the genome was generated from
@@ -77,7 +98,8 @@ def parse_args(argv):
     co-assembly: True/False, whether the genome was generated from a co-assembly
     genome_coverage : genome coverage
     genome_path: path to genome to upload
-'''
+"""
+
 
 def round_stats(stats):
     new_stat = round(float(stats), 2)
@@ -88,17 +110,19 @@ def round_stats(stats):
 
     return new_stat
 
+
 def compute_mag_quality(completeness, contamination, rna_presence):
     rna_presence = str(rna_presence).lower() in ["true", "yes", "y"]
     quality = MQ
     if float(completeness) >= 90 and float(contamination) <= 5 and rna_presence:
         quality = HQ
-    
+
     return quality, completeness, contamination
 
+
 def extract_tax_info(tax_info):
     # if unclassified, block the execution
-    lineage, kingdom_position_lineage, digit_annotation = tax_info.split(';'), 0, False
+    lineage, kingdom_position_lineage, digit_annotation = tax_info.split(";"), 0, False
     lineage_first = lineage[0]
     if "Unclassified " in lineage_first:
         if "Archaea" in lineage_first:
@@ -128,13 +152,13 @@ def extract_tax_info(tax_info):
                 final_kingdom = selected_kingdom[index]
                 break
 
-    iterator = len(lineage)-1
+    iterator = len(lineage) - 1
     submittable = False
     rank = ""
     while iterator != -1 and not submittable:
         scientific_name = lineage[iterator].strip()
         if digit_annotation:
-            if not '*' in scientific_name:
+            if "*" not in scientific_name:
                 scientific_name = ena.query_taxid(scientific_name)
             else:
                 iterator -= 1
@@ -156,11 +180,12 @@ def extract_tax_info(tax_info):
 
     return taxid, scientific_name
 
+
 def extract_eukaryota_info(name, rank):
     non_submittable = (False, "", 0)
 
     # Asterisks in given taxonomy suggest the classification might be not confident enough.
-    if '*' in name:
+    if "*" in name:
         return non_submittable
 
     if rank == "super kingdom":
@@ -178,23 +203,24 @@ def extract_eukaryota_info(name, rank):
             if submittable:
                 return submittable, name, taxid
             else:
-                name = name.replace(" sp.", '')
+                name = name.replace(" sp.", "")
                 submittable, taxid = ena.query_scientific_name(name)
                 if submittable:
                     return submittable, name, taxid
                 else:
                     return non_submittable
 
+
 def extract_bacteria_info(name, rank):
     if rank == "species":
         name = name
     elif rank == "superkingdom":
-        name = "uncultured bacterium".format(name)
+        name = "uncultured bacterium".format()
     elif rank in ["family", "order", "class", "phylum"]:
         name = "uncultured {} bacterium".format(name)
     elif rank == "genus":
         name = "uncultured {} sp.".format(name)
-    
+
     submittable, taxid, rank = ena.query_scientific_name(name, search_rank=True)
     if not submittable:
         if rank in ["species", "genus"] and name.lower().endswith("bacteria"):
@@ -206,6 +232,7 @@ def extract_bacteria_info(name, rank):
 
     return submittable, name, taxid
 
+
 def extract_archaea_info(name, rank):
     if rank == "species":
         name = name
@@ -222,20 +249,22 @@ def extract_archaea_info(name, rank):
         name = "uncultured {} archaeon".format(name)
     elif rank == "genus":
         name = "uncultured {} sp.".format(name)
-    
+
     submittable, taxid, rank = ena.query_scientific_name(name, search_rank=True)
     if not submittable:
         if "Candidatus" in name:
             if rank == "phylum":
-                name = name.replace("Candidatus ", '')
+                name = name.replace("Candidatus ", "")
             elif rank == "family":
-                name = name.replace("uncultured ", '')
+                name = name.replace("uncultured ", "")
             submittable, taxid = ena.query_scientific_name(name)
-                            
+
     return submittable, name, taxid
 
+
 def multiple_element_set(metadata_list):
-    return len(set(metadata_list))>1
+    return len(set(metadata_list)) > 1
+
 
 def combine_ena_info(genome_info, ena_dict):
     for g in genome_info:
@@ -254,12 +283,12 @@ def combine_ena_info(genome_info, ena_dict):
                 long_list.append(ena_dict[run]["longitude"])
                 latit_list.append(ena_dict[run]["latitude"])
 
-            genome_info[g]["study"] = study_list[0] 
+            genome_info[g]["study"] = study_list[0]
             genome_info[g]["description"] = description_list[0]
-            
+
             instrument = instrument_list[0]
             if multiple_element_set(instrument_list):
-                instrument = ','.join(instrument_list)
+                instrument = ",".join(instrument_list)
             genome_info[g]["sequencingMethod"] = instrument
 
             collection_date = collection_list[0]
@@ -284,7 +313,7 @@ def combine_ena_info(genome_info, ena_dict):
 
             samples = samples_list[0]
             if multiple_element_set(samples_list):
-                samples = ','.join(samples_list)
+                samples = ",".join(samples_list)
             genome_info[g]["sample_accessions"] = samples
         else:
             run = genome_info[g]["accessions"][0]
@@ -296,85 +325,96 @@ def combine_ena_info(genome_info, ena_dict):
             genome_info[g]["country"] = ena_dict[run]["country"]
             genome_info[g]["longitude"] = ena_dict[run]["longitude"]
             genome_info[g]["latitude"] = ena_dict[run]["latitude"]
-        
-        genome_info[g]["accessions"] = ','.join(genome_info[g]["accessions"])
+
+        genome_info[g]["accessions"] = ",".join(genome_info[g]["accessions"])
 
 
 def get_accessions(accessions_file):
     accession_dict = {}
-    with open(accessions_file, 'r') as f:
+    with open(accessions_file, "r") as f:
         for line in f:
-            line = line.split('\t')
+            line = line.split("\t")
             alias = line[0]
-            accession = line[1].rstrip('\n')
+            accession = line[1].rstrip("\n")
             accession_dict[alias] = accession
 
     return accession_dict
 
+
 def save_accessions(alias_accession_dict, accessions_file, write_mode):
     with open(accessions_file, write_mode) as f:
         for elem in alias_accession_dict:
             f.write("{}\t{}\n".format(elem, alias_accession_dict[elem]))
 
-def create_manifest_dictionary(run, alias, assembly_software, sequencing_method, 
-    mag_path, gen, study, coverage, is_coassembly):
+
+def create_manifest_dictionary(run, alias, assembly_software, sequencing_method, mag_path, gen, study, coverage, is_coassembly):
     manifest_dict = {
-        "accessions" : run,
-        "alias" : alias,
-        "assembler" : assembly_software, 
-        "sequencingMethod" : sequencing_method, 
-        "genome_path" : mag_path,
-        "genome_name" : gen,
-        "study" : study,
-        "coverageDepth" : coverage, 
-        "co-assembly" : is_coassembly
+        "accessions": run,
+        "alias": alias,
+        "assembler": assembly_software,
+        "sequencingMethod": sequencing_method,
+        "genome_path": mag_path,
+        "genome_name": gen,
+        "study": study,
+        "coverageDepth": coverage,
+        "co-assembly": is_coassembly,
     }
 
     return manifest_dict
 
+
 def compute_manifests(genomes):
     manifest_info = {}
     for g in genomes:
-        manifest_info[g] = create_manifest_dictionary(genomes[g]["accessions"],
-            genomes[g]["alias"], genomes[g]["assembly_software"], 
-            genomes[g]["sequencingMethod"], genomes[g]["genome_path"], g, 
-            genomes[g]["study"], genomes[g]["genome_coverage"], genomes[g]["co-assembly"])
-    
+        manifest_info[g] = create_manifest_dictionary(
+            genomes[g]["accessions"],
+            genomes[g]["alias"],
+            genomes[g]["assembly_software"],
+            genomes[g]["sequencingMethod"],
+            genomes[g]["genome_path"],
+            g,
+            genomes[g]["study"],
+            genomes[g]["genome_coverage"],
+            genomes[g]["co-assembly"],
+        )
+
     return manifest_info
 
+
 def get_study_from_xml(sample):
     description = sample.childNodes[5].childNodes[0].data
-    study = description.split(' ')[-1][:-1]
+    study = description.split(" ")[-1][:-1]
 
     return study
 
+
 def recover_info_from_xml(genome_dict, sample_xml, live_mode):
     logger.info("Retrieving data for genome submission...")
 
     # extract list of genomes (samples) to be registered
     xml_structure = minidom.parse(sample_xml)
     samples = xml_structure.getElementsByTagName("SAMPLE")
-    
+
     for s in samples:
         study = get_study_from_xml(s)
 
         # extract alias from xml and find a match with genomes the user is uploading
         xml_alias = s.attributes["alias"].value
-        if not live_mode:         # remove time stamp if test mode is selected
+        if not live_mode:  # remove time stamp if test mode is selected
             alias_split = xml_alias.split("_")
-            xml_alias = '_'.join(alias_split[:-1])
+            xml_alias = "_".join(alias_split[:-1])
         for gen in genome_dict:
             # if match is found, associate attributes listed in the xml file
             # with genomes to upload
             if xml_alias == gen:
                 if not live_mode:
                     current_time_stamp = str(int(dt.timestamp(dt.now())))
-                    xml_alias = gen + '_' + current_time_stamp
+                    xml_alias = gen + "_" + current_time_stamp
                     s.attributes["alias"].value = xml_alias
                     sample_title = s.getElementsByTagName("TITLE")[0]
                     sample_title_value = sample_title.firstChild.nodeValue.split("_")
                     sample_title_value[-1] = current_time_stamp
-                    new_sample_title = '_'.join(sample_title_value)
+                    new_sample_title = "_".join(sample_title_value)
                     s.getElementsByTagName("TITLE")[0].firstChild.replaceWholeText(new_sample_title)
                 attributes = s.childNodes[7].getElementsByTagName("SAMPLE_ATTRIBUTE")
                 seq_method, ass_software = "", ""
@@ -390,7 +430,7 @@ def recover_info_from_xml(genome_dict, sample_xml, live_mode):
                     if not seq_method == "" and not ass_software == "":
                         break
 
-                genome_dict[gen]["accessions"] = ','.join(genome_dict[gen]["accessions"])
+                genome_dict[gen]["accessions"] = ",".join(genome_dict[gen]["accessions"])
                 genome_dict[gen]["alias"] = xml_alias
                 genome_dict[gen]["assembly_software"] = ass_software
                 genome_dict[gen]["sequencingMethod"] = seq_method
@@ -400,12 +440,13 @@ def recover_info_from_xml(genome_dict, sample_xml, live_mode):
     if not live_mode:
         for s in samples:
             xml_structure.firstChild.appendChild(s)
-        
-        with open(sample_xml, 'wb') as f:
+
+        with open(sample_xml, "wb") as f:
             dom_string = xml_structure.toprettyxml().encode("utf-8")
-            dom_string = b'\n'.join([s for s in dom_string.splitlines() if s.strip()])
+            dom_string = b"\n".join([s for s in dom_string.splitlines() if s.strip()])
             f.write(dom_string)
 
+
 def create_sample_attribute(sample_attributes, data_list, mag_data=None):
     tag = data_list[0]
     value = data_list[1]
@@ -414,13 +455,14 @@ def create_sample_attribute(sample_attributes, data_list, mag_data=None):
     units = None
     if len(data_list) == 3:
         units = data_list[2]
-    
+
     new_sample_attr = ET.SubElement(sample_attributes, "SAMPLE_ATTRIBUTE")
-    ET.SubElement(new_sample_attr, 'TAG').text = tag
-    ET.SubElement(new_sample_attr, 'VALUE').text = value
+    ET.SubElement(new_sample_attr, "TAG").text = tag
+    ET.SubElement(new_sample_attr, "VALUE").text = value
 
     if units:
-        ET.SubElement(new_sample_attr, 'UNITS').text = units
+        ET.SubElement(new_sample_attr, "UNITS").text = units
+
 
 def write_genomes_xml(genomes, xml_path, genome_type, centre_name, tpa):
     map_sample_attributes = [
@@ -467,18 +509,16 @@ def write_genomes_xml(genomes, xml_path, genome_type, centre_name, tpa):
     for g in genomes:
         plural = ""
         if genomes[g]["co-assembly"]:
-            plural = 's'
-        description = ("This sample represents a {}{} assembled from the "
-            "metagenomic run{} {} of study {}.".format(tpa_description, 
-            assembly_type, plural, genomes[g]["accessions"], 
-            genomes[g]["study"]))
-        
+            plural = "s"
+        description = "This sample represents a {}{} assembled from the " "metagenomic run{} {} of study {}.".format(
+            tpa_description, assembly_type, plural, genomes[g]["accessions"], genomes[g]["study"]
+        )
+
         sample = ET.SubElement(sample_set, "SAMPLE")
         sample.set("alias", genomes[g]["alias"])
         sample.set("center_name", centre_name)
 
-        ET.SubElement(sample, 'TITLE').text = ("{}: {}".format(assembly_type, 
-            genomes[g]["alias"]))
+        ET.SubElement(sample, "TITLE").text = "{}: {}".format(assembly_type, genomes[g]["alias"])
         sample_name = ET.SubElement(sample, "SAMPLE_NAME")
         ET.SubElement(sample_name, "TAXON_ID").text = genomes[g]["taxID"]
         ET.SubElement(sample_name, "SCIENTIFIC_NAME").text = genomes[g]["scientific_name"]
@@ -494,41 +534,39 @@ def write_genomes_xml(genomes, xml_path, genome_type, centre_name, tpa):
         for constant in constant_sample_attributes:
             create_sample_attribute(sample_attributes, constant)
 
-    with open(xml_path, 'wb') as f:
-        dom = minidom.parseString(
-            ET.tostring(sample_set, encoding="utf-8")
-        )
+    with open(xml_path, "wb") as f:
+        dom = minidom.parseString(ET.tostring(sample_set, encoding="utf-8"))
         f.write(dom.toprettyxml().encode("utf-8"))
 
+
 def write_submission_xml(upload_dir, centre_name, study=True):
     today = str(date.today())
-    sub_xml = os.path.join(upload_dir, 'submission.xml')
+    sub_xml = os.path.join(upload_dir, "submission.xml")
 
-    submission = ET.Element('SUBMISSION')
-    submission.set('center_name', centre_name)
+    submission = ET.Element("SUBMISSION")
+    submission.set("center_name", centre_name)
 
     # template
-    actions = ET.SubElement(submission, 'ACTIONS')
-    action_sub = ET.SubElement(actions, 'ACTION')
-    ET.SubElement(action_sub, 'ADD')
+    actions = ET.SubElement(submission, "ACTIONS")
+    action_sub = ET.SubElement(actions, "ACTION")
+    ET.SubElement(action_sub, "ADD")
 
     # attributes: function and hold date
     if study:
-        action_hold = ET.SubElement(actions, 'ACTION')
-        hold = ET.SubElement(action_hold, 'HOLD')
-        hold.set('HoldUntilDate', today)
+        action_hold = ET.SubElement(actions, "ACTION")
+        hold = ET.SubElement(action_hold, "HOLD")
+        hold.set("HoldUntilDate", today)
 
-    with open(sub_xml, 'wb') as submission_file:
-        dom = minidom.parseString(
-            ET.tostring(submission, encoding="utf-8")
-        )
+    with open(sub_xml, "wb") as submission_file:
+        dom = minidom.parseString(ET.tostring(submission, encoding="utf-8"))
         submission_file.write(dom.toprettyxml().encode("utf-8"))
 
     return sub_xml
 
+
 def generate_genome_manifest(genome_info, study, manifests_root, alias_to_sample, genome_type, tpa):
     manifest_path = os.path.join(manifests_root, f'{genome_info["genome_name"]}.manifest')
-    
+
     tpa_addition, multiple_runs = "", ""
     if tpa:
         tpa_addition = "Third Party Annotation (TPA) "
@@ -539,25 +577,29 @@ def generate_genome_manifest(genome_info, study, manifests_root, alias_to_sample
         assembly_type = "binned metagenome"
 
     values = (
-        ('STUDY', study),
-        ('SAMPLE', alias_to_sample[genome_info["alias"]]),
-        ('ASSEMBLYNAME', genome_info["alias"]),
-        ('ASSEMBLY_TYPE', assembly_type),
-        ('COVERAGE', genome_info["coverageDepth"]),
-        ('PROGRAM', genome_info["assembler"]),
-        ('PLATFORM', genome_info["sequencingMethod"]),
-        ('MOLECULETYPE', "genomic DNA"),
-        ('DESCRIPTION', ("This is a {}bin derived from the primary whole genome "
-            "shotgun (WGS) data set {}. This sample represents a {} from the "
-            "metagenomic run{} {}.".format(tpa_addition, genome_info["study"], 
-            assembly_type, multiple_runs, genome_info["accessions"]))),
-        ('RUN_REF', genome_info["accessions"]),
-        ('FASTA', os.path.abspath(genome_info["genome_path"]))
+        ("STUDY", study),
+        ("SAMPLE", alias_to_sample[genome_info["alias"]]),
+        ("ASSEMBLYNAME", genome_info["alias"]),
+        ("ASSEMBLY_TYPE", assembly_type),
+        ("COVERAGE", genome_info["coverageDepth"]),
+        ("PROGRAM", genome_info["assembler"]),
+        ("PLATFORM", genome_info["sequencingMethod"]),
+        ("MOLECULETYPE", "genomic DNA"),
+        (
+            "DESCRIPTION",
+            (
+                "This is a {}bin derived from the primary whole genome "
+                "shotgun (WGS) data set {}. This sample represents a {} from the "
+                "metagenomic run{} {}.".format(tpa_addition, genome_info["study"], assembly_type, multiple_runs, genome_info["accessions"])
+            ),
+        ),
+        ("RUN_REF", genome_info["accessions"]),
+        ("FASTA", os.path.abspath(genome_info["genome_path"])),
     )
     logger.info("Writing manifest file (.manifest) for {}.".format(genome_info["alias"]))
     with open(manifest_path, "w") as outfile:
-        for (k, v) in values:
-            manifest = f'{k}\t{v}\n'
+        for k, v in values:
+            manifest = f"{k}\t{v}\n"
             outfile.write(manifest)
         if tpa:
             outfile.write("TPA\ttrue\n")
@@ -575,8 +617,8 @@ def __init__(
         tpa: bool = False,
         force: bool = False,
         out: str = None,
-        ):
-        f"""
+    ):
+        """
         Submission of genomes.
 
         :param upload_study: Study accession for genomes upload.
@@ -610,21 +652,22 @@ def __init__(
         self.genome_metadata = genome_info
 
     def validate_metadata_tsv(self):
-        logger.info('Retrieving info for genomes to submit...')
-        
+        logger.info("Retrieving info for genomes to submit...")
+
         all_fields = MAG_MANDATORY_FIELDS + BIN_MANDATORY_FIELDS
-        metadata = pd.read_csv(self.genome_metadata, sep='\t', usecols=all_fields)
-        
+        metadata = pd.read_csv(self.genome_metadata, sep="\t", usecols=all_fields)
+
         # make sure there are no empty cells
         clean_columns = list(metadata.dropna(axis=1))
         if self.genome_type == "MAGs":
             missing_values = [item for item in all_fields if item not in clean_columns]
         else:
             missing_values = [item for item in BIN_MANDATORY_FIELDS if item not in clean_columns]
-        
+
         if missing_values:
-            raise ValueError("The following mandatory fields have missing values in " +
-                "the input file: {}".format(", ".join(missing_values)))
+            raise ValueError(
+                "The following mandatory fields have missing values in " + "the input file: {}".format(", ".join(missing_values))
+            )
 
         # check amount of genomes to register at the same time
         if len(metadata) >= 5000:
@@ -633,58 +676,53 @@ def validate_metadata_tsv(self):
         # check whether accessions follow the right format
         accessions_reg_exp = re.compile(r"([E|S|D]R[R|Z]\d{6,})")
 
-        accession_comparison = pd.DataFrame(columns=["genome_name", "attemptive_accessions", 
-            "correct", "mismatching", "co-assembly"])
+        accession_comparison = pd.DataFrame(columns=["genome_name", "attemptive_accessions", "correct", "mismatching", "co-assembly"])
         accession_comparison["genome_name"] = metadata["genome_name"]
 
-        accession_comparison["attemptive_accessions"] = metadata["accessions"].map(
-            lambda a: len(a.split(',')))
+        accession_comparison["attemptive_accessions"] = metadata["accessions"].map(lambda a: len(a.split(",")))
 
-        accession_comparison["correct"] = metadata["accessions"].map(
-            lambda a: len(accessions_reg_exp.findall(a)))
+        accession_comparison["correct"] = metadata["accessions"].map(lambda a: len(accessions_reg_exp.findall(a)))
 
-        accession_comparison["mismatching"] = accession_comparison.apply(lambda row: 
-            True if row["attemptive_accessions"] == row["correct"] 
-            else None, axis=1).isna()
+        accession_comparison["mismatching"] = accession_comparison.apply(
+            lambda row: True if row["attemptive_accessions"] == row["correct"] else None, axis=1
+        ).isna()
 
         mismatching_accessions = accession_comparison[accession_comparison["mismatching"]]["genome_name"]
         if not mismatching_accessions.empty:
-            raise ValueError("Run accessions are not correctly formatted for the following " + 
-                "genomes: " + ','.join(mismatching_accessions.values))
+            raise ValueError(
+                "Run accessions are not correctly formatted for the following " + "genomes: " + ",".join(mismatching_accessions.values)
+            )
 
         # check whether completeness and contamination are floats
         try:
             pd.to_numeric(metadata["completeness"])
             pd.to_numeric(metadata["contamination"])
             pd.to_numeric(metadata["genome_coverage"])
-        except:
+        except ValueError:
             raise ValueError("Completeness, contamination or coverage values should be formatted as floats")
 
         # check whether all co-assemblies have more than one run associated and viceversa
         accession_comparison["co-assembly"] = metadata["co-assembly"]
-        coassembly_discrepancy = metadata[(
-            (accession_comparison["correct"] < 2) & (accession_comparison["co-assembly"])) |
-            ((accession_comparison["correct"] > 1) & (~accession_comparison["co-assembly"])
-            )]["genome_name"]
+        coassembly_discrepancy = metadata[
+            ((accession_comparison["correct"] < 2) & (accession_comparison["co-assembly"]))
+            | ((accession_comparison["correct"] > 1) & (~accession_comparison["co-assembly"]))
+        ]["genome_name"]
         if not coassembly_discrepancy.empty:
-            raise ValueError("The following genomes show discrepancy between number of runs "
-                "involved and co-assembly status: " + ','.join(coassembly_discrepancy.values))
+            raise ValueError(
+                "The following genomes show discrepancy between number of runs "
+                "involved and co-assembly status: " + ",".join(coassembly_discrepancy.values)
+            )
 
         # are provided metagenomes part of the accepted metagenome list?
-        if False in metadata.apply(lambda row: 
-            True if row["metagenome"] in METAGENOMES 
-            else False, axis=1).unique():
-            raise ValueError("Metagenomes associated with each genome need to belong to ENA's " +
-                "approved metagenomes list.")
+        if False in metadata.apply(lambda row: True if row["metagenome"] in METAGENOMES else False, axis=1).unique():
+            raise ValueError("Metagenomes associated with each genome need to belong to ENA's " + "approved metagenomes list.")
 
         # do provided file paths exist?
-        if False in metadata.apply(lambda row: 
-            True if os.path.exists(row["genome_path"]) 
-            else False, axis =1).unique():
+        if False in metadata.apply(lambda row: True if os.path.exists(row["genome_path"]) else False, axis=1).unique():
             raise FileNotFoundError("Some genome paths do not exist.")
 
         # check genome name lengths
-        #if not (metadata["genome_name"].map(lambda a: len(a) < 20).all()):
+        # if not (metadata["genome_name"].map(lambda a: len(a) < 20).all()):
         #    raise ValueError("Genome names must be shorter than 20 characters.")
 
         # create dictionary while checking genome name uniqueness
@@ -692,7 +730,7 @@ def validate_metadata_tsv(self):
         if uniqueness:
             if not self.live:
                 timestamp = str(int(dt.timestamp(dt.now())))
-                timestamp_names = [row["genome_name"] + '_' + timestamp for index, row in metadata.iterrows()]
+                timestamp_names = [row["genome_name"] + "_" + timestamp for index, row in metadata.iterrows()]
                 metadata["unique_genome_name"] = timestamp_names
                 genome_info = metadata.set_index("unique_genome_name").transpose().to_dict()
             else:
@@ -705,7 +743,7 @@ def validate_metadata_tsv(self):
     def extract_genomes_info(self):
         genome_info = self.validate_metadata_tsv()
         for gen in genome_info:
-            genome_info[gen]["accessions"] = genome_info[gen]["accessions"].split(',')
+            genome_info[gen]["accessions"] = genome_info[gen]["accessions"].split(",")
             accession_type = "run"
             assembly_reg_exp = re.compile(r"([E|S|D]RZ\d{6,})")
             if assembly_reg_exp.findall(genome_info[gen]["accessions"][0]):
@@ -713,14 +751,17 @@ def extract_genomes_info(self):
             genome_info[gen]["accessionType"] = accession_type
 
             genome_info[gen]["isolationSource"] = genome_info[gen]["metagenome"]
-            
+
             try:
-                (genome_info[gen]["MAG_quality"], 
-                genome_info[gen]["completeness"], 
-                genome_info[gen]["contamination"]) = compute_mag_quality(
-                                        str(round_stats(genome_info[gen]["completeness"])),
-                                        str(round_stats(genome_info[gen]["contamination"])), 
-                                        genome_info[gen]["rRNA_presence"])
+                (
+                    genome_info[gen]["MAG_quality"],
+                    genome_info[gen]["completeness"],
+                    genome_info[gen]["contamination"],
+                ) = compute_mag_quality(
+                    str(round_stats(genome_info[gen]["completeness"])),
+                    str(round_stats(genome_info[gen]["contamination"])),
+                    genome_info[gen]["rRNA_presence"],
+                )
             except IndexError:
                 pass
 
@@ -728,7 +769,7 @@ def extract_genomes_info(self):
                 genome_info[gen]["co-assembly"] = True
             else:
                 genome_info[gen]["co-assembly"] = False
-            
+
             genome_info[gen]["alias"] = gen
 
             tax_id, scientific_name = extract_tax_info(genome_info[gen]["NCBI_lineage"])
@@ -738,8 +779,8 @@ def extract_genomes_info(self):
         return genome_info
 
     def extract_ena_info(self, genome_info):
-        logger.info('Retrieving project and run info from ENA (this might take a while)...')
-        
+        logger.info("Retrieving project and run info from ENA (this might take a while)...")
+
         # retrieving metadata from runs (and runs from assembly accessions if provided)
         all_runs = []
         for g in genome_info:
@@ -757,14 +798,14 @@ def extract_ena_info(self, genome_info):
             run_info = ena_query.build_query()
             study_set.add(run_info["secondary_study_accession"])
             samples_dict[r] = run_info["sample_accession"]
-        
+
         if not study_set:
             raise ValueError("No study corresponding to runs found.")
 
         backup_file = os.path.join(self.upload_dir, "ENA_backup.json")
         counter = 0
         if not os.path.exists(backup_file):
-            with open(backup_file, 'w') as file:
+            with open(backup_file, "w") as file:
                 pass
         with open(backup_file, "r+") as file:
             try:
@@ -782,7 +823,7 @@ def extract_ena_info(self, genome_info):
                 ena_info = ena_query.build_query()
                 if ena_info == []:
                     raise IOError("No runs found on ENA for project {}.".format(s))
-                
+
                 for run, item in enumerate(ena_info):
                     run_accession = ena_info[run]["run_accession"]
                     if run_accession not in backup_dict:
@@ -791,23 +832,23 @@ def extract_ena_info(self, genome_info):
                             ena_query = EnaQuery(sample_accession, "sample", self.private)
                             sample_info = ena_query.build_query()
 
-                            if 'latitude' in sample_info and 'longitude' in sample_info:
-                                latitude =  sample_info['latitude']
-                                longitude = sample_info['longitude']
+                            if "latitude" in sample_info and "longitude" in sample_info:
+                                latitude = sample_info["latitude"]
+                                longitude = sample_info["longitude"]
                             else:
                                 location = sample_info["location"]
                                 latitude, longitude = None, None
-                                if 'N' in location:
-                                    latitude = location.split('N')[0].strip()
-                                    longitude = location.split('N')[1].strip()
-                                elif 'S' in location:
-                                    latitude = '-' + location.split('S')[0].strip()
-                                    longitude = location.split('S')[1].strip()
-
-                                if 'W' in longitude:
-                                    longitude = '-' + longitude.split('W')[0].strip()
-                                elif longitude.endswith('E'):
-                                    longitude = longitude.split('E')[0].strip()
+                                if "N" in location:
+                                    latitude = location.split("N")[0].strip()
+                                    longitude = location.split("N")[1].strip()
+                                elif "S" in location:
+                                    latitude = "-" + location.split("S")[0].strip()
+                                    longitude = location.split("S")[1].strip()
+
+                                if "W" in longitude:
+                                    longitude = "-" + longitude.split("W")[0].strip()
+                                elif longitude.endswith("E"):
+                                    longitude = longitude.split("E")[0].strip()
 
                             if latitude:
                                 latitude = "{:.{}f}".format(round(float(latitude), GEOGRAPHY_DIGIT_COORDS), GEOGRAPHY_DIGIT_COORDS)
@@ -818,35 +859,34 @@ def extract_ena_info(self, genome_info):
                                 longitude = "{:.{}f}".format(round(float(longitude), GEOGRAPHY_DIGIT_COORDS), GEOGRAPHY_DIGIT_COORDS)
                             else:
                                 longitude = "not provided"
-    
-                            country = sample_info["country"].split(':')[0]
-                            if not country in GEOGRAPHIC_LOCATIONS:
+
+                            country = sample_info["country"].split(":")[0]
+                            if country not in GEOGRAPHIC_LOCATIONS:
                                 country = "not provided"
-                            
+
                             collectionDate = sample_info["collection_date"]
                             if collectionDate == "" or collectionDate == "missing":
                                 collectionDate = "not provided"
 
                             temp_dict[run_accession] = {
-                                "instrumentModel" : ena_info[run]["instrument_model"],
-                                "collectionDate" : collectionDate,
-                                "country" : country,
-                                "latitude" : latitude,
-                                "longitude" : longitude,
-                                "projectDescription" : project_description,
-                                "study" : s,
-                                "sampleAccession" : samples_dict[run_accession]
+                                "instrumentModel": ena_info[run]["instrument_model"],
+                                "collectionDate": collectionDate,
+                                "country": country,
+                                "latitude": latitude,
+                                "longitude": longitude,
+                                "projectDescription": project_description,
+                                "study": s,
+                                "sampleAccession": samples_dict[run_accession],
                             }
                             counter += 1
 
-                            if (counter%10 == 0) or (len(runs_set) - len(backup_dict) == counter):
+                            if (counter % 10 == 0) or (len(runs_set) - len(backup_dict) == counter):
                                 file.seek(0)
                                 file.write(json.dumps(temp_dict))
                                 file.truncate()
         temp_dict = {**temp_dict, **backup_dict}
         combine_ena_info(genome_info, temp_dict)
 
-
     def generate_genomes_upload_dir(self):
         upload_name = "MAG_upload"
         if self.genome_type == "bins":
@@ -856,7 +896,7 @@ def generate_genomes_upload_dir(self):
         return upload_dir
 
     def create_genome_dictionary(self, samples_xml):
-        logger.info('Retrieving data for MAG submission...')
+        logger.info("Retrieving data for MAG submission...")
 
         genome_info = self.extract_genomes_info()
 
@@ -864,18 +904,16 @@ def create_genome_dictionary(self, samples_xml):
             self.extract_ena_info(genome_info)
             logger.info("Writing genome registration XML...")
 
-            write_genomes_xml(genome_info, samples_xml, self.genome_type, 
-                              self.centre_name, self.tpa)
+            write_genomes_xml(genome_info, samples_xml, self.genome_type, self.centre_name, self.tpa)
             logger.info("All files have been written to " + self.upload_dir)
         else:
             recover_info_from_xml(genome_info, samples_xml, self.live)
 
         return genome_info
-    
+
     def genome_upload(self):
         if not self.live:
-            logger.warning("Warning: genome submission is not in live mode, " +
-                           "files will be validated, but not uploaded.")
+            logger.warning("Warning: genome submission is not in live mode, " + "files will be validated, but not uploaded.")
         samples_xml = os.path.join(self.upload_dir, "genome_samples.xml")
         submission_xml = os.path.join(self.upload_dir, "submission.xml")
         genomes, manifest_info = {}, {}
@@ -883,7 +921,7 @@ def genome_upload(self):
         # submission xml existence
         if not os.path.exists(submission_xml):
             submission_xml = write_submission_xml(self.upload_dir, self.centre_name, False)
-        
+
         # sample xml generation or recovery
         genomes = self.create_genome_dictionary(samples_xml)
 
@@ -899,18 +937,18 @@ def genome_upload(self):
 
         accessions_file = os.path.join(self.upload_dir, accessionsgen)
         save = False
-        write_mode = 'a'
+        write_mode = "a"
         if os.path.exists(accessions_file):
             if not self.live:
                 save = True
                 if self.force:
-                    write_mode = 'w'
+                    write_mode = "w"
             if not save:
                 logger.info("Genome samples already registered, reading ERS accessions...")
                 alias_to_new_sample_accession = get_accessions(accessions_file)
         else:
             save = True
-            
+
         if save:
             logger.info("Registering genome samples XMLs...")
             ena_submit = EnaSubmit(samples_xml, submission_xml, self.live)
@@ -922,8 +960,9 @@ def genome_upload(self):
         manifest_info = compute_manifests(genomes)
 
         for m in manifest_info:
-            generate_genome_manifest(manifest_info[m], self.upload_study,  
-                manifest_dir, alias_to_new_sample_accession, self.genome_type, self.tpa)
+            generate_genome_manifest(
+                manifest_info[m], self.upload_study, manifest_dir, alias_to_new_sample_accession, self.genome_type, self.tpa
+            )
 
 
 def main():
@@ -937,10 +976,11 @@ def main():
         private=args.private,
         tpa=args.tpa,
         force=args.force,
-        out = args.out,
+        out=args.out,
     )
     ena_upload.genome_upload()
-    
+
+
 if __name__ == "__main__":
     main()
-    logger.info('Completed')
+    logger.info("Completed")

From 77dcaea6dab8274b5ffe5c389d85c96859a95bdd Mon Sep 17 00:00:00 2001
From: Varsha <vkale@ebi.ac.uk>
Date: Thu, 19 Dec 2024 14:06:49 +0000
Subject: [PATCH 12/20] always try public APIs if private fails

---
 genomeuploader/ena.py | 67 +++++++++++++++++++++++++++++++++----------
 1 file changed, 52 insertions(+), 15 deletions(-)

diff --git a/genomeuploader/ena.py b/genomeuploader/ena.py
index d968e55..d181d82 100644
--- a/genomeuploader/ena.py
+++ b/genomeuploader/ena.py
@@ -343,31 +343,68 @@ def _get_public_sample(self):
     def build_query(self):
         if self.query_type == "study":
             if self.private:
-                ena_response = self._get_private_study()
-            else:
-                ena_response = self._get_public_study()
+                try:
+                    ena_response = self._get_private_study()
+                except Exception:
+                    logging.info("Private API failed, trying public")
+            ena_response = self._get_public_study()
         elif self.query_type == "run":
             if self.private:
-                ena_response = self._get_private_run()
-            else:
-                ena_response = self._get_public_run()
+                try:
+                    ena_response = self._get_private_run()
+                except Exception:
+                    logging.info("Private API failed, trying public")
+
+            ena_response = self._get_public_run()
         elif self.query_type == "run_assembly":
             if self.private:
-                ena_response = self._get_private_run_from_assembly()
-            else:
-                ena_response = self._get_public_run_from_assembly()
+                try:
+                    ena_response = self._get_private_run_from_assembly()
+                except Exception:
+                    logging.info("Private API failed, trying public")
+            ena_response = self._get_public_run_from_assembly()
         elif self.query_type == "study_runs":
             if self.private:
-                ena_response = self._get_private_study_runs()
-            else:
-                ena_response = self._get_public_study_runs()
+                try:
+                    ena_response = self._get_private_study_runs()
+                except Exception:
+                    logging.info("Private API failed, trying public")
+            ena_response = self._get_public_study_runs()
         elif self.query_type == "sample":
             if self.private:
-                ena_response = self._get_private_sample()
-            else:
-                ena_response = self._get_public_sample()
+                try:    
+                    ena_response = self._get_private_sample()
+                except Exception:
+                    logging.info("Private API failed, trying public")
+            ena_response = self._get_public_sample()
         return ena_response
 
+    def build_query(self):
+        """If the private flag is given, assume private data and try private APIs.
+        ENA also has cases where a run may be private but the sample may be public etc. Hence always try 
+        public if private fails"""
+        api_map = {
+            "study": (self._get_private_study, self._get_public_study),
+            "run": (self._get_private_run, self._get_public_run),
+            "run_assembly": (self._get_private_run_from_assembly, self._get_public_run_from_assembly),
+            "study_runs": (self._get_private_study_runs, self._get_public_study_runs),
+            "sample": (self._get_private_sample, self._get_public_sample)
+        }
+
+        private_api, public_api = api_map.get(self.query_type, (None, None))
+
+        if self.private:
+            try:
+                ena_response = private_api()
+            except Exception:
+                logging.info(f"Private API for {self.query_type} failed, trying public.")
+                ena_response = public_api()
+        else:
+            ena_response = public_api()
+
+        return ena_response
+
+
 
 class EnaSubmit:
     def __init__(self, sample_xml, submission_xml, live=False):

From b136582edd0356275bcd934123ae9a3d84e92a38 Mon Sep 17 00:00:00 2001
From: Varsha <vkale@ebi.ac.uk>
Date: Thu, 19 Dec 2024 18:43:42 +0000
Subject: [PATCH 13/20] add unit tests for ena queries

---
 genomeuploader/ena.py                         |  62 ++-----
 genomeuploader/genome_upload.py               |   2 +-
 tests/conftest.py                             | 100 +++++++++++
 tests/fixtures/responses/private_run.json     |  17 ++
 .../responses/private_study_runs.json         | 152 +++++++++++++++++
 tests/fixtures/responses/public_run.json      |   7 +
 tests/fixtures/responses/public_sample.json   |   8 +
 tests/fixtures/responses/public_study.json    |   6 +
 .../fixtures/responses/public_study_runs.json |  52 ++++++
 tests/unit/test_ena.py                        | 158 ++++++++++++++++++
 10 files changed, 514 insertions(+), 50 deletions(-)
 create mode 100644 tests/conftest.py
 create mode 100644 tests/fixtures/responses/private_run.json
 create mode 100644 tests/fixtures/responses/private_study_runs.json
 create mode 100644 tests/fixtures/responses/public_run.json
 create mode 100644 tests/fixtures/responses/public_sample.json
 create mode 100644 tests/fixtures/responses/public_study.json
 create mode 100644 tests/fixtures/responses/public_study_runs.json
 create mode 100644 tests/unit/test_ena.py

diff --git a/genomeuploader/ena.py b/genomeuploader/ena.py
index d181d82..f5ae0b9 100644
--- a/genomeuploader/ena.py
+++ b/genomeuploader/ena.py
@@ -43,9 +43,9 @@ class NoDataException(ValueError):
 ASSEMBLY_DEFAULT_FIELDS = "sample_accession"
 
 
-SAMPLE_DEFAULT_FIELDS = ",".join(["sample_accession", "secondary_sample_accession", "collection_date", "country", "location"])
+SAMPLE_DEFAULT_FIELDS = ",".join(["sample_accession", "collection_date", "country", "location"])
 
-STUDY_DEFAULT_FIELDS = "study_description"
+STUDY_DEFAULT_FIELDS = "study_accession,study_description"
 
 RETRY_COUNT = 3
 
@@ -71,9 +71,11 @@ def parse_accession(accession):
     elif "RR" in accession:
         return "run_accession"
     elif "SAM" in accession:
-        return "secondary_sample_accession"
-    elif "RS" in accession:
         return "sample_accession"
+    elif "RS" in accession:
+        return "secondary_sample_accession"
+    elif "RZ" in accession:
+        return "analysis_accession"
     else:
         logging.error(f"{accession} is not a valid accession")
         sys.exit()
@@ -192,7 +194,7 @@ def get_data_or_handle_error(self, response, xml=False, full_json=False):
                     return response.json()
                 else:
                     #   only return the fist element in the list is the default. In these cases there should only be one entry returned
-                    return json.loads(response.txt)[0]
+                    return json.loads(response.text)[0]
         except (IndexError, TypeError, ValueError, KeyError):
             logging.error(f"Failed to fetch {self.accession}, returned error: {response.text}")
 
@@ -226,7 +228,8 @@ def _get_private_run(self):
         run = self.get_data_or_handle_error(response)
         run_data = run["report"]
         reformatted_data = {
-            "secondary_study_accession": run_data["studyID"],
+            "run_accession": run_data["id"],
+            "secondary_study_accession": run_data["studyId"],
             "sample_accession": run_data["sampleId"],
         }
         logging.info(f"{self.accession} private run returned from ENA")
@@ -251,7 +254,7 @@ def _get_private_study(self):
         response = self.retry_or_handle_request_error(self.get_request, url)
         study = self.get_data_or_handle_error(response, xml=True)
         study_desc = study.getElementsByTagName("STUDY_DESCRIPTION")[0].firstChild.nodeValue
-        reformatted_data = {"study_description": study_desc}
+        reformatted_data = {"study_accession": self.accession, "study_description": study_desc}
         logging.info(f"{self.accession} private study returned from ENA")
         return reformatted_data
 
@@ -317,6 +320,7 @@ def _get_private_sample(self):
         url = f"{self.private_url}/samples/xml/{self.accession}"
         reponse = self.retry_or_handle_request_error(self.get_request, url)
         reformatted_data = {}
+        reformatted_data["sample_accession"] = self.accession
         parsed_xml = self.get_data_or_handle_error(reponse, xml=True)
         sample_attributes = parsed_xml.getElementsByTagName("SAMPLE_ATTRIBUTE")
         for attribute in sample_attributes:
@@ -340,55 +344,16 @@ def _get_public_sample(self):
         logging.info(f"{self.accession} public sample returned from ENA")
         return sample
 
-    def build_query(self):
-        if self.query_type == "study":
-            if self.private:
-                try:
-                    ena_response = self._get_private_study()
-                except Exception:
-                    logging.info("Private API failed, trying public")
-            ena_response = self._get_public_study()
-        elif self.query_type == "run":
-            if self.private:
-                try:
-                    ena_response = self._get_private_run()
-                except Exception:
-                    logging.info("Private API failed, trying public")
-
-            ena_response = self._get_public_run()
-        elif self.query_type == "run_assembly":
-            if self.private:
-                try:
-                    ena_response = self._get_private_run_from_assembly()
-                except Exception:
-                    logging.info("Private API failed, trying public")
-            ena_response = self._get_public_run_from_assembly()
-        elif self.query_type == "study_runs":
-            if self.private:
-                try:
-                    ena_response = self._get_private_study_runs()
-                except Exception:
-                    logging.info("Private API failed, trying public")
-            ena_response = self._get_public_study_runs()
-        elif self.query_type == "sample":
-            if self.private:
-                try:    
-                    ena_response = self._get_private_sample()
-                except Exception:
-                    logging.info("Private API failed, trying public")
-            ena_response = self._get_public_sample()
-        return ena_response
-
     def build_query(self):
         """If the private flag is given, assume private data and try private APIs.
-        ENA also has cases where a run may be private but the sample may be public etc. Hence always try 
+        ENA also has cases where a run may be private but the sample may be public etc. Hence always try
         public if private fails"""
         api_map = {
             "study": (self._get_private_study, self._get_public_study),
             "run": (self._get_private_run, self._get_public_run),
             "run_assembly": (self._get_private_run_from_assembly, self._get_public_run_from_assembly),
             "study_runs": (self._get_private_study_runs, self._get_public_study_runs),
-            "sample": (self._get_private_sample, self._get_public_sample)
+            "sample": (self._get_private_sample, self._get_public_sample),
         }
 
         private_api, public_api = api_map.get(self.query_type, (None, None))
@@ -405,7 +370,6 @@ def build_query(self):
         return ena_response
 
 
-
 class EnaSubmit:
     def __init__(self, sample_xml, submission_xml, live=False):
         self.sample_xml = sample_xml
diff --git a/genomeuploader/genome_upload.py b/genomeuploader/genome_upload.py
index 264d57d..4436195 100755
--- a/genomeuploader/genome_upload.py
+++ b/genomeuploader/genome_upload.py
@@ -51,7 +51,7 @@ def parse_args(argv):
     parser.add_argument("-u", "--upload_study", type=str, required=True, help="Study accession for genomes upload")
     parser.add_argument("--genome_info", type=str, required=True, help="Genomes metadata file")
 
-    genomeType = parser.add_mutually_exclusive_group(required=True, default=False)
+    genomeType = parser.add_mutually_exclusive_group(required=True)
     genomeType.add_argument("-m", "--mags", action="store_true", help="Select for MAG upload")
     genomeType.add_argument("-b", "--bins", action="store_true", help="Select for bin upload")
 
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 0000000..c08e123
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,100 @@
+from pathlib import Path
+
+import pytest
+
+@pytest.fixture(scope="module")
+def test_file_dir():
+    return Path(__file__).resolve().parent / Path("fixtures/responses")
+
+@pytest.fixture(scope="module")
+def public_study_json(test_file_dir):
+    return test_file_dir / Path("public_study.json")
+
+@pytest.fixture(scope="module")
+def private_study_xml(test_file_dir):
+    return test_file_dir / Path("private_study.xml")
+
+@pytest.fixture(scope="module")
+def public_run_json(test_file_dir):
+    return test_file_dir / Path("public_run.json")
+
+@pytest.fixture(scope="module")
+def private_run_json(test_file_dir):
+    return test_file_dir / Path("private_run.json")
+
+@pytest.fixture(scope="module")
+def public_run_from_assembly_xml(test_file_dir):
+    return test_file_dir / Path("public_run_from_assembly.xml")
+
+@pytest.fixture(scope="module")
+def private_run_from_assembly_xml(test_file_dir):
+    return test_file_dir / Path("private_run_from_assembly.xml")
+
+@pytest.fixture(scope="module")
+def public_study_runs_json(test_file_dir):
+    return test_file_dir / Path("public_study_runs.json")
+
+@pytest.fixture(scope="module")
+def private_study_runs_json(test_file_dir):
+    return test_file_dir / Path("private_study_runs.json")
+
+@pytest.fixture(scope="module")
+def public_sample_json(test_file_dir):
+    return test_file_dir / Path("public_sample.json")
+
+@pytest.fixture(scope="module")
+def private_sample_xml(test_file_dir):
+    return test_file_dir / Path("private_sample.xml")
+
+
+@pytest.fixture
+def public_study_data():
+    return {
+    "study_accession": "PRJEB41657",
+    "study_description": "Metagenomic raw reads, assemblies, and bins derived from HoloFood salmon gut samples from trial A and B. The samples in this project contributed to the salmon MAG catalogue (project: PRJEB55376 [ERP140265])."
+    }
+
+#   private XML has no PRJ study ID
+@pytest.fixture
+def private_study_data():
+    return {
+    "study_accession": "ERP125469",
+    "study_description": "Metagenomic raw reads, assemblies, and bins derived from HoloFood salmon gut samples from trial A and B. The samples in this project contributed to the salmon MAG catalogue (project: PRJEB55376 [ERP140265])."
+    }
+
+@pytest.fixture
+def public_run_data():
+    return {
+        "run_accession": "ERR4918394",
+        "sample_accession": "SAMEA7687881",
+        "secondary_study_accession": "ERP125469"
+    }
+
+#   private json only has ERS sample IDs
+@pytest.fixture
+def private_run_data():
+    return {
+        "run_accession": "ERR4918394",
+        "sample_accession": "ERS5444411",
+        "secondary_study_accession": "ERP125469"
+    }
+
+@pytest.fixture
+def public_sample_data():
+    return {
+        "sample_accession": "SAMEA7687881",
+        "collection_date": "2019-06-11",
+        "country": "Norway",
+        "location": "66.079905 N 12.587848 E"
+  }
+
+#  private xml can only query with RS accession and lat and long are split
+@pytest.fixture
+def private_sample_data():
+    return {
+        "sample_accession": "ERS5444411",
+        "collection_date": "2019-06-11",
+        "country": "Norway",
+        'latitude': '66.079905',
+        'longitude': '12.587848'
+  }
diff --git a/tests/fixtures/responses/private_run.json b/tests/fixtures/responses/private_run.json
new file mode 100644
index 0000000..09c0a03
--- /dev/null
+++ b/tests/fixtures/responses/private_run.json
@@ -0,0 +1,17 @@
+[
+  {
+    "report": {
+      "id": "ERR4918394",
+      "alias": "webin-reads-SA01_02C1a_metaG",
+      "instrumentModel": "DNBSEQ-G400",
+      "firstCreated": "2020-12-04T12:14:11",
+      "firstPublic": "2022-08-02T17:24:21",
+      "releaseStatus": "PUBLIC",
+      "submissionAccountId": "Webin-51990",
+      "studyId": "ERP125469",
+      "experimentId": "ERX4783303",
+      "sampleId": "ERS5444411"
+    },
+    "links": []
+  }
+]
diff --git a/tests/fixtures/responses/private_study_runs.json b/tests/fixtures/responses/private_study_runs.json
new file mode 100644
index 0000000..b3f7149
--- /dev/null
+++ b/tests/fixtures/responses/private_study_runs.json
@@ -0,0 +1,152 @@
+[
+  {
+    "report": {
+      "id": "ERR4918752",
+      "alias": "webin-reads-SB06_18C1a_metaG",
+      "instrumentModel": "DNBSEQ-G400",
+      "firstCreated": "2020-12-04T12:32:19",
+      "firstPublic": "2022-08-02T17:24:21",
+      "releaseStatus": "PUBLIC",
+      "submissionAccountId": "Webin-51990",
+      "studyId": "ERP125469",
+      "experimentId": "ERX4783661",
+      "sampleId": "ERS5444665"
+    },
+    "links": []
+  },
+  {
+    "report": {
+      "id": "ERR4918751",
+      "alias": "webin-reads-SB06_13C1a_metaG",
+      "instrumentModel": "DNBSEQ-G400",
+      "firstCreated": "2020-12-04T12:31:46",
+      "firstPublic": "2022-08-02T17:24:21",
+      "releaseStatus": "PUBLIC",
+      "submissionAccountId": "Webin-51990",
+      "studyId": "ERP125469",
+      "experimentId": "ERX4783660",
+      "sampleId": "ERS5444660"
+    },
+    "links": []
+  },
+  {
+    "report": {
+      "id": "ERR4918750",
+      "alias": "webin-reads-SB10_16C1a_metaG",
+      "instrumentModel": "DNBSEQ-G400",
+      "firstCreated": "2020-12-04T12:31:17",
+      "firstPublic": "2022-08-02T17:25:05",
+      "releaseStatus": "PUBLIC",
+      "submissionAccountId": "Webin-51990",
+      "studyId": "ERP125469",
+      "experimentId": "ERX4783659",
+      "sampleId": "ERS5444722"
+    },
+    "links": []
+  },
+  {
+    "report": {
+      "id": "ERR4918749",
+      "alias": "webin-reads-SB10_30C1a_metaG",
+      "instrumentModel": "DNBSEQ-G400",
+      "firstCreated": "2020-12-04T12:30:42",
+      "firstPublic": "2022-08-02T17:24:20",
+      "releaseStatus": "PUBLIC",
+      "submissionAccountId": "Webin-51990",
+      "studyId": "ERP125469",
+      "experimentId": "ERX4783658",
+      "sampleId": "ERS5444736"
+    },
+    "links": []
+  },
+  {
+    "report": {
+      "id": "ERR4918748",
+      "alias": "webin-reads-SB11_25C1a_metaG",
+      "instrumentModel": "DNBSEQ-G400",
+      "firstCreated": "2020-12-04T12:30:20",
+      "firstPublic": "2022-08-02T17:25:03",
+      "releaseStatus": "PUBLIC",
+      "submissionAccountId": "Webin-51990",
+      "studyId": "ERP125469",
+      "experimentId": "ERX4783657",
+      "sampleId": "ERS5444756"
+    },
+    "links": []
+  },
+  {
+    "report": {
+      "id": "ERR4918747",
+      "alias": "webin-reads-SB11_13C1a_metaG",
+      "instrumentModel": "DNBSEQ-G400",
+      "firstCreated": "2020-12-04T12:30:12",
+      "firstPublic": "2022-08-02T17:25:04",
+      "releaseStatus": "PUBLIC",
+      "submissionAccountId": "Webin-51990",
+      "studyId": "ERP125469",
+      "experimentId": "ERX4783656",
+      "sampleId": "ERS5444744"
+    },
+    "links": []
+  },
+  {
+    "report": {
+      "id": "ERR4918746",
+      "alias": "webin-reads-SB11_29C1a_metaG",
+      "instrumentModel": "DNBSEQ-G400",
+      "firstCreated": "2020-12-04T12:29:52",
+      "firstPublic": "2022-08-02T17:24:21",
+      "releaseStatus": "PUBLIC",
+      "submissionAccountId": "Webin-51990",
+      "studyId": "ERP125469",
+      "experimentId": "ERX4783655",
+      "sampleId": "ERS5444760"
+    },
+    "links": []
+  },
+  {
+    "report": {
+      "id": "ERR4918745",
+      "alias": "webin-reads-SA09_02C1a_metaG",
+      "instrumentModel": "DNBSEQ-G400",
+      "firstCreated": "2020-12-04T12:29:29",
+      "firstPublic": "2022-08-02T17:24:20",
+      "releaseStatus": "PUBLIC",
+      "submissionAccountId": "Webin-51990",
+      "studyId": "ERP125469",
+      "experimentId": "ERX4783654",
+      "sampleId": "ERS5444529"
+    },
+    "links": []
+  },
+  {
+    "report": {
+      "id": "ERR4918744",
+      "alias": "webin-reads-SB08_05C1a_metaG",
+      "instrumentModel": "DNBSEQ-G400",
+      "firstCreated": "2020-12-04T12:29:27",
+      "firstPublic": "2022-08-02T17:24:21",
+      "releaseStatus": "PUBLIC",
+      "submissionAccountId": "Webin-51990",
+      "studyId": "ERP125469",
+      "experimentId": "ERX4783653",
+      "sampleId": "ERS5444707"
+    },
+    "links": []
+  },
+  {
+    "report": {
+      "id": "ERR4918743",
+      "alias": "webin-reads-SB11_28C1a_metaG",
+      "instrumentModel": "DNBSEQ-G400",
+      "firstCreated": "2020-12-04T12:29:25",
+      "firstPublic": "2022-08-02T17:25:05",
+      "releaseStatus": "PUBLIC",
+      "submissionAccountId": "Webin-51990",
+      "studyId": "ERP125469",
+      "experimentId": "ERX4783652",
+      "sampleId": "ERS5444759"
+    },
+    "links": []
+  }
+]
diff --git a/tests/fixtures/responses/public_run.json b/tests/fixtures/responses/public_run.json
new file mode 100644
index 0000000..cc790e8
--- /dev/null
+++ b/tests/fixtures/responses/public_run.json
@@ -0,0 +1,7 @@
+[
+  {
+    "run_accession": "ERR4918394",
+    "sample_accession": "SAMEA7687881",
+    "secondary_study_accession": "ERP125469"
+  }
+]
diff --git a/tests/fixtures/responses/public_sample.json b/tests/fixtures/responses/public_sample.json
new file mode 100644
index 0000000..d59db96
--- /dev/null
+++ b/tests/fixtures/responses/public_sample.json
@@ -0,0 +1,8 @@
+[
+  {
+    "sample_accession": "SAMEA7687881",
+    "collection_date": "2019-06-11",
+    "country": "Norway",
+    "location": "66.079905 N 12.587848 E"
+  }
+]
diff --git a/tests/fixtures/responses/public_study.json b/tests/fixtures/responses/public_study.json
new file mode 100644
index 0000000..482c952
--- /dev/null
+++ b/tests/fixtures/responses/public_study.json
@@ -0,0 +1,6 @@
+[
+    {
+      "study_accession": "PRJEB41657",
+      "study_description": "Metagenomic raw reads, assemblies, and bins derived from HoloFood salmon gut samples from trial A and B. The samples in this project contributed to the salmon MAG catalogue (project: PRJEB55376 [ERP140265])."
+    }
+  ]
\ No newline at end of file
diff --git a/tests/fixtures/responses/public_study_runs.json b/tests/fixtures/responses/public_study_runs.json
new file mode 100644
index 0000000..ae8c790
--- /dev/null
+++ b/tests/fixtures/responses/public_study_runs.json
@@ -0,0 +1,52 @@
+[
+  {
+    "run_accession": "ERR4918394",
+    "sample_accession": "SAMEA7687881",
+    "instrument_model": "DNBSEQ-G400"
+  },
+  {
+    "run_accession": "ERR4918397",
+    "sample_accession": "SAMEA7687882",
+    "instrument_model": "DNBSEQ-G400"
+  },
+  {
+    "run_accession": "ERR4918398",
+    "sample_accession": "SAMEA7678309",
+    "instrument_model": "DNBSEQ-G400"
+  },
+  {
+    "run_accession": "ERR4918404",
+    "sample_accession": "SAMEA7687910",
+    "instrument_model": "DNBSEQ-G400"
+  },
+  {
+    "run_accession": "ERR4918405",
+    "sample_accession": "SAMEA7687902",
+    "instrument_model": "DNBSEQ-G400"
+  },
+  {
+    "run_accession": "ERR4918406",
+    "sample_accession": "SAMEA7687895",
+    "instrument_model": "DNBSEQ-G400"
+  },
+  {
+    "run_accession": "ERR4918408",
+    "sample_accession": "SAMEA7687899",
+    "instrument_model": "DNBSEQ-G400"
+  },
+  {
+    "run_accession": "ERR4918413",
+    "sample_accession": "SAMEA7687903",
+    "instrument_model": "DNBSEQ-G400"
+  },
+  {
+    "run_accession": "ERR4918414",
+    "sample_accession": "SAMEA7687912",
+    "instrument_model": "DNBSEQ-G400"
+  },
+  {
+    "run_accession": "ERR4918416",
+    "sample_accession": "SAMEA7687892",
+    "instrument_model": "DNBSEQ-G400"
+  }
+]
diff --git a/tests/unit/test_ena.py b/tests/unit/test_ena.py
new file mode 100644
index 0000000..8917fca
--- /dev/null
+++ b/tests/unit/test_ena.py
@@ -0,0 +1,158 @@
+import pytest
+import responses
+from requests.exceptions import ConnectionError, HTTPError
+import json
+
+from genomeuploader.ena import EnaQuery
+#from assembly_uploader.webin_utils import ENA_WEBIN, ENA_WEBIN_PASSWORD
+
+
+def read_json(path):
+    with open(path, 'r') as jpath:
+        return json.load(jpath)
+
+def read_xml(path):
+    with open(path, 'r') as xpath:
+        return xpath.read()
+
+
+def test_ena_study(public_study_data, private_study_data, public_study_json, private_study_xml):
+    responses.add(
+        responses.POST,
+        "https://www.ebi.ac.uk/ena/portal/api/search",
+        json= read_json(public_study_json)
+    )
+
+    responses.add(
+        responses.GET,
+        "https://www.ebi.ac.uk/ena/submit/report/studies/xml/ERP125469",
+        body= read_xml(private_study_xml),
+        content_type='application/xml'
+    )
+
+    ena_study_public = EnaQuery(
+        accession="ERP125469",
+        query_type="study",
+        private=False
+    )
+
+    ena_study_private = EnaQuery(
+        accession="ERP125469",
+        query_type="study",
+        private=True,
+    )
+    assert ena_study_public.build_query() == public_study_data
+    assert ena_study_private.build_query() == private_study_data
+
+
+def test_ena_run(public_run_data, private_run_data, public_run_json, private_run_json):
+    responses.add(
+        responses.POST,
+        "https://www.ebi.ac.uk/ena/portal/api/search",
+        json = read_json(public_run_json)
+    )
+
+    responses.add(
+        responses.GET,
+        "https://www.ebi.ac.uk/ena/submit/report/runs/ERR4918394",
+        json = read_json(private_run_json)
+    )
+
+    ena_run_public = EnaQuery(
+        accession="ERR4918394",
+        query_type="run",
+        private=False
+    )
+
+    ena_run_private = EnaQuery(
+        accession="ERR4918394",
+        query_type="run",
+        private=True
+    )
+    assert ena_run_public.build_query() == public_run_data
+    assert ena_run_private.build_query() == private_run_data
+
+def test_ena_run_from_assembly(public_run_from_assembly_xml, private_run_from_assembly_xml):
+    responses.add(
+        responses.GET,
+        "https://www.ebi.ac.uk/ena/browser/api/xml/ERZ2626953",
+        body = read_xml(public_run_from_assembly_xml),
+        content_type='application/xml'
+    )
+
+    responses.add(
+        responses.GET,
+        "https://www.ebi.ac.uk/ena/submit/report/analyses/xml/ERZ2626953",
+        body = read_xml(private_run_from_assembly_xml),
+        content_type='application/xml'
+    )
+
+    ena_run_from_assembly_public = EnaQuery(
+        accession="ERZ2626953",
+        query_type="run_assembly",
+        private=False
+    )
+
+    ena_run_from_assembly_public = EnaQuery(
+        accession="ERZ2626953",
+        query_type="run_assembly",
+        private=True
+    )
+
+    assert ena_run_from_assembly_public.build_query() and ena_run_from_assembly_public.build_query() == "ERR4918394"
+
+def test_ena_study_runs(public_study_runs_json, private_study_runs_json):
+    """response mock limited to 10 runs for the study, need to check if there is page limit"""
+    responses.add(
+        responses.POST,
+        "https://www.ebi.ac.uk/ena/portal/api/search",
+        json = read_json(public_study_runs_json)
+    )
+
+    responses.add(
+        responses.GET,
+        "https://www.ebi.ac.uk/ena/submit/report/runs/ERP125469",
+        json = read_json(private_study_runs_json)
+    )
+
+    ena_study_runs_public = EnaQuery(
+        accession="ERP125469",
+        query_type="study_runs",
+        private=False
+    )
+
+    ena_study_runs_private = EnaQuery(
+        accession="ERP125469",
+        query_type="study_runs",
+        private=True
+    )
+
+    assert len(ena_study_runs_public.build_query()) and len(ena_study_runs_private.build_query()) == 10 
+
+def test_ena_sample(public_sample_data, private_sample_data, public_sample_json, private_sample_xml):
+    responses.add(
+        responses.POST,
+        "https://www.ebi.ac.uk/ena/portal/api/search",
+        json = read_json(public_sample_json)
+    )
+
+    responses.add(
+        responses.GET,
+        "https://www.ebi.ac.uk/ena/submit/report/samples/xml/ERS5444411",
+        body = read_xml(private_sample_xml)
+    )
+
+    ena_sample_public = EnaQuery(
+        accession="SAMEA7687881",
+        query_type="sample",
+        private=False
+    )
+
+    ena_sample_private = EnaQuery(
+        accession="ERS5444411",
+        query_type="sample",
+        private=True
+    )
+
+    assert ena_sample_public.build_query() == public_sample_data
+    assert ena_sample_private.build_query() == private_sample_data
\ No newline at end of file

From d249395c69e899eaa4858102c7fe884a6662fde4 Mon Sep 17 00:00:00 2001
From: Varsha <vkale@ebi.ac.uk>
Date: Fri, 27 Dec 2024 16:27:41 +0000
Subject: [PATCH 14/20] add exception test

---
 tests/conftest.py      |  49 ++++++------
 tests/unit/test_ena.py | 174 ++++++++++++++++++++---------------------
 2 files changed, 112 insertions(+), 111 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index c08e123..032779d 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -2,46 +2,57 @@
 
 import pytest
 
+
 @pytest.fixture(scope="module")
 def test_file_dir():
     return Path(__file__).resolve().parent / Path("fixtures/responses")
 
+
 @pytest.fixture(scope="module")
 def public_study_json(test_file_dir):
     return test_file_dir / Path("public_study.json")
 
+
 @pytest.fixture(scope="module")
 def private_study_xml(test_file_dir):
     return test_file_dir / Path("private_study.xml")
 
+
 @pytest.fixture(scope="module")
 def public_run_json(test_file_dir):
     return test_file_dir / Path("public_run.json")
 
+
 @pytest.fixture(scope="module")
 def private_run_json(test_file_dir):
     return test_file_dir / Path("private_run.json")
 
+
 @pytest.fixture(scope="module")
 def public_run_from_assembly_xml(test_file_dir):
     return test_file_dir / Path("public_run_from_assembly.xml")
 
+
 @pytest.fixture(scope="module")
 def private_run_from_assembly_xml(test_file_dir):
     return test_file_dir / Path("private_run_from_assembly.xml")
 
+
 @pytest.fixture(scope="module")
 def public_study_runs_json(test_file_dir):
     return test_file_dir / Path("public_study_runs.json")
 
+
 @pytest.fixture(scope="module")
 def private_study_runs_json(test_file_dir):
     return test_file_dir / Path("private_study_runs.json")
 
+
 @pytest.fixture(scope="module")
 def public_sample_json(test_file_dir):
     return test_file_dir / Path("public_sample.json")
 
+
 @pytest.fixture(scope="module")
 def private_sample_xml(test_file_dir):
     return test_file_dir / Path("private_sample.xml")
@@ -50,43 +61,35 @@ def private_sample_xml(test_file_dir):
 @pytest.fixture
 def public_study_data():
     return {
-    "study_accession": "PRJEB41657",
-    "study_description": "Metagenomic raw reads, assemblies, and bins derived from HoloFood salmon gut samples from trial A and B. The samples in this project contributed to the salmon MAG catalogue (project: PRJEB55376 [ERP140265])."
+        "study_accession": "PRJEB41657",
+        "study_description": "Metagenomic raw reads, assemblies, and bins derived from HoloFood salmon gut samples from trial A and B. The samples in this project contributed to the salmon MAG catalogue (project: PRJEB55376 [ERP140265]).",
     }
 
+
 #   private XML has no PRJ study ID
 @pytest.fixture
 def private_study_data():
     return {
-    "study_accession": "ERP125469",
-    "study_description": "Metagenomic raw reads, assemblies, and bins derived from HoloFood salmon gut samples from trial A and B. The samples in this project contributed to the salmon MAG catalogue (project: PRJEB55376 [ERP140265])."
+        "study_accession": "ERP125469",
+        "study_description": "Metagenomic raw reads, assemblies, and bins derived from HoloFood salmon gut samples from trial A and B. The samples in this project contributed to the salmon MAG catalogue (project: PRJEB55376 [ERP140265]).",
     }
 
+
 @pytest.fixture
 def public_run_data():
-    return {
-        "run_accession": "ERR4918394",
-        "sample_accession": "SAMEA7687881",
-        "secondary_study_accession": "ERP125469"
-    }
+    return {"run_accession": "ERR4918394", "sample_accession": "SAMEA7687881", "secondary_study_accession": "ERP125469"}
+
 
 #   private json only has ERS sample IDs
 @pytest.fixture
 def private_run_data():
-    return {
-        "run_accession": "ERR4918394",
-        "sample_accession": "ERS5444411",
-        "secondary_study_accession": "ERP125469"
-    }
+    return {"run_accession": "ERR4918394", "sample_accession": "ERS5444411", "secondary_study_accession": "ERP125469"}
+
 
 @pytest.fixture
 def public_sample_data():
-    return {
-        "sample_accession": "SAMEA7687881",
-        "collection_date": "2019-06-11",
-        "country": "Norway",
-        "location": "66.079905 N 12.587848 E"
-  }
+    return {"sample_accession": "SAMEA7687881", "collection_date": "2019-06-11", "country": "Norway", "location": "66.079905 N 12.587848 E"}
+
 
 #  private xml can only query with RS accession and lat and long are split
 @pytest.fixture
@@ -95,6 +98,6 @@ def private_sample_data():
         "sample_accession": "ERS5444411",
         "collection_date": "2019-06-11",
         "country": "Norway",
-        'latitude': '66.079905',
-        'longitude': '12.587848'
-  }
+        "latitude": "66.079905",
+        "longitude": "12.587848",
+    }
diff --git a/tests/unit/test_ena.py b/tests/unit/test_ena.py
index 8917fca..902cf13 100644
--- a/tests/unit/test_ena.py
+++ b/tests/unit/test_ena.py
@@ -1,40 +1,36 @@
+import json
+
 import pytest
 import responses
 from requests.exceptions import ConnectionError, HTTPError
-import json
 
 from genomeuploader.ena import EnaQuery
-#from assembly_uploader.webin_utils import ENA_WEBIN, ENA_WEBIN_PASSWORD
+
+ENA_WEBIN = "ENA_WEBIN"
+ENA_WEBIN_PASSWORD = "ENA_WEBIN_PASSWORD"
 
 
 def read_json(path):
-    with open(path, 'r') as jpath:
+    with open(path, "r") as jpath:
         return json.load(jpath)
 
+
 def read_xml(path):
-    with open(path, 'r') as xpath:
+    with open(path, "r") as xpath:
         return xpath.read()
 
 
 def test_ena_study(public_study_data, private_study_data, public_study_json, private_study_xml):
-    responses.add(
-        responses.POST,
-        "https://www.ebi.ac.uk/ena/portal/api/search",
-        json= read_json(public_study_json)
-    )
+    responses.add(responses.POST, "https://www.ebi.ac.uk/ena/portal/api/search", json=read_json(public_study_json))
 
     responses.add(
         responses.GET,
         "https://www.ebi.ac.uk/ena/submit/report/studies/xml/ERP125469",
-        body= read_xml(private_study_xml),
-        content_type='application/xml'
+        body=read_xml(private_study_xml),
+        content_type="application/xml",
     )
 
-    ena_study_public = EnaQuery(
-        accession="ERP125469",
-        query_type="study",
-        private=False
-    )
+    ena_study_public = EnaQuery(accession="ERP125469", query_type="study", private=False)
 
     ena_study_private = EnaQuery(
         accession="ERP125469",
@@ -46,113 +42,115 @@ def test_ena_study(public_study_data, private_study_data, public_study_json, pri
 
 
 def test_ena_run(public_run_data, private_run_data, public_run_json, private_run_json):
-    responses.add(
-        responses.POST,
-        "https://www.ebi.ac.uk/ena/portal/api/search",
-        json = read_json(public_run_json)
-    )
+    responses.add(responses.POST, "https://www.ebi.ac.uk/ena/portal/api/search", json=read_json(public_run_json))
 
-    responses.add(
-        responses.GET,
-        "https://www.ebi.ac.uk/ena/submit/report/runs/ERR4918394",
-        json = read_json(private_run_json)
-    )
+    responses.add(responses.GET, "https://www.ebi.ac.uk/ena/submit/report/runs/ERR4918394", json=read_json(private_run_json))
 
-    ena_run_public = EnaQuery(
-        accession="ERR4918394",
-        query_type="run",
-        private=False
-    )
+    ena_run_public = EnaQuery(accession="ERR4918394", query_type="run", private=False)
 
-    ena_run_private = EnaQuery(
-        accession="ERR4918394",
-        query_type="run",
-        private=True
-    )
+    ena_run_private = EnaQuery(accession="ERR4918394", query_type="run", private=True)
     assert ena_run_public.build_query() == public_run_data
     assert ena_run_private.build_query() == private_run_data
 
+
 def test_ena_run_from_assembly(public_run_from_assembly_xml, private_run_from_assembly_xml):
     responses.add(
         responses.GET,
         "https://www.ebi.ac.uk/ena/browser/api/xml/ERZ2626953",
-        body = read_xml(public_run_from_assembly_xml),
-        content_type='application/xml'
+        body=read_xml(public_run_from_assembly_xml),
+        content_type="application/xml",
     )
 
     responses.add(
         responses.GET,
         "https://www.ebi.ac.uk/ena/submit/report/analyses/xml/ERZ2626953",
-        body = read_xml(private_run_from_assembly_xml),
-        content_type='application/xml'
+        body=read_xml(private_run_from_assembly_xml),
+        content_type="application/xml",
     )
 
-    ena_run_from_assembly_public = EnaQuery(
-        accession="ERZ2626953",
-        query_type="run_assembly",
-        private=False
-    )
+    ena_run_from_assembly_public = EnaQuery(accession="ERZ2626953", query_type="run_assembly", private=False)
 
-    ena_run_from_assembly_public = EnaQuery(
-        accession="ERZ2626953",
-        query_type="run_assembly",
-        private=True
-    )
+    ena_run_from_assembly_public = EnaQuery(accession="ERZ2626953", query_type="run_assembly", private=True)
 
     assert ena_run_from_assembly_public.build_query() and ena_run_from_assembly_public.build_query() == "ERR4918394"
 
+
 def test_ena_study_runs(public_study_runs_json, private_study_runs_json):
     """response mock limited to 10 runs for the study, need to check if there is page limit"""
-    responses.add(
-        responses.POST,
-        "https://www.ebi.ac.uk/ena/portal/api/search",
-        json = read_json(public_study_runs_json)
-    )
+    responses.add(responses.POST, "https://www.ebi.ac.uk/ena/portal/api/search", json=read_json(public_study_runs_json))
 
-    responses.add(
-        responses.GET,
-        "https://www.ebi.ac.uk/ena/submit/report/runs/ERP125469",
-        json = read_json(private_study_runs_json)
-    )
+    responses.add(responses.GET, "https://www.ebi.ac.uk/ena/submit/report/runs/ERP125469", json=read_json(private_study_runs_json))
 
-    ena_study_runs_public = EnaQuery(
-        accession="ERP125469",
-        query_type="study_runs",
-        private=False
-    )
+    ena_study_runs_public = EnaQuery(accession="ERP125469", query_type="study_runs", private=False)
 
-    ena_study_runs_private = EnaQuery(
-        accession="ERP125469",
-        query_type="study_runs",
-        private=True
-    )
+    ena_study_runs_private = EnaQuery(accession="ERP125469", query_type="study_runs", private=True)
+
+    assert len(ena_study_runs_public.build_query()) and len(ena_study_runs_private.build_query()) == 10
 
-    assert len(ena_study_runs_public.build_query()) and len(ena_study_runs_private.build_query()) == 10 
 
 def test_ena_sample(public_sample_data, private_sample_data, public_sample_json, private_sample_xml):
-    responses.add(
-        responses.POST,
-        "https://www.ebi.ac.uk/ena/portal/api/search",
-        json = read_json(public_sample_json)
+    responses.add(responses.POST, "https://www.ebi.ac.uk/ena/portal/api/search", json=read_json(public_sample_json))
+
+    responses.add(responses.GET, "https://www.ebi.ac.uk/ena/submit/report/samples/xml/ERS5444411", body=read_xml(private_sample_xml))
+
+    ena_sample_public = EnaQuery(accession="SAMEA7687881", query_type="sample", private=False)
+
+    ena_sample_private = EnaQuery(accession="ERS5444411", query_type="sample", private=True)
+
+    assert ena_sample_public.build_query() == public_sample_data
+    assert ena_sample_private.build_query() == private_sample_data
+
+
+def test_ena_exceptions(monkeypatch):
+    monkeypatch.setenv(ENA_WEBIN, "fake-webin-999")
+    monkeypatch.setenv(ENA_WEBIN_PASSWORD, "fakewebinpw")
+
+    ena_query = EnaQuery(
+        accession="ERP125469",
+        query_type="study",
+        private=True,
     )
 
+    public_api_data = {
+        "format": "json",
+        "includeMetagenomes": True,
+        "dataPortal": "ena",
+        "result": "study",
+        "query": "secondary_study_accession=ERP125469",
+        "fields": "study_accession,study_description",
+    }
+
     responses.add(
         responses.GET,
-        "https://www.ebi.ac.uk/ena/submit/report/samples/xml/ERS5444411",
-        body = read_xml(private_sample_xml)
+        "https://www.ebi.ac.uk/ena/submit/report/studies/ERP125469",
+        body=ConnectionError("Test retry private error"),
     )
 
-    ena_sample_public = EnaQuery(
-        accession="SAMEA7687881",
-        query_type="sample",
-        private=False
-    )
+    responses.add(responses.POST, "https://www.ebi.ac.uk/ena/portal/api/search", body=ConnectionError("Test retry public error"))
+
+    with pytest.raises(
+        ValueError,
+        match="Could not find ERP125469 in ENA after 3 attempts. Error: Test retry private error",
+    ):
+        ena_query.retry_or_handle_request_error(
+            request=ena_query.get_request,
+            url="https://www.ebi.ac.uk/ena/submit/report/studies/ERP125469",
+        )
+    assert responses.assert_call_count("https://www.ebi.ac.uk/ena/submit/report/studies/ERP125469", 3)
+
+    with pytest.raises(ValueError, match="Could not find ERP125469 in ENA after 3 attempts. Error: Test retry public error"):
+        ena_query.retry_or_handle_request_error(request=ena_query.post_request, data=public_api_data)
+    assert responses.assert_call_count("https://www.ebi.ac.uk/ena/portal/api/search", 3)
 
-    ena_sample_private = EnaQuery(
-        accession="ERS5444411",
-        query_type="sample",
-        private=True
+    responses.add(
+        responses.GET,
+        "https://www.ebi.ac.uk/ena/submit/report/studies/ERPXYZ",
+        body=HTTPError("Test failure error"),
     )
 
-    assert ena_sample_public.build_query() == public_sample_data
-    assert ena_sample_private.build_query() == private_sample_data
\ No newline at end of file
+    with pytest.raises(HTTPError):
+        ena_query.retry_or_handle_request_error(
+            request=ena_query.get_request,
+            url="https://www.ebi.ac.uk/ena/submit/report/studies/ERPXYZ",
+        )
+    assert responses.assert_call_count("https://www.ebi.ac.uk/ena/submit/report/studies/ERPXYZ", 1)

From 9c7da3d83dee4d2c8c183f4fcd98093881fd1138 Mon Sep 17 00:00:00 2001
From: Varsha <vkale@ebi.ac.uk>
Date: Fri, 27 Dec 2024 16:29:26 +0000
Subject: [PATCH 15/20] precommit reformatting and removal of dummy test file

---
 .env.example                          |  4 ++--
 .github/workflows/pytest_workflow.yml |  2 +-
 README.md                             | 16 ++++++++--------
 pytest.ini                            |  1 +
 requirements-test.yml                 |  2 +-
 tests/test_dummy.py                   |  5 -----
 6 files changed, 13 insertions(+), 17 deletions(-)
 delete mode 100644 tests/test_dummy.py

diff --git a/.env.example b/.env.example
index 41fba3f..3e69e4e 100644
--- a/.env.example
+++ b/.env.example
@@ -1,2 +1,2 @@
-ENA_WEBIN=""
-ENA_WEBIN_PASSWORD=""
\ No newline at end of file
+ENA_WEBIN="ENA_WEBIN"
+ENA_WEBIN_PASSWORD="ENA_WEBIN_PASSWORD"
diff --git a/.github/workflows/pytest_workflow.yml b/.github/workflows/pytest_workflow.yml
index c0b190f..3c9a0c1 100644
--- a/.github/workflows/pytest_workflow.yml
+++ b/.github/workflows/pytest_workflow.yml
@@ -27,7 +27,7 @@ jobs:
           emoji: false
           verbose: true
           job-summary: true
-          
+
   pytest_39:
     name: Run pytest 3.9
     needs:
diff --git a/README.md b/README.md
index 05a3b8c..05354d6 100755
--- a/README.md
+++ b/README.md
@@ -1,5 +1,5 @@
 # Public bins and MAGs uploader
-Python script to upload bins and MAGs in fasta format to ENA (European Nucleotide Archive). This script generates xmls and manifests necessary for submission with webin-cli. 
+Python script to upload bins and MAGs in fasta format to ENA (European Nucleotide Archive). This script generates xmls and manifests necessary for submission with webin-cli.
 
 It takes as input one tsv (tab-separated values) table in the following format:
 
@@ -26,20 +26,20 @@ With columns indicating:
   * _local_environment_: `string` (explanation following)
   * _environmental_medium_: `string` (explanation following)
 
-According to ENA checklist's guidelines, 'broad_environment' describes the broad ecological context of a sample - desert, taiga, coral reef, ... 'local_environment' is more local - lake, harbour, cliff, ... 'environmental_medium' is either the material displaced by the sample, or the one in which the sample was embedded prior to the sampling event - air, soil, water, ... 
+According to ENA checklist's guidelines, 'broad_environment' describes the broad ecological context of a sample - desert, taiga, coral reef, ... 'local_environment' is more local - lake, harbour, cliff, ... 'environmental_medium' is either the material displaced by the sample, or the one in which the sample was embedded prior to the sampling event - air, soil, water, ...
 For host-associated metagenomic samples, the three variables can be defined similarly to the following example for the chicken gut metagenome: "chicken digestive system", "digestive tube", "caecum". More information can be found at [ERC000050](<https://www.ebi.ac.uk/ena/browser/view/ERC000050>) for bins and [ERC000047](<https://www.ebi.ac.uk/ena/browser/view/ERC000047>) for MAGs under field names "broad-scale environmental context", "local environmental context", "environmental medium"
 
 Another example can be found [here](examples/input_example.tsv)
 
 ### Warnings
 
-Raw-read runs from which genomes were generated should already be available on the INSDC (ENA by EBI, GenBank by NCBI, or DDBJ), hence at least one DRR|ERR|SRR accession should be available for every genome to be uploaded. Assembly accessions (ERZ|SRZ|DRZ) are also supported. 
+Raw-read runs from which genomes were generated should already be available on the INSDC (ENA by EBI, GenBank by NCBI, or DDBJ), hence at least one DRR|ERR|SRR accession should be available for every genome to be uploaded. Assembly accessions (ERZ|SRZ|DRZ) are also supported.
 
-If uploading TPA (Third PArty) genomes, you will need to contact [ENA support](<https://www.ebi.ac.uk/ena/browser/support>) before using the script. They will provide instructions on how to correctly register a TPA project where to submit your genomes. If both TPA and non-TPA genomes need to be uploaded, please divide them in two batches and use the `--tpa` flag only with TPA genomes. 
+If uploading TPA (Third PArty) genomes, you will need to contact [ENA support](<https://www.ebi.ac.uk/ena/browser/support>) before using the script. They will provide instructions on how to correctly register a TPA project where to submit your genomes. If both TPA and non-TPA genomes need to be uploaded, please divide them in two batches and use the `--tpa` flag only with TPA genomes.
 
-Files to be uploaded will need to be compressed (e.g. already in .gz format). 
+Files to be uploaded will need to be compressed (e.g. already in .gz format).
 
-No more than 5000 genomes can be submitted at the same time. 
+No more than 5000 genomes can be submitted at the same time.
 
 ## Register samples and generate pre-upload files
 The script needs `python`, `pandas`, `requests`, and `ena-webin-cli` to run. We provide a yaml file for the generation of a conda environment:
@@ -68,9 +68,9 @@ where
   * `--centre_name CENTRE_NAME`: name of the centre generating and uploading genomes
   * `--tpa`: if uploading TPA (Third PArty) generated genomes
 
-It is recommended to validate your genomes in test mode (i.e. without `--live` in the registration step and with `-test` during the upload) before attempting the final upload. Launching the registration in test mode will add a timestamp to the genome name to allow multiple executions of the test process. 
+It is recommended to validate your genomes in test mode (i.e. without `--live` in the registration step and with `-test` during the upload) before attempting the final upload. Launching the registration in test mode will add a timestamp to the genome name to allow multiple executions of the test process.
 
-Sample xmls won't be regenerated automatically if a previous xml already exists. If any metadata or value in the tsv table changes, `--force` will allow xml regeneration. 
+Sample xmls won't be regenerated automatically if a previous xml already exists. If any metadata or value in the tsv table changes, `--force` will allow xml regeneration.
 
 ### Produced files:
 The script produces the following files and folders:
diff --git a/pytest.ini b/pytest.ini
index 8e78522..654f24a 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -1,3 +1,4 @@
 [pytest]
 python_files = tests/*.py
 testpaths = tests
+required_plugins = pytest-workflow
diff --git a/requirements-test.yml b/requirements-test.yml
index 333f06c..914434e 100644
--- a/requirements-test.yml
+++ b/requirements-test.yml
@@ -1 +1 @@
--r requirements.yml
\ No newline at end of file
+-r requirements.yml
diff --git a/tests/test_dummy.py b/tests/test_dummy.py
deleted file mode 100644
index 9095527..0000000
--- a/tests/test_dummy.py
+++ /dev/null
@@ -1,5 +0,0 @@
-
-class TestDummy:
-
-    def test_dummy(self):
-        assert 1 == 1
\ No newline at end of file

From d47e0414d8c9f208d126d11d244c44991a6673bc Mon Sep 17 00:00:00 2001
From: Varsha <vkale@ebi.ac.uk>
Date: Fri, 27 Dec 2024 16:39:47 +0000
Subject: [PATCH 16/20] run test on PR to dev

---
 .github/workflows/pytest_workflow.yml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/pytest_workflow.yml b/.github/workflows/pytest_workflow.yml
index 3c9a0c1..4effa1e 100644
--- a/.github/workflows/pytest_workflow.yml
+++ b/.github/workflows/pytest_workflow.yml
@@ -4,7 +4,9 @@ on:
   push:
     branches: [main]
   pull_request:
-    branches: [main]
+    branches: 
+      - main
+      - dev
 
 jobs:
   pytest_38:

From 35751c41490c58326f2bbada63f90e7d9dfedee2 Mon Sep 17 00:00:00 2001
From: Varsha <vkale@ebi.ac.uk>
Date: Fri, 27 Dec 2024 16:42:14 +0000
Subject: [PATCH 17/20] add responses to test requirements

---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index 82527f3..293445a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -52,6 +52,7 @@ test = [
     "pytest-md==0.2.0",
     "pytest-workflow==2.0.1",
     "pytest-cov==3.0.0",
+    "pytest-responses==0.5.1"
 ]
 
 [build-system]

From 4e8d1081c45830eb55a802df3a18d69905257b24 Mon Sep 17 00:00:00 2001
From: Varsha <vkale@ebi.ac.uk>
Date: Fri, 27 Dec 2024 16:42:20 +0000
Subject: [PATCH 18/20] add responses to test requirements

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 293445a..968ffa2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -52,7 +52,7 @@ test = [
     "pytest-md==0.2.0",
     "pytest-workflow==2.0.1",
     "pytest-cov==3.0.0",
-    "pytest-responses==0.5.1"
+    "pytest-responses==0.5.1",
 ]
 
 [build-system]

From 677001c198fd492db319d6a6f2a10054b0b30f4c Mon Sep 17 00:00:00 2001
From: Varsha <vkale@ebi.ac.uk>
Date: Fri, 27 Dec 2024 16:45:30 +0000
Subject: [PATCH 19/20] add responses for private data

---
 .../responses/private_run_from_assembly.xml   |  40 +++++
 tests/fixtures/responses/private_sample.xml   | 159 ++++++++++++++++++
 tests/fixtures/responses/private_study.xml    |  16 ++
 .../responses/public_run_from_assembly.xml    |  25 +++
 tests/fixtures/responses/public_study.json    |   2 +-
 5 files changed, 241 insertions(+), 1 deletion(-)
 create mode 100644 tests/fixtures/responses/private_run_from_assembly.xml
 create mode 100644 tests/fixtures/responses/private_sample.xml
 create mode 100644 tests/fixtures/responses/private_study.xml
 create mode 100644 tests/fixtures/responses/public_run_from_assembly.xml

diff --git a/tests/fixtures/responses/private_run_from_assembly.xml b/tests/fixtures/responses/private_run_from_assembly.xml
new file mode 100644
index 0000000..a387933
--- /dev/null
+++ b/tests/fixtures/responses/private_run_from_assembly.xml
@@ -0,0 +1,40 @@
+<?xml version="1.0" encoding="UTF-8"?>
+  <ANALYSIS_SET>
+    <ANALYSIS alias="webin-genome-ERR4918394_62952e961c26f0859b4ebdd83d93e4e7" center_name="UNIVERSITY OF COPENHAGEN" accession="ERZ2626953">
+      <IDENTIFIERS>
+        <PRIMARY_ID>ERZ2626953</PRIMARY_ID>
+        <SUBMITTER_ID namespace="UNIVERSITY OF COPENHAGEN">webin-genome-ERR4918394_62952e961c26f0859b4ebdd83d93e4e7</SUBMITTER_ID>
+      </IDENTIFIERS>
+      <TITLE>Genome assembly: ERR4918394_62952e961c26f0859b4ebdd83d93e4e7</TITLE>
+      <STUDY_REF accession="ERP125469">
+        <IDENTIFIERS>
+          <PRIMARY_ID>ERP125469</PRIMARY_ID>
+          <SECONDARY_ID>PRJEB41657</SECONDARY_ID>
+        </IDENTIFIERS>
+      </STUDY_REF>
+      <SAMPLE_REF accession="ERS5444411">
+        <IDENTIFIERS>
+          <PRIMARY_ID>ERS5444411</PRIMARY_ID>
+        </IDENTIFIERS>
+      </SAMPLE_REF>
+      <RUN_REF accession="ERR4918394">
+        <IDENTIFIERS>
+          <PRIMARY_ID>ERR4918394</PRIMARY_ID>
+        </IDENTIFIERS>
+      </RUN_REF>
+      <ANALYSIS_TYPE>
+        <SEQUENCE_ASSEMBLY>
+          <NAME>ERR4918394_62952e961c26f0859b4ebdd83d93e4e7</NAME>
+          <TYPE>primary metagenome</TYPE>
+          <PARTIAL>false</PARTIAL>
+          <COVERAGE>17.0</COVERAGE>
+          <PROGRAM>metaSPAdes v3.14.1</PROGRAM>
+          <PLATFORM>DNBSEQ-G400</PLATFORM>
+          <MOL_TYPE>genomic DNA</MOL_TYPE>
+        </SEQUENCE_ASSEMBLY>
+      </ANALYSIS_TYPE>
+      <FILES>
+        <FILE checksum="62952e961c26f0859b4ebdd83d93e4e7" checksum_method="MD5" filename="ERZ262/ERZ2626953/ERR4918394.fasta.gz" filetype="fasta"/>
+      </FILES>
+    </ANALYSIS>
+  </ANALYSIS_SET>
diff --git a/tests/fixtures/responses/private_sample.xml b/tests/fixtures/responses/private_sample.xml
new file mode 100644
index 0000000..e0a50bd
--- /dev/null
+++ b/tests/fixtures/responses/private_sample.xml
@@ -0,0 +1,159 @@
+<?xml version="1.0" encoding="UTF-8"?>
+  <SAMPLE_SET>
+    <SAMPLE alias="SA01_02C1a_metaG" center_name="UNIVERSITY OF COPENHAGEN" accession="ERS5444411">
+      <IDENTIFIERS>
+        <PRIMARY_ID>ERS5444411</PRIMARY_ID>
+        <SUBMITTER_ID namespace="UNIVERSITY OF COPENHAGEN">SA01_02C1a_metaG</SUBMITTER_ID>
+      </IDENTIFIERS>
+      <TITLE>SA01.02C1a</TITLE>
+      <SAMPLE_NAME>
+        <TAXON_ID>1602388</TAXON_ID>
+        <SCIENTIFIC_NAME>fish gut metagenome</SCIENTIFIC_NAME>
+      </SAMPLE_NAME>
+      <DESCRIPTION>Salmon metagenomic DNA data from Distal gut content; animal SA01.02 from tank SA01 with treatment Tiger</DESCRIPTION>
+      <SAMPLE_ATTRIBUTES>
+        <SAMPLE_ATTRIBUTE>
+          <TAG>adapters</TAG>
+          <VALUE>AAGTCGGAGGCCAAGCGGTCTTAGGAAGACAA;GAACGACATGGCTACGATCCGACTT</VALUE>
+        </SAMPLE_ATTRIBUTE>
+        <SAMPLE_ATTRIBUTE>
+          <TAG>collection date</TAG>
+          <VALUE>2019-06-11</VALUE>
+        </SAMPLE_ATTRIBUTE>
+        <SAMPLE_ATTRIBUTE>
+          <TAG>geographic location (country and/or sea)</TAG>
+          <VALUE>Norway</VALUE>
+        </SAMPLE_ATTRIBUTE>
+        <SAMPLE_ATTRIBUTE>
+          <TAG>geographic location (latitude)</TAG>
+          <VALUE>66.079905</VALUE>
+          <UNITS>DD</UNITS>
+        </SAMPLE_ATTRIBUTE>
+        <SAMPLE_ATTRIBUTE>
+          <TAG>geographic location (longitude)</TAG>
+          <VALUE>12.587848</VALUE>
+          <UNITS>DD</UNITS>
+        </SAMPLE_ATTRIBUTE>
+        <SAMPLE_ATTRIBUTE>
+          <TAG>geographic location (region and locality)</TAG>
+          <VALUE>Dønna; Nordland county</VALUE>
+        </SAMPLE_ATTRIBUTE>
+        <SAMPLE_ATTRIBUTE>
+          <TAG>host body site</TAG>
+          <VALUE>Distal gut content</VALUE>
+        </SAMPLE_ATTRIBUTE>
+        <SAMPLE_ATTRIBUTE>
+          <TAG>host common name</TAG>
+          <VALUE>Atlantic salmon</VALUE>
+        </SAMPLE_ATTRIBUTE>
+        <SAMPLE_ATTRIBUTE>
+          <TAG>host diet</TAG>
+          <VALUE>Fermented algae meal (added in in oil coating)</VALUE>
+        </SAMPLE_ATTRIBUTE>
+        <SAMPLE_ATTRIBUTE>
+          <TAG>host diet treatment</TAG>
+          <VALUE>Tiger: SA Control</VALUE>
+        </SAMPLE_ATTRIBUTE>
+        <SAMPLE_ATTRIBUTE>
+          <TAG>host disease status</TAG>
+          <VALUE>Wounded:-</VALUE>
+        </SAMPLE_ATTRIBUTE>
+        <SAMPLE_ATTRIBUTE>
+          <TAG>host scientific name</TAG>
+          <VALUE>Salmo salar</VALUE>
+        </SAMPLE_ATTRIBUTE>
+        <SAMPLE_ATTRIBUTE>
+          <TAG>host subject id</TAG>
+          <VALUE>SA01.02</VALUE>
+        </SAMPLE_ATTRIBUTE>
+        <SAMPLE_ATTRIBUTE>
+          <TAG>host taxid</TAG>
+          <VALUE>8030</VALUE>
+        </SAMPLE_ATTRIBUTE>
+        <SAMPLE_ATTRIBUTE>
+          <TAG>host total mass</TAG>
+          <VALUE>290.4</VALUE>
+          <UNITS>g</UNITS>
+        </SAMPLE_ATTRIBUTE>
+        <SAMPLE_ATTRIBUTE>
+          <TAG>nucleic acid extraction</TAG>
+          <VALUE>D-rex protocol</VALUE>
+        </SAMPLE_ATTRIBUTE>
+        <SAMPLE_ATTRIBUTE>
+          <TAG>pcr primers</TAG>
+          <VALUE>N/A</VALUE>
+        </SAMPLE_ATTRIBUTE>
+        <SAMPLE_ATTRIBUTE>
+          <TAG>project name</TAG>
+          <VALUE>HoloFood Salmon - Metagenomic DNA</VALUE>
+        </SAMPLE_ATTRIBUTE>
+        <SAMPLE_ATTRIBUTE>
+          <TAG>reference host genome for decontamination</TAG>
+          <VALUE>GCA_000000000.0</VALUE>
+        </SAMPLE_ATTRIBUTE>
+        <SAMPLE_ATTRIBUTE>
+          <TAG>sample storage buffer</TAG>
+          <VALUE>Shield</VALUE>
+        </SAMPLE_ATTRIBUTE>
+        <SAMPLE_ATTRIBUTE>
+          <TAG>sample storage container</TAG>
+          <VALUE>2ml E-matrix</VALUE>
+        </SAMPLE_ATTRIBUTE>
+        <SAMPLE_ATTRIBUTE>
+          <TAG>sample storage location</TAG>
+          <VALUE>UCPH</VALUE>
+        </SAMPLE_ATTRIBUTE>
+        <SAMPLE_ATTRIBUTE>
+          <TAG>sample storage temperature</TAG>
+          <VALUE>-20</VALUE>
+          <UNITS>°C</UNITS>
+        </SAMPLE_ATTRIBUTE>
+        <SAMPLE_ATTRIBUTE>
+          <TAG>sample volume or weight for DNA extraction</TAG>
+          <VALUE>0.2</VALUE>
+          <UNITS>mL</UNITS>
+        </SAMPLE_ATTRIBUTE>
+        <SAMPLE_ATTRIBUTE>
+          <TAG>sequencing method</TAG>
+          <VALUE>MGISEQ-2000</VALUE>
+        </SAMPLE_ATTRIBUTE>
+        <SAMPLE_ATTRIBUTE>
+          <TAG>trial timepoint</TAG>
+          <VALUE>0</VALUE>
+          <UNITS>days</UNITS>
+        </SAMPLE_ATTRIBUTE>
+        <SAMPLE_ATTRIBUTE>
+          <TAG>host storage container temperature</TAG>
+          <VALUE>12.76</VALUE>
+          <UNITS>°C</UNITS>
+        </SAMPLE_ATTRIBUTE>
+        <SAMPLE_ATTRIBUTE>
+          <TAG>host storage container pH</TAG>
+          <VALUE>7.05</VALUE>
+        </SAMPLE_ATTRIBUTE>
+        <SAMPLE_ATTRIBUTE>
+          <TAG>host gutted mass</TAG>
+          <VALUE>246.6</VALUE>
+          <UNITS>g</UNITS>
+        </SAMPLE_ATTRIBUTE>
+        <SAMPLE_ATTRIBUTE>
+          <TAG>host length</TAG>
+          <VALUE>29.5</VALUE>
+          <UNITS>cm</UNITS>
+        </SAMPLE_ATTRIBUTE>
+        <SAMPLE_ATTRIBUTE>
+          <TAG>host diet treatment concentration</TAG>
+          <VALUE>0</VALUE>
+          <UNITS>% mass</UNITS>
+        </SAMPLE_ATTRIBUTE>
+        <SAMPLE_ATTRIBUTE>
+          <TAG>host storage container</TAG>
+          <VALUE>LetSea Tank: Tiger</VALUE>
+        </SAMPLE_ATTRIBUTE>
+        <SAMPLE_ATTRIBUTE>
+          <TAG>ENA-CHECKLIST</TAG>
+          <VALUE>ERC000052</VALUE>
+        </SAMPLE_ATTRIBUTE>
+      </SAMPLE_ATTRIBUTES>
+    </SAMPLE>
+  </SAMPLE_SET>
diff --git a/tests/fixtures/responses/private_study.xml b/tests/fixtures/responses/private_study.xml
new file mode 100644
index 0000000..32ef2c1
--- /dev/null
+++ b/tests/fixtures/responses/private_study.xml
@@ -0,0 +1,16 @@
+<?xml version="1.0" encoding="UTF-8"?>
+  <STUDY_SET>
+    <STUDY accession="ERP125469" alias="ena-STUDY-UNIVERSITY OF COPENHAGEN-02-12-2020-08:09:52:762-111" center_name="UNIVERSITY OF COPENHAGEN">
+      <IDENTIFIERS>
+        <PRIMARY_ID>ERP125469</PRIMARY_ID>
+        <SUBMITTER_ID namespace="UNIVERSITY OF COPENHAGEN">ena-STUDY-UNIVERSITY OF COPENHAGEN-02-12-2020-08:09:52:762-111</SUBMITTER_ID>
+      </IDENTIFIERS>
+      <DESCRIPTOR>
+        <STUDY_TITLE>HoloFood Salmon Trial A+B Gut Metagenome</STUDY_TITLE>
+        <STUDY_ABSTRACT>Metagenomic raw reads, assemblies, and bins derived from HoloFood salmon gut samples from trial A and B. The samples in this project contributed to the salmon MAG catalogue (project: PRJEB55376 [ERP140265]).</STUDY_ABSTRACT>
+        <STUDY_DESCRIPTION>Metagenomic raw reads, assemblies, and bins derived from HoloFood salmon gut samples from trial A and B. The samples in this project contributed to the salmon MAG catalogue (project: PRJEB55376 [ERP140265]).</STUDY_DESCRIPTION>
+        <CENTER_PROJECT_NAME>HoloFood Salmon Trial A+B Gut Metagenome</CENTER_PROJECT_NAME>
+        <STUDY_TYPE existing_study_type="Other"/>
+      </DESCRIPTOR>
+    </STUDY>
+  </STUDY_SET>
\ No newline at end of file
diff --git a/tests/fixtures/responses/public_run_from_assembly.xml b/tests/fixtures/responses/public_run_from_assembly.xml
new file mode 100644
index 0000000..7d5b620
--- /dev/null
+++ b/tests/fixtures/responses/public_run_from_assembly.xml
@@ -0,0 +1,25 @@
+<ANALYSIS_SET>
+  <ANALYSIS accession="ERZ2626953" alias="webin-genome-ERR4918394_62952e961c26f0859b4ebdd83d93e4e7" center_name="UNIVERSITY OF COPENHAGEN">
+    <IDENTIFIERS>
+      <PRIMARY_ID>ERZ2626953</PRIMARY_ID>
+      <SUBMITTER_ID namespace="UNIVERSITY OF COPENHAGEN">webin-genome-ERR4918394_62952e961c26f0859b4ebdd83d93e4e7</SUBMITTER_ID>
+    </IDENTIFIERS>
+    <TITLE>Genome assembly: ERR4918394_62952e961c26f0859b4ebdd83d93e4e7</TITLE>
+    <STUDY_REF accession="ERP125469">
+      <IDENTIFIERS>
+        <PRIMARY_ID>ERP125469</PRIMARY_ID>
+        <SECONDARY_ID>PRJEB41657</SECONDARY_ID>
+      </IDENTIFIERS>
+    </STUDY_REF>
+    <SAMPLE_REF accession="ERS5444411">
+      <IDENTIFIERS>
+        <PRIMARY_ID>ERS5444411</PRIMARY_ID>
+        <EXTERNAL_ID namespace="BioSample">SAMEA7687881</EXTERNAL_ID>
+      </IDENTIFIERS>
+    </SAMPLE_REF>
+    <RUN_REF accession="ERR4918394">
+      <IDENTIFIERS>
+        <PRIMARY_ID>ERR4918394</PRIMARY_ID>
+      </IDENTIFIERS>
+    </RUN_REF>
+    <ANALYSIS_TYPE>
diff --git a/tests/fixtures/responses/public_study.json b/tests/fixtures/responses/public_study.json
index 482c952..eb360c6 100644
--- a/tests/fixtures/responses/public_study.json
+++ b/tests/fixtures/responses/public_study.json
@@ -3,4 +3,4 @@
       "study_accession": "PRJEB41657",
       "study_description": "Metagenomic raw reads, assemblies, and bins derived from HoloFood salmon gut samples from trial A and B. The samples in this project contributed to the salmon MAG catalogue (project: PRJEB55376 [ERP140265])."
     }
-  ]
\ No newline at end of file
+  ]

From b601790e6feaa1a917496881085b8019097cf489 Mon Sep 17 00:00:00 2001
From: Varsha <vkale@ebi.ac.uk>
Date: Fri, 27 Dec 2024 17:38:33 +0000
Subject: [PATCH 20/20] max private study runs set to 2000

---
 genomeuploader/ena.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/genomeuploader/ena.py b/genomeuploader/ena.py
index f5ae0b9..0f3c5b6 100644
--- a/genomeuploader/ena.py
+++ b/genomeuploader/ena.py
@@ -186,6 +186,7 @@ def get_data_or_handle_error(self, response, xml=False, full_json=False):
                     logging.error(f"{self.accession} private data is not present in the specified Webin account")
                 else:
                     logging.error(f"{self.accession} public data does not exist")
+                return None
             else:
                 if xml:
                     return minidom.parseString(data_txt)
@@ -194,7 +195,7 @@ def get_data_or_handle_error(self, response, xml=False, full_json=False):
                     return response.json()
                 else:
                     #   only return the fist element in the list is the default. In these cases there should only be one entry returned
-                    return json.loads(response.text)[0]
+                    return json.loads(response.text)[0] 
         except (IndexError, TypeError, ValueError, KeyError):
             logging.error(f"Failed to fetch {self.accession}, returned error: {response.text}")
 
@@ -290,9 +291,10 @@ def _get_public_run_from_assembly(self):
         logging.info(f"public run from the assembly {self.accession} returned from ENA")
 
         return run
-
+    
     def _get_private_study_runs(self):
-        url = f"{self.private_url}/runs/{self.accession}"
+        #   Pagination documentation unclear - offest not working. Using hardcoded 2000 default. TO MODIFY
+        url = f"{self.private_url}/runs/{self.accession}?format=json&max-results=2000"
         response = self.retry_or_handle_request_error(self.get_request, url)
         runs = self.get_data_or_handle_error(response, full_json=True)
         reformatted_data = []
@@ -305,6 +307,7 @@ def _get_private_study_runs(self):
             if "instrumentModel" in run_data:
                 run_data["instrument_model"] = run_data.pop("instrumentModel")
             reformatted_data.append(run_data)
+        logging.info(f"found {len(reformatted_data)} runs for study {self.accession}")
         logging.info(f"private runs from study {self.accession}, returned from ENA")
         return reformatted_data