Merge pull request #39 from EBI-Metagenomics/build_gff_fgs

supporting FGS protein predictions in GFF building
EBI-Metagenomics · Jul 12, 2021 · 29518e0 · 29518e0
2 parents a9379fd + e3fba11
commit 29518e0
Showing 1 changed file with 55 additions and 15 deletions.
diff --git a/tools/Assembly/GFF/build_assembly_gff.py b/tools/Assembly/GFF/build_assembly_gff.py
@@ -18,6 +18,7 @@
 import re
 import subprocess
 import sys
+import os
 from urllib import parse
 
 
@@ -41,7 +42,18 @@ def merge(cls, annotations):
 
     @classmethod
     def clean_seq_name(cls, name):
-        return re.sub(r'_\d+$', '', name)
+        # prodigal clean up
+        prodigal_match = re.match('_\d+$', name)
+        if prodigal_match:
+            return re.sub(r'_\d+$', '', name)
+        # fgs clean_up
+        fgs_match = re.match('^(?P<contig>.+?)_\d+_\d+\_.', name)
+        if fgs_match:
+            groups = fgs_match.groupdict()
+            return groups['contig']
+
+        # no fgs or prodigal
+        return name
 
     def _split_line(self, line):
         return line.replace('\n', ' ').replace('\r', '').split('\t')
@@ -108,12 +120,26 @@ def parse_fasta_header_mags(header):
 def parse_fasta_header(header):
     # FIXME: add sanity check
     # FIXME: what should we do with cases like :
-    #        ERZ477576.1085103-NODE-1085103-length-126-cov-0.633803_1_126_+ 
-    splitted = header.replace('>', '').split(' # ')
-    if len(splitted) < 4:
-        return None, None, None, None
-    return splitted[:4]
-
+    #        ERZ477576.1085103-NODE-1085103-length-126-cov-0.633803_1_126_+
+
+    # Prodigal header example: >NODE-3-length-2984-cov-4.247866_3 # 1439 # 1894 # 1 # ID=3_3;partial=00;start_type=TTG;rbs_motif=TAA;rbs_spacer=8bp;gc_cont=0.340  
+    prodigal_match = re.match('^>(?P<contig>.+?)\s#\s(?P<start>\d+)\s#\s(?P<end>\d+)\s#\s(?P<strand>.+?)\s#', header)
+    if prodigal_match:
+        groups = prodigal_match.groupdict()
+        return groups['contig'], groups['start'], groups['end'], groups['strand']
+
+    # FGS header example: >ERZ1759872.3-contig-100_3188_4599_-
+    fgs_match = re.match('^>.+?_(?P<start>\d+)_(?P<end>\d+)_(?P<strand>.)', header)
+    if fgs_match:
+        groups = fgs_match.groupdict()
+        strand = '1'
+        if groups['strand'] == '-':
+            strand = "-1"
+        return header.rstrip().replace(">", ""), groups['start'], groups['end'], strand
+
+    # unable to parse fasta header
+    return None, None, None, None
+
 
 def load_annotation(file, klass, annotations):
     """Load the annotations of a TSV by `query_name` (contig name at the moment)
@@ -175,7 +201,7 @@ def build_gff(annotations, faa):
             clean_name = Annotation.clean_seq_name(contig_name)
 
             row_annotations = Annotation.merge([ann.get() for ann in annotations.get(contig_name, [])])
-
+            
             ann_string = ';'.join(['{}={}'.format(k, ','.join(v).strip()) for k, v in row_annotations.items()])
 
             eggNOGScore = ''.join(row_annotations.get('eggNOG_score', []))
@@ -193,6 +219,13 @@ def build_gff(annotations, faa):
                     'ID=' + clean_name + ';' + ann_string
                 ]
 
+def error_exit(gff_file):
+    if os.path.exists(gff_file):
+        os.remove(gff_file)
+    open("no_antismash", "w").close()
+    sys.exit(0)
+
+
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(
@@ -213,20 +246,27 @@ def build_gff(annotations, faa):
         '-o', dest='out', help='Ouput GFF file name', required=True)
     args = parser.parse_args()
 
-    with open(args.out, 'w', buffering=1) as out_handle:
-
-        print('##gff-version 3', file=out_handle)
+    annotations = {}
+    load_annotation(args.egg, EggResult, annotations)
+    load_annotation(args.interpro, InterProResult, annotations)
 
-        annotations = {}
-        load_annotation(args.egg, EggResult, annotations)
-        load_annotation(args.interpro, InterProResult, annotations)
+    if len(annotations) < 1:
+        print("No annotations loaded, aborting")
+        error_exit(args.out)
 
+    records = 0
+    with open(args.out, 'w', buffering=1) as out_handle:
+        print('##gff-version 3', file=out_handle)
         for row in build_gff(annotations, args.faa):
             print('\t'.join(row), file=out_handle)
+            records += 1
+    if records == 0:
+        print("No annotations in GFF, aborting")
+        error_exit(args.out)
 
     print('Sorting...')
     subprocess.call('(grep ^"#" {0}; grep -v ^"#" {0} | sort -k1,1 -k4,4n)'.format(args.out) +
                     ' | bgzip > {0}.bgz'.format(args.out), shell=True)
     print('Building the index...')
     subprocess.call(['tabix', '-p', 'gff', '{}.bgz'.format(args.out)])
-    print('Bye')
+    print('Bye')