Merge pull request #64 from pachterlab/dev

Dev
pachterlab · Mar 11, 2023 · bdff6b8 · bdff6b8
2 parents 09c374d + f6bec83
commit bdff6b8
Show file tree

Hide file tree

Showing 12 changed files with 427 additions and 52 deletions.
diff --git a/docs/src/info.md b/docs/src/info.md
@@ -13,6 +13,9 @@ Path to the file the results will be saved in, e.g. path/to/directory/results.cs
 Python: `save=True` will save the output in the current working directory.
 
 **Flags**  
+`-pdb` `--pdb`  
+Include PDB IDs in output (might increase runtime).  
+
 `-eo` `--ensembl_only`  
 Only return results from Ensembl (excludes results from UniProt, NCBI and PDB, but runs significantly faster).  
 

diff --git a/docs/src/updates.md b/docs/src/updates.md
@@ -1,4 +1,7 @@
 ## ✨ What's new  
+**Version ≥ 0.27.3:**  
+- [`gget info`](./info.md) excludes PDB IDs by default to increase speed (PDB results can be included using flag `--pdb` / `pdb=True`).
+
 **Version ≥ 0.27.2:**  
 - Updated [`gget alphafold`](./alphafold.md) to [DeepMind's AlphaFold v2.3.0](https://github.com/deepmind/alphafold/releases/tag/v2.3.0) (including new arguments `multimer_for_monomer` and `multimer_recycles`)
 

diff --git a/gget/__init__.py b/gget/__init__.py
@@ -20,6 +20,6 @@
 # Mute numexpr threads info
 logging.getLogger("numexpr").setLevel(logging.WARNING)
 
-__version__ = "0.27.2"
+__version__ = "0.27.3"
 __author__ = "Laura Luebbert"
 __email__ = "[email protected]"
diff --git a/gget/constants.py b/gget/constants.py
@@ -16,7 +16,7 @@
 RCSB_PDB_API = "https://data.rcsb.org/rest/v1/core/"
 
 # API to get PDB entries from Ensembl IDs
-ENS_TO_PDB_API = "https://wwwdev.ebi.ac.uk/pdbe/aggregated-api/mappings/ensembl_to_pdb/"
+ENS_TO_PDB_API = "https://www.ebi.ac.uk/pdbe/aggregated-api/mappings/ensembl_to_pdb/"
 
 # BLAST API endpoints
 BLAST_URL = "https://blast.ncbi.nlm.nih.gov/Blast.cgi"

diff --git a/gget/gget_info.py b/gget/gget_info.py
@@ -23,15 +23,16 @@
 from .constants import ENSEMBL_REST_API, UNIPROT_REST_API, NCBI_URL
 
 ## gget info
-def info(ens_ids, wrap_text=False, ensembl_only=False, json=False, verbose=True, save=False, expand=False):
+def info(ens_ids, wrap_text=False, pdb=False, ensembl_only=False, json=False, verbose=True, save=False, expand=False):
     """
     Fetch gene and transcript metadata using Ensembl IDs.
 
     Args:
     - ens_ids       One or more Ensembl IDs to look up (string or list of strings).
                     Also supports WormBase and Flybase IDs.
     - wrap_text     If True, displays data frame with wrapped text for easy reading. Default: False.
-    - ensembl_only  Only return results from Ensembl. Default: False. 
+    - pdb           If True, also returns PDB IDs (might increase run time). Default: False.
+    - ensembl_only  If True, only returns results from Ensembl (excludes PDB, UniProt, and NCBI results). Default: False.
     - json          If True, returns results in json/dictionary format instead of data frame. Default: False.
     - verbose       True/False whether to print progress information. Default True.
     - save          True/False wether to save csv with query results in current working directory. Default: False.
@@ -46,6 +47,9 @@ def info(ens_ids, wrap_text=False, ensembl_only=False, json=False, verbose=True,
             "'expand' argument deprecated! gget info now always returns all of the available information."
         )
 
+    # Rename pdb argument
+    fetch_pdb = pdb
+
     # Define Ensembl REST API server
     server = ENSEMBL_REST_API
     # Define type of returned content from REST
@@ -234,7 +238,7 @@ def info(ens_ids, wrap_text=False, ensembl_only=False, json=False, verbose=True,
             try:
                 synonyms = sorted(synonyms)
             except:
-                None
+                pass
 
             # Save NCBI info to data frame
             df_ncbi = pd.DataFrame(
@@ -245,13 +249,14 @@ def info(ens_ids, wrap_text=False, ensembl_only=False, json=False, verbose=True,
                 },
             )
 
-            ## Get PDB IDs from Ensembl ID
-            # Add pdb_ids to NCBI data frame
-            pdb_ids = get_pdb_ids(ens_id)
-            if pdb_ids:
-                df_ncbi["pdb_id"] = [pdb_ids]
-            else:
-                df_ncbi["pdb_id"] = np.NaN
+            if fetch_pdb:
+                ## Get PDB IDs from Ensembl ID
+                # Add pdb_ids to NCBI data frame
+                pdb_ids = get_pdb_ids(ens_id)
+                if pdb_ids:
+                    df_ncbi["pdb_id"] = [pdb_ids]
+                else:
+                    df_ncbi["pdb_id"] = np.NaN
 
             # Transpose NCBI df and add Ensembl ID as column name
             df_ncbi = df_ncbi.T

diff --git a/gget/gget_search.py b/gget/gget_search.py
@@ -255,7 +255,7 @@ def search(
 
     # Add URL to gene summary on Ensembl
     df["url"] = (
-        "https://uswest.ensembl.org/"
+        "https://useast.ensembl.org/"
         + "_".join(db.split("_")[:2])
         + "/Gene/Summary?g="
         + df["ensembl_id"]

diff --git a/gget/main.py b/gget/main.py
@@ -259,15 +259,23 @@ def main():
         type=str,
         nargs="*",
         default=None,
-        help="One or more Ensembl, WormBase or FlyBase IDs).",
+        help="One or more Ensembl, WormBase or FlyBase IDs.",
+    )
+    parser_info.add_argument(
+        "-pdb",
+        "--pdb",
+        default=False,
+        action="store_true",
+        required=False,
+        help="Also returns PDB IDs (might increase run time).",
     )
     parser_info.add_argument(
         "-eo",
         "--ensembl_only",
         default=False,
         action="store_true",
         required=False,
-        help="Only return results from Ensembl.",
+        help="Only returns results from Ensembl (excludes PDB, UniProt, and NCBI results).",
     )
     parser_info.add_argument(
         "-csv",
@@ -283,7 +291,7 @@ def main():
         default=True,
         action="store_false",
         required=False,
-        help="Do not print progress information.",
+        help="Does not print progress information.",
     )
     parser_info.add_argument(
         "-o",
@@ -1451,6 +1459,7 @@ def main():
         # Look up requested Ensembl IDs
         info_results = info(
             ids_clean_final,
+            pdb=args.pdb,
             ensembl_only=args.ensembl_only,
             expand=args.expand,
             json=args.csv,

diff --git a/gget/utils.py b/gget/utils.py
@@ -490,6 +490,9 @@ def get_pdb_ids(ens_id):
     """
     Function to fetch all PDB IDs linked to an Ensembl ID.
     using the PDBe API https://wwwdev.ebi.ac.uk/pdbe/aggregated-api/mappings/ensembl_to_pdb/[ens_id]
+    
+    API documentation:
+    https://www.ebi.ac.uk/pdbe/aggregated-api/#/SIFTS/get_ensembl_to_pdb_mappings_api_mappings_ensembl_to_pdb__gene_id__get
     """
 
     res = requests.get(ENS_TO_PDB_API + ens_id)
@@ -508,7 +511,7 @@ def get_pdb_ids(ens_id):
     for entry in pdb_dict:
         pdb_ids.append(entry["pdb_id"])
 
-    return list(set(pdb_ids))
+    return sorted(list(set(pdb_ids)))
 
 
 def wrap_cols_func(df, cols):

diff --git a/requirements.txt b/requirements.txt
@@ -8,3 +8,4 @@ beautifulsoup4>=4.10.0
 ipywidgets
 tqdm
 py3Dmol>=1.8.0
+lxml
-Original file line number
+Diff line change
@@ Expand Up / @@ -8,3 +8,4 @@ beautifulsoup4>=4.10.0 @@
     ipywidgets
     tqdm
     py3Dmol>=1.8.0
+    lxml