Skip to content

Commit

Permalink
Merge pull request #64 from pachterlab/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
lauraluebbert authored Mar 11, 2023
2 parents 09c374d + f6bec83 commit bdff6b8
Show file tree
Hide file tree
Showing 12 changed files with 427 additions and 52 deletions.
3 changes: 3 additions & 0 deletions docs/src/info.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ Path to the file the results will be saved in, e.g. path/to/directory/results.cs
Python: `save=True` will save the output in the current working directory.

**Flags**
`-pdb` `--pdb`
Include PDB IDs in output (might increase runtime).

`-eo` `--ensembl_only`
Only return results from Ensembl (excludes results from UniProt, NCBI and PDB, but runs significantly faster).

Expand Down
3 changes: 3 additions & 0 deletions docs/src/updates.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
## ✨ What's new
**Version ≥ 0.27.3:**
- [`gget info`](./info.md) excludes PDB IDs by default to increase speed (PDB results can be included using flag `--pdb` / `pdb=True`).

**Version ≥ 0.27.2:**
- Updated [`gget alphafold`](./alphafold.md) to [DeepMind's AlphaFold v2.3.0](https://github.com/deepmind/alphafold/releases/tag/v2.3.0) (including new arguments `multimer_for_monomer` and `multimer_recycles`)

Expand Down
2 changes: 1 addition & 1 deletion gget/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,6 @@
# Mute numexpr threads info
logging.getLogger("numexpr").setLevel(logging.WARNING)

__version__ = "0.27.2"
__version__ = "0.27.3"
__author__ = "Laura Luebbert"
__email__ = "[email protected]"
2 changes: 1 addition & 1 deletion gget/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
RCSB_PDB_API = "https://data.rcsb.org/rest/v1/core/"

# API to get PDB entries from Ensembl IDs
ENS_TO_PDB_API = "https://wwwdev.ebi.ac.uk/pdbe/aggregated-api/mappings/ensembl_to_pdb/"
ENS_TO_PDB_API = "https://www.ebi.ac.uk/pdbe/aggregated-api/mappings/ensembl_to_pdb/"

# BLAST API endpoints
BLAST_URL = "https://blast.ncbi.nlm.nih.gov/Blast.cgi"
Expand Down
25 changes: 15 additions & 10 deletions gget/gget_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,15 +23,16 @@
from .constants import ENSEMBL_REST_API, UNIPROT_REST_API, NCBI_URL

## gget info
def info(ens_ids, wrap_text=False, ensembl_only=False, json=False, verbose=True, save=False, expand=False):
def info(ens_ids, wrap_text=False, pdb=False, ensembl_only=False, json=False, verbose=True, save=False, expand=False):
"""
Fetch gene and transcript metadata using Ensembl IDs.
Args:
- ens_ids One or more Ensembl IDs to look up (string or list of strings).
Also supports WormBase and Flybase IDs.
- wrap_text If True, displays data frame with wrapped text for easy reading. Default: False.
- ensembl_only Only return results from Ensembl. Default: False.
- pdb If True, also returns PDB IDs (might increase run time). Default: False.
- ensembl_only If True, only returns results from Ensembl (excludes PDB, UniProt, and NCBI results). Default: False.
- json If True, returns results in json/dictionary format instead of data frame. Default: False.
- verbose True/False whether to print progress information. Default True.
- save True/False wether to save csv with query results in current working directory. Default: False.
Expand All @@ -46,6 +47,9 @@ def info(ens_ids, wrap_text=False, ensembl_only=False, json=False, verbose=True,
"'expand' argument deprecated! gget info now always returns all of the available information."
)

# Rename pdb argument
fetch_pdb = pdb

# Define Ensembl REST API server
server = ENSEMBL_REST_API
# Define type of returned content from REST
Expand Down Expand Up @@ -234,7 +238,7 @@ def info(ens_ids, wrap_text=False, ensembl_only=False, json=False, verbose=True,
try:
synonyms = sorted(synonyms)
except:
None
pass

# Save NCBI info to data frame
df_ncbi = pd.DataFrame(
Expand All @@ -245,13 +249,14 @@ def info(ens_ids, wrap_text=False, ensembl_only=False, json=False, verbose=True,
},
)

## Get PDB IDs from Ensembl ID
# Add pdb_ids to NCBI data frame
pdb_ids = get_pdb_ids(ens_id)
if pdb_ids:
df_ncbi["pdb_id"] = [pdb_ids]
else:
df_ncbi["pdb_id"] = np.NaN
if fetch_pdb:
## Get PDB IDs from Ensembl ID
# Add pdb_ids to NCBI data frame
pdb_ids = get_pdb_ids(ens_id)
if pdb_ids:
df_ncbi["pdb_id"] = [pdb_ids]
else:
df_ncbi["pdb_id"] = np.NaN

# Transpose NCBI df and add Ensembl ID as column name
df_ncbi = df_ncbi.T
Expand Down
2 changes: 1 addition & 1 deletion gget/gget_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,7 @@ def search(

# Add URL to gene summary on Ensembl
df["url"] = (
"https://uswest.ensembl.org/"
"https://useast.ensembl.org/"
+ "_".join(db.split("_")[:2])
+ "/Gene/Summary?g="
+ df["ensembl_id"]
Expand Down
15 changes: 12 additions & 3 deletions gget/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,15 +259,23 @@ def main():
type=str,
nargs="*",
default=None,
help="One or more Ensembl, WormBase or FlyBase IDs).",
help="One or more Ensembl, WormBase or FlyBase IDs.",
)
parser_info.add_argument(
"-pdb",
"--pdb",
default=False,
action="store_true",
required=False,
help="Also returns PDB IDs (might increase run time).",
)
parser_info.add_argument(
"-eo",
"--ensembl_only",
default=False,
action="store_true",
required=False,
help="Only return results from Ensembl.",
help="Only returns results from Ensembl (excludes PDB, UniProt, and NCBI results).",
)
parser_info.add_argument(
"-csv",
Expand All @@ -283,7 +291,7 @@ def main():
default=True,
action="store_false",
required=False,
help="Do not print progress information.",
help="Does not print progress information.",
)
parser_info.add_argument(
"-o",
Expand Down Expand Up @@ -1451,6 +1459,7 @@ def main():
# Look up requested Ensembl IDs
info_results = info(
ids_clean_final,
pdb=args.pdb,
ensembl_only=args.ensembl_only,
expand=args.expand,
json=args.csv,
Expand Down
5 changes: 4 additions & 1 deletion gget/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -490,6 +490,9 @@ def get_pdb_ids(ens_id):
"""
Function to fetch all PDB IDs linked to an Ensembl ID.
using the PDBe API https://wwwdev.ebi.ac.uk/pdbe/aggregated-api/mappings/ensembl_to_pdb/[ens_id]
API documentation:
https://www.ebi.ac.uk/pdbe/aggregated-api/#/SIFTS/get_ensembl_to_pdb_mappings_api_mappings_ensembl_to_pdb__gene_id__get
"""

res = requests.get(ENS_TO_PDB_API + ens_id)
Expand All @@ -508,7 +511,7 @@ def get_pdb_ids(ens_id):
for entry in pdb_dict:
pdb_ids.append(entry["pdb_id"])

return list(set(pdb_ids))
return sorted(list(set(pdb_ids)))


def wrap_cols_func(df, cols):
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@ beautifulsoup4>=4.10.0
ipywidgets
tqdm
py3Dmol>=1.8.0
lxml
Loading

0 comments on commit bdff6b8

Please sign in to comment.