Skip to content

Commit 2516af1

Browse files
authored
Crawlers (#165)
* fix: correct pubchem crawling from smiles with multiple entries * pin: numpy * chore: remove Zinc API * ci: expand tests to python 3.7-3.11 * doc: Update README * ci: Span matrix * ci: remove windows tests * ci: remove trailing zero * chore: add support for torch 2 and python 3.11 * ci: test 3.7-3.12 * ci: enable 3.12
1 parent 78ad7c9 commit 2516af1

File tree

8 files changed

+39
-134
lines changed

8 files changed

+39
-134
lines changed

.github/workflows/build.yml

+6-7
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ on:
77
- 'gh_pages'
88
jobs:
99
conda-tests:
10-
name: Test with conda (${{ matrix.os }})
10+
name: Test with conda (${{ matrix.os }}) Python ${{ matrix.python-version }})
1111
runs-on: ${{ matrix.os }}
1212
continue-on-error: ${{ matrix.experimental }}
1313
strategy:
@@ -17,9 +17,7 @@ jobs:
1717
- os: ubuntu-latest
1818
pip_cache_path: ~/.cache/pip
1919
experimental: false
20-
- os: windows-latest
21-
pip_cache_path: ~\AppData\Local\pip\Cache
22-
experimental: true
20+
python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"]
2321
defaults:
2422
run:
2523
shell: bash -l {0} # For conda
@@ -35,19 +33,20 @@ jobs:
3533
uses: actions/cache@v2
3634
with:
3735
path: ~/conda_pkgs_dir # from: conda-incubator/setup-miniconda@v2
38-
key: ${{ runner.os }}-conda-${{ env.CACHE_NUMBER }}-${{
36+
key: ${{ runner.os }}-conda-${{ matrix.python-version }}-${{ env.CACHE_NUMBER }}-${{
3937
hashFiles('conda.yml') }}
4038

4139
- name: Cache pip
4240
uses: actions/cache@v2
4341
with:
4442
path: ${{ matrix.pip_cache_path }}
45-
key: ${{ runner.os }}-pip--${{ env.CACHE_NUMBER }}-${{
43+
key: ${{ runner.os }}-pip-${{ matrix.python-version }}-${{ env.CACHE_NUMBER }}-${{
4644
hashFiles('requirements.txt') }}
4745

4846
- name: Conda environment setup
4947
uses: conda-incubator/setup-miniconda@v2
5048
with:
49+
python-version: ${{ matrix.python-version }}
5150
activate-environment: pytoda
5251
environment-file: conda.yml
5352
auto-activate-base: false
@@ -72,7 +71,7 @@ jobs:
7271
if: always()
7372
with:
7473
status: ${{ job.status }}
75-
text: "CI Build ${{ matrix.os }}"
74+
text: "CI Build ${{ matrix.os }} Python ${{ matrix.python-version}}"
7675
author_name: ${{ github.actor }}
7776
env:
7877
SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}

README.md

+9-36
Original file line numberDiff line numberDiff line change
@@ -57,41 +57,14 @@ For some examples on how to use `pytoda` see [here](./examples)
5757
If you use `pytoda` in your projects, please cite the following:
5858

5959
```bib
60-
@article{born2021datadriven,
61-
author = {
62-
Born, Jannis and Manica, Matteo and Cadow, Joris and Markert, Greta and
63-
Mill,Nil Adell and Filipavicius, Modestas and Janakarajan, Nikita and
64-
Cardinale, Antonio and Laino, Teodoro and
65-
{Rodr{\'{i}}guez Mart{\'{i}}nez}, Mar{\'{i}}a
66-
},
67-
doi = {10.1088/2632-2153/abe808},
68-
issn = {2632-2153},
69-
journal = {Machine Learning: Science and Technology},
70-
number = {2},
71-
pages = {025024},
72-
title = {{
73-
Data-driven molecular design for discovery and synthesis of novel ligands:
74-
a case study on SARS-CoV-2
75-
}},
76-
url = {https://iopscience.iop.org/article/10.1088/2632-2153/abe808},
77-
volume = {2},
78-
year = {2021}
79-
}
80-
@article{born2021paccmannrl,
81-
title = {
82-
PaccMann$^{RL}$: De novo generation of hit-like anticancer molecules from
83-
transcriptomic data via reinforcement learning
84-
},
85-
journal = {iScience},
86-
volume = {24},
87-
number = {4},
88-
year = {2021},
89-
issn = {2589-0042},
90-
doi = {https://doi.org/10.1016/j.isci.2021.102269},
91-
url = {https://www.cell.com/iscience/fulltext/S2589-0042(21)00237-6},
92-
author = {
93-
Jannis Born and Matteo Manica and Ali Oskooei and Joris Cadow and Greta Markert
94-
and Mar{\'\i}a Rodr{\'\i}guez Mart{\'\i}nez}
95-
}
60+
@article{born2021data,
61+
title={Data-driven molecular design for discovery and synthesis of novel ligands: a case study on SARS-CoV-2},
62+
author={Born, Jannis and Manica, Matteo and Cadow, Joris and Markert, Greta and Mill, Nil Adell and Filipavicius, Modestas and Janakarajan, Nikita and Cardinale, Antonio and Laino, Teodoro and Martinez, Maria Rodriguez},
63+
journal={Machine Learning: Science and Technology},
64+
volume={2},
65+
number={2},
66+
pages={025024},
67+
year={2021},
68+
publisher={IOP Publishing}
9669
}
9770
```

conda.yml

+3-3
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
name: pytoda
22
dependencies:
3-
- python>=3.8
4-
- pip>=19.1,<20.3
3+
- python>=3.7,<3.13
4+
- pip
55
- pip:
6-
- -r file:requirements.txt
6+
- -r requirements.txt
77

pytoda/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
name = 'pytoda'
2-
__version__ = '1.1.5'
2+
__version__ = '1.1.6'

pytoda/preprocessing/crawlers.py

+2-60
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
import logging
2-
import urllib
32
import urllib.request as urllib_request
43
from itertools import filterfalse
54
from typing import Iterable, List, Tuple, Union
@@ -13,66 +12,11 @@
1312

1413
logger = logging.getLogger(__name__)
1514

16-
ZINC_DRUG_SEARCH_ROOT = 'http://zinc.docking.org/substances/search/?q='
17-
ZINC_ID_SEARCH_ROOT = 'http://zinc.docking.org/substances/'
18-
1915
PUBCHEM_START = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound'
2016
PUBCHEM_MID = 'property'
2117
PUBCHEM_END = 'TXT'
2218

2319

24-
def get_smiles_from_zinc(drug: Union[str, int]) -> str:
25-
"""
26-
Uses the ZINC databases to retrieve the SMILES of a ZINC ID (int) or a drug
27-
name (str).
28-
29-
Args:
30-
drug (Union[str, int]): a string with a drug name or an int of a ZINC
31-
ID.
32-
Returns:
33-
smiles (str): The SMILES string of the drug name or ZINC ID.
34-
"""
35-
36-
if type(drug) != str and type(drug) != int:
37-
raise TypeError(
38-
f'Please insert drug of type {{str, int}}, given was {type(drug)}'
39-
f'({drug}).'
40-
)
41-
42-
if type(drug) == str:
43-
44-
# Parse name, then retrieve ZINC ID from it
45-
stripped_drug = unidecode(drug).strip().replace(' ', '%20')
46-
zinc_ids = []
47-
try:
48-
drug_url = urllib_request.pathname2url(stripped_drug)
49-
path = '{}{}'.format(ZINC_DRUG_SEARCH_ROOT, drug_url)
50-
response = urllib.request.urlopen(path)
51-
52-
for line in response:
53-
line = line.decode(encoding='UTF-8').strip()
54-
if 'href="/substances/ZINC' in line:
55-
zinc_ids.append(line.split('/')[-2])
56-
zinc_id = zinc_ids[0]
57-
58-
except HTTPError:
59-
logger.warning(f'Did not find any result for drug: {drug}')
60-
return ''
61-
62-
elif type(drug) == int:
63-
zinc_id = str(drug)
64-
65-
zinc_id_url = ZINC_ID_SEARCH_ROOT + zinc_id
66-
id_response = urllib_request.urlopen(zinc_id_url)
67-
68-
for id_line in id_response:
69-
id_line = id_line.decode(encoding='UTF-8').strip()
70-
if 'id="substance-smiles-field" readonly value=' in id_line:
71-
smiles = id_line.split('"')[-2]
72-
73-
return smiles
74-
75-
7620
def get_smiles_from_pubchem(
7721
drug: Union[str, int],
7822
query_type: str = 'name',
@@ -122,15 +66,13 @@ def get_smiles_from_pubchem(
12266
if isinstance(drug, str):
12367
drug = unidecode(drug).strip().replace(' ', '%20')
12468

125-
# Search ZINC for compound name
69+
# Search in PubChem for compound name
12670
for option in options:
12771
try:
12872
path = '{}/{}/{}/{}/{}/{}'.format(
12973
PUBCHEM_START, query_type, drug, PUBCHEM_MID, option, PUBCHEM_END
13074
)
131-
smiles = (
132-
urllib_request.urlopen(path).read().decode('UTF-8').replace('\n', '')
133-
)
75+
smiles = urllib_request.urlopen(path).read().decode('UTF-8').split()[0]
13476
if not kekulize:
13577
smiles = Chem.MolToSmiles(Chem.MolFromSmiles(smiles, sanitize=sanitize))
13678
return smiles

pytoda/preprocessing/tests/test_crawlers.py

+11-26
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
"""Testing Crawlers."""
2+
23
import unittest
34

4-
from pytoda.preprocessing.crawlers import ( # query_pubchem,; remove_pubchem_smiles,
5+
from pytoda.preprocessing.crawlers import (
56
get_smiles_from_pubchem,
6-
get_smiles_from_zinc,
77
query_pubchem,
88
remove_pubchem_smiles,
99
)
@@ -12,26 +12,8 @@
1212
class TestCrawlers(unittest.TestCase):
1313
"""Testing Crawlsers."""
1414

15-
def test_get_smiles_from_zinc(self) -> None:
16-
"""Test get_smiles_from_zinc"""
17-
18-
# # ZINC is down since quite some time, hence we skip these tests
19-
return True
20-
21-
# Test text mode
22-
drug = 'Aspirin'
23-
ground_truth = 'CC(=O)Oc1ccccc1C(=O)O'
24-
smiles = get_smiles_from_zinc(drug)
25-
self.assertEqual(smiles, ground_truth)
26-
27-
# Test ZINC ID mode
28-
zinc_id = 53
29-
ground_truth = 'CC(=O)Oc1ccccc1C(=O)O'
30-
smiles = get_smiles_from_zinc(zinc_id)
31-
self.assertEqual(smiles, ground_truth)
32-
3315
def test_get_smiles_from_pubchem(self) -> None:
34-
"""Test get_smiles_from_zinc"""
16+
"""Test get_smiles_from_pubchem"""
3517

3618
for sanitize in [True, False]:
3719

@@ -83,10 +65,16 @@ def test_get_smiles_from_pubchem(self) -> None:
8365
)
8466
self.assertEqual(smiles, ground_truth)
8567

68+
# Test molecule where landing page has several entries
69+
gt_smiles = (
70+
'CC12C(C(CC(O1)N3C4=CC=CC=C4C5=C6C(=C7C8=CC=CC=C8N2C7=C53)CNC6=O)NC)OC'
71+
)
72+
drug = 'Staurosporine'
73+
smiles = get_smiles_from_pubchem(drug, use_isomeric=False, kekulize=True)
74+
self.assertEqual(smiles, gt_smiles)
75+
8676
def test_query_pubchem(self) -> None:
8777
"""Test query_pubchem"""
88-
# pass
89-
# Disabled due to bug in pubchem api
9078
smiles_list = [
9179
'O1C=CC=NC(=O)C1=O',
9280
'CC(N)S(O)(=O)C(C)CC(C(C)C)c1cc(F)cc(F)c1',
@@ -98,9 +86,6 @@ def test_query_pubchem(self) -> None:
9886

9987
def test_remove_pubchem_smiles(self) -> None:
10088
"""Test remove_pubchem_smiles"""
101-
# pass
102-
103-
# Disabled due to bug in pubchem api
10489
smiles_list = [
10590
'O1C=CC=NC(=O)C1=O',
10691
'CC(N)S(O)(=O)C(C)CC(C(C)C)c1cc(F)cc(F)c1',

requirements.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
numpy>=1.19.0
22
scikit-learn>=0.23.0
33
pandas>=1.0.0
4-
torch>=1.4.0,<1.9
4+
torch>=1.9
55
diskcache>=5.0.3
66
dill>=0.3.3
77
selfies>=2.1.1

setup.py

+6
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
"""Package installer."""
2+
23
import codecs
34
import os
45

@@ -59,6 +60,11 @@ def get_version(rel_path):
5960
'License :: OSI Approved :: MIT License',
6061
'Programming Language :: Python :: 3',
6162
'Programming Language :: Python :: 3.7',
63+
'Programming Language :: Python :: 3.8',
64+
'Programming Language :: Python :: 3.9',
65+
'Programming Language :: Python :: 3.10',
66+
'Programming Language :: Python :: 3.11',
67+
'Programming Language :: Python :: 3.12',
6268
'Topic :: Software Development :: Libraries :: Python Modules',
6369
],
6470
packages=find_packages(),

0 commit comments

Comments
 (0)