Crawlers (#165)

jannisborn · web-flow · commit 2516af1b9c24 · 2024-10-04T08:34:06.000+02:00
* fix: correct pubchem crawling from smiles with multiple entries

* pin: numpy

* chore: remove Zinc API

* ci: expand tests to python 3.7-3.11

* doc: Update README

* ci: Span matrix

* ci: remove windows tests

* ci: remove trailing zero

* chore: add support for torch 2 and python 3.11

* ci: test 3.7-3.12

* ci: enable 3.12
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -7,7 +7,7 @@ on:
       - 'gh_pages'
 jobs:
   conda-tests:
-    name: Test with conda (${{ matrix.os }})
+    name: Test with conda (${{ matrix.os }}) Python ${{ matrix.python-version }})
     runs-on: ${{ matrix.os }}
     continue-on-error: ${{ matrix.experimental }}
     strategy:
@@ -17,9 +17,7 @@ jobs:
           - os: ubuntu-latest
             pip_cache_path: ~/.cache/pip
             experimental: false
-          - os: windows-latest
-            pip_cache_path: ~\AppData\Local\pip\Cache
-            experimental: true
+        python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"]
     defaults:
       run:
         shell: bash -l {0}  # For conda
@@ -35,19 +33,20 @@ jobs:
         uses: actions/cache@v2
         with:
           path: ~/conda_pkgs_dir  # from: conda-incubator/setup-miniconda@v2
-          key: ${{ runner.os }}-conda-${{ env.CACHE_NUMBER }}-${{
+          key: ${{ runner.os }}-conda-${{ matrix.python-version }}-${{ env.CACHE_NUMBER }}-${{
             hashFiles('conda.yml') }}
 
       - name: Cache pip
         uses: actions/cache@v2
         with:
           path: ${{ matrix.pip_cache_path }}
-          key: ${{ runner.os }}-pip--${{ env.CACHE_NUMBER }}-${{
+          key: ${{ runner.os }}-pip-${{ matrix.python-version }}-${{ env.CACHE_NUMBER }}-${{
             hashFiles('requirements.txt') }}
 
       - name: Conda environment setup
         uses: conda-incubator/setup-miniconda@v2
         with:
+          python-version: ${{ matrix.python-version }}
           activate-environment: pytoda
           environment-file: conda.yml
           auto-activate-base: false
@@ -72,7 +71,7 @@ jobs:
         if: always()
         with:
           status: ${{ job.status }}
-          text: "CI Build ${{ matrix.os }}"
+          text: "CI Build ${{ matrix.os }} Python ${{ matrix.python-version}}"
           author_name: ${{ github.actor }}
         env:
           SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}
diff --git a/README.md b/README.md
@@ -57,41 +57,14 @@ For some examples on how to use `pytoda` see [here](./examples)
 If you use `pytoda` in your projects, please cite the following:
 
 ```bib
-@article{born2021datadriven,
-  author = {
-    Born, Jannis and Manica, Matteo and Cadow, Joris and Markert, Greta and
-    Mill,Nil Adell and Filipavicius, Modestas and Janakarajan, Nikita and
-    Cardinale, Antonio and Laino, Teodoro and 
-    {Rodr{\'{i}}guez Mart{\'{i}}nez}, Mar{\'{i}}a
-  },
-  doi = {10.1088/2632-2153/abe808},
-  issn = {2632-2153},
-  journal = {Machine Learning: Science and Technology},
-  number = {2},
-  pages = {025024},
-  title = {{
-    Data-driven molecular design for discovery and synthesis of novel ligands: 
-    a case study on SARS-CoV-2
-  }},
-  url = {https://iopscience.iop.org/article/10.1088/2632-2153/abe808},
-  volume = {2},
-  year = {2021}
-}
-@article{born2021paccmannrl,
-    title = {
-      PaccMann$^{RL}$: De novo generation of hit-like anticancer molecules from
-      transcriptomic data via reinforcement learning
-    },
-    journal = {iScience},
-    volume = {24},
-    number = {4},
-    year = {2021},
-    issn = {2589-0042},
-    doi = {https://doi.org/10.1016/j.isci.2021.102269},
-    url = {https://www.cell.com/iscience/fulltext/S2589-0042(21)00237-6},
-    author = {
-      Jannis Born and Matteo Manica and Ali Oskooei and Joris Cadow and Greta Markert
-      and Mar{\'\i}a Rodr{\'\i}guez Mart{\'\i}nez}
-    }
+@article{born2021data,
+  title={Data-driven molecular design for discovery and synthesis of novel ligands: a case study on SARS-CoV-2},
+  author={Born, Jannis and Manica, Matteo and Cadow, Joris and Markert, Greta and Mill, Nil Adell and Filipavicius, Modestas and Janakarajan, Nikita and Cardinale, Antonio and Laino, Teodoro and Martinez, Maria Rodriguez},
+  journal={Machine Learning: Science and Technology},
+  volume={2},
+  number={2},
+  pages={025024},
+  year={2021},
+  publisher={IOP Publishing}
 }
 ```
diff --git a/conda.yml b/conda.yml
@@ -1,7 +1,7 @@
 name: pytoda
 dependencies:
-  - python>=3.8
-  - pip>=19.1,<20.3
+  - python>=3.7,<3.13
+  - pip
   - pip:
-    - -r file:requirements.txt
+    - -r requirements.txt
     
diff --git a/pytoda/__init__.py b/pytoda/__init__.py
@@ -1,2 +1,2 @@
 name = 'pytoda'
-__version__ = '1.1.5'
+__version__ = '1.1.6'
diff --git a/pytoda/preprocessing/crawlers.py b/pytoda/preprocessing/crawlers.py
@@ -1,5 +1,4 @@
 import logging
-import urllib
 import urllib.request as urllib_request
 from itertools import filterfalse
 from typing import Iterable, List, Tuple, Union
@@ -13,66 +12,11 @@
 
 logger = logging.getLogger(__name__)
 
-ZINC_DRUG_SEARCH_ROOT = 'http://zinc.docking.org/substances/search/?q='
-ZINC_ID_SEARCH_ROOT = 'http://zinc.docking.org/substances/'
-
 PUBCHEM_START = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound'
 PUBCHEM_MID = 'property'
 PUBCHEM_END = 'TXT'
 
 
-def get_smiles_from_zinc(drug: Union[str, int]) -> str:
-    """
-    Uses the ZINC databases to retrieve the SMILES of a ZINC ID (int) or a drug
-    name (str).
-
-    Args:
-        drug (Union[str, int]): a string with a drug name or an int of a ZINC
-            ID.
-    Returns:
-        smiles (str): The SMILES string of the drug name or ZINC ID.
-    """
-
-    if type(drug) != str and type(drug) != int:
-        raise TypeError(
-            f'Please insert drug of type {{str, int}}, given was {type(drug)}'
-            f'({drug}).'
-        )
-
-    if type(drug) == str:
-
-        # Parse name, then retrieve ZINC ID from it
-        stripped_drug = unidecode(drug).strip().replace(' ', '%20')
-        zinc_ids = []
-        try:
-            drug_url = urllib_request.pathname2url(stripped_drug)
-            path = '{}{}'.format(ZINC_DRUG_SEARCH_ROOT, drug_url)
-            response = urllib.request.urlopen(path)
-
-            for line in response:
-                line = line.decode(encoding='UTF-8').strip()
-                if 'href="/substances/ZINC' in line:
-                    zinc_ids.append(line.split('/')[-2])
-            zinc_id = zinc_ids[0]
-
-        except HTTPError:
-            logger.warning(f'Did not find any result for drug: {drug}')
-            return ''
-
-    elif type(drug) == int:
-        zinc_id = str(drug)
-
-    zinc_id_url = ZINC_ID_SEARCH_ROOT + zinc_id
-    id_response = urllib_request.urlopen(zinc_id_url)
-
-    for id_line in id_response:
-        id_line = id_line.decode(encoding='UTF-8').strip()
-        if 'id="substance-smiles-field" readonly value=' in id_line:
-            smiles = id_line.split('"')[-2]
-
-    return smiles
-
-
 def get_smiles_from_pubchem(
     drug: Union[str, int],
     query_type: str = 'name',
@@ -122,15 +66,13 @@ def get_smiles_from_pubchem(
     if isinstance(drug, str):
         drug = unidecode(drug).strip().replace(' ', '%20')
 
-    # Search ZINC for compound name
+    # Search in PubChem for compound name
     for option in options:
         try:
             path = '{}/{}/{}/{}/{}/{}'.format(
                 PUBCHEM_START, query_type, drug, PUBCHEM_MID, option, PUBCHEM_END
             )
-            smiles = (
-                urllib_request.urlopen(path).read().decode('UTF-8').replace('\n', '')
-            )
+            smiles = urllib_request.urlopen(path).read().decode('UTF-8').split()[0]
             if not kekulize:
                 smiles = Chem.MolToSmiles(Chem.MolFromSmiles(smiles, sanitize=sanitize))
             return smiles
diff --git a/pytoda/preprocessing/tests/test_crawlers.py b/pytoda/preprocessing/tests/test_crawlers.py
@@ -1,9 +1,9 @@
 """Testing Crawlers."""
+
 import unittest
 
-from pytoda.preprocessing.crawlers import (  # query_pubchem,; remove_pubchem_smiles,
+from pytoda.preprocessing.crawlers import (
     get_smiles_from_pubchem,
-    get_smiles_from_zinc,
     query_pubchem,
     remove_pubchem_smiles,
 )
@@ -12,26 +12,8 @@
 class TestCrawlers(unittest.TestCase):
     """Testing Crawlsers."""
 
-    def test_get_smiles_from_zinc(self) -> None:
-        """Test get_smiles_from_zinc"""
-
-        # # ZINC is down since quite some time, hence we skip these tests
-        return True
-
-        # Test text mode
-        drug = 'Aspirin'
-        ground_truth = 'CC(=O)Oc1ccccc1C(=O)O'
-        smiles = get_smiles_from_zinc(drug)
-        self.assertEqual(smiles, ground_truth)
-
-        # Test ZINC ID mode
-        zinc_id = 53
-        ground_truth = 'CC(=O)Oc1ccccc1C(=O)O'
-        smiles = get_smiles_from_zinc(zinc_id)
-        self.assertEqual(smiles, ground_truth)
-
     def test_get_smiles_from_pubchem(self) -> None:
-        """Test get_smiles_from_zinc"""
+        """Test get_smiles_from_pubchem"""
 
         for sanitize in [True, False]:
 
@@ -83,10 +65,16 @@ def test_get_smiles_from_pubchem(self) -> None:
                 )
                 self.assertEqual(smiles, ground_truth)
 
+        # Test molecule where landing page has several entries
+        gt_smiles = (
+            'CC12C(C(CC(O1)N3C4=CC=CC=C4C5=C6C(=C7C8=CC=CC=C8N2C7=C53)CNC6=O)NC)OC'
+        )
+        drug = 'Staurosporine'
+        smiles = get_smiles_from_pubchem(drug, use_isomeric=False, kekulize=True)
+        self.assertEqual(smiles, gt_smiles)
+
     def test_query_pubchem(self) -> None:
         """Test query_pubchem"""
-        # pass
-        # Disabled due to bug in pubchem api
         smiles_list = [
             'O1C=CC=NC(=O)C1=O',
             'CC(N)S(O)(=O)C(C)CC(C(C)C)c1cc(F)cc(F)c1',
@@ -98,9 +86,6 @@ def test_query_pubchem(self) -> None:
 
     def test_remove_pubchem_smiles(self) -> None:
         """Test remove_pubchem_smiles"""
-        # pass
-
-        # Disabled due to bug in pubchem api
         smiles_list = [
             'O1C=CC=NC(=O)C1=O',
             'CC(N)S(O)(=O)C(C)CC(C(C)C)c1cc(F)cc(F)c1',
diff --git a/requirements.txt b/requirements.txt
@@ -1,7 +1,7 @@
 numpy>=1.19.0
 scikit-learn>=0.23.0
 pandas>=1.0.0
-torch>=1.4.0,<1.9
+torch>=1.9
 diskcache>=5.0.3
 dill>=0.3.3
 selfies>=2.1.1
diff --git a/setup.py b/setup.py
@@ -1,4 +1,5 @@
 """Package installer."""
+
 import codecs
 import os
 
@@ -59,6 +60,11 @@ def get_version(rel_path):
         'License :: OSI Approved :: MIT License',
         'Programming Language :: Python :: 3',
         'Programming Language :: Python :: 3.7',
+        'Programming Language :: Python :: 3.8',
+        'Programming Language :: Python :: 3.9',
+        'Programming Language :: Python :: 3.10',
+        'Programming Language :: Python :: 3.11',
+        'Programming Language :: Python :: 3.12',
         'Topic :: Software Development :: Libraries :: Python Modules',
     ],
     packages=find_packages(),

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`	`1`	`name = 'pytoda'`
`2`		`-__version__ = '1.1.5'`
	`2`	`+__version__ = '1.1.6'`