[MRG] Add Figshare content provider (#788)

betatim · web-flow · commit e33d5f86caae · 2019-09-11T22:17:39.000+02:00
[MRG] Add Figshare content provider
diff --git a/docs/source/contributing/roadmap.md b/docs/source/contributing/roadmap.md
@@ -83,4 +83,3 @@ time there is no active plan for an item. The project would like to find the
 resources and time to discuss and then execute these ideas.
 * support execution on a remote host (with more resources than available locally) via the command-line
 * add support for using ZIP files as the repo (`repo2docker https://example.com/an-archive.zip`) this will give us access to several archives (like Zenodo) that expose things as ZIP files.
-* add support for Zenodo (`repo2docker 10.5281/zenodo.1476680`) so Zenodo software archives can be used as the source in addition to a git repository
diff --git a/docs/source/usage.rst b/docs/source/usage.rst
@@ -12,7 +12,7 @@ Using ``repo2docker``
 
 ``repo2docker`` can build a reproducible computational environment for any repository that
 follows :ref:`specification`. repo2docker is called with the URL of a Git repository,
-a Zenodo DOI or a path to a local directory. It then
+a DOI from Zenodo or Figshare, or a path to a local directory. It then
 performs these steps:
 
 1. Inspects the repository for :ref:`configuration files <config-files>`. These will be used to build
diff --git a/repo2docker/app.py b/repo2docker/app.py
@@ -142,7 +142,12 @@ def _default_log_level(self):
     # detecting if something will successfully `git clone` is very hard if all
     # you can do is look at the path/URL to it.
     content_providers = List(
-        [contentproviders.Local, contentproviders.Zenodo, contentproviders.Git],
+        [
+            contentproviders.Local,
+            contentproviders.Zenodo,
+            contentproviders.Figshare,
+            contentproviders.Git,
+        ],
         config=True,
         help="""
         Ordered list by priority of ContentProviders to try in turn to fetch
diff --git a/repo2docker/contentproviders/__init__.py b/repo2docker/contentproviders/__init__.py
@@ -1,3 +1,4 @@
 from .git import Git
 from .base import Local
 from .zenodo import Zenodo
+from .figshare import Figshare
diff --git a/repo2docker/contentproviders/doi.py b/repo2docker/contentproviders/doi.py
@@ -0,0 +1,90 @@
+import os
+import json
+import shutil
+import logging
+
+from os import makedirs
+from os import path
+from urllib import request  # urlopen, Request
+from urllib.error import HTTPError
+from zipfile import ZipFile, is_zipfile
+
+from .base import ContentProvider
+from ..utils import copytree, deep_get
+from ..utils import normalize_doi, is_doi
+from .. import __version__
+
+
+class DoiProvider(ContentProvider):
+    """Provide contents of a repository identified by a DOI and some helper functions."""
+
+    def urlopen(self, req, headers=None):
+        """A urlopen() helper"""
+        # someone passed a string, not a request
+        if not isinstance(req, request.Request):
+            req = request.Request(req)
+
+        req.add_header("User-Agent", "repo2docker {}".format(__version__))
+        if headers is not None:
+            for key, value in headers.items():
+                req.add_header(key, value)
+
+        return request.urlopen(req)
+
+    def doi2url(self, doi):
+        # Transform a DOI to a URL
+        # If not a doi, assume we have a URL and return
+        if is_doi(doi):
+            doi = normalize_doi(doi)
+
+            try:
+                resp = self.urlopen("https://doi.org/{}".format(doi))
+            # If the DOI doesn't resolve, just return URL
+            except HTTPError:
+                return doi
+            return resp.url
+        else:
+            # Just return what is actulally just a URL
+            return doi
+
+    def fetch_file(self, file_ref, host, output_dir, unzip=False):
+        # the assumption is that `unzip=True` means that this is the only
+        # file related to a record
+        file_url = deep_get(file_ref, host["download"])
+        fname = deep_get(file_ref, host["filename"])
+        logging.debug("Downloading file {} as {}\n".format(file_url, fname))
+        with self.urlopen(file_url) as src:
+            if path.dirname(fname):
+                sub_dir = path.join(output_dir, path.dirname(fname))
+                if not path.exists(sub_dir):
+                    yield "Creating {}\n".format(sub_dir)
+                    makedirs(sub_dir, exist_ok=True)
+
+            dst_fname = path.join(output_dir, fname)
+            with open(dst_fname, "wb") as dst:
+                yield "Fetching {}\n".format(fname)
+                shutil.copyfileobj(src, dst)
+            # first close the newly written file, then continue
+            # processing it
+            if unzip and is_zipfile(dst_fname):
+                yield "Extracting {}\n".format(fname)
+                zfile = ZipFile(dst_fname)
+                zfile.extractall(path=output_dir)
+                zfile.close()
+
+                # delete downloaded file ...
+                os.remove(dst_fname)
+                # ... and any directories we might have created,
+                # in which case sub_dir will be defined
+                if path.dirname(fname):
+                    shutil.rmtree(sub_dir)
+
+                new_subdirs = os.listdir(output_dir)
+                # if there is only one new subdirectory move its contents
+                # to the top level directory
+                if len(new_subdirs) == 1:
+                    d = new_subdirs[0]
+                    copytree(path.join(output_dir, d), output_dir)
+                    shutil.rmtree(path.join(output_dir, d))
+
+                yield "Fetched files: {}\n".format(os.listdir(output_dir))
diff --git a/repo2docker/contentproviders/figshare.py b/repo2docker/contentproviders/figshare.py
@@ -0,0 +1,97 @@
+import os
+import re
+import json
+import shutil
+
+from os import makedirs
+from os import path
+from urllib.request import Request
+from urllib.error import HTTPError
+from zipfile import is_zipfile
+
+from .doi import DoiProvider
+from ..utils import copytree, deep_get
+
+
+class Figshare(DoiProvider):
+    """Provide contents of a Figshare article.
+    
+    See https://docs.figshare.com/#public_article for API docs.
+
+    Examples:
+      - https://doi.org/10.6084/m9.figshare.9782777
+      - https://doi.org/10.6084/m9.figshare.9782777.v2
+      - https://figshare.com/articles/binder-examples_requirements/9784088 (only one zipfile, no DOI)
+    """
+
+    def __init__(self):
+        self.hosts = [
+            {
+                "hostname": [
+                    "https://figshare.com/articles/",
+                    "http://figshare.com/articles/",
+                    "https://figshare.com/account/articles/",
+                ],
+                "api": "https://api.figshare.com/v2/articles/",
+                "filepath": "files",
+                "filename": "name",
+                "download": "download_url",
+            }
+        ]
+
+    url_regex = re.compile(r"(.*)/articles/([^/]+)/(\d+)(/)?(\d+)?")
+
+    def detect(self, doi, ref=None, extra_args=None):
+        """Trigger this provider for things that resolve to a Figshare article"""
+        # We need the hostname (url where records are), api url (for metadata),
+        # filepath (path to files in metadata), filename (path to filename in
+        # metadata), download (path to file download URL), and type (path to item type in metadata)
+
+        url = self.doi2url(doi)
+
+        for host in self.hosts:
+            if any([url.startswith(s) for s in host["hostname"]]):
+                match = self.url_regex.match(url)
+                if match:
+                    self.article_id = match.groups()[2]
+                    self.article_version = match.groups()[4]
+                    if not self.article_version:
+                        self.article_version = "1"
+                    return {
+                        "article": self.article_id,
+                        "host": host,
+                        "version": self.article_version,
+                    }
+                else:
+                    return None
+
+    def fetch(self, spec, output_dir, yield_output=False):
+        """Fetch and unpack a Figshare article"""
+        article_id = spec["article"]
+        article_version = spec["version"]
+        host = spec["host"]
+
+        yield "Fetching Figshare article {} in version {}.\n".format(
+            article_id, article_version
+        )
+        req = Request(
+            "{}{}/versions/{}".format(host["api"], article_id, article_version),
+            headers={"accept": "application/json"},
+        )
+        resp = self.urlopen(req)
+
+        article = json.loads(resp.read().decode("utf-8"))
+
+        files = deep_get(article, host["filepath"])
+        # only fetch files where is_link_only: False
+        files = [file for file in files if not file["is_link_only"]]
+        only_one_file = len(files) == 1
+        for file_ref in files:
+            unzip = file_ref["name"].endswith(".zip") and only_one_file
+            for line in self.fetch_file(file_ref, host, output_dir, unzip):
+                yield line
+
+    @property
+    def content_id(self):
+        """The Figshare article ID"""
+        return "{}.v{}".format(self.article_id, self.article_version)
diff --git a/repo2docker/contentproviders/zenodo.py b/repo2docker/contentproviders/zenodo.py
@@ -4,54 +4,21 @@
 
 from os import makedirs
 from os import path
-from urllib.request import urlopen, Request
+from urllib.request import Request
 from urllib.error import HTTPError
-from zipfile import ZipFile, is_zipfile
 
-from .base import ContentProvider
+from .doi import DoiProvider
 from ..utils import copytree, deep_get
-from ..utils import normalize_doi, is_doi
-from .. import __version__
 
 
-class Zenodo(ContentProvider):
+class Zenodo(DoiProvider):
     """Provide contents of a Zenodo deposit."""
 
-    def _urlopen(self, req, headers=None):
-        """A urlopen() helper"""
-        # someone passed a string, not a request
-        if not isinstance(req, Request):
-            req = Request(req)
-
-        req.add_header("User-Agent", "repo2docker {}".format(__version__))
-        if headers is not None:
-            for key, value in headers.items():
-                req.add_header(key, value)
-
-        return urlopen(req)
-
-    def _doi2url(self, doi):
-        # Transform a DOI to a URL
-        # If not a doi, assume we have a URL and return
-        if is_doi(doi):
-            doi = normalize_doi(doi)
-
-            try:
-                resp = self._urlopen("https://doi.org/{}".format(doi))
-            # If the DOI doesn't resolve, just return URL
-            except HTTPError:
-                return doi
-            return resp.url
-        else:
-            # Just return what is actulally just a URL
-            return doi
-
-    def detect(self, doi, ref=None, extra_args=None):
-        """Trigger this provider for things that resolve to a Zenodo/Invenio record"""
+    def __init__(self):
         # We need the hostname (url where records are), api url (for metadata),
         # filepath (path to files in metadata), filename (path to filename in
         # metadata), download (path to file download URL), and type (path to item type in metadata)
-        hosts = [
+        self.hosts = [
             {
                 "hostname": ["https://zenodo.org/record/", "http://zenodo.org/record/"],
                 "api": "https://zenodo.org/api/records/",
@@ -73,9 +40,11 @@ def detect(self, doi, ref=None, extra_args=None):
             },
         ]
 
-        url = self._doi2url(doi)
+    def detect(self, doi, ref=None, extra_args=None):
+        """Trigger this provider for things that resolve to a Zenodo/Invenio record"""
+        url = self.doi2url(doi)
 
-        for host in hosts:
+        for host in self.hosts:
             if any([url.startswith(s) for s in host["hostname"]]):
                 self.record_id = url.rsplit("/", maxsplit=1)[1]
                 return {"record": self.record_id, "host": host}
@@ -90,53 +59,17 @@ def fetch(self, spec, output_dir, yield_output=False):
             "{}{}".format(host["api"], record_id),
             headers={"accept": "application/json"},
         )
-        resp = self._urlopen(req)
+        resp = self.urlopen(req)
 
         record = json.loads(resp.read().decode("utf-8"))
 
-        def _fetch(file_ref, unzip=False):
-            # the assumption is that `unzip=True` means that this is the only
-            # file related to the zenodo record
-            with self._urlopen(deep_get(file_ref, host["download"])) as src:
-                fname = deep_get(file_ref, host["filename"])
-                if path.dirname(fname):
-                    sub_dir = path.join(output_dir, path.dirname(fname))
-                    if not path.exists(sub_dir):
-                        yield "Creating {}\n".format(sub_dir)
-                        makedirs(sub_dir, exist_ok=True)
-
-                dst_fname = path.join(output_dir, fname)
-                with open(dst_fname, "wb") as dst:
-                    yield "Fetching {}\n".format(fname)
-                    shutil.copyfileobj(src, dst)
-                # first close the newly written file, then continue
-                # processing it
-                if unzip and is_zipfile(dst_fname):
-                    yield "Extracting {}\n".format(fname)
-                    zfile = ZipFile(dst_fname)
-                    zfile.extractall(path=output_dir)
-                    zfile.close()
-
-                    # delete downloaded file ...
-                    os.remove(dst_fname)
-                    # ... and any directories we might have created,
-                    # in which case sub_dir will be defined
-                    if path.dirname(fname):
-                        shutil.rmtree(sub_dir)
-
-                    new_subdirs = os.listdir(output_dir)
-                    # if there is only one new subdirectory move its contents
-                    # to the top level directory
-                    if len(new_subdirs) == 1:
-                        d = new_subdirs[0]
-                        copytree(path.join(output_dir, d), output_dir)
-                        shutil.rmtree(path.join(output_dir, d))
-
         is_software = deep_get(record, host["type"]).lower() == "software"
         files = deep_get(record, host["filepath"])
         only_one_file = len(files) == 1
         for file_ref in files:
-            for line in _fetch(file_ref, unzip=is_software and only_one_file):
+            for line in self.fetch_file(
+                file_ref, host, output_dir, is_software and only_one_file
+            ):
                 yield line
 
     @property
diff --git a/tests/unit/contentproviders/test_doi.py b/tests/unit/contentproviders/test_doi.py
@@ -0,0 +1,41 @@
+import json
+import os
+import re
+import urllib
+import pytest
+import tempfile
+import logging
+
+from unittest.mock import patch, MagicMock, mock_open
+from zipfile import ZipFile
+
+from repo2docker.contentproviders.doi import DoiProvider
+from repo2docker.contentproviders.base import ContentProviderException
+
+
+def test_content_id():
+    doi = DoiProvider()
+    assert doi.content_id is None
+
+
+def fake_urlopen(req):
+    print(req)
+    return req.headers
+
+
+@patch("urllib.request.urlopen", fake_urlopen)
+def test_url_headers():
+    doi = DoiProvider()
+
+    headers = {"test1": "value1", "Test2": "value2"}
+    result = doi.urlopen("https://mybinder.org", headers=headers)
+    assert "Test1" in result
+    assert "Test2" in result
+    assert len(result) is 3  # User-agent is also set
+
+
+def test_unresolving_doi():
+    doi = DoiProvider()
+
+    fakedoi = "10.1/1234"
+    assert doi.doi2url(fakedoi) is fakedoi
diff --git a/tests/unit/contentproviders/test_figshare.py b/tests/unit/contentproviders/test_figshare.py
diff --git a/tests/unit/contentproviders/test_zenodo.py b/tests/unit/contentproviders/test_zenodo.py