Skip to content

Commit e33d5f8

Browse files
authored
[MRG] Add Figshare content provider (#788)
[MRG] Add Figshare content provider
2 parents 57919b9 + a40e179 commit e33d5f8

File tree

10 files changed

+439
-156
lines changed

10 files changed

+439
-156
lines changed

docs/source/contributing/roadmap.md

-1
Original file line numberDiff line numberDiff line change
@@ -83,4 +83,3 @@ time there is no active plan for an item. The project would like to find the
8383
resources and time to discuss and then execute these ideas.
8484
* support execution on a remote host (with more resources than available locally) via the command-line
8585
* add support for using ZIP files as the repo (`repo2docker https://example.com/an-archive.zip`) this will give us access to several archives (like Zenodo) that expose things as ZIP files.
86-
* add support for Zenodo (`repo2docker 10.5281/zenodo.1476680`) so Zenodo software archives can be used as the source in addition to a git repository

docs/source/usage.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ Using ``repo2docker``
1212

1313
``repo2docker`` can build a reproducible computational environment for any repository that
1414
follows :ref:`specification`. repo2docker is called with the URL of a Git repository,
15-
a Zenodo DOI or a path to a local directory. It then
15+
a DOI from Zenodo or Figshare, or a path to a local directory. It then
1616
performs these steps:
1717

1818
1. Inspects the repository for :ref:`configuration files <config-files>`. These will be used to build

repo2docker/app.py

+6-1
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,12 @@ def _default_log_level(self):
142142
# detecting if something will successfully `git clone` is very hard if all
143143
# you can do is look at the path/URL to it.
144144
content_providers = List(
145-
[contentproviders.Local, contentproviders.Zenodo, contentproviders.Git],
145+
[
146+
contentproviders.Local,
147+
contentproviders.Zenodo,
148+
contentproviders.Figshare,
149+
contentproviders.Git,
150+
],
146151
config=True,
147152
help="""
148153
Ordered list by priority of ContentProviders to try in turn to fetch
+1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
from .git import Git
22
from .base import Local
33
from .zenodo import Zenodo
4+
from .figshare import Figshare

repo2docker/contentproviders/doi.py

+90
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
import os
2+
import json
3+
import shutil
4+
import logging
5+
6+
from os import makedirs
7+
from os import path
8+
from urllib import request # urlopen, Request
9+
from urllib.error import HTTPError
10+
from zipfile import ZipFile, is_zipfile
11+
12+
from .base import ContentProvider
13+
from ..utils import copytree, deep_get
14+
from ..utils import normalize_doi, is_doi
15+
from .. import __version__
16+
17+
18+
class DoiProvider(ContentProvider):
19+
"""Provide contents of a repository identified by a DOI and some helper functions."""
20+
21+
def urlopen(self, req, headers=None):
22+
"""A urlopen() helper"""
23+
# someone passed a string, not a request
24+
if not isinstance(req, request.Request):
25+
req = request.Request(req)
26+
27+
req.add_header("User-Agent", "repo2docker {}".format(__version__))
28+
if headers is not None:
29+
for key, value in headers.items():
30+
req.add_header(key, value)
31+
32+
return request.urlopen(req)
33+
34+
def doi2url(self, doi):
35+
# Transform a DOI to a URL
36+
# If not a doi, assume we have a URL and return
37+
if is_doi(doi):
38+
doi = normalize_doi(doi)
39+
40+
try:
41+
resp = self.urlopen("https://doi.org/{}".format(doi))
42+
# If the DOI doesn't resolve, just return URL
43+
except HTTPError:
44+
return doi
45+
return resp.url
46+
else:
47+
# Just return what is actulally just a URL
48+
return doi
49+
50+
def fetch_file(self, file_ref, host, output_dir, unzip=False):
51+
# the assumption is that `unzip=True` means that this is the only
52+
# file related to a record
53+
file_url = deep_get(file_ref, host["download"])
54+
fname = deep_get(file_ref, host["filename"])
55+
logging.debug("Downloading file {} as {}\n".format(file_url, fname))
56+
with self.urlopen(file_url) as src:
57+
if path.dirname(fname):
58+
sub_dir = path.join(output_dir, path.dirname(fname))
59+
if not path.exists(sub_dir):
60+
yield "Creating {}\n".format(sub_dir)
61+
makedirs(sub_dir, exist_ok=True)
62+
63+
dst_fname = path.join(output_dir, fname)
64+
with open(dst_fname, "wb") as dst:
65+
yield "Fetching {}\n".format(fname)
66+
shutil.copyfileobj(src, dst)
67+
# first close the newly written file, then continue
68+
# processing it
69+
if unzip and is_zipfile(dst_fname):
70+
yield "Extracting {}\n".format(fname)
71+
zfile = ZipFile(dst_fname)
72+
zfile.extractall(path=output_dir)
73+
zfile.close()
74+
75+
# delete downloaded file ...
76+
os.remove(dst_fname)
77+
# ... and any directories we might have created,
78+
# in which case sub_dir will be defined
79+
if path.dirname(fname):
80+
shutil.rmtree(sub_dir)
81+
82+
new_subdirs = os.listdir(output_dir)
83+
# if there is only one new subdirectory move its contents
84+
# to the top level directory
85+
if len(new_subdirs) == 1:
86+
d = new_subdirs[0]
87+
copytree(path.join(output_dir, d), output_dir)
88+
shutil.rmtree(path.join(output_dir, d))
89+
90+
yield "Fetched files: {}\n".format(os.listdir(output_dir))
+97
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
import os
2+
import re
3+
import json
4+
import shutil
5+
6+
from os import makedirs
7+
from os import path
8+
from urllib.request import Request
9+
from urllib.error import HTTPError
10+
from zipfile import is_zipfile
11+
12+
from .doi import DoiProvider
13+
from ..utils import copytree, deep_get
14+
15+
16+
class Figshare(DoiProvider):
17+
"""Provide contents of a Figshare article.
18+
19+
See https://docs.figshare.com/#public_article for API docs.
20+
21+
Examples:
22+
- https://doi.org/10.6084/m9.figshare.9782777
23+
- https://doi.org/10.6084/m9.figshare.9782777.v2
24+
- https://figshare.com/articles/binder-examples_requirements/9784088 (only one zipfile, no DOI)
25+
"""
26+
27+
def __init__(self):
28+
self.hosts = [
29+
{
30+
"hostname": [
31+
"https://figshare.com/articles/",
32+
"http://figshare.com/articles/",
33+
"https://figshare.com/account/articles/",
34+
],
35+
"api": "https://api.figshare.com/v2/articles/",
36+
"filepath": "files",
37+
"filename": "name",
38+
"download": "download_url",
39+
}
40+
]
41+
42+
url_regex = re.compile(r"(.*)/articles/([^/]+)/(\d+)(/)?(\d+)?")
43+
44+
def detect(self, doi, ref=None, extra_args=None):
45+
"""Trigger this provider for things that resolve to a Figshare article"""
46+
# We need the hostname (url where records are), api url (for metadata),
47+
# filepath (path to files in metadata), filename (path to filename in
48+
# metadata), download (path to file download URL), and type (path to item type in metadata)
49+
50+
url = self.doi2url(doi)
51+
52+
for host in self.hosts:
53+
if any([url.startswith(s) for s in host["hostname"]]):
54+
match = self.url_regex.match(url)
55+
if match:
56+
self.article_id = match.groups()[2]
57+
self.article_version = match.groups()[4]
58+
if not self.article_version:
59+
self.article_version = "1"
60+
return {
61+
"article": self.article_id,
62+
"host": host,
63+
"version": self.article_version,
64+
}
65+
else:
66+
return None
67+
68+
def fetch(self, spec, output_dir, yield_output=False):
69+
"""Fetch and unpack a Figshare article"""
70+
article_id = spec["article"]
71+
article_version = spec["version"]
72+
host = spec["host"]
73+
74+
yield "Fetching Figshare article {} in version {}.\n".format(
75+
article_id, article_version
76+
)
77+
req = Request(
78+
"{}{}/versions/{}".format(host["api"], article_id, article_version),
79+
headers={"accept": "application/json"},
80+
)
81+
resp = self.urlopen(req)
82+
83+
article = json.loads(resp.read().decode("utf-8"))
84+
85+
files = deep_get(article, host["filepath"])
86+
# only fetch files where is_link_only: False
87+
files = [file for file in files if not file["is_link_only"]]
88+
only_one_file = len(files) == 1
89+
for file_ref in files:
90+
unzip = file_ref["name"].endswith(".zip") and only_one_file
91+
for line in self.fetch_file(file_ref, host, output_dir, unzip):
92+
yield line
93+
94+
@property
95+
def content_id(self):
96+
"""The Figshare article ID"""
97+
return "{}.v{}".format(self.article_id, self.article_version)

repo2docker/contentproviders/zenodo.py

+13-80
Original file line numberDiff line numberDiff line change
@@ -4,54 +4,21 @@
44

55
from os import makedirs
66
from os import path
7-
from urllib.request import urlopen, Request
7+
from urllib.request import Request
88
from urllib.error import HTTPError
9-
from zipfile import ZipFile, is_zipfile
109

11-
from .base import ContentProvider
10+
from .doi import DoiProvider
1211
from ..utils import copytree, deep_get
13-
from ..utils import normalize_doi, is_doi
14-
from .. import __version__
1512

1613

17-
class Zenodo(ContentProvider):
14+
class Zenodo(DoiProvider):
1815
"""Provide contents of a Zenodo deposit."""
1916

20-
def _urlopen(self, req, headers=None):
21-
"""A urlopen() helper"""
22-
# someone passed a string, not a request
23-
if not isinstance(req, Request):
24-
req = Request(req)
25-
26-
req.add_header("User-Agent", "repo2docker {}".format(__version__))
27-
if headers is not None:
28-
for key, value in headers.items():
29-
req.add_header(key, value)
30-
31-
return urlopen(req)
32-
33-
def _doi2url(self, doi):
34-
# Transform a DOI to a URL
35-
# If not a doi, assume we have a URL and return
36-
if is_doi(doi):
37-
doi = normalize_doi(doi)
38-
39-
try:
40-
resp = self._urlopen("https://doi.org/{}".format(doi))
41-
# If the DOI doesn't resolve, just return URL
42-
except HTTPError:
43-
return doi
44-
return resp.url
45-
else:
46-
# Just return what is actulally just a URL
47-
return doi
48-
49-
def detect(self, doi, ref=None, extra_args=None):
50-
"""Trigger this provider for things that resolve to a Zenodo/Invenio record"""
17+
def __init__(self):
5118
# We need the hostname (url where records are), api url (for metadata),
5219
# filepath (path to files in metadata), filename (path to filename in
5320
# metadata), download (path to file download URL), and type (path to item type in metadata)
54-
hosts = [
21+
self.hosts = [
5522
{
5623
"hostname": ["https://zenodo.org/record/", "http://zenodo.org/record/"],
5724
"api": "https://zenodo.org/api/records/",
@@ -73,9 +40,11 @@ def detect(self, doi, ref=None, extra_args=None):
7340
},
7441
]
7542

76-
url = self._doi2url(doi)
43+
def detect(self, doi, ref=None, extra_args=None):
44+
"""Trigger this provider for things that resolve to a Zenodo/Invenio record"""
45+
url = self.doi2url(doi)
7746

78-
for host in hosts:
47+
for host in self.hosts:
7948
if any([url.startswith(s) for s in host["hostname"]]):
8049
self.record_id = url.rsplit("/", maxsplit=1)[1]
8150
return {"record": self.record_id, "host": host}
@@ -90,53 +59,17 @@ def fetch(self, spec, output_dir, yield_output=False):
9059
"{}{}".format(host["api"], record_id),
9160
headers={"accept": "application/json"},
9261
)
93-
resp = self._urlopen(req)
62+
resp = self.urlopen(req)
9463

9564
record = json.loads(resp.read().decode("utf-8"))
9665

97-
def _fetch(file_ref, unzip=False):
98-
# the assumption is that `unzip=True` means that this is the only
99-
# file related to the zenodo record
100-
with self._urlopen(deep_get(file_ref, host["download"])) as src:
101-
fname = deep_get(file_ref, host["filename"])
102-
if path.dirname(fname):
103-
sub_dir = path.join(output_dir, path.dirname(fname))
104-
if not path.exists(sub_dir):
105-
yield "Creating {}\n".format(sub_dir)
106-
makedirs(sub_dir, exist_ok=True)
107-
108-
dst_fname = path.join(output_dir, fname)
109-
with open(dst_fname, "wb") as dst:
110-
yield "Fetching {}\n".format(fname)
111-
shutil.copyfileobj(src, dst)
112-
# first close the newly written file, then continue
113-
# processing it
114-
if unzip and is_zipfile(dst_fname):
115-
yield "Extracting {}\n".format(fname)
116-
zfile = ZipFile(dst_fname)
117-
zfile.extractall(path=output_dir)
118-
zfile.close()
119-
120-
# delete downloaded file ...
121-
os.remove(dst_fname)
122-
# ... and any directories we might have created,
123-
# in which case sub_dir will be defined
124-
if path.dirname(fname):
125-
shutil.rmtree(sub_dir)
126-
127-
new_subdirs = os.listdir(output_dir)
128-
# if there is only one new subdirectory move its contents
129-
# to the top level directory
130-
if len(new_subdirs) == 1:
131-
d = new_subdirs[0]
132-
copytree(path.join(output_dir, d), output_dir)
133-
shutil.rmtree(path.join(output_dir, d))
134-
13566
is_software = deep_get(record, host["type"]).lower() == "software"
13667
files = deep_get(record, host["filepath"])
13768
only_one_file = len(files) == 1
13869
for file_ref in files:
139-
for line in _fetch(file_ref, unzip=is_software and only_one_file):
70+
for line in self.fetch_file(
71+
file_ref, host, output_dir, is_software and only_one_file
72+
):
14073
yield line
14174

14275
@property
+41
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
import json
2+
import os
3+
import re
4+
import urllib
5+
import pytest
6+
import tempfile
7+
import logging
8+
9+
from unittest.mock import patch, MagicMock, mock_open
10+
from zipfile import ZipFile
11+
12+
from repo2docker.contentproviders.doi import DoiProvider
13+
from repo2docker.contentproviders.base import ContentProviderException
14+
15+
16+
def test_content_id():
17+
doi = DoiProvider()
18+
assert doi.content_id is None
19+
20+
21+
def fake_urlopen(req):
22+
print(req)
23+
return req.headers
24+
25+
26+
@patch("urllib.request.urlopen", fake_urlopen)
27+
def test_url_headers():
28+
doi = DoiProvider()
29+
30+
headers = {"test1": "value1", "Test2": "value2"}
31+
result = doi.urlopen("https://mybinder.org", headers=headers)
32+
assert "Test1" in result
33+
assert "Test2" in result
34+
assert len(result) is 3 # User-agent is also set
35+
36+
37+
def test_unresolving_doi():
38+
doi = DoiProvider()
39+
40+
fakedoi = "10.1/1234"
41+
assert doi.doi2url(fakedoi) is fakedoi

0 commit comments

Comments
 (0)