Add support for the SWHID content provider

This content provider allows to retrieve the content from a Software Heritage (SWH) persistent identifier (SWHID). Typical usage: repo2docker swh:1:rev:94dca98c006b80309704c717b5d83dff3c1fa3a0 It uses the SWH public vault API to retrieve the content of the given directory. Most of the times, this will not need an authentication token to bypass the rate-limiting of the SWH API. Without authentication, one should be allowed to retrieve one directory content per minute. If this is not enought, then the user must use authenticated calls to the SWH API. For this, a new `swh_token` config item has been added to the Repo2Docker application class. To use authentication: repo2docker --config cfg.json swh:1:rev:94dca98c006b80309704c717b5d83dff3c1fa3a0 with the swh_token config option being defined in the cfg.json config file.
jupyterhub · betatim · Jan 26, 2021 · Nov 24, 2020 · Nov 24, 2020 · Nov 26, 2020
commit e54c24ce2ee540805d6e4f74a355da995679fa3c
diff --git a/repo2docker/app.py b/repo2docker/app.py
@@ -148,6 +148,7 @@ def _default_log_level(self):
             contentproviders.Figshare,
             contentproviders.Dataverse,
             contentproviders.Hydroshare,
+            contentproviders.Swhid,
             contentproviders.Mercurial,
             contentproviders.Git,
         ],
@@ -269,6 +270,18 @@ def _user_name_default(self):
         allow_none=True,
     )
 
+    swh_token = Unicode(
+        None,
+        help="""
+        Token to use authenticated SWH API access.
+
+        If unset, default to unauthenticated (limited) usage of the Software
+        Heritage API.
+        """,
+        config=True,
+        allow_none=True,
+    )
+
     cleanup_checkout = Bool(
         False,
         help="""
@@ -395,6 +408,10 @@ def fetch(self, url, ref, checkout_path):
                 "No matching content provider found for " "{url}.".format(url=url)
             )
 
+        swh_token = self.config.get("swh_token", self.swh_token)
+        if swh_token and isinstance(picked_content_provider, contentproviders.Swhid):
+            picked_content_provider.set_auth_token(swh_token)
+
         for log_line in picked_content_provider.fetch(
             spec, checkout_path, yield_output=self.json_logs
         ):

diff --git a/repo2docker/contentproviders/__init__.py b/repo2docker/contentproviders/__init__.py
@@ -5,3 +5,4 @@
 from .dataverse import Dataverse
 from .hydroshare import Hydroshare
 from .mercurial import Mercurial
+from .swhid import Swhid
diff --git a/repo2docker/contentproviders/swhid.py b/repo2docker/contentproviders/swhid.py
@@ -0,0 +1,113 @@
+import io
+import os
+import shutil
+import tarfile
+import time
+import re
+
+from os import path
+
+import requests
+
+from .base import ContentProvider
+from ..utils import copytree
+from .. import __version__
+
+
+def parse_swhid(swhid):
+    swhid_regexp = r"^swh:(?P<version>\d+):(?P<type>ori|cnt|rev|dir|snp|rel):(?P<hash>[0-9a-f]{40})$"
+    # only parse/check the <identifier_core> of the swhid
+    # see https://docs.softwareheritage.org/devel/swh-model/persistent-identifiers.html
+    m = re.match(swhid_regexp, swhid.split(";")[0])
+    if m:
+        return m.groupdict()
+
+
+class Swhid(ContentProvider):
+    """Provide contents of a repository identified by a SWHID."""
+
+    retry_delay = 5
+
+    def __init__(self):
+        self.swhid = None
+        self.base_url = "https://archive.softwareheritage.org/api/1"
+        self.session = requests.Session()
+        self.session.headers.update(
+            {
+                "user-agent": "repo2docker {}".format(__version__),
+            }
+        )
+
+    def set_auth_token(self, token):
+        header = {"Authorization": "Bearer {}".format(token)}
+        self.session.headers.update(header)
+
+    def _request(self, url, method="GET"):
+        if not url.endswith("/"):
+            url = url + "/"
+
+        for retries in range(3):
+            try:
+                resp = self.session.request(method, url)
+                if resp.ok:
+                    break
+            except requests.ConnectionError:
+                time.sleep(self.retry_delay)
+
+        return resp
+
+    @property
+    def content_id(self):
+        """The SWHID record ID used for content retrival"""
+        return self.swhid
+
+    def detect(self, swhid, ref=None, extra_args=None):
+        swhid_dict = parse_swhid(swhid)
+
+        if (
+            swhid_dict
+            and swhid_dict["type"] in ("dir", "rev")
+            and swhid_dict["version"] == "1"
+        ):
+            return {"swhid": swhid, "swhid_obj": swhid_dict}
+
+    def fetch_directory(self, dir_hash, output_dir):
+        url = "{}/vault/directory/{}/".format(self.base_url, dir_hash)
+        yield "Fetching directory {} from {}\n".format(dir_hash, url)
+        resp = self._request(url, "POST")
+        receipt = resp.json()
+        status = receipt["status"]
+        assert status != "failed", receipt
+        while status not in ("failed", "done"):
+            time.sleep(self.retry_delay)
+            resp = self._request(url)
+            status = resp.json()["status"]
+        if status == "failed":
+            yield "Error preparing the directory for download"
+            raise Exception()
+        resp = self._request(resp.json()["fetch_url"])
+        archive = tarfile.open(fileobj=io.BytesIO(resp.content))
+        archive.extractall(path=output_dir)
+        # the output_dir should have only one subdir named after the dir_hash
+        # move its content one level up
+        copytree(path.join(output_dir, dir_hash), output_dir)
+        shutil.rmtree(path.join(output_dir, dir_hash))
+        yield "Fetched files: {}\n".format(os.listdir(output_dir))
+
+    def fetch(self, spec, output_dir, yield_output=False):
+        swhid = spec["swhid"]
+        swhid_obj = spec["swhid_obj"]
+
+        if swhid_obj["type"] == "rev":
+            # need to get the directory for this revision
+            sha1git = swhid_obj["hash"]
+            url = "{}/revision/{}/".format(self.base_url, sha1git)
+            yield "Fetching revision {} from {}\n".format(sha1git, url)
+            resp = self._request(url)
+            assert resp.ok, (resp.content, self.session.headers)
+            directory = resp.json()["directory"]
+            self.swhid = "swh:1:dir:{}".format(directory)
+            yield from self.fetch_directory(directory, output_dir)
+        elif swhid_obj["type"] == "dir":
+            self.swhid = swhid
+            yield from self.fetch_directory(swhid_obj["hash"], output_dir)
diff --git a/setup.py b/setup.py
@@ -56,6 +56,7 @@ def get_identifier(json):
         "ruamel.yaml>=0.15",
         "toml",
         "semver",
+        "requests",
     ],
     python_requires=">=3.6",
     author="Project Jupyter Contributors",

diff --git a/tests/unit/contentproviders/test_swhid.py b/tests/unit/contentproviders/test_swhid.py
@@ -0,0 +1,157 @@
+import json
+import os
+import io
+import tarfile
+import shutil
+import re
+import urllib
+import pytest
+import tempfile
+import logging
+import requests_mock
+
+from os import makedirs
+from os.path import join
+from unittest.mock import patch, MagicMock, mock_open
+from zipfile import ZipFile
+
+from repo2docker.contentproviders.swhid import Swhid, parse_swhid
+from repo2docker.contentproviders.base import ContentProviderException
+
+
+# this is a slightly stripped down copy of swh.model.cli.swhid_of_dir().
+# We do not use this later to prevent having to depend on swh.model[cli]
+def swhid_of_dir(path):
+    object = Directory.from_disk(path=path).get_data()
+    return swhid(DIRECTORY, object)
+
+
+def test_content_id():
+    swhid = Swhid()
+    assert swhid.content_id is None
+
+
+swhids_ok = [
+    "swh:1:dir:" + "0" * 40,
+    "swh:1:rev:" + "0" * 40,
+]
+swhids_invalid = [
+    "swh:1:dir:" + "0" * 39,
+    "swh:2:dir:" + "0" * 40,
+    "swh:1:rev:" + "0" * 41,
+    "swh:1:cnt:" + "0" * 40,
+    "swh:1:ori:" + "0" * 40,
+    "swh:1:rel:" + "0" * 40,
+    "swh:1:snp:" + "0" * 40,
+]
+
+detect_values = [
+    (swhid, {"swhid": swhid, "swhid_obj": parse_swhid(swhid)}) for swhid in swhids_ok
+] + [(swhid, None) for swhid in swhids_invalid]
+
+
+@pytest.mark.parametrize("swhid, expected", detect_values)
+def test_detect(swhid, expected):
+    provider = Swhid()
+    assert provider.detect(swhid) == expected
+
+
+def fake_urlopen(req):
+    print(req)
+    return req.headers
+
+
+def test_unresolving_swhid():
+    provider = Swhid()
+
+    # swhid = "0" * 40
+    # assert provider.swhid2url(swhid) is swhid
+
+
+NULLID = "0" * 40
+
+
+@pytest.fixture
+def gen_tarfile(tmpdir):
+    rootdir = join(tmpdir, "tmp")
+    makedirs(rootdir)
+    with open(join(rootdir, "file1.txt"), "wb") as fobj:
+        fobj.write(b"Some content\n")
+
+    # this directory hash can be computed using the swh.model package, but we do
+    # nto want to depend on this later to limit dependencies and because it
+    # does not support python 3.6;
+    dirhash = "89a3bd29a2c5ae0b1465febbe5df09730a8576fe"
+    buf = io.BytesIO()
+    tarf = tarfile.open(name=dirhash, fileobj=buf, mode="w")
+    tarf.add(rootdir, arcname=dirhash)
+    tarf.close()
+    shutil.rmtree(rootdir)
+    return dirhash, buf.getvalue()
+
+
+def mocked_provider(tmpdir, dirhash, tarfile_buf):
+    provider = Swhid()
+    adapter = requests_mock.Adapter()
+    provider.base_url = "mock://api/1"
+    provider.retry_delay = 0.1
+    provider.session.mount("mock://", adapter)
+
+    adapter.register_uri(
+        "GET",
+        "mock://api/1/revision/{}/".format(NULLID),
+        json={
+            "author": {"fullname": "John Doe <[email protected]>"},
+            "directory": dirhash,
+        },
+    )
+    adapter.register_uri(
+        "POST",
+        "mock://api/1/vault/directory/{}/".format(dirhash),
+        json={
+            "fetch_url": "mock://api/1/vault/directory/{}/raw/".format(dirhash),
+            "status": "new",
+        },
+    )
+    adapter.register_uri(
+        "GET",
+        "mock://api/1/vault/directory/{}/".format(dirhash),
+        [
+            {
+                "json": {
+                    "fetch_url": "mock://api/1/vault/directory/{}/raw/".format(dirhash),
+                    "status": "pending",
+                }
+            },
+            {
+                "json": {
+                    "fetch_url": "mock://api/1/vault/directory/{}/raw/".format(dirhash),
+                    "status": "done",
+                }
+            },
+        ],
+    )
+    adapter.register_uri(
+        "GET",
+        "mock://api/1/vault/directory/{}/raw/".format(dirhash),
+        content=tarfile_buf,
+    )
+    return provider
+
+
+def test_fetch_revision(tmpdir, gen_tarfile):
+    dir_id, tarfile_buf = gen_tarfile
+    provider = mocked_provider(tmpdir, dir_id, tarfile_buf)
+    swhid = "swh:1:rev:" + NULLID
+    for log in provider.fetch(provider.detect(swhid), tmpdir):
+        print(log)
+    assert provider.content_id == "swh:1:dir:" + dir_id
+
+
+def test_fetch_directory(tmpdir, gen_tarfile):
+    dir_id, tarfile_buf = gen_tarfile
+    provider = mocked_provider(tmpdir, dir_id, tarfile_buf)
+    swhid = "swh:1:dir:" + dir_id
+    for log in provider.fetch(provider.detect(swhid), tmpdir):
+        print(log)
+    assert provider.content_id == swhid