Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a contentprovider for Software Heritage persistent ID (SWHID) #988

Merged
merged 3 commits into from
Jan 26, 2021
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Add support for the SWHID content provider
This content provider allows to retrieve the content from a
Software Heritage (SWH) persistent identifier (SWHID).
Typical usage:

  repo2docker swh:1:rev:94dca98c006b80309704c717b5d83dff3c1fa3a0

It uses the SWH public vault API to retrieve the content of the given
directory.

Most of the times, this will not need an authentication
token to bypass the rate-limiting of the SWH API.
Without authentication, one should be allowed to retrieve one
directory content per minute.

If this is not enought, then the user must use authenticated calls to
the SWH API.

For this, a new `swh_token` config item has been added to the Repo2Docker
application class.

To use authentication:

  repo2docker --config cfg.json swh:1:rev:94dca98c006b80309704c717b5d83dff3c1fa3a0

with the swh_token config option being defined in the cfg.json config file.
douardda committed Jan 19, 2021
commit e54c24ce2ee540805d6e4f74a355da995679fa3c
17 changes: 17 additions & 0 deletions repo2docker/app.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -148,6 +148,7 @@ def _default_log_level(self):
contentproviders.Figshare,
contentproviders.Dataverse,
contentproviders.Hydroshare,
contentproviders.Swhid,
contentproviders.Mercurial,
contentproviders.Git,
],
@@ -269,6 +270,18 @@ def _user_name_default(self):
allow_none=True,
)

swh_token = Unicode(
None,
help="""
Token to use authenticated SWH API access.
If unset, default to unauthenticated (limited) usage of the Software
Heritage API.
""",
config=True,
allow_none=True,
)

cleanup_checkout = Bool(
False,
help="""
@@ -395,6 +408,10 @@ def fetch(self, url, ref, checkout_path):
"No matching content provider found for " "{url}.".format(url=url)
)

swh_token = self.config.get("swh_token", self.swh_token)
if swh_token and isinstance(picked_content_provider, contentproviders.Swhid):
picked_content_provider.set_auth_token(swh_token)

for log_line in picked_content_provider.fetch(
spec, checkout_path, yield_output=self.json_logs
):
1 change: 1 addition & 0 deletions repo2docker/contentproviders/__init__.py
Original file line number Diff line number Diff line change
@@ -5,3 +5,4 @@
from .dataverse import Dataverse
from .hydroshare import Hydroshare
from .mercurial import Mercurial
from .swhid import Swhid
113 changes: 113 additions & 0 deletions repo2docker/contentproviders/swhid.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
import io
import os
import shutil
import tarfile
import time
import re

from os import path

import requests
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We currently don't depend on requests and have used the standard library urllib to make HTTP requests. We should take a moment to review if we want to take on the additional maintenance cost of a new dependency vs sticking with using urllib.


from .base import ContentProvider
from ..utils import copytree
from .. import __version__


def parse_swhid(swhid):
swhid_regexp = r"^swh:(?P<version>\d+):(?P<type>ori|cnt|rev|dir|snp|rel):(?P<hash>[0-9a-f]{40})$"
# only parse/check the <identifier_core> of the swhid
# see https://docs.softwareheritage.org/devel/swh-model/persistent-identifiers.html
m = re.match(swhid_regexp, swhid.split(";")[0])
if m:
return m.groupdict()


class Swhid(ContentProvider):
"""Provide contents of a repository identified by a SWHID."""

retry_delay = 5

def __init__(self):
self.swhid = None
self.base_url = "https://archive.softwareheritage.org/api/1"
self.session = requests.Session()
self.session.headers.update(
{
"user-agent": "repo2docker {}".format(__version__),
}
)

def set_auth_token(self, token):
header = {"Authorization": "Bearer {}".format(token)}
self.session.headers.update(header)

def _request(self, url, method="GET"):
if not url.endswith("/"):
url = url + "/"

for retries in range(3):
try:
resp = self.session.request(method, url)
if resp.ok:
break
except requests.ConnectionError:
time.sleep(self.retry_delay)

return resp

@property
def content_id(self):
"""The SWHID record ID used for content retrival"""
return self.swhid

def detect(self, swhid, ref=None, extra_args=None):
swhid_dict = parse_swhid(swhid)

if (
swhid_dict
and swhid_dict["type"] in ("dir", "rev")
and swhid_dict["version"] == "1"
):
return {"swhid": swhid, "swhid_obj": swhid_dict}

def fetch_directory(self, dir_hash, output_dir):
url = "{}/vault/directory/{}/".format(self.base_url, dir_hash)
yield "Fetching directory {} from {}\n".format(dir_hash, url)
resp = self._request(url, "POST")
receipt = resp.json()
status = receipt["status"]
assert status != "failed", receipt
while status not in ("failed", "done"):
time.sleep(self.retry_delay)
resp = self._request(url)
status = resp.json()["status"]
if status == "failed":
yield "Error preparing the directory for download"
raise Exception()
resp = self._request(resp.json()["fetch_url"])
archive = tarfile.open(fileobj=io.BytesIO(resp.content))
archive.extractall(path=output_dir)
# the output_dir should have only one subdir named after the dir_hash
# move its content one level up
copytree(path.join(output_dir, dir_hash), output_dir)
shutil.rmtree(path.join(output_dir, dir_hash))
yield "Fetched files: {}\n".format(os.listdir(output_dir))

def fetch(self, spec, output_dir, yield_output=False):
swhid = spec["swhid"]
swhid_obj = spec["swhid_obj"]

if swhid_obj["type"] == "rev":
# need to get the directory for this revision
sha1git = swhid_obj["hash"]
url = "{}/revision/{}/".format(self.base_url, sha1git)
yield "Fetching revision {} from {}\n".format(sha1git, url)
resp = self._request(url)
assert resp.ok, (resp.content, self.session.headers)
directory = resp.json()["directory"]
self.swhid = "swh:1:dir:{}".format(directory)
yield from self.fetch_directory(directory, output_dir)
elif swhid_obj["type"] == "dir":
self.swhid = swhid
yield from self.fetch_directory(swhid_obj["hash"], output_dir)
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -56,6 +56,7 @@ def get_identifier(json):
"ruamel.yaml>=0.15",
"toml",
"semver",
"requests",
],
python_requires=">=3.6",
author="Project Jupyter Contributors",
157 changes: 157 additions & 0 deletions tests/unit/contentproviders/test_swhid.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
import json
import os
import io
import tarfile
import shutil
import re
import urllib
import pytest
import tempfile
import logging
import requests_mock

from os import makedirs
from os.path import join
from unittest.mock import patch, MagicMock, mock_open
from zipfile import ZipFile

from repo2docker.contentproviders.swhid import Swhid, parse_swhid
from repo2docker.contentproviders.base import ContentProviderException


# this is a slightly stripped down copy of swh.model.cli.swhid_of_dir().
# We do not use this later to prevent having to depend on swh.model[cli]
def swhid_of_dir(path):
object = Directory.from_disk(path=path).get_data()
return swhid(DIRECTORY, object)


def test_content_id():
swhid = Swhid()
assert swhid.content_id is None


swhids_ok = [
"swh:1:dir:" + "0" * 40,
"swh:1:rev:" + "0" * 40,
]
swhids_invalid = [
"swh:1:dir:" + "0" * 39,
"swh:2:dir:" + "0" * 40,
"swh:1:rev:" + "0" * 41,
"swh:1:cnt:" + "0" * 40,
"swh:1:ori:" + "0" * 40,
"swh:1:rel:" + "0" * 40,
"swh:1:snp:" + "0" * 40,
]

detect_values = [
(swhid, {"swhid": swhid, "swhid_obj": parse_swhid(swhid)}) for swhid in swhids_ok
] + [(swhid, None) for swhid in swhids_invalid]


@pytest.mark.parametrize("swhid, expected", detect_values)
def test_detect(swhid, expected):
provider = Swhid()
assert provider.detect(swhid) == expected


def fake_urlopen(req):
print(req)
return req.headers


def test_unresolving_swhid():
provider = Swhid()

# swhid = "0" * 40
# assert provider.swhid2url(swhid) is swhid


NULLID = "0" * 40


@pytest.fixture
def gen_tarfile(tmpdir):
rootdir = join(tmpdir, "tmp")
makedirs(rootdir)
with open(join(rootdir, "file1.txt"), "wb") as fobj:
fobj.write(b"Some content\n")

# this directory hash can be computed using the swh.model package, but we do
# nto want to depend on this later to limit dependencies and because it
# does not support python 3.6;
dirhash = "89a3bd29a2c5ae0b1465febbe5df09730a8576fe"
buf = io.BytesIO()
tarf = tarfile.open(name=dirhash, fileobj=buf, mode="w")
tarf.add(rootdir, arcname=dirhash)
tarf.close()
shutil.rmtree(rootdir)
return dirhash, buf.getvalue()


def mocked_provider(tmpdir, dirhash, tarfile_buf):
provider = Swhid()
adapter = requests_mock.Adapter()
provider.base_url = "mock://api/1"
provider.retry_delay = 0.1
provider.session.mount("mock://", adapter)

adapter.register_uri(
"GET",
"mock://api/1/revision/{}/".format(NULLID),
json={
"author": {"fullname": "John Doe <[email protected]>"},
"directory": dirhash,
},
)
adapter.register_uri(
"POST",
"mock://api/1/vault/directory/{}/".format(dirhash),
json={
"fetch_url": "mock://api/1/vault/directory/{}/raw/".format(dirhash),
"status": "new",
},
)
adapter.register_uri(
"GET",
"mock://api/1/vault/directory/{}/".format(dirhash),
[
{
"json": {
"fetch_url": "mock://api/1/vault/directory/{}/raw/".format(dirhash),
"status": "pending",
}
},
{
"json": {
"fetch_url": "mock://api/1/vault/directory/{}/raw/".format(dirhash),
"status": "done",
}
},
],
)
adapter.register_uri(
"GET",
"mock://api/1/vault/directory/{}/raw/".format(dirhash),
content=tarfile_buf,
)
return provider


def test_fetch_revision(tmpdir, gen_tarfile):
dir_id, tarfile_buf = gen_tarfile
provider = mocked_provider(tmpdir, dir_id, tarfile_buf)
swhid = "swh:1:rev:" + NULLID
for log in provider.fetch(provider.detect(swhid), tmpdir):
print(log)
assert provider.content_id == "swh:1:dir:" + dir_id


def test_fetch_directory(tmpdir, gen_tarfile):
dir_id, tarfile_buf = gen_tarfile
provider = mocked_provider(tmpdir, dir_id, tarfile_buf)
swhid = "swh:1:dir:" + dir_id
for log in provider.fetch(provider.detect(swhid), tmpdir):
print(log)
assert provider.content_id == swhid