Skip to content

Commit 94e7279

Browse files
committed
Merge remote-tracking branch 'origin/main' into process-isa-json-after-biosamples
2 parents 6a91ee0 + 4135a56 commit 94e7279

File tree

8 files changed

+5531
-8
lines changed

8 files changed

+5531
-8
lines changed

mars-cli/generate_config.py

+2
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,10 @@ def create_settings_file(settings_dir):
3232
config["ena"] = {
3333
"development-url": "https://wwwdev.ebi.ac.uk/ena/submit/webin-v2/",
3434
"development-submission-url": "https://wwwdev.ebi.ac.uk/ena/submit/drop-box/submit/?auth=ENA",
35+
"development-data-submission-url": "webin2.ebi.ac.uk",
3536
"production-url": "https://www.ebi.ac.uk/ena/submit/webin-v2/",
3637
"production-submission-url": "https://www.ebi.ac.uk/ena/submit/drop-box/submit/?auth=ENA",
38+
"production-data-submission-url": "webin2.ebi.ac.uk",
3739
}
3840

3941
config["biosamples"] = {

mars-cli/mars_cli.py

+34
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,11 @@
6363
"development-submission-url",
6464
fallback="https://wwwdev.ebi.ac.uk/biosamples/samples/submit",
6565
),
66+
"DATA-SUBMISSION": config.get(
67+
"ena",
68+
"development-data-submission-url",
69+
fallback="webin2.ebi.ac.uk",
70+
),
6671
},
6772
"WEBIN": {
6873
"SERVICE": config.get(
@@ -101,6 +106,11 @@
101106
"production-submission-url",
102107
fallback="https://www.ebi.ac.uk/ena/submit/drop-box/submit/?auth=ENA",
103108
),
109+
"DATA-SUBMISSION": config.get(
110+
"ena",
111+
"development-data-submission-url",
112+
fallback="webin2.ebi.ac.uk",
113+
),
104114
},
105115
"WEBIN": {
106116
"SERVICE": config.get(
@@ -173,6 +183,23 @@ def cli(ctx, development):
173183
help="Submit to BioSamples.",
174184
)
175185
@click.option("--submit-to-ena", type=click.BOOL, default=True, help="Submit to ENA.")
186+
@click.option(
187+
"--file-transfer",
188+
type=click.STRING,
189+
help="provide the name of a file transfer solution, like ftp or aspera",
190+
)
191+
@click.option(
192+
"--data-files",
193+
type=click.File("r"),
194+
multiple=True,
195+
help="Path of files to upload",
196+
)
197+
# @click.option(
198+
# "--data-submit-to-ena",
199+
# type=click.BOOL,
200+
# default=False,
201+
# help="Submit data files to ENA.",
202+
# )
176203
@click.option(
177204
"--submit-to-metabolights",
178205
type=click.BOOL,
@@ -196,6 +223,8 @@ def submit(
196223
submit_to_ena,
197224
submit_to_metabolights,
198225
investigation_is_root,
226+
file_transfer,
227+
data_files,
199228
):
200229
"""Start a submission to the target repositories."""
201230
target_repositories = []
@@ -214,6 +243,9 @@ def submit(
214243
)
215244

216245
urls_dict = ctx.obj["FILTERED_URLS"]
246+
247+
data_file_paths = [f.name for f in data_files] if file_transfer else []
248+
217249
try:
218250
submission(
219251
credential_service_name,
@@ -223,6 +255,8 @@ def submit(
223255
target_repositories,
224256
investigation_is_root,
225257
urls_dict,
258+
file_transfer,
259+
data_file_paths,
226260
)
227261
except requests.RequestException as err:
228262
tb = sys.exc_info()[2] # Traceback value

mars-cli/mars_lib/ftp_upload.py

+59
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
import ftplib
2+
import os
3+
from pathlib import Path
4+
from typing import List
5+
6+
from retry import retry
7+
from mars_lib.logging import print_and_log
8+
9+
10+
class PatchFTP_TLS(ftplib.FTP_TLS):
11+
"""
12+
Modification from https://stackoverflow.com/questions/14659154/ftpes-session-reuse-required
13+
to work around bug in Python standard library: https://bugs.python.org/issue19500
14+
Explicit FTPS, with shared TLS session
15+
"""
16+
17+
def ntransfercmd(self, cmd, rest=None):
18+
conn, size = ftplib.FTP.ntransfercmd(self, cmd, rest)
19+
if self._prot_p:
20+
conn = self.context.wrap_socket(
21+
conn, server_hostname=self.host, session=self.sock.session
22+
) # this is the fix
23+
return conn, size
24+
25+
26+
class FTPUploader:
27+
def __init__(self, ftp_host: str, username: str, password: str):
28+
self.ftp_host = ftp_host
29+
self.username = username
30+
self.password = password
31+
32+
@retry(exceptions=ftplib.all_errors, tries=3, delay=2, backoff=1.2, jitter=(1, 3))
33+
def upload(self, file_paths: List[Path], target_location: str = "/") -> bool:
34+
# Heuristic to set the expected timeout assuming 10Mb/s upload speed but no less than 30 sec
35+
# and no more than an hour
36+
max_file_size = max([os.path.getsize(f) for f in file_paths])
37+
timeout = min(max(int(max_file_size / 10000000), 30), 3600)
38+
with PatchFTP_TLS() as ftps:
39+
ftps.context.set_ciphers("HIGH:!DH:!aNULL")
40+
ftps.connect(self.ftp_host, port=21, timeout=timeout)
41+
ftps.login(self.username, self.password)
42+
ftps.prot_p()
43+
44+
ftps.cwd(target_location)
45+
previous_content = ftps.nlst()
46+
for file_to_upload in file_paths:
47+
file_name = os.path.basename(file_to_upload)
48+
if file_name in previous_content and ftps.size(
49+
file_name
50+
) == os.path.getsize(file_to_upload):
51+
print_and_log(
52+
f"{file_name} already exists and has the same size on the FTP, skipping"
53+
)
54+
continue
55+
print_and_log(f"Uploading {file_name} to FTP")
56+
with open(file_to_upload, "rb") as open_file:
57+
ftps.storbinary("STOR %s" % file_name, open_file)
58+
59+
return True

mars-cli/mars_lib/submit.py

+37-1
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,10 @@
2424
from mars_lib.logging import print_and_log
2525
from pydantic import ValidationError
2626

27+
from mars_lib.ftp_upload import FTPUploader
28+
from pathlib import Path
29+
from typing import List
30+
2731

2832
def save_step_to_file(time_stamp: float, filename: str, isa_json: IsaJson):
2933
dir_path = f"tmp/{str(time_stamp)}"
@@ -44,6 +48,8 @@ def submission(
4448
target_repositories: list[str],
4549
investigation_is_root: bool,
4650
urls: dict[str, Any],
51+
file_transfer: str,
52+
data_file_paths=None,
4753
):
4854
# If credential manager info found:
4955
# Get password from the credential manager
@@ -82,7 +88,17 @@ def submission(
8288
):
8389
raise ValueError("No target repository selected.")
8490

85-
if TargetRepository.BIOSAMPLES in target_repositories:
91+
if (
92+
TargetRepository.ENA in target_repositories
93+
and data_file_paths
94+
and file_transfer
95+
):
96+
upload_to_ena(
97+
file_paths=data_file_paths,
98+
user_credentials=user_credentials,
99+
submission_url=urls["ENA"]["DATA-SUBMISSION"],
100+
file_transfer=file_transfer,
101+
)
86102
# Submit to Biosamples
87103
biosamples_result = submit_to_biosamples(
88104
isa_json=isa_json,
@@ -202,6 +218,26 @@ def submit_to_ena(
202218
return result
203219

204220

221+
def upload_to_ena(
222+
file_paths: List[Path],
223+
user_credentials: dict[str, str],
224+
submission_url: str,
225+
file_transfer: str,
226+
):
227+
ALLOWED_FILE_TRANSFER_SOLUTIONS = {"ftp", "aspera"}
228+
file_transfer = file_transfer.lower()
229+
230+
if file_transfer not in ALLOWED_FILE_TRANSFER_SOLUTIONS:
231+
raise ValueError(f"Unsupported transfer protocol: {file_transfer}")
232+
if file_transfer == "ftp":
233+
uploader = FTPUploader(
234+
submission_url,
235+
user_credentials["username"],
236+
user_credentials["password"],
237+
)
238+
uploader.upload(file_paths)
239+
240+
205241
def create_external_references(
206242
biosamples_credentials: dict[str, str],
207243
biosamples_externalReferences: dict[str, Any],

mars-cli/requirements.txt

+1
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,4 @@ jsonschema
33
keyring
44
pydantic
55
click
6+
retry

mars-cli/tests/test_ftp_upload.py

+22
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
import json
2+
3+
import pytest
4+
from pathlib import Path
5+
import ftplib
6+
7+
from mars_lib.ftp_upload import FTPUploader
8+
9+
10+
def test_upload_login_failure():
11+
uploader = FTPUploader("webin2.ebi.ac.uk", "junk", "more junk")
12+
with pytest.raises(ftplib.error_perm, match="530 Login incorrect."):
13+
uploader.upload([Path("./tests/fixtures/not_a_json_file.txt")])
14+
15+
16+
@pytest.mark.skip(reason="Relies on real ENA credentials in test_credentials_example.json")
17+
def test_upload_success():
18+
# For local testing, add ENA username/password to test_credentials_example.json
19+
with open("./tests/test_credentials_example.json") as f:
20+
creds = json.load(f)
21+
uploader = FTPUploader("webin2.ebi.ac.uk", creds["username"], creds["password"])
22+
uploader.upload([Path("../test-data/ENA_TEST2.R1.fastq.gz"), Path("./tests/fixtures/not_a_json_file.txt")])

repository-services/repository-api.md

+52-7
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
This document is to define the interface between the broker and the target repository services.
33
This applies to all repositories, including BioSamples.
44

5-
At present, only a single API endpoint is required, `submit`. Authentication and data transfer are not covered in this document, but some assumptions are laid out below.
5+
There is one required endpoint, `submit`, as well as a submission status endpoint recommended for long-running submission processing. Authentication and data transfer are not covered in this document, but some assumptions are laid out below.
66

77
## Authentication
88
If the repository requires authentication to submit data, the submit endpoint must allow authentication via an authorization header.
@@ -35,24 +35,33 @@ The response must be JSON in the following format:
3535
"errors": [
3636
// error objects
3737
],
38+
"status": {
39+
// status object
40+
},
3841
"info": [
3942
// info objects
4043
]
4144
}
4245
```
4346
where:
4447
* `targetRepository` is the identifier used to annotate the ISA-JSON and should take values from [identifiers.org](http://identifiers.org/)
45-
* Either [`accessions`](#accession-object) OR [`errors`](#error-object), but not both, must be present as a list of objects of the form described below. Presence of this field indicates whether the submission was a success or a failure.
46-
* (optional) [`info`](#info-object) is a list of objects of the form described below. This allows additional repository-specific information to be returned in the response.
48+
* Exactly one of the following:
49+
* `accessions`: list of objects defined [here](#accession-object)
50+
* `errors`: list of objects defined [here](#error-object)
51+
* `status`: object defined [here](#status-object)
52+
* Presence of `accession`, `errors`, or `status` indicates whether the submission was a success, failure, or is still pending (asynchronous response).
53+
* (optional) `info` is a list of objects of the form described [below](#info-object). This allows additional repository-specific information to be returned in the response.
54+
55+
This object is frequently referred to as the "receipt" or the "MARS receipt".
4756

4857
#### Accession object
4958
The accession object looks like the following:
5059
```jsonc
5160
{
5261
"path": [
53-
{"key": "studies", "where": {"key": "X", "value": "Y"}},
54-
{"key": "materials"}
55-
// further path objects as needed
62+
{"key": "studies", "where": {"key": "X", "value": "Y"}},
63+
{"key": "materials"}
64+
// further path objects as needed
5665
],
5766
"value": "REPO_123"
5867
}
@@ -88,6 +97,20 @@ The error objects being returned by the repository may be used by developers to
8897

8998
Besides this error reporting, the service should employ other HTTP error codes as usual (e.g. 401).
9099

100+
#### Status object
101+
The status object looks like the following:
102+
```jsonc
103+
{
104+
"statusUrl": "...",
105+
"id": "...",
106+
"percentComplete": 0.25,
107+
}
108+
```
109+
where:
110+
* `statusUrl` is a URL that can be queried to determine the completion status of the submission (see [status endpoint](#submission-status-endpoint) section below)
111+
* (optional) `id` is an identifier for the submission
112+
* (optional) `percentComplete` is a number between 0 and 1 indicating the approximate percentage of the processing by the repository that is complete
113+
91114
#### Info object
92115
The info object looks like the following:
93116
```jsonc
@@ -96,10 +119,21 @@ The info object looks like the following:
96119
"message": "..."
97120
}
98121
```
99-
where `name` and `message` are strings at the repository’s discretion.
122+
where `name` (optional) and `message` are strings at the repository’s discretion.
100123

101124
This can be used to provide any additional information back to the user, not relating to accessions or errors. For example, it could include the submission date and when the data will be made public. This will not be processed further by the broker but will only be presented to the user.
102125

126+
## Submission status endpoint
127+
`GET /{submission_id}/status`
128+
129+
(The endpoint path is only a suggestion, the actual path can differ as long as it is accurately returned in the `status` field of the receipt.)
130+
131+
This endpoint is used to poll for the status of a previous submission. It should be used whenever the time from data and metadata submission until the issuing of accessions exceeds a reasonable duration, and it must be returned in the `status` field of the receipt.
132+
133+
### Response
134+
135+
The response must be the same format as for the submit endpoint (i.e. the [MARS receipt](#response)), again indicating whether the submission is complete and successful, complete with errors, or still pending.
136+
103137
## Examples
104138

105139
### Submission request
@@ -219,3 +253,14 @@ For illustration only.
219253
]
220254
}
221255
```
256+
257+
### Status response
258+
```json
259+
{
260+
"targetRepository": "eva",
261+
"status": {
262+
"id": "123-456",
263+
"statusUrl": "https://ebi.ac.uk/eva/submission/123-456/status"
264+
}
265+
}
266+
```

0 commit comments

Comments
 (0)