Skip to content

Commit fa183a4

Browse files
committed
add script for pruning stale source archives
The script will iterate over stored source archives and attempt to locate the most recent request for them via the API. Archives determined to be stale will be deleted. STONEBLD-1990 Signed-off-by: Taylor Madore <[email protected]>
1 parent d2cd999 commit fa183a4

File tree

3 files changed

+560
-0
lines changed

3 files changed

+560
-0
lines changed

cachito/workers/prune_archives.py

+217
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,217 @@
1+
import logging
2+
import re
3+
from dataclasses import dataclass
4+
from datetime import datetime, timedelta, timezone
5+
from itertools import islice
6+
from pathlib import Path
7+
from typing import Annotated, Any, Generator, NamedTuple, Optional
8+
9+
import requests
10+
import typer
11+
from ratelimit import limits, sleep_and_retry
12+
13+
from cachito.errors import NetworkError
14+
from cachito.workers.config import get_worker_config
15+
from cachito.workers.requests import get_requests_session
16+
17+
app = typer.Typer()
18+
config = get_worker_config()
19+
log = logging.getLogger(__name__)
20+
session = get_requests_session()
21+
22+
ARCHIVE_DIR = Path(config.cachito_sources_dir)
23+
ARCHIVE_PATTERN = re.compile(r"^[a-f0-9]{40}(-with-submodules)?\.tar\.gz$")
24+
DEFAULT_AGE_DATETIME = datetime.now(timezone.utc) - timedelta(
25+
days=config.cachito_archives_default_age_days
26+
)
27+
MINIMUM_AGE_DATETIME = datetime.now(timezone.utc) - timedelta(
28+
days=config.cachito_archives_minimum_age_days
29+
)
30+
LOG_FORMAT = "%(asctime)s %(levelname)s %(message)s"
31+
32+
33+
@dataclass(frozen=True)
34+
class _ParsedArchive:
35+
"""A source archive parsed from the filesystem."""
36+
37+
path: Path
38+
repo_name: str
39+
ref: str
40+
41+
@classmethod
42+
def from_path(cls, path: Path) -> "_ParsedArchive":
43+
repo_name = path.parent.relative_to(ARCHIVE_DIR).as_posix()
44+
ref = path.name[:40]
45+
return cls(path, repo_name, ref)
46+
47+
48+
class _ResolvedArchive(NamedTuple):
49+
"""A source archive matched to the most recent request for it."""
50+
51+
path: Path
52+
created: datetime
53+
latest_request_id: int
54+
55+
56+
def _get_parsed_source_archives(archive_dir: Path) -> Generator[_ParsedArchive, None, None]:
57+
"""Return a _ParsedArchive for each source archive in ARCHIVE_DIR."""
58+
59+
def is_valid_archive_filename(filename: str) -> bool:
60+
"""Archive filename should match <sha1 hash>-<(optional)with-submodules>.tar.gz."""
61+
return re.match(ARCHIVE_PATTERN, filename) is not None
62+
63+
for path in archive_dir.rglob("*.tar.gz"):
64+
if path.is_file() and is_valid_archive_filename(path.name):
65+
yield _ParsedArchive.from_path(path)
66+
else:
67+
log.debug("%s does not appear to be a source archive.", path)
68+
69+
70+
def _resolve_source_archive(parsed_archive: _ParsedArchive) -> Optional[_ResolvedArchive]:
71+
"""Return a _ResolvedArchive if a matching request is found via the API."""
72+
latest_request = _get_latest_request(parsed_archive)
73+
if latest_request is None:
74+
log.debug("Archive %s could not be resolved via the API.", parsed_archive.path)
75+
return None
76+
77+
return _ResolvedArchive(
78+
parsed_archive.path,
79+
datetime.strptime(latest_request["created"], "%Y-%m-%dT%H:%M:%S.%f").replace(
80+
tzinfo=timezone.utc
81+
),
82+
latest_request["id"],
83+
)
84+
85+
86+
def _get_stale_archives(
87+
older_than: datetime, api_calls_per_second: int
88+
) -> Generator[_ResolvedArchive, None, None]:
89+
"""
90+
Return a Generator of _ResolvedArchives that are all stale.
91+
92+
The API requests are ratelimited to prevent potentially overwhelming the API
93+
with a background maintenance task.
94+
"""
95+
96+
@sleep_and_retry
97+
@limits(calls=api_calls_per_second, period=1)
98+
def resolve_source_archive_ratelimited(archive: _ParsedArchive) -> Optional[_ResolvedArchive]:
99+
return _resolve_source_archive(archive)
100+
101+
for parsed_archive in _get_parsed_source_archives(ARCHIVE_DIR):
102+
resolved_archive = resolve_source_archive_ratelimited(parsed_archive)
103+
if resolved_archive and resolved_archive.created < older_than:
104+
yield resolved_archive
105+
106+
107+
def _get_latest_request(archive: _ParsedArchive) -> Optional[dict[str, Any]]:
108+
"""
109+
Find the latest request matching the _ParsedArchive via the API.
110+
111+
Return None if no matching request is found.
112+
"""
113+
url = f"{config.cachito_api_url.rstrip('/')}/requests/latest"
114+
params = {
115+
"repo_name": archive.repo_name,
116+
"ref": archive.ref,
117+
}
118+
119+
try:
120+
response = session.get(url, params=params, timeout=config.cachito_api_timeout)
121+
response.raise_for_status()
122+
except requests.HTTPError:
123+
if response.status_code == 404:
124+
return None
125+
log.error(
126+
"The request to %s failed with the status code %d and the following text: %s",
127+
url,
128+
response.status_code,
129+
response.text,
130+
)
131+
raise NetworkError("Failed to query the cachito API")
132+
except requests.RequestException:
133+
msg = f"The connection failed when querying {url}"
134+
log.exception(msg)
135+
raise NetworkError(msg)
136+
137+
return response.json()
138+
139+
140+
@app.callback()
141+
def configure_logging(verbose: bool = False):
142+
"""Configure logging for the app."""
143+
log_level = logging.DEBUG if verbose else logging.INFO
144+
handler = logging.StreamHandler()
145+
handler.setFormatter(logging.Formatter(LOG_FORMAT))
146+
log.setLevel(log_level)
147+
log.addHandler(handler)
148+
149+
150+
def _validate_older_than(older_than: datetime) -> datetime:
151+
"""Ensure that the value of the --older-than CLI option is not more recent than the minimum."""
152+
older_than_utc = older_than.replace(tzinfo=timezone.utc)
153+
if older_than_utc > MINIMUM_AGE_DATETIME:
154+
raise typer.BadParameter(f"cannot be more recent than {MINIMUM_AGE_DATETIME}")
155+
return older_than_utc
156+
157+
158+
@app.command()
159+
def delete(
160+
older_than: Annotated[
161+
datetime,
162+
typer.Option(
163+
callback=_validate_older_than,
164+
formats=["%Y-%m-%d"],
165+
help="Deletes archives that are older than the specified date (UTC). YYYY-MM-DD",
166+
),
167+
] = DEFAULT_AGE_DATETIME,
168+
api_calls_per_second: Annotated[
169+
int, typer.Option(min=1, max=5, help="The API requests-per-second limit.")
170+
] = 1,
171+
limit: Annotated[
172+
Optional[int], typer.Option(min=1, help="The maximum number of stale archives to process.")
173+
] = None,
174+
execute: Annotated[bool, typer.Option(help="Actual deletion will only occur if True.")] = False,
175+
):
176+
"""
177+
List and delete stale source archives.
178+
179+
Actual deletion will not occur unless the --execute option is included.
180+
"""
181+
for archive in islice(_get_stale_archives(older_than, api_calls_per_second), limit):
182+
log.info(
183+
f"Archive {archive.path} is stale. The most recent request_id="
184+
f"{archive.latest_request_id} at {archive.created}"
185+
)
186+
if execute:
187+
log.error(f"Deleting {archive.path}")
188+
archive.path.unlink()
189+
190+
191+
@app.command()
192+
def list(
193+
older_than: Annotated[
194+
datetime,
195+
typer.Option(
196+
callback=_validate_older_than,
197+
formats=["%Y-%m-%d"],
198+
help="Lists archives that are older than the specified date (UTC). YYYY-MM-DD",
199+
),
200+
] = DEFAULT_AGE_DATETIME,
201+
api_calls_per_second: Annotated[
202+
int, typer.Option(min=1, max=5, help="The API requests-per-second limit.")
203+
] = 1,
204+
limit: Annotated[
205+
Optional[int], typer.Option(min=1, help="The maximum number of stale archives to process.")
206+
] = None,
207+
):
208+
"""List stale source archives."""
209+
for archive in islice(_get_stale_archives(older_than, api_calls_per_second), limit):
210+
log.info(
211+
f"Archive {archive.path} is stale. The most recent request_id="
212+
f"{archive.latest_request_id} at {archive.created}"
213+
)
214+
215+
216+
if __name__ == "__main__":
217+
app()

setup.py

+1
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444
"console_scripts": [
4545
"cachito=cachito.web.manage:cli",
4646
"cachito-cleanup=cachito.workers.cleanup_job:main",
47+
"cachito-prune-archives=cachito.workers.prune_archives:app",
4748
"cachito-update-nexus-scripts=cachito.workers.nexus:create_or_update_scripts",
4849
]
4950
},

0 commit comments

Comments
 (0)