Skip to content

Commit c7aabe4

Browse files
authored
Implement storage- and relationship-aware cleanup (#973)
1 parent 96774a8 commit c7aabe4

23 files changed

+1945
-690
lines changed

conftest.py

+9-5
Original file line numberDiff line numberDiff line change
@@ -278,13 +278,17 @@ def mock_storage(mocker):
278278

279279
@pytest.fixture
280280
def mock_archive_storage(mocker):
281-
m = mocker.patch("shared.api_archive.archive.StorageService")
282-
use_archive = mocker.patch(
283-
"shared.django_apps.core.models.should_write_data_to_storage_config_check"
281+
mocker.patch(
282+
"shared.django_apps.core.models.should_write_data_to_storage_config_check",
283+
return_value=True,
284284
)
285-
use_archive.return_value = True
286285
storage_server = MemoryStorageService({})
287-
m.return_value = storage_server
286+
mocker.patch(
287+
"shared.api_archive.archive.StorageService", return_value=storage_server
288+
)
289+
mocker.patch(
290+
"shared.storage.get_appropriate_storage_service", return_value=storage_server
291+
)
288292
return storage_server
289293

290294

requirements.in

+1
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ sentry-sdk>=2.13.0
5252
sentry-sdk[celery]
5353
SQLAlchemy
5454
SQLAlchemy-Utils
55+
sqlparse
5556
statsd
5657
stripe>=11.4.1
5758
time-machine

requirements.txt

+3-1
Original file line numberDiff line numberDiff line change
@@ -400,7 +400,9 @@ sqlalchemy-utils==0.36.8
400400
# -r requirements.in
401401
# pytest-sqlalchemy
402402
sqlparse==0.5.0
403-
# via django
403+
# via
404+
# -r requirements.in
405+
# django
404406
statsd==3.3.0
405407
# via -r requirements.in
406408
stripe==11.4.1

services/archive.py

-16
Original file line numberDiff line numberDiff line change
@@ -229,22 +229,6 @@ def delete_file(self, path) -> None:
229229
"""
230230
self.storage.delete_file(self.root, path)
231231

232-
@sentry_sdk.trace()
233-
def delete_files(self, paths: list[str]) -> list[bool]:
234-
"""
235-
Batch-deletes the gives list of files.
236-
"""
237-
return self.storage.delete_files(self.root, paths)
238-
239-
def delete_repo_files(self) -> int:
240-
"""
241-
Deletes an entire repository's contents
242-
"""
243-
path = "v4/repos/{}".format(self.storage_hash)
244-
objects = self.storage.list_folder_contents(self.root, path)
245-
results = self.storage.delete_files(self.root, [obj["name"] for obj in objects])
246-
return len(results)
247-
248232
def read_chunks(self, commit_sha, report_code=None) -> str:
249233
"""
250234
Convenience method to read a chunks file from the archive.

services/cleanup/__init__.py

Whitespace-only changes.

services/cleanup/cleanup.py

+56
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
import logging
2+
3+
from django.db.models.query import QuerySet
4+
5+
from services.cleanup.models import MANUAL_CLEANUP
6+
from services.cleanup.relations import build_relation_graph
7+
from services.cleanup.utils import CleanupResult, CleanupSummary, cleanup_context
8+
9+
log = logging.getLogger(__name__)
10+
11+
12+
def run_cleanup(
13+
query: QuerySet,
14+
) -> CleanupSummary:
15+
"""
16+
Cleans up all the models and storage files reachable from the given `QuerySet`.
17+
18+
This deletes all database models in topological sort order, and also removes
19+
all the files in storage for any of the models in the relationship graph.
20+
21+
Returns the number of models and files being cleaned up in total, and per-Model.
22+
"""
23+
models_to_cleanup = build_relation_graph(query)
24+
25+
summary = {}
26+
cleaned_models = 0
27+
cleaned_files = 0
28+
29+
with cleanup_context() as context:
30+
for model, query in models_to_cleanup:
31+
# This is needed so that the correct connection is chosen for the
32+
# `_raw_delete` queries, as otherwise it might chose a readonly connection.
33+
query._for_write = True
34+
35+
manual_cleanup = MANUAL_CLEANUP.get(model)
36+
if manual_cleanup is not None:
37+
result = manual_cleanup(context, query)
38+
else:
39+
result = CleanupResult(query._raw_delete(query.db))
40+
41+
if result.cleaned_models > 0 or result.cleaned_files > 0:
42+
summary[model] = result
43+
44+
log.info(
45+
f"Finished cleaning up {model.__name__}",
46+
extra={
47+
"cleaned_models": result.cleaned_models,
48+
"cleaned_files": result.cleaned_files,
49+
},
50+
)
51+
52+
cleaned_models += result.cleaned_models
53+
cleaned_files += result.cleaned_files
54+
55+
totals = CleanupResult(cleaned_models, cleaned_files)
56+
return CleanupSummary(totals, summary)

services/cleanup/models.py

+193
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,193 @@
1+
import dataclasses
2+
from collections import defaultdict
3+
from collections.abc import Callable
4+
from functools import partial
5+
6+
from django.db.models import Model
7+
from django.db.models.query import QuerySet
8+
from shared.bundle_analysis import StoragePaths
9+
from shared.django_apps.compare.models import CommitComparison
10+
from shared.django_apps.core.models import Commit, Pull
11+
from shared.django_apps.profiling.models import ProfilingUpload
12+
from shared.django_apps.reports.models import CommitReport, ReportDetails
13+
from shared.django_apps.reports.models import ReportSession as Upload
14+
from shared.django_apps.staticanalysis.models import StaticAnalysisSingleFileSnapshot
15+
16+
from services.archive import ArchiveService, MinioEndpoints
17+
from services.cleanup.utils import CleanupContext, CleanupResult
18+
19+
MANUAL_QUERY_CHUNKSIZE = 5_000
20+
DELETE_FILES_BATCHSIZE = 50
21+
22+
23+
def cleanup_files_batched(
24+
context: CleanupContext, buckets_paths: dict[str, list[str]]
25+
) -> int:
26+
def delete_file(bucket_path: tuple[str, str]) -> bool:
27+
return context.storage.delete_file(bucket_path[0], bucket_path[1])
28+
29+
iter = ((bucket, path) for bucket, paths in buckets_paths.items() for path in paths)
30+
results = context.threadpool.map(delete_file, iter)
31+
return sum(results)
32+
33+
34+
def cleanup_with_storage_field(
35+
path_field: str,
36+
context: CleanupContext,
37+
query: QuerySet,
38+
) -> CleanupResult:
39+
cleaned_files = 0
40+
41+
# delete `None` `path_field`s right away
42+
cleaned_models = query.filter(**{f"{path_field}__isnull": True})._raw_delete(
43+
query.db
44+
)
45+
46+
# delete all those files from storage, using chunks based on the `id` column
47+
storage_query = query.filter(**{f"{path_field}__isnull": False}).order_by("id")
48+
49+
while True:
50+
storage_paths = storage_query.values_list(path_field, flat=True)[
51+
:MANUAL_QUERY_CHUNKSIZE
52+
]
53+
if len(storage_paths) == 0:
54+
break
55+
56+
cleaned_files += cleanup_files_batched(
57+
context, {context.default_bucket: storage_paths}
58+
)
59+
cleaned_models += query.filter(
60+
id__in=storage_query[:MANUAL_QUERY_CHUNKSIZE]
61+
)._raw_delete(query.db)
62+
63+
return CleanupResult(cleaned_models, cleaned_files)
64+
65+
66+
def cleanup_archivefield(
67+
field_name: str, context: CleanupContext, query: QuerySet
68+
) -> CleanupResult:
69+
model_field_name = f"_{field_name}_storage_path"
70+
71+
return cleanup_with_storage_field(model_field_name, context, query)
72+
73+
74+
# This has all the `Repository` fields needed by `get_archive_hash`
75+
@dataclasses.dataclass
76+
class FakeRepository:
77+
repoid: int
78+
service: str
79+
service_id: str
80+
81+
82+
def cleanup_commitreport(context: CleanupContext, query: QuerySet) -> CleanupResult:
83+
coverage_reports = query.values_list(
84+
"report_type",
85+
"code",
86+
"external_id",
87+
"commit__commitid",
88+
"commit__repository__repoid",
89+
"commit__repository__author__service",
90+
"commit__repository__service_id",
91+
).order_by("id")
92+
93+
cleaned_models = 0
94+
cleaned_files = 0
95+
repo_hashes: dict[int, str] = {}
96+
97+
while True:
98+
reports = coverage_reports[:MANUAL_QUERY_CHUNKSIZE]
99+
if len(reports) == 0:
100+
break
101+
102+
buckets_paths: dict[str, list[str]] = defaultdict(list)
103+
for (
104+
report_type,
105+
report_code,
106+
external_id,
107+
commit_sha,
108+
repoid,
109+
repo_service,
110+
repo_service_id,
111+
) in reports:
112+
if repoid not in repo_hashes:
113+
fake_repo = FakeRepository(
114+
repoid=repoid, service=repo_service, service_id=repo_service_id
115+
)
116+
repo_hashes[repoid] = ArchiveService.get_archive_hash(fake_repo)
117+
repo_hash = repo_hashes[repoid]
118+
119+
# depending on the `report_type`, we have:
120+
# - a `chunks` file for coverage
121+
# - a `bundle_report.sqlite` for BA
122+
if report_type == "bundle_analysis":
123+
path = StoragePaths.bundle_report.path(
124+
repo_key=repo_hash, report_key=external_id
125+
)
126+
buckets_paths[context.bundleanalysis_bucket].append(path)
127+
elif report_type == "test_results":
128+
# TA has cached rollups, but those are based on `Branch`
129+
pass
130+
else:
131+
chunks_file_name = report_code if report_code is not None else "chunks"
132+
path = MinioEndpoints.chunks.get_path(
133+
version="v4",
134+
repo_hash=repo_hash,
135+
commitid=commit_sha,
136+
chunks_file_name=chunks_file_name,
137+
)
138+
buckets_paths[context.default_bucket].append(path)
139+
140+
cleaned_files += cleanup_files_batched(context, buckets_paths)
141+
cleaned_models += query.filter(
142+
id__in=query.order_by("id")[:MANUAL_QUERY_CHUNKSIZE]
143+
)._raw_delete(query.db)
144+
145+
return CleanupResult(cleaned_models, cleaned_files)
146+
147+
148+
def cleanup_upload(context: CleanupContext, query: QuerySet) -> CleanupResult:
149+
cleaned_files = 0
150+
151+
# delete `None` `storage_path`s right away
152+
cleaned_models = query.filter(storage_path__isnull=True)._raw_delete(query.db)
153+
154+
# delete all those files from storage, using chunks based on the `id` column
155+
storage_query = query.filter(storage_path__isnull=False).order_by("id")
156+
157+
while True:
158+
uploads = storage_query.values_list("report__report_type", "storage_path")[
159+
:MANUAL_QUERY_CHUNKSIZE
160+
]
161+
if len(uploads) == 0:
162+
break
163+
164+
buckets_paths: dict[str, list[str]] = defaultdict(list)
165+
for report_type, storage_path in uploads:
166+
if report_type == "bundle_analysis":
167+
buckets_paths[context.bundleanalysis_bucket].append(storage_path)
168+
else:
169+
buckets_paths[context.default_bucket].append(storage_path)
170+
171+
cleaned_files += cleanup_files_batched(context, buckets_paths)
172+
cleaned_models += query.filter(
173+
id__in=storage_query[:MANUAL_QUERY_CHUNKSIZE]
174+
)._raw_delete(query.db)
175+
176+
return CleanupResult(cleaned_models, cleaned_files)
177+
178+
179+
# All the models that need custom python code for deletions so a bulk `DELETE` query does not work.
180+
MANUAL_CLEANUP: dict[
181+
type[Model], Callable[[CleanupContext, QuerySet], CleanupResult]
182+
] = {
183+
Commit: partial(cleanup_archivefield, "report"),
184+
Pull: partial(cleanup_archivefield, "flare"),
185+
ReportDetails: partial(cleanup_archivefield, "files_array"),
186+
CommitReport: cleanup_commitreport,
187+
Upload: cleanup_upload,
188+
CommitComparison: partial(cleanup_with_storage_field, "report_storage_path"),
189+
ProfilingUpload: partial(cleanup_with_storage_field, "raw_upload_location"),
190+
StaticAnalysisSingleFileSnapshot: partial(
191+
cleanup_with_storage_field, "content_location"
192+
),
193+
}

services/cleanup/owner.py

+54
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
import logging
2+
3+
from django.db import transaction
4+
from django.db.models import Q
5+
from shared.django_apps.codecov_auth.models import Owner, OwnerProfile
6+
from shared.django_apps.core.models import Commit, Pull, Repository
7+
8+
from services.cleanup.cleanup import run_cleanup
9+
from services.cleanup.utils import CleanupSummary
10+
11+
log = logging.getLogger(__name__)
12+
13+
CLEAR_ARRAY_FIELDS = ["plan_activated_users", "organizations", "admins"]
14+
15+
16+
def cleanup_owner(owner_id: int) -> CleanupSummary:
17+
log.info("Started/Continuing Owner cleanup", extra={"owner_id": owner_id})
18+
19+
clear_owner_references(owner_id)
20+
owner_query = Owner.objects.filter(ownerid=owner_id)
21+
summary = run_cleanup(owner_query)
22+
23+
log.info("Owner cleanup finished", extra={"owner_id": owner_id, "summary": summary})
24+
return summary
25+
26+
27+
# TODO: maybe turn this into a `MANUAL_CLEANUP`?
28+
def clear_owner_references(owner_id: int):
29+
"""
30+
This clears the `ownerid` from various DB arrays where it is being referenced.
31+
"""
32+
33+
OwnerProfile.objects.filter(default_org=owner_id).update(default_org=None)
34+
Owner.objects.filter(bot=owner_id).update(bot=None)
35+
Repository.objects.filter(bot=owner_id).update(bot=None)
36+
Commit.objects.filter(author=owner_id).update(author=None)
37+
Pull.objects.filter(author=owner_id).update(author=None)
38+
39+
# This uses a transaction / `select_for_update` to ensure consistency when
40+
# modifying these `ArrayField`s in python.
41+
# I don’t think we have such consistency anyplace else in the codebase, so
42+
# if this is causing lock contention issues, its also fair to avoid this.
43+
with transaction.atomic():
44+
filter = Q()
45+
for field in CLEAR_ARRAY_FIELDS:
46+
filter = filter | Q(**{f"{field}__contains": [owner_id]})
47+
48+
owners_with_reference = Owner.objects.select_for_update().filter(filter)
49+
for owner in owners_with_reference:
50+
for field in CLEAR_ARRAY_FIELDS:
51+
array = getattr(owner, field)
52+
setattr(owner, field, [x for x in array if x != owner_id])
53+
54+
owner.save(update_fields=CLEAR_ARRAY_FIELDS)

0 commit comments

Comments
 (0)