Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement storage- and relationship-aware cleanup #973

Merged
merged 22 commits into from
Jan 22, 2025
Merged
Changes from 1 commit
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
bf0841b
Implement storage- and relationship-aware cleanup
Swatinem Dec 18, 2024
3b4cbaa
start working on batched/chunked deletes
Swatinem Jan 7, 2025
e43b561
use `_raw_delete`
Swatinem Jan 7, 2025
7b5b295
add cleanup of `CompareCommit` and BA `CommitReport`s
Swatinem Jan 9, 2025
e6226f7
rename to upload
Swatinem Jan 9, 2025
6031478
start rewriting FlushRepo in terms of cleanup
Swatinem Jan 14, 2025
9635aba
finish up migrating `FlushRepo`
Swatinem Jan 14, 2025
98edb6c
Reimplement `DeleteOwner` in terms of `cleanup`
Swatinem Jan 14, 2025
14cc6da
test a snapshot of the delete queries
Swatinem Jan 15, 2025
d83bbf6
rewrite relation builder to generate complete queries, and prefer sho…
Swatinem Jan 15, 2025
152e704
Move repository to shadow Owner at the start of cleanup task
Swatinem Jan 16, 2025
0d70211
also dump owner deletion queries, ignore certain relations
Swatinem Jan 16, 2025
2eb6a1d
add undocumented relations
Swatinem Jan 16, 2025
8ded8b5
Fix various typos and assumptions
Swatinem Jan 17, 2025
1d61cc3
support cleaning BA files which are stored in a different bucket
Swatinem Jan 17, 2025
90bf1e2
add more undocumented relations
Swatinem Jan 20, 2025
a726394
remove unused `delete_repo_files`
Swatinem Jan 21, 2025
15ef7d2
delete files one-by-one, and reuse the threadpool
Swatinem Jan 21, 2025
6693116
use new file cleanup within `FlareCleanup`
Swatinem Jan 21, 2025
5b6206c
fix `FlareCleanup` tests along with removing more unused `Archive/Sto…
Swatinem Jan 21, 2025
15aea8b
make sure to pick the correct connection for raw deletes
Swatinem Jan 22, 2025
0a22ed4
only re-parent non-`deleted` repos based on `service_id` in `SyncRepos`
Swatinem Jan 22, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Move repository to shadow Owner at the start of cleanup task
Swatinem committed Jan 17, 2025

Verified

This commit was signed with the committer’s verified signature.
Swatinem Arpad Borsos
commit 152e704ab7df8ad20ac2925377d51178e4b9e144
13 changes: 13 additions & 0 deletions services/cleanup/cleanup.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
import logging

from django.db.models.query import QuerySet

from services.cleanup.models import MANUAL_CLEANUP
from services.cleanup.relations import build_relation_graph
from services.cleanup.utils import CleanupContext, CleanupResult, CleanupSummary

log = logging.getLogger(__name__)


def run_cleanup(
query: QuerySet,
@@ -32,6 +36,15 @@ def run_cleanup(

if result.cleaned_models > 0 or result.cleaned_files > 0:
summary[model] = result

log.info(
f"Finished cleaning up {model.__name__}",
extra={
"cleaned_models": result.cleaned_models,
"cleaned_files": result.cleaned_files,
},
)

cleaned_models += result.cleaned_models
cleaned_files += result.cleaned_files

2 changes: 1 addition & 1 deletion services/cleanup/models.py
Original file line number Diff line number Diff line change
@@ -15,7 +15,7 @@
from services.archive import ArchiveService, MinioEndpoints
from services.cleanup.utils import CleanupContext, CleanupResult

MANUAL_QUERY_CHUNKSIZE = 2_500
MANUAL_QUERY_CHUNKSIZE = 5_000
DELETE_FILES_BATCHSIZE = 50


24 changes: 23 additions & 1 deletion services/cleanup/owner.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,30 @@
import logging

from django.db import transaction
from django.db.models import Q
from shared.django_apps.codecov_auth.models import Owner
from shared.django_apps.core.models import Commit, Pull, Repository

from services.cleanup.cleanup import run_cleanup
from services.cleanup.utils import CleanupSummary

log = logging.getLogger(__name__)

CLEAR_ARRAY_FIELDS = ["plan_activated_users", "organizations", "admins"]


def clear_owner(owner_id: int):
def cleanup_owner(owner_id: int) -> CleanupSummary:
log.info("Started/Continuing Owner cleanup", extra={"owner_id": owner_id})

clear_owner_references(owner_id)
owner_query = Owner.objects.filter(ownerid=owner_id)
summary = run_cleanup(owner_query)

log.info("Owner cleanup finished", extra={"owner_id": owner_id, "summary": summary})
return summary


def clear_owner_references(owner_id: int):
"""
This clears the `ownerid` from various DB arrays where it is being referenced.
"""
@@ -16,6 +34,10 @@ def clear_owner(owner_id: int):
Commit.objects.filter(author=owner_id).update(author=None)
Pull.objects.filter(author=owner_id).update(author=None)

# This uses a transaction / `select_for_update` to ensure consistency when
# modifying these `ArrayField`s in python.
# I don’t think we have such consistency anyplace else in the codebase, so
# if this is causing lock contention issues, its also fair to avoid this.
with transaction.atomic():
filter = Q()
for field in CLEAR_ARRAY_FIELDS:
83 changes: 83 additions & 0 deletions services/cleanup/repository.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
import logging
from uuid import uuid4

from django.db import transaction
from shared.django_apps.codecov_auth.models import Owner
from shared.django_apps.core.models import Repository

from services.cleanup.cleanup import run_cleanup
from services.cleanup.utils import CleanupSummary

log = logging.getLogger(__name__)


def cleanup_repo(repo_id: int) -> CleanupSummary:
cleanup_started, owner_id = start_repo_cleanup(repo_id)

if cleanup_started:
log.info("Started Repository cleanup", extra={"repo_id": repo_id})
else:
log.info("Continuing Repository cleanup", extra={"repo_id": repo_id})

repo_query = Repository.objects.filter(repoid=repo_id)
summary = run_cleanup(repo_query)
Owner.filter(ownerid=owner_id).delete()

log.info(
"Repository cleanup finished", extra={"repoid": repo_id, "summary": summary}
)
return summary


def start_repo_cleanup(repo_id: int) -> tuple[bool, int]:
"""
Starts Repository deletion by marking the repository as `deleted`, and moving
it to a newly created "shadow Owner".
This newly created `Owner` only has a valid `service` and `service_id`,
which are the only required non-NULL fields without defaults, and is otherwise
completely empty.
The `ownerid` of this newly created owner is being returned along with a flag
indicating whether the repo cleanup was just started, or whether it is already
marked for deletion, and this function is being retried.
It is expected that repo cleanup is a slow process and might be done in more steps.
"""
# Runs in a transaction as we do not want to leave leftover shadow owners in
# case anything goes wrong here.
with transaction.atomic():
(
repo_deleted,
owner_id,
owner_name,
owner_username,
owner_service,
owner_service_id,
) = Repository.objects.values_list(
"deleted",
"author__owneridauthor__name",
"author__username",
"author__service",
"author__service_id",
).get(repoid=repo_id)

if repo_deleted and not owner_name and not owner_username:
return (False, owner_id)

# We mark the repository as "scheduled for deletion" by setting the `deleted`
# flag, moving it to a new shadow owner, and clearing some tokens.
shadow_owner = Owner.objects.create(
service=owner_service, service_id=owner_service_id
)
new_token = uuid4().hex
Repository.objects.filter(repoid=repo_id).update(
deleted=True,
author=shadow_owner,
upload_token=new_token,
image_token=new_token,
)

# The equivalent of `SET NULL`:
Repository.objects.filter(forkid=repo_id).update(forkid=None)

return (True, shadow_owner.ownerid)
21 changes: 4 additions & 17 deletions tasks/delete_owner.py
Original file line number Diff line number Diff line change
@@ -1,34 +1,21 @@
import logging

from shared.celery_config import delete_owner_task_name
from shared.django_apps.codecov_auth.models import Owner

from app import celery_app
from services.cleanup.cleanup import run_cleanup
from services.cleanup.owner import clear_owner
from services.cleanup.owner import cleanup_owner
from services.cleanup.utils import CleanupSummary
from tasks.base import BaseCodecovTask

log = logging.getLogger(__name__)


class DeleteOwnerTask(BaseCodecovTask, name=delete_owner_task_name):
"""
Delete an owner and their data:
- Repo archive data for each of their owned repos
- Owner entry from db
- Cascading deletes of repos, pulls, and branches for the owner
"""
acks_late = True # retry the task when the worker dies for whatever reason
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It won't retry if the task ends with an Exception... just saying, if that is the expectation in "whatever"

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you have more details about the execution model of celery? I feel like after all this time I still don’t understand how it works and behaves under various circumstances.

For these tasks, I would want them to "resume" until they are completed.
This means "resuming" on timeouts (not sure if you can configure them to not have a timeout at all?).
It means resuming when the worker is killed for whatever reason. It also means resuming on any kind of random infrastructure exception.

Even for a fixable programmer error, I would like these tasks to always run to "successful" completion.

I’m not entirely sure how to achieve that with celery 🤷🏻‍♂️

max_retries = None # aka, no limit on retries

def run_impl(self, _db_session, ownerid: int) -> CleanupSummary:
log.info("Delete owner", extra={"ownerid": ownerid})

clear_owner(ownerid)
owner_query = Owner.objects.filter(ownerid=ownerid)

summary = run_cleanup(owner_query)
log.info("Deletion finished", extra={"ownerid": ownerid, "summary": summary})
return summary
return cleanup_owner(ownerid)


RegisteredDeleteOwnerTask = celery_app.register_task(DeleteOwnerTask())
19 changes: 5 additions & 14 deletions tasks/flush_repo.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,19 @@
import logging

import sentry_sdk
from shared.django_apps.core.models import Repository

from app import celery_app
from database.engine import Session
from services.cleanup.cleanup import run_cleanup
from services.cleanup.repository import cleanup_repo
from services.cleanup.utils import CleanupSummary
from tasks.base import BaseCodecovTask

log = logging.getLogger(__name__)


class FlushRepoTask(BaseCodecovTask, name="app.tasks.flush_repo.FlushRepo"):
@sentry_sdk.trace
def run_impl(
self, _db_session: Session, *, repoid: int, **kwargs
) -> CleanupSummary:
log.info("Deleting repo contents", extra={"repoid": repoid})
repo_query = Repository.objects.filter(repoid=repoid)
acks_late = True # retry the task when the worker dies for whatever reason
max_retries = None # aka, no limit on retries

summary = run_cleanup(repo_query)
log.info("Deletion finished", extra={"repoid": repoid, "summary": summary})
return summary
def run_impl(self, _db_session, repoid: int) -> CleanupSummary:
return cleanup_repo(repoid)


FlushRepo = celery_app.register_task(FlushRepoTask())