Skip to content

Commit

Permalink
fix(grouping): Fix missing date_added in grouphash metadata (#86078)
Browse files Browse the repository at this point in the history
Normally, when storing Seer results in grouphash metadata, we use the metadata's `date_added` value as the value for `seer_date_sent`. Because of race conditions, though, it's possible for `date_added` to be `None` (see below). Rather than use `None` for both, this fixes it so that the equivalence can go in reverse, in other words, so that we use the `seer_date_sent` timestamp for `date_aded`. (The difference between the two should be a matter of milliseconds, so there's no loss of accuracy.)

(The race condition goes like this: Events A and B, which have the same new hash, hit our servers nearly simultaneously. A gets to `get_or_create_grouphashes` first, so from the `GroupHash.objects.get_or_create` call, it gets `created = True`, while B gets `created = False`. Now they're both passed to `create_or_update_grouphash_metadata_if_needed`, and both hit a call to `GroupHashMetadata.objects.get_or_create`. This time, however, it's B which wins the race, and gets `created = True`. It's therefore the one which makes it past the `if not created` check, but because it lost the first race, we mistake it for an existing grouphash and null out `date_added`. (The code for this is here[1] and here[2].))


[1] https://github.com/getsentry/sentry/blob/0339d3a90475b8682362b9fc3830b19b0d759070/src/sentry/grouping/ingest/hashing.py#L229-L237
[2] https://github.com/getsentry/sentry/blob/ea02541f2a5562c51acba855655dcdb6005f7fe9/src/sentry/grouping/ingest/grouphash_metadata.py#L140-L163
  • Loading branch information
lobsterkatie authored Feb 28, 2025
1 parent 39cb3ce commit 0cfd300
Show file tree
Hide file tree
Showing 2 changed files with 50 additions and 1 deletion.
10 changes: 9 additions & 1 deletion src/sentry/grouping/ingest/seer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import sentry_sdk
from django.conf import settings
from django.utils import timezone

from sentry import features, options
from sentry import ratelimits as ratelimiter
Expand Down Expand Up @@ -419,13 +420,20 @@ def maybe_check_seer_for_matching_grouphash(
)
return seer_matched_grouphash

timestamp = timezone.now()

gh_metadata.update(
# Technically the time of the metadata record creation and the time of the Seer
# request will be some milliseconds apart, but a) the difference isn't meaningful
# for us, and b) forcing them to be the same (rather than just close) lets us use
# their equality as a signal that the Seer call happened during ingest rather than
# during a backfill, without having to store that information separately.
seer_date_sent=gh_metadata.date_added,
#
# In rare race condition cases, `date_added` will be None (if different events win
# the race to create the relevant `GroupHash` and `GroupHashMetadata` records), so
# we set that if necessary here as well.
date_added=gh_metadata.date_added or timestamp,
seer_date_sent=gh_metadata.date_added or timestamp,
seer_event_sent=event.event_id,
seer_model=seer_response_data["similarity_model_version"],
seer_matched_grouphash=seer_matched_grouphash,
Expand Down
41 changes: 41 additions & 0 deletions tests/sentry/event_manager/grouping/test_seer_grouping.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from unittest.mock import MagicMock, patch

from sentry.conf.server import SEER_SIMILARITY_MODEL_VERSION
from sentry.grouping.ingest.grouphash_metadata import create_or_update_grouphash_metadata_if_needed
from sentry.grouping.ingest.seer import get_seer_similar_issues, should_call_seer_for_grouping
from sentry.models.grouphash import GroupHash
from sentry.seer.similarity.types import SeerSimilarIssueData
Expand Down Expand Up @@ -277,3 +278,43 @@ def test_event_not_sent_to_seer(self):

assert event_grouphash and event_grouphash.metadata
self.assert_correct_seer_metadata(event_grouphash, None, None, None, None, None)

@with_feature("organizations:grouphash-metadata-creation")
@patch("sentry.grouping.ingest.seer.should_call_seer_for_grouping", return_value=True)
def test_fills_in_missing_date_added(self, _):

# Mimic the effects of the race condition wherein two events with the same new hash race to
# create `GroupHash` and `GroupHashMetadata` records, and each event wins one of the races,
# which results in the metadata not having a `date_added` value
def race_condition_create_or_update_grouphash_metadata(
event, project, grouphash, created, grouping_config, variants
):
create_or_update_grouphash_metadata_if_needed(
event, project, grouphash, created, grouping_config, variants
)
assert grouphash.metadata
grouphash.metadata.update(date_added=None)
assert not grouphash.metadata.date_added

with (
patch(
"sentry.grouping.ingest.seer.get_similarity_data_from_seer",
return_value=[],
),
patch(
"sentry.grouping.ingest.hashing.create_or_update_grouphash_metadata_if_needed",
wraps=race_condition_create_or_update_grouphash_metadata,
) as mock_create_or_update_grouphash_metadata,
):
event = save_new_event(get_event_data(), self.project)
event_grouphash = GroupHash.objects.filter(
hash=event.get_primary_hash(), project_id=self.project.id
).first()
assert event_grouphash and event_grouphash.metadata

# Our mock was called, so we know that going into the Seer call, the grouphash had no
# `date_added` in its metadata, because that's asserted in the mock
mock_create_or_update_grouphash_metadata.assert_called()
# Now, however, it does, and it's the same as the Seer timestamp
assert event_grouphash.metadata.date_added
assert event_grouphash.metadata.seer_date_sent == event_grouphash.metadata.date_added

0 comments on commit 0cfd300

Please sign in to comment.