diff --git a/src/hope_dedup_engine/apps/api/models/deduplication.py b/src/hope_dedup_engine/apps/api/models/deduplication.py index 29add69..258e2ec 100644 --- a/src/hope_dedup_engine/apps/api/models/deduplication.py +++ b/src/hope_dedup_engine/apps/api/models/deduplication.py @@ -1,4 +1,3 @@ -from itertools import chain from typing import Any, Final, override from uuid import uuid4 @@ -7,18 +6,7 @@ from django.db import models, transaction from hope_dedup_engine.apps.security.models import ExternalSystem -from hope_dedup_engine.types import ( - Embedding, - EntityEmbedding, - EntityEmbeddingError, - Filename, - Finding, - IgnoredPair, - ImageEmbedding, - ImageEmbeddingError, - Score, - SortedTuple, -) +from hope_dedup_engine.types import ImageEmbedding, ImageEmbeddingError REFERENCE_PK_LENGTH: Final[int] = 100 @@ -79,31 +67,6 @@ class State(models.IntegerChoices): def __str__(self) -> str: return self.name or f"ID: {self.pk}" - def get_encodings(self) -> dict[Filename, Embedding]: - return self.encodings - - def get_findings(self) -> list[Finding]: - return list( - self.finding_set.values_list( - "first_reference_pk", "second_reference_pk", "score" - ) - ) - - def get_ignored_pairs(self) -> set[IgnoredPair]: - return set( - chain( - map( - SortedTuple, - self.ignoredreferencepkpair_set.values_list("first", "second"), - ), - map( - SortedTuple, - list(self.ignoredfilenamepair_set.values_list("first", "second")), - ), - ) - ) - - @transaction.atomic def update_encodings(self, encodings: list[ImageEmbedding]) -> None: with transaction.atomic(): fresh_self: DeduplicationSet = ( @@ -112,7 +75,6 @@ def update_encodings(self, encodings: list[ImageEmbedding]) -> None: fresh_self.encodings.update(encodings) fresh_self.save() - @transaction.atomic def update_encoding_errors(self, errors: list[ImageEmbeddingError]) -> None: with transaction.atomic(): fresh_self: DeduplicationSet = ( @@ -121,43 +83,6 @@ def update_encoding_errors(self, errors: list[ImageEmbeddingError]) -> None: fresh_self.encoding_errors.update(errors) fresh_self.save() - def update_findings( - self, findings: list[tuple[EntityEmbedding, EntityEmbedding, Score]] - ) -> None: - images = Image.objects.filter(deduplication_set=self).values( - "filename", "reference_pk" - ) - filename_to_reference_pk = { - img["filename"]: img["reference_pk"] for img in images - } | {"": ""} - findings_to_create = [ - Finding( - deduplication_set=self, - first_reference_pk=filename_to_reference_pk.get(first_filename), - first_filename=first_filename, - second_reference_pk=filename_to_reference_pk.get(second_filename), - second_filename=second_filename, - score=score, - ) - for first_filename, second_filename, score in findings - ] - Finding.objects.bulk_create(findings_to_create, ignore_conflicts=True) - - def update_finding_errors( - self, encoding_errors: list[EntityEmbeddingError] - ) -> None: - - errors_to_create = [ - Finding( - deduplication_set=self, - first_reference_pk=reference_pk, - first_filename=filename, - status_code=Finding.StatusCode[error].value, - ) - for reference_pk, filename, error in encoding_errors - ] - Finding.objects.bulk_create(errors_to_create, ignore_conflicts=True) - class Image(models.Model): """ diff --git a/src/hope_dedup_engine/apps/faces/celery/tasks/deduplication.py b/src/hope_dedup_engine/apps/faces/celery/tasks/deduplication.py index 9b07027..ea3044d 100644 --- a/src/hope_dedup_engine/apps/faces/celery/tasks/deduplication.py +++ b/src/hope_dedup_engine/apps/faces/celery/tasks/deduplication.py @@ -5,10 +5,11 @@ from hope_dedup_engine.apps.faces.services.facial import ( encode_faces, find_similar_faces, + get_ignored_pairs, + update_finding_errors, + update_findings, ) from hope_dedup_engine.config.celery import app - -# from hope_dedup_engine.constants import FacialError from hope_dedup_engine.types import EntityEmbedding, Filename, SortedTuple from hope_dedup_engine.utils import compact_pairs from hope_dedup_engine.utils.celery.task_result import wrapped @@ -46,7 +47,7 @@ def filter_ignored_pairs( embedding_pairs: Iterable[tuple[EntityEmbedding, EntityEmbedding]], deduplication_set: DeduplicationSet, ) -> Generator[tuple[EntityEmbedding, EntityEmbedding], None, None]: - ignored_pairs = deduplication_set.get_ignored_pairs() + ignored_pairs = get_ignored_pairs(deduplication_set) for embedding_pair in embedding_pairs: first, second = embedding_pair first_reference_pk, _ = first @@ -87,7 +88,7 @@ def find_duplicates( dedupe_threshold=deduplicate_config.get("threshold"), options=deduplicate_config, ) - deduplication_set.update_findings(findings) + update_findings(deduplication_set, findings) @app.task @@ -97,11 +98,10 @@ def save_encoding_errors_in_findings(deduplication_set_id: str) -> None: pk=deduplication_set_id ) embedding_errors = [ - # (reference_pk, FacialError(deduplication_set.encoding_errors[filename])) (reference_pk, filename, deduplication_set.encoding_errors[filename]) for reference_pk, filename in deduplication_set.image_set.values_list( "reference_pk", "filename" ) if filename in deduplication_set.encoding_errors ] - deduplication_set.update_finding_errors(embedding_errors) + update_finding_errors(deduplication_set, embedding_errors) diff --git a/src/hope_dedup_engine/apps/faces/services/facial.py b/src/hope_dedup_engine/apps/faces/services/facial.py index 1706289..e2e77ce 100644 --- a/src/hope_dedup_engine/apps/faces/services/facial.py +++ b/src/hope_dedup_engine/apps/faces/services/facial.py @@ -1,20 +1,24 @@ import logging from collections.abc import Generator, Iterable +from itertools import chain from typing import Any, cast -# from hope_dedup_engine.types import EncodingType, FindingType, IgnoredPairType +from django.db import transaction + from deepface import DeepFace -from hope_dedup_engine.apps.api.models import Finding +from hope_dedup_engine.apps.api.models import DeduplicationSet, Finding, Image from hope_dedup_engine.apps.faces.managers import ImagesStorageManager - -# from hope_dedup_engine.constants import FacialError from hope_dedup_engine.types import ( Embedding, EntityEmbedding, + EntityEmbeddingError, + EntityIgnoredPair, Filename, ImageEmbedding, ImageEmbeddingError, + Score, + SortedTuple, ) logger = logging.getLogger(__name__) @@ -71,3 +75,71 @@ def find_similar_faces( similarity = face_similarity(first_embedding, second_embedding, **options) if similarity >= dedupe_threshold: yield first_filename, second_filename, similarity + + +def update_findings( + deduplication_set: DeduplicationSet, + findings: list[tuple[EntityEmbedding, EntityEmbedding, Score]], +) -> None: + + filename_to_reference_pk = dict( + Image.objects.filter(deduplication_set=deduplication_set).values_list( + "filename", "reference_pk" + ) + ) + findings_to_create = ( + Finding( + deduplication_set=deduplication_set, + first_reference_pk=filename_to_reference_pk.get(first_filename), + first_filename=first_filename, + second_reference_pk=filename_to_reference_pk.get(second_filename), + second_filename=second_filename, + score=score, + ) + for first_filename, second_filename, score in findings + ) + bulk_create_findings(findings_to_create) + + +def update_finding_errors( + deduplication_set: DeduplicationSet, encoding_errors: list[EntityEmbeddingError] +): + errors_to_create = ( + Finding( + deduplication_set=deduplication_set, + first_reference_pk=reference_pk, + first_filename=filename, + status_code=Finding.StatusCode[error].value, + ) + for reference_pk, filename, error in encoding_errors + ) + bulk_create_findings(errors_to_create) + + +def bulk_create_findings(findings: Iterable[Finding]) -> None: + findings_list = list(findings) + if findings_list: + with transaction.atomic(): + Finding.objects.bulk_create(findings_list, ignore_conflicts=True) + logger.info(f"Created {len(findings_list)} findings.") + + +def get_ignored_pairs(deduplication_set: DeduplicationSet) -> set[EntityIgnoredPair]: + return set( + chain( + map( + SortedTuple, + deduplication_set.ignoredreferencepkpair_set.values_list( + "first", "second" + ), + ), + map( + SortedTuple, + list( + deduplication_set.ignoredfilenamepair_set.values_list( + "first", "second" + ) + ), + ), + ) + ) diff --git a/src/hope_dedup_engine/types.py b/src/hope_dedup_engine/types.py index 7395e97..b622373 100644 --- a/src/hope_dedup_engine/types.py +++ b/src/hope_dedup_engine/types.py @@ -1,12 +1,11 @@ from collections.abc import Iterable -from enum import Enum -from typing import Self +from typing import TYPE_CHECKING, Self -# from hope_dedup_engine.constants import FacialError -# TODO: -# from hope_dedup_engine.apps.api.models.deduplication import Finding as FindingModel +if TYPE_CHECKING: + from hope_dedup_engine.apps.api.models.deduplication import ( # noqa: F401 + Finding as FindingModel, + ) -# .Image import StatusCode type ReferencePK = str type Filename = str @@ -14,19 +13,11 @@ type Score = float -class FacialError(Enum): - GENERIC_ERROR = 999 - NO_FACE_DETECTED = 998 - MULTIPLE_FACES_DETECTED = 997 - NO_FILE_FOUND = 996 - - EntityImage = tuple[ReferencePK, Filename] EntityEmbedding = tuple[ReferencePK, Embedding] -EntityEmbeddingError = tuple[ReferencePK, FacialError] +EntityEmbeddingError = tuple[ReferencePK, "FindingModel.StatusCode"] ImageEmbedding = tuple[Filename, Embedding] -ImageEmbeddingError = tuple[Filename, FacialError] -Finding = tuple[ReferencePK, ReferencePK, Score] +ImageEmbeddingError = tuple[Filename, "FindingModel.StatusCode"] class SortedTuple(tuple): @@ -34,4 +25,4 @@ def __new__(cls, iterable: Iterable) -> Self: return tuple.__new__(cls, sorted(iterable)) -IgnoredPair = SortedTuple[ReferencePK, ReferencePK] +EntityIgnoredPair = SortedTuple[ReferencePK, ReferencePK]