From 7aa072178bd58e2381f1b46b7446191cfb417320 Mon Sep 17 00:00:00 2001 From: Rohit Gupta Date: Thu, 9 Jan 2025 23:01:19 +0530 Subject: [PATCH 01/10] Update recognizer_registry.py --- .../recognizer_registry.py | 40 +++++++++++-------- 1 file changed, 23 insertions(+), 17 deletions(-) diff --git a/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry.py b/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry.py index 97c64b307..4ec19d56d 100644 --- a/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry.py +++ b/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry.py @@ -156,11 +156,19 @@ def get_recognizers( if entities is None and all_fields is False: raise ValueError("No entities provided") + all_entity_recognizers = dict() all_possible_recognizers = copy.copy(self.recognizers) if ad_hoc_recognizers: all_possible_recognizers.extend(ad_hoc_recognizers) + for rec in all_possible_recognizers: + print(rec.supported_entities) + if type(rec.supported_entities) == list and len(rec.supported_entities) > 0: + all_entity_recognizers.update(dict.fromkeys(rec.supported_entities, rec)) + elif type(rec.supported_entities) == str: + all_entity_recognizers[rec.supported_entities] = rec + # filter out unwanted recognizers to_return = set() if all_fields: @@ -170,23 +178,21 @@ def get_recognizers( if language == rec.supported_language ] else: - for entity in entities: - subset = [ - rec - for rec in all_possible_recognizers - if entity in rec.supported_entities - and language == rec.supported_language - ] - - if not subset: - logger.warning( - "Entity %s doesn't have the corresponding" - " recognizer in language : %s", - entity, - language, - ) - else: - to_return.update(set(subset)) + subset = [ + all_entity_recognizers[entity] + for entity in entities + if entity in all_entity_recognizers + and language == all_entity_recognizers[entity].supported_language + ] + if not subset: + logger.warning( + "Entity %s doesn't have the corresponding" + " recognizer in language : %s", + entity, + language, + ) + else: + to_return.update(set(subset)) logger.debug( "Returning a total of %s recognizers", From 4fc942aebdfdd47b535a5160d8c0873f4083ee01 Mon Sep 17 00:00:00 2001 From: Rohit Gupta Date: Thu, 9 Jan 2025 23:06:55 +0530 Subject: [PATCH 02/10] Update recognizer_registry.py --- .../presidio_analyzer/recognizer_registry/recognizer_registry.py | 1 - 1 file changed, 1 deletion(-) diff --git a/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry.py b/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry.py index 4ec19d56d..9812bd2bb 100644 --- a/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry.py +++ b/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry.py @@ -163,7 +163,6 @@ def get_recognizers( all_possible_recognizers.extend(ad_hoc_recognizers) for rec in all_possible_recognizers: - print(rec.supported_entities) if type(rec.supported_entities) == list and len(rec.supported_entities) > 0: all_entity_recognizers.update(dict.fromkeys(rec.supported_entities, rec)) elif type(rec.supported_entities) == str: From 35cc00f757988edd52432921e4308f20a92520fb Mon Sep 17 00:00:00 2001 From: Rohit Gupta Date: Mon, 13 Jan 2025 18:46:10 +0530 Subject: [PATCH 03/10] Update recognizer_registry.py --- .../recognizer_registry/recognizer_registry.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry.py b/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry.py index 9812bd2bb..34cd5058e 100644 --- a/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry.py +++ b/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry.py @@ -162,12 +162,6 @@ def get_recognizers( if ad_hoc_recognizers: all_possible_recognizers.extend(ad_hoc_recognizers) - for rec in all_possible_recognizers: - if type(rec.supported_entities) == list and len(rec.supported_entities) > 0: - all_entity_recognizers.update(dict.fromkeys(rec.supported_entities, rec)) - elif type(rec.supported_entities) == str: - all_entity_recognizers[rec.supported_entities] = rec - # filter out unwanted recognizers to_return = set() if all_fields: @@ -177,11 +171,12 @@ def get_recognizers( if language == rec.supported_language ] else: + entities = set(entities) subset = [ - all_entity_recognizers[entity] - for entity in entities - if entity in all_entity_recognizers - and language == all_entity_recognizers[entity].supported_language + rec + for rec in all_possible_recognizers + if bool(set(rec.supported_entities).intersection(entities)) + and language == rec.supported_language ] if not subset: logger.warning( From f9641a77174dd3c11fb9950ce0c74b6394439519 Mon Sep 17 00:00:00 2001 From: Rohit Gupta Date: Mon, 13 Jan 2025 18:51:38 +0530 Subject: [PATCH 04/10] Update recognizer_registry.py --- .../presidio_analyzer/recognizer_registry/recognizer_registry.py | 1 - 1 file changed, 1 deletion(-) diff --git a/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry.py b/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry.py index 34cd5058e..6c2c095c3 100644 --- a/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry.py +++ b/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry.py @@ -156,7 +156,6 @@ def get_recognizers( if entities is None and all_fields is False: raise ValueError("No entities provided") - all_entity_recognizers = dict() all_possible_recognizers = copy.copy(self.recognizers) if ad_hoc_recognizers: From 38a3d65adda2c74392541f4f04522c2000639f8d Mon Sep 17 00:00:00 2001 From: Rohit Gupta Date: Wed, 5 Feb 2025 16:42:14 +0530 Subject: [PATCH 05/10] Update recognizer_registry.py --- .../recognizer_registry.py | 39 ++++++++++--------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry.py b/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry.py index 6c2c095c3..06a83a00d 100644 --- a/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry.py +++ b/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry.py @@ -160,8 +160,6 @@ def get_recognizers( all_possible_recognizers = copy.copy(self.recognizers) if ad_hoc_recognizers: all_possible_recognizers.extend(ad_hoc_recognizers) - - # filter out unwanted recognizers to_return = set() if all_fields: to_return = [ @@ -170,23 +168,26 @@ def get_recognizers( if language == rec.supported_language ] else: - entities = set(entities) - subset = [ - rec - for rec in all_possible_recognizers - if bool(set(rec.supported_entities).intersection(entities)) - and language == rec.supported_language - ] - if not subset: - logger.warning( - "Entity %s doesn't have the corresponding" - " recognizer in language : %s", - entity, - language, - ) - else: - to_return.update(set(subset)) - + # filter out unwanted recognizers + all_entity_recognizers = dict() + for rec in all_possible_recognizers: + if type(rec.supported_entities) == list and len(rec.supported_entities) > 0: + for supported_entity in rec.supported_entities: + all_entity_recognizers[supported_entity] = all_entity_recognizers[supported_entity].add( + rec) if supported_entity in all_entity_recognizers else {rec} + elif type(rec.supported_entities) == str: + all_entity_recognizers[rec.supported_entities] = all_entity_recognizers[rec.supported_entities].add( + rec) if rec.supported_entities in all_entity_recognizers else {rec} + for entity in entities: + if entity in all_entity_recognizers: + to_return.update(all_entity_recognizers[entity]) + else: + logger.warning( + "Entity %s doesn't have the corresponding" + " recognizer in language : %s", + entity, + language, + ) logger.debug( "Returning a total of %s recognizers", str(len(to_return)), From d03f6c08cbe98b6de8efbaa8761de0b12360d2da Mon Sep 17 00:00:00 2001 From: Rohit Gupta Date: Wed, 5 Feb 2025 17:14:02 +0530 Subject: [PATCH 06/10] Update recognizer_registry.py --- .../recognizer_registry/recognizer_registry.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry.py b/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry.py index 06a83a00d..fedc137ff 100644 --- a/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry.py +++ b/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry.py @@ -160,6 +160,7 @@ def get_recognizers( all_possible_recognizers = copy.copy(self.recognizers) if ad_hoc_recognizers: all_possible_recognizers.extend(ad_hoc_recognizers) + to_return = set() if all_fields: to_return = [ @@ -168,16 +169,16 @@ def get_recognizers( if language == rec.supported_language ] else: - # filter out unwanted recognizers all_entity_recognizers = dict() for rec in all_possible_recognizers: - if type(rec.supported_entities) == list and len(rec.supported_entities) > 0: - for supported_entity in rec.supported_entities: - all_entity_recognizers[supported_entity] = all_entity_recognizers[supported_entity].add( - rec) if supported_entity in all_entity_recognizers else {rec} - elif type(rec.supported_entities) == str: - all_entity_recognizers[rec.supported_entities] = all_entity_recognizers[rec.supported_entities].add( - rec) if rec.supported_entities in all_entity_recognizers else {rec} + if language == rec.supported_language: + if type(rec.supported_entities) == list and len(rec.supported_entities) > 0: + for supported_entity in rec.supported_entities: + all_entity_recognizers[supported_entity] = all_entity_recognizers[supported_entity].add( + rec) if supported_entity in all_entity_recognizers else {rec} + elif type(rec.supported_entities) == str: + all_entity_recognizers[rec.supported_entities] = all_entity_recognizers[rec.supported_entities].add( + rec) if rec.supported_entities in all_entity_recognizers else {rec} for entity in entities: if entity in all_entity_recognizers: to_return.update(all_entity_recognizers[entity]) From a06ebb30af7447830dd7bff17764e070b74890e8 Mon Sep 17 00:00:00 2001 From: Rohit Gupta Date: Wed, 5 Feb 2025 20:43:29 +0530 Subject: [PATCH 07/10] Update recognizer_registry.py --- .../recognizer_registry/recognizer_registry.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry.py b/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry.py index fedc137ff..1489be075 100644 --- a/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry.py +++ b/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry.py @@ -174,11 +174,9 @@ def get_recognizers( if language == rec.supported_language: if type(rec.supported_entities) == list and len(rec.supported_entities) > 0: for supported_entity in rec.supported_entities: - all_entity_recognizers[supported_entity] = all_entity_recognizers[supported_entity].add( - rec) if supported_entity in all_entity_recognizers else {rec} + self.add_recognizer_map(all_entity_recognizers, supported_entity, rec) elif type(rec.supported_entities) == str: - all_entity_recognizers[rec.supported_entities] = all_entity_recognizers[rec.supported_entities].add( - rec) if rec.supported_entities in all_entity_recognizers else {rec} + self.add_recognizer_map(all_entity_recognizers, supported_entity, rec) for entity in entities: if entity in all_entity_recognizers: to_return.update(all_entity_recognizers[entity]) @@ -199,6 +197,12 @@ def get_recognizers( return list(to_return) + def add_recognizer_map(self, all_entity_recognizers, supported_entity, rec): + if supported_entity in all_entity_recognizers: + all_entity_recognizers[supported_entity].add(rec) + else: + all_entity_recognizers[supported_entity] = {rec} + def add_recognizer(self, recognizer: EntityRecognizer) -> None: """ Add a new recognizer to the list of recognizers. From 0b59b40d0ad36da079808d0cc7e5db56b4fa328f Mon Sep 17 00:00:00 2001 From: Rohit Gupta Date: Wed, 5 Feb 2025 21:03:45 +0530 Subject: [PATCH 08/10] Update recognizer_registry.py --- .../recognizer_registry/recognizer_registry.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry.py b/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry.py index 1489be075..82b656255 100644 --- a/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry.py +++ b/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry.py @@ -178,15 +178,15 @@ def get_recognizers( elif type(rec.supported_entities) == str: self.add_recognizer_map(all_entity_recognizers, supported_entity, rec) for entity in entities: - if entity in all_entity_recognizers: - to_return.update(all_entity_recognizers[entity]) - else: + if entity not in all_entity_recognizers: logger.warning( "Entity %s doesn't have the corresponding" " recognizer in language : %s", entity, language, ) + else: + to_return.update(all_entity_recognizers[entity]) logger.debug( "Returning a total of %s recognizers", str(len(to_return)), From be62a2dc8ab139c4d22e77f1d4c5c236d67a1dc1 Mon Sep 17 00:00:00 2001 From: Rohit Gupta Date: Wed, 5 Feb 2025 21:04:24 +0530 Subject: [PATCH 09/10] Update recognizer_registry.py --- .../presidio_analyzer/recognizer_registry/recognizer_registry.py | 1 + 1 file changed, 1 insertion(+) diff --git a/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry.py b/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry.py index 82b656255..72d43d920 100644 --- a/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry.py +++ b/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry.py @@ -187,6 +187,7 @@ def get_recognizers( ) else: to_return.update(all_entity_recognizers[entity]) + logger.debug( "Returning a total of %s recognizers", str(len(to_return)), From fa16ffd20721d8da7e2b50ce824c99a165465632 Mon Sep 17 00:00:00 2001 From: Rohit Gupta Date: Wed, 5 Feb 2025 21:07:34 +0530 Subject: [PATCH 10/10] Update recognizer_registry.py --- .../recognizer_registry/recognizer_registry.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry.py b/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry.py index 72d43d920..909dc3d75 100644 --- a/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry.py +++ b/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry.py @@ -161,6 +161,7 @@ def get_recognizers( if ad_hoc_recognizers: all_possible_recognizers.extend(ad_hoc_recognizers) + # filter out unwanted recognizers to_return = set() if all_fields: to_return = [ @@ -177,6 +178,7 @@ def get_recognizers( self.add_recognizer_map(all_entity_recognizers, supported_entity, rec) elif type(rec.supported_entities) == str: self.add_recognizer_map(all_entity_recognizers, supported_entity, rec) + for entity in entities: if entity not in all_entity_recognizers: logger.warning(