From 4a78883ce45ae5ba10dd663be79c0caa56fecdc6 Mon Sep 17 00:00:00 2001 From: Stefano Braghin <527806+stefano81@users.noreply.github.com> Date: Sat, 16 May 2026 22:19:49 +0100 Subject: [PATCH 1/5] fix: clean up more Signed-off-by: Stefano Braghin <527806+stefano81@users.noreply.github.com> --- .../classification/identifiers/us_postal_address.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/risk_assessment/classification/identifiers/us_postal_address.py b/src/risk_assessment/classification/identifiers/us_postal_address.py index e829c4e..d36cc03 100644 --- a/src/risk_assessment/classification/identifiers/us_postal_address.py +++ b/src/risk_assessment/classification/identifiers/us_postal_address.py @@ -1,4 +1,4 @@ -from re import Pattern +import re import re2 @@ -650,7 +650,7 @@ class USPostalAddress(Identifier): def __init__(self) -> None: options = re2.Options() options.case_sensitive = False - self.patterns: list[Pattern[str]] = [ + self.patterns = [ re2.compile( r"^(?:" + r"(?:" @@ -767,8 +767,6 @@ def _check_that_case_is_consistent(text: str) -> bool: bool: True if all alphabetic initial letters are consistently upper or lower case, False if mixed case is detected """ - import re - # Split on whitespace and commas using proper regex tokens = re.split(r"[\s,]+", text) From 2c801ce21812f48d8f5ea26bbe96ed46d3f184b6 Mon Sep 17 00:00:00 2001 From: Stefano Braghin <527806+stefano81@users.noreply.github.com> Date: Sun, 17 May 2026 10:49:42 +0100 Subject: [PATCH 2/5] fix: clean up more Signed-off-by: Stefano Braghin <527806+stefano81@users.noreply.github.com> --- src/risk_assessment/classification/identifiers/age.py | 4 ++-- tests/classification/unstructured/test_aggregator.py | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/risk_assessment/classification/identifiers/age.py b/src/risk_assessment/classification/identifiers/age.py index 26a3f91..7c37b40 100644 --- a/src/risk_assessment/classification/identifiers/age.py +++ b/src/risk_assessment/classification/identifiers/age.py @@ -140,8 +140,8 @@ class AgeImproved(Identifier): compile(r"^deceased\s+([0-9]+)$", I | U), compile(r"^died\s+at\s+([0-9]+)$", I | U), compile(r"^died\s+([0-9]+)-old\s+age$", I | U), - compile(r"^died\s+of\s+([\w|'|-]+\s+){1,3}at\s+([0-9]+)$", I | U), - compile(r"^died\s+of\s+([\w|'|-]+\s+){1,3}at\s+age\s+(of\s+)?([0-9]+)$", I | U), + compile(r"^died\s+of\s+([\w'|-]+\s+){1,3}at\s+([0-9]+)$", I | U), + compile(r"^died\s+of\s+([\w'|-]+\s+){1,3}at\s+age\s+(of\s+)?([0-9]+)$", I | U), compile(r"^passed\s+away\s+at\s+age\s+([0-9]+)$", I | U), ] diff --git a/tests/classification/unstructured/test_aggregator.py b/tests/classification/unstructured/test_aggregator.py index fe8d310..988f842 100644 --- a/tests/classification/unstructured/test_aggregator.py +++ b/tests/classification/unstructured/test_aggregator.py @@ -333,7 +333,6 @@ def test_aggregation_different_tokenizers(): entities1 = [Entity(len("my_email is: "), len(data), "Email", frozenset(["DRL"]))] entities2 = [Entity(len("my_email is: "), len(data), "URI", frozenset(["STANZA"]))] - entities3 = [Entity(len("my_email is: "), len("my_email is: john"), "NAME", frozenset(["DRL2"]))] entities3 = [ Entity(len("my_email is: john"), len("my_email is: john.doe"), "NAME", frozenset(["SPACY"])), Entity(len("my_email is: john.doe"), len(data), "URI", frozenset(["Spacy"])), From 0e9cb1bcdc8e0748240be466cfe396b851d780dc Mon Sep 17 00:00:00 2001 From: Stefano Braghin <527806+stefano81@users.noreply.github.com> Date: Sun, 17 May 2026 10:56:46 +0100 Subject: [PATCH 3/5] fix: typo Signed-off-by: Stefano Braghin <527806+stefano81@users.noreply.github.com> --- src/risk_assessment/classification/__init__.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/risk_assessment/classification/__init__.py b/src/risk_assessment/classification/__init__.py index 60ebdc0..503d4e6 100644 --- a/src/risk_assessment/classification/__init__.py +++ b/src/risk_assessment/classification/__init__.py @@ -130,7 +130,7 @@ class DatasetClassificationConfiguration: >>> config = DatasetClassificationConfiguration( ... identifiers=[Email(), Phone()], ... mark_unknown=True, - ... unknonw_type="UNKNOWN" + ... unknown_type="UNKNOWN" ... ) """ @@ -139,7 +139,7 @@ def __init__( identifiers: list[Identifier | str], strategy: DatasetClassificationStrategy = FrequencyBasedDatasetClassificationStrategy(), mark_unknown: bool = True, - unknonw_type: str = "UNKNOWN", + unknown_type: str = "UNKNOWN", ) -> None: """Initialize the classification configuration. @@ -147,12 +147,12 @@ def __init__( identifiers: List of Identifier instances or fully qualified name strings. strategy: Classification strategy to use (default: frequency-based). mark_unknown: Whether to mark unidentified values as unknown (default: True). - unknonw_type: Label for unknown values (default: "UNKNOWN"). + unknown_type: Label for unknown values (default: "UNKNOWN"). """ self.identifiers = build_identifiers(identifiers) self.strategy = strategy self.mark_unknown = mark_unknown - self.unknown_type = unknonw_type + self.unknown_type = unknown_type @dataclass From 21a0df4e4747fe77dc913fea15af9b931693597e Mon Sep 17 00:00:00 2001 From: Stefano Braghin <527806+stefano81@users.noreply.github.com> Date: Sun, 17 May 2026 10:57:38 +0100 Subject: [PATCH 4/5] fix: typo Signed-off-by: Stefano Braghin <527806+stefano81@users.noreply.github.com> --- .../classification/identifiers/geography.py | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/src/risk_assessment/classification/identifiers/geography.py b/src/risk_assessment/classification/identifiers/geography.py index 7f844a8..53a6182 100644 --- a/src/risk_assessment/classification/identifiers/geography.py +++ b/src/risk_assessment/classification/identifiers/geography.py @@ -16,19 +16,6 @@ logger = logging.getLogger(__name__) -def _extract_all_langugage_city_names(file: str) -> list[str]: - """Extract city names from a multi-language file. - - Args: - file: Path to the file containing city names. - - Returns: - List of city names. - """ - with (Path(__file__).parent / file).open("r") as stream: - return [line.strip() for line in stream.readlines()] - - def _extract_city_names(file: str) -> list[str]: """Extract city names from a CSV file. @@ -683,7 +670,7 @@ def __init__(self) -> None: "GU", "Guam", "VI", - "Vigin Islands", + "Virgin Islands", "PR", "Puerto Rico", "FM", From ec09e6cb04da1e47b2fdd8d7b68c2403a6ab233c Mon Sep 17 00:00:00 2001 From: Stefano Braghin <527806+stefano81@users.noreply.github.com> Date: Sun, 17 May 2026 11:03:22 +0100 Subject: [PATCH 5/5] fix: typos Signed-off-by: Stefano Braghin <527806+stefano81@users.noreply.github.com> --- .../classification/identifiers/geography.py | 13 +++++++++++++ .../classification/identifiers/us_postal_address.py | 8 +++----- tests/classification/identifiers/test_geography.py | 4 ++-- 3 files changed, 18 insertions(+), 7 deletions(-) diff --git a/src/risk_assessment/classification/identifiers/geography.py b/src/risk_assessment/classification/identifiers/geography.py index 53a6182..0caea36 100644 --- a/src/risk_assessment/classification/identifiers/geography.py +++ b/src/risk_assessment/classification/identifiers/geography.py @@ -16,6 +16,19 @@ logger = logging.getLogger(__name__) +def _extract_all_language_city_names(file: str) -> list[str]: + """Extract city names from a multi-language file. + + Args: + file: Path to the file containing city names. + + Returns: + List of city names. + """ + with (Path(__file__).parent / file).open("r") as stream: + return [line.strip() for line in stream.readlines()] + + def _extract_city_names(file: str) -> list[str]: """Extract city names from a CSV file. diff --git a/src/risk_assessment/classification/identifiers/us_postal_address.py b/src/risk_assessment/classification/identifiers/us_postal_address.py index d36cc03..ce16807 100644 --- a/src/risk_assessment/classification/identifiers/us_postal_address.py +++ b/src/risk_assessment/classification/identifiers/us_postal_address.py @@ -1,5 +1,3 @@ -import re - import re2 from risk_assessment.classification.identifiers import Identifier @@ -633,7 +631,7 @@ r"|TX|Texas" r"|UT|Utah" r"|VT|Vermont" - r"|VA|Virginia[H]" + r"|VA|Virginia" r"|WA|Washington" r"|WV|West Virginia" r"|WI|Wisconsin" @@ -705,7 +703,7 @@ def __init__(self) -> None: + r",?(?:\s+\w{3,})+,?(?:\s+\w{3,})+,?\s+" + STATE_AND_POSSESSIONS + r")" - r"|(?:\w{3,}(:?\s+\w{3,})*,?\s+" + STATE_AND_POSSESSIONS + r",?\s+" + ZIP_CODE + r"(?:\s+\w{2,})+)" + r"|(?:\w{3,}(?:\s+\w{3,})*,?\s+" + STATE_AND_POSSESSIONS + r",?\s+" + ZIP_CODE + r"(?:\s+\w{2,})+)" r"|(?:\d+(?:\s+\w{3,})*(?:\s+" + SUFFIX + r")?,?\s+" @@ -768,7 +766,7 @@ def _check_that_case_is_consistent(text: str) -> bool: False if mixed case is detected """ # Split on whitespace and commas using proper regex - tokens = re.split(r"[\s,]+", text) + tokens = re2.split(r"[\s,]+", text) upper_count = 0 lower_count = 0 diff --git a/tests/classification/identifiers/test_geography.py b/tests/classification/identifiers/test_geography.py index 4375cff..8567190 100644 --- a/tests/classification/identifiers/test_geography.py +++ b/tests/classification/identifiers/test_geography.py @@ -4,7 +4,7 @@ from risk_assessment.classification.identifiers.geography import ( UKPostCode, UnitedStateState, - _extract_all_langugage_city_names, + _extract_all_language_city_names, ) @@ -182,7 +182,7 @@ def test_uk_postcode_suppors_for_known_formats(): def test_all_city_names(): - identifier = City("data/all_language_city_names.txt", _extract_all_langugage_city_names) + identifier = City("data/all_language_city_names.txt", _extract_all_language_city_names) assert len(identifier.data) == 930425, len(identifier.data)