From 4a78883ce45ae5ba10dd663be79c0caa56fecdc6 Mon Sep 17 00:00:00 2001
From: Stefano Braghin <527806+stefano81@users.noreply.github.com>
Date: Sat, 16 May 2026 22:19:49 +0100
Subject: [PATCH 1/5] fix: clean up more

Signed-off-by: Stefano Braghin <527806+stefano81@users.noreply.github.com>
---
 .../classification/identifiers/us_postal_address.py         | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/risk_assessment/classification/identifiers/us_postal_address.py b/src/risk_assessment/classification/identifiers/us_postal_address.py
index e829c4e..d36cc03 100644
--- a/src/risk_assessment/classification/identifiers/us_postal_address.py
+++ b/src/risk_assessment/classification/identifiers/us_postal_address.py
@@ -1,4 +1,4 @@
-from re import Pattern
+import re
 
 import re2
 
@@ -650,7 +650,7 @@ class USPostalAddress(Identifier):
     def __init__(self) -> None:
         options = re2.Options()
         options.case_sensitive = False
-        self.patterns: list[Pattern[str]] = [
+        self.patterns = [
             re2.compile(
                 r"^(?:"
                 + r"(?:"
@@ -767,8 +767,6 @@ def _check_that_case_is_consistent(text: str) -> bool:
         bool: True if all alphabetic initial letters are consistently upper or lower case,
               False if mixed case is detected
     """
-    import re
-
     # Split on whitespace and commas using proper regex
     tokens = re.split(r"[\s,]+", text)
 

From 2c801ce21812f48d8f5ea26bbe96ed46d3f184b6 Mon Sep 17 00:00:00 2001
From: Stefano Braghin <527806+stefano81@users.noreply.github.com>
Date: Sun, 17 May 2026 10:49:42 +0100
Subject: [PATCH 2/5] fix: clean up more

Signed-off-by: Stefano Braghin <527806+stefano81@users.noreply.github.com>
---
 src/risk_assessment/classification/identifiers/age.py | 4 ++--
 tests/classification/unstructured/test_aggregator.py  | 1 -
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/risk_assessment/classification/identifiers/age.py b/src/risk_assessment/classification/identifiers/age.py
index 26a3f91..7c37b40 100644
--- a/src/risk_assessment/classification/identifiers/age.py
+++ b/src/risk_assessment/classification/identifiers/age.py
@@ -140,8 +140,8 @@ class AgeImproved(Identifier):
         compile(r"^deceased\s+([0-9]+)$", I | U),
         compile(r"^died\s+at\s+([0-9]+)$", I | U),
         compile(r"^died\s+([0-9]+)-old\s+age$", I | U),
-        compile(r"^died\s+of\s+([\w|'|-]+\s+){1,3}at\s+([0-9]+)$", I | U),
-        compile(r"^died\s+of\s+([\w|'|-]+\s+){1,3}at\s+age\s+(of\s+)?([0-9]+)$", I | U),
+        compile(r"^died\s+of\s+([\w'|-]+\s+){1,3}at\s+([0-9]+)$", I | U),
+        compile(r"^died\s+of\s+([\w'|-]+\s+){1,3}at\s+age\s+(of\s+)?([0-9]+)$", I | U),
         compile(r"^passed\s+away\s+at\s+age\s+([0-9]+)$", I | U),
     ]
 
diff --git a/tests/classification/unstructured/test_aggregator.py b/tests/classification/unstructured/test_aggregator.py
index fe8d310..988f842 100644
--- a/tests/classification/unstructured/test_aggregator.py
+++ b/tests/classification/unstructured/test_aggregator.py
@@ -333,7 +333,6 @@ def test_aggregation_different_tokenizers():
 
     entities1 = [Entity(len("my_email is: "), len(data), "Email", frozenset(["DRL"]))]
     entities2 = [Entity(len("my_email is: "), len(data), "URI", frozenset(["STANZA"]))]
-    entities3 = [Entity(len("my_email is: "), len("my_email is: john"), "NAME", frozenset(["DRL2"]))]
     entities3 = [
         Entity(len("my_email is: john"), len("my_email is: john.doe"), "NAME", frozenset(["SPACY"])),
         Entity(len("my_email is: john.doe"), len(data), "URI", frozenset(["Spacy"])),

From 0e9cb1bcdc8e0748240be466cfe396b851d780dc Mon Sep 17 00:00:00 2001
From: Stefano Braghin <527806+stefano81@users.noreply.github.com>
Date: Sun, 17 May 2026 10:56:46 +0100
Subject: [PATCH 3/5] fix: typo

Signed-off-by: Stefano Braghin <527806+stefano81@users.noreply.github.com>
---
 src/risk_assessment/classification/__init__.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/risk_assessment/classification/__init__.py b/src/risk_assessment/classification/__init__.py
index 60ebdc0..503d4e6 100644
--- a/src/risk_assessment/classification/__init__.py
+++ b/src/risk_assessment/classification/__init__.py
@@ -130,7 +130,7 @@ class DatasetClassificationConfiguration:
         >>> config = DatasetClassificationConfiguration(
         ...     identifiers=[Email(), Phone()],
         ...     mark_unknown=True,
-        ...     unknonw_type="UNKNOWN"
+        ...     unknown_type="UNKNOWN"
         ... )
     """
 
@@ -139,7 +139,7 @@ def __init__(
         identifiers: list[Identifier | str],
         strategy: DatasetClassificationStrategy = FrequencyBasedDatasetClassificationStrategy(),
         mark_unknown: bool = True,
-        unknonw_type: str = "UNKNOWN",
+        unknown_type: str = "UNKNOWN",
     ) -> None:
         """Initialize the classification configuration.
 
@@ -147,12 +147,12 @@ def __init__(
             identifiers: List of Identifier instances or fully qualified name strings.
             strategy: Classification strategy to use (default: frequency-based).
             mark_unknown: Whether to mark unidentified values as unknown (default: True).
-            unknonw_type: Label for unknown values (default: "UNKNOWN").
+            unknown_type: Label for unknown values (default: "UNKNOWN").
         """
         self.identifiers = build_identifiers(identifiers)
         self.strategy = strategy
         self.mark_unknown = mark_unknown
-        self.unknown_type = unknonw_type
+        self.unknown_type = unknown_type
 
 
 @dataclass

From 21a0df4e4747fe77dc913fea15af9b931693597e Mon Sep 17 00:00:00 2001
From: Stefano Braghin <527806+stefano81@users.noreply.github.com>
Date: Sun, 17 May 2026 10:57:38 +0100
Subject: [PATCH 4/5] fix: typo

Signed-off-by: Stefano Braghin <527806+stefano81@users.noreply.github.com>
---
 .../classification/identifiers/geography.py       | 15 +--------------
 1 file changed, 1 insertion(+), 14 deletions(-)

diff --git a/src/risk_assessment/classification/identifiers/geography.py b/src/risk_assessment/classification/identifiers/geography.py
index 7f844a8..53a6182 100644
--- a/src/risk_assessment/classification/identifiers/geography.py
+++ b/src/risk_assessment/classification/identifiers/geography.py
@@ -16,19 +16,6 @@
 logger = logging.getLogger(__name__)
 
 
-def _extract_all_langugage_city_names(file: str) -> list[str]:
-    """Extract city names from a multi-language file.
-
-    Args:
-        file: Path to the file containing city names.
-
-    Returns:
-        List of city names.
-    """
-    with (Path(__file__).parent / file).open("r") as stream:
-        return [line.strip() for line in stream.readlines()]
-
-
 def _extract_city_names(file: str) -> list[str]:
     """Extract city names from a CSV file.
 
@@ -683,7 +670,7 @@ def __init__(self) -> None:
                 "GU",
                 "Guam",
                 "VI",
-                "Vigin Islands",
+                "Virgin Islands",
                 "PR",
                 "Puerto Rico",
                 "FM",

From ec09e6cb04da1e47b2fdd8d7b68c2403a6ab233c Mon Sep 17 00:00:00 2001
From: Stefano Braghin <527806+stefano81@users.noreply.github.com>
Date: Sun, 17 May 2026 11:03:22 +0100
Subject: [PATCH 5/5] fix: typos

Signed-off-by: Stefano Braghin <527806+stefano81@users.noreply.github.com>
---
 .../classification/identifiers/geography.py         | 13 +++++++++++++
 .../classification/identifiers/us_postal_address.py |  8 +++-----
 tests/classification/identifiers/test_geography.py  |  4 ++--
 3 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/src/risk_assessment/classification/identifiers/geography.py b/src/risk_assessment/classification/identifiers/geography.py
index 53a6182..0caea36 100644
--- a/src/risk_assessment/classification/identifiers/geography.py
+++ b/src/risk_assessment/classification/identifiers/geography.py
@@ -16,6 +16,19 @@
 logger = logging.getLogger(__name__)
 
 
+def _extract_all_language_city_names(file: str) -> list[str]:
+    """Extract city names from a multi-language file.
+
+    Args:
+        file: Path to the file containing city names.
+
+    Returns:
+        List of city names.
+    """
+    with (Path(__file__).parent / file).open("r") as stream:
+        return [line.strip() for line in stream.readlines()]
+
+
 def _extract_city_names(file: str) -> list[str]:
     """Extract city names from a CSV file.
 
diff --git a/src/risk_assessment/classification/identifiers/us_postal_address.py b/src/risk_assessment/classification/identifiers/us_postal_address.py
index d36cc03..ce16807 100644
--- a/src/risk_assessment/classification/identifiers/us_postal_address.py
+++ b/src/risk_assessment/classification/identifiers/us_postal_address.py
@@ -1,5 +1,3 @@
-import re
-
 import re2
 
 from risk_assessment.classification.identifiers import Identifier
@@ -633,7 +631,7 @@
     r"|TX|Texas"
     r"|UT|Utah"
     r"|VT|Vermont"
-    r"|VA|Virginia[H]"
+    r"|VA|Virginia"
     r"|WA|Washington"
     r"|WV|West Virginia"
     r"|WI|Wisconsin"
@@ -705,7 +703,7 @@ def __init__(self) -> None:
                 + r",?(?:\s+\w{3,})+,?(?:\s+\w{3,})+,?\s+"
                 + STATE_AND_POSSESSIONS
                 + r")"
-                r"|(?:\w{3,}(:?\s+\w{3,})*,?\s+" + STATE_AND_POSSESSIONS + r",?\s+" + ZIP_CODE + r"(?:\s+\w{2,})+)"
+                r"|(?:\w{3,}(?:\s+\w{3,})*,?\s+" + STATE_AND_POSSESSIONS + r",?\s+" + ZIP_CODE + r"(?:\s+\w{2,})+)"
                 r"|(?:\d+(?:\s+\w{3,})*(?:\s+"
                 + SUFFIX
                 + r")?,?\s+"
@@ -768,7 +766,7 @@ def _check_that_case_is_consistent(text: str) -> bool:
               False if mixed case is detected
     """
     # Split on whitespace and commas using proper regex
-    tokens = re.split(r"[\s,]+", text)
+    tokens = re2.split(r"[\s,]+", text)
 
     upper_count = 0
     lower_count = 0
diff --git a/tests/classification/identifiers/test_geography.py b/tests/classification/identifiers/test_geography.py
index 4375cff..8567190 100644
--- a/tests/classification/identifiers/test_geography.py
+++ b/tests/classification/identifiers/test_geography.py
@@ -4,7 +4,7 @@
 from risk_assessment.classification.identifiers.geography import (
     UKPostCode,
     UnitedStateState,
-    _extract_all_langugage_city_names,
+    _extract_all_language_city_names,
 )
 
 
@@ -182,7 +182,7 @@ def test_uk_postcode_suppors_for_known_formats():
 
 
 def test_all_city_names():
-    identifier = City("data/all_language_city_names.txt", _extract_all_langugage_city_names)
+    identifier = City("data/all_language_city_names.txt", _extract_all_language_city_names)
 
     assert len(identifier.data) == 930425, len(identifier.data)