From 662b7feefb87f7049e44ddd71aa10418c90618da Mon Sep 17 00:00:00 2001 From: Jeff Rose Date: Wed, 4 Feb 2026 14:46:30 -0500 Subject: [PATCH 1/4] Add an address parsing method using usaddress --- src/stac_utils/address.py | 47 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 src/stac_utils/address.py diff --git a/src/stac_utils/address.py b/src/stac_utils/address.py new file mode 100644 index 0000000..0a1d225 --- /dev/null +++ b/src/stac_utils/address.py @@ -0,0 +1,47 @@ + +import usaddress +import logging + +# logging +logger = logging.getLogger(__name__) + +def parse_address(address: str) -> dict: + """ + Parse a full address string into components. + + Args: + address: A full address string (e.g., "123 Internet St, City, ST 12345") + + Returns: + dict: Contains keys like 'street_address', 'city', 'state', 'zip' as available. + """ + try: + parsed, address_type = usaddress.tag(address) + result = {} + + # Map usaddress fields to street address components + street_parts = [] + street_keys = [ + 'AddressNumber', 'StreetNamePreDirectional', 'StreetNamePreModifier', + 'StreetNamePreType', 'StreetName', 'StreetNamePostType', + 'StreetNamePostDirectional', 'SubaddressType', 'SubaddressIdentifier', + 'OccupancyType', 'OccupancyIdentifier' + ] + for key in street_keys: + if key in parsed: + street_parts.append(parsed[key]) + + if street_parts: + result['street_address'] = ' '.join(street_parts) + if 'PlaceName' in parsed: + result['city'] = parsed['PlaceName'] + if 'StateName' in parsed: + result['state'] = parsed['StateName'] + if 'ZipCode' in parsed: + result['zip'] = parsed['ZipCode'] + + return result + + except Exception as e: + logger.error(f"Failed to parse address '{address}': {e}") + return {} \ No newline at end of file From 4d066d8249bb37fe91320ac09c5373e027bee981 Mon Sep 17 00:00:00 2001 From: Jeff Rose Date: Wed, 4 Feb 2026 14:46:41 -0500 Subject: [PATCH 2/4] add usaddress to dependencies --- requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index fd0bf5f..b222733 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,4 +11,5 @@ psycopg selenium webdriver-manager sphinx -pre-commit \ No newline at end of file +pre-commit +usaddress \ No newline at end of file From dcaf7b512bd26a58b941f9789f6c59b92970cb9b Mon Sep 17 00:00:00 2001 From: Jeff Rose Date: Wed, 4 Feb 2026 14:47:00 -0500 Subject: [PATCH 3/4] add support for new fields --- src/stac_utils/convert.py | 28 ++++++++++ src/stac_utils/ngpvan.py | 104 +++++++++++++++++++++++++++++++------- 2 files changed, 114 insertions(+), 18 deletions(-) diff --git a/src/stac_utils/convert.py b/src/stac_utils/convert.py index 368ec3e..12d433e 100644 --- a/src/stac_utils/convert.py +++ b/src/stac_utils/convert.py @@ -1,5 +1,6 @@ import re +from typing import Any, Optional def _convert(camel_input: str) -> str: # from https://stackoverflow.com/a/46493824 @@ -35,3 +36,30 @@ def strip_dict(full_dict: dict): :param full_dict: dict to clean up :return: dict without None values""" return {k: v for k, v in full_dict.items() if v is not None} + +def get_first_value(row: dict, keys: list[str]) -> tuple[Optional[Any], Optional[str]]: + """ + Get the first non-None/non-empty value from a list of possible keys. + + Returns: + tuple: (value, key_used) or (None, None) if no value found. + """ + for key in keys: + value = row.get(key) + if value is not None and value != "": + return value, key + return None, None + + +def get_all_values(row: dict, keys: list[str]) -> dict[str, Any]: + """ + Get all non-None/non-empty values from a list of possible keys. + + Returns: + dict: {key: value} for all keys that have values. + """ + return { + key: row.get(key) + for key in keys + if row.get(key) is not None and row.get(key) != "" + } diff --git a/src/stac_utils/ngpvan.py b/src/stac_utils/ngpvan.py index 1c0179d..97fac5c 100644 --- a/src/stac_utils/ngpvan.py +++ b/src/stac_utils/ngpvan.py @@ -3,7 +3,10 @@ import os import requests -from .convert import convert_to_snake_case, strip_dict +from .listify import listify +from .address import parse_address + +from .convert import convert_to_snake_case, strip_dict, get_first_value, get_all_values from .http import HTTPClient logger = logging.getLogger(__name__) @@ -178,36 +181,101 @@ def format_person_json(row: dict, id_key: str, has_identifier: bool) -> dict: else: print("No ID key used") - if row.get("email"): - formatted_json["emails"] = [{"email": row.get("email").strip()}] + emails = None + + if row.get("emails"): + emails = listify(str(row.get("emails").strip())) + else: + email_keys = ["email", "email_address"] + email_value, _ = get_first_value(row, email_keys) + + if email_value: + emails = listify(str(email_value).strip()) + + if emails: + formatted_json["emails"] = [{"email": e.strip()} for e in emails] + + phones = None - if row.get("phone"): + if row.get("phones"): + phones = listify(str(row.get("phones").strip())) + else: + phone_keys = ["phone", "phone_number"] + phone_value, _ = get_first_value(row, phone_keys) + + if phone_value: + phones = listify(str(phone_value).strip()) + + if phones: formatted_json["phones"] = [ - {"phoneNumber": str(row.get("phone")).replace(".0", "")} + {"phoneNumber": p.replace(".0", "")} for p in phones ] if row.get("middle_name"): formatted_json["middleName"] = row.get("middle_name") + if row.get("suffix"): + formatted_json["suffix"] = row.get("suffix") + address = {} - if row.get("street_address"): - address["addressLine1"] = row.get("street_address") + street_address_keys = ["street_address", "address", "address1", "address_1"] + street_values = get_all_values(row, street_address_keys) - if row.get("city"): - address["city"] = row.get("city") + if len(street_values) > 1: + logger.warning( + f"Multiple street address fields provided: {list(street_values.keys())}. " + f"Using first found value." + ) - if row.get("state") or row.get("stateOrProvince"): - if row.get("state"): - address["stateOrProvince"] = row.get("state") - else: - address["stateOrProvince"] = row.get("stateOrProvince") + street_address, _ = get_first_value(row, street_address_keys) + + has_city = row.get("city") + has_state = row.get("state") or row.get("stateOrProvince") + has_zip = row.get("zip") or row.get("zipOrPostalCode") + + if street_address and not (has_city or has_state or has_zip): + parsed = parse_address(str(street_address)) + + if parsed.get('street_address'): + address["addressLine1"] = parsed['street_address'] + + if parsed.get('city'): + address["city"] = parsed['city'] + + if parsed.get('state'): + address["stateOrProvince"] = parsed['state'] + + if parsed.get('zip'): + address["zipOrPostalCode"] = parsed['zip'] + else: + if street_address: + address["addressLine1"] = street_address + + if row.get("city"): + address["city"] = row.get("city") + + if row.get("state") or row.get("stateOrProvince"): + address["stateOrProvince"] = row.get("state") or row.get("stateOrProvince") + + if row.get("zip") or row.get("zipOrPostalCode"): + address["zipOrPostalCode"] = row.get("zip") or row.get("zipOrPostalCode") + + if row.get("street_address"): + address["addressLine1"] = row.get("street_address") + + # Handle addressLine2 (multiple aliases supported) + address2_keys = ["address2", "address_2", "street_address_2"] + address2_value, address2_key = get_first_value(row, address2_keys) - if row.get("zip") or row.get("zipOrPostalCode"): - if row.get("zip"): - address["zipOrPostalCode"] = row.get("zip") + if address2_value: + if not address.get("addressLine1"): + logger.error( + f"addressLine2 field '{address2_key}' provided without addressLine1. " + f"Value: '{address2_value}'" + ) else: - address["zipOrPostalCode"] = row.get("zipOrPostalCode") + address["addressLine2"] = address2_value if address: formatted_json["addresses"] = [address] From 27e8ba34cf1fe706c092126865f9783188402c46 Mon Sep 17 00:00:00 2001 From: Jeff Rose Date: Wed, 4 Feb 2026 14:47:12 -0500 Subject: [PATCH 4/4] tests from new fields --- src/tests/test_ngpvan.py | 568 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 568 insertions(+) diff --git a/src/tests/test_ngpvan.py b/src/tests/test_ngpvan.py index dc52749..d9681bd 100644 --- a/src/tests/test_ngpvan.py +++ b/src/tests/test_ngpvan.py @@ -380,6 +380,574 @@ def test_format_person_json(self): }, ) + """ + Tests for expanded format_person_json functionality. + Copy these methods into the existing TestNGPVAN class. + """ + + # ========================================================================= + # Address Field Alias Tests + # ========================================================================= + + def test_format_person_json_address_alias(self): + """Test that 'address' field is treated as street_address""" + from src.stac_utils.ngpvan import NGPVANClient + + result = NGPVANClient.format_person_json( + { + "first_name": "John", + "last_name": "Smith", + "address": "123 Main St", + "city": "Clinton", + "state": "IA", + "zip": "12345", + }, + None, + False, + ) + self.assertEqual( + result["addresses"], + [ + { + "addressLine1": "123 Main St", + "city": "Clinton", + "stateOrProvince": "IA", + "zipOrPostalCode": "12345", + } + ], + ) + + def test_format_person_json_address1_alias(self): + """Test that 'address1' field is treated as street_address""" + from src.stac_utils.ngpvan import NGPVANClient + + result = NGPVANClient.format_person_json( + { + "first_name": "John", + "last_name": "Smith", + "address1": "456 Oak Ave", + "city": "Clinton", + "state": "IA", + "zip": "12345", + }, + None, + False, + ) + self.assertEqual(result["addresses"][0]["addressLine1"], "456 Oak Ave") + + def test_format_person_json_address_1_alias(self): + """Test that 'address_1' field is treated as street_address""" + from src.stac_utils.ngpvan import NGPVANClient + + result = NGPVANClient.format_person_json( + { + "first_name": "John", + "last_name": "Smith", + "address_1": "789 Pine Rd", + "city": "Clinton", + "state": "IA", + "zip": "12345", + }, + None, + False, + ) + self.assertEqual(result["addresses"][0]["addressLine1"], "789 Pine Rd") + + def test_format_person_json_multiple_address_fields_logs_warning(self): + """Test that providing multiple address fields logs a warning""" + from src.stac_utils.ngpvan import NGPVANClient + + with patch("src.stac_utils.ngpvan.logger") as mock_logger: + result = NGPVANClient.format_person_json( + { + "first_name": "John", + "last_name": "Smith", + "street_address": "123 Main St", + "address": "456 Other St", # Duplicate - should trigger warning + "city": "Clinton", + "state": "IA", + "zip": "12345", + }, + None, + False, + ) + mock_logger.warning.assert_called() + # Should still use the first valid value found + self.assertIn("addressLine1", result["addresses"][0]) + + # ========================================================================= + # Address Parsing Tests + # ========================================================================= + + def test_format_person_json_address_parsing_full_address(self): + """Test that full address is parsed when no city/state/zip provided""" + from src.stac_utils.ngpvan import NGPVANClient + + result = NGPVANClient.format_person_json( + { + "first_name": "John", + "last_name": "Smith", + "address": "123 Main St, Boston, MA 02101", + }, + None, + False, + ) + self.assertIn("addresses", result) + address = result["addresses"][0] + # The parsed address should contain these components + self.assertIn("addressLine1", address) + self.assertIn("city", address) + self.assertIn("stateOrProvince", address) + self.assertIn("zipOrPostalCode", address) + + def test_format_person_json_address_not_parsed_when_city_present(self): + """Test that address is not parsed when city is provided""" + from src.stac_utils.ngpvan import NGPVANClient + + result = NGPVANClient.format_person_json( + { + "first_name": "John", + "last_name": "Smith", + "address": "123 Main St, Boston, MA 02101", + "city": "Springfield", # City provided, so don't parse + }, + None, + False, + ) + # Address should be used as-is, not parsed + self.assertEqual( + result["addresses"][0]["addressLine1"], "123 Main St, Boston, MA 02101" + ) + self.assertEqual(result["addresses"][0]["city"], "Springfield") + + def test_format_person_json_address_not_parsed_when_state_present(self): + """Test that address is not parsed when state is provided""" + from src.stac_utils.ngpvan import NGPVANClient + + result = NGPVANClient.format_person_json( + { + "first_name": "John", + "last_name": "Smith", + "address": "123 Main St, Boston, MA 02101", + "state": "IL", + }, + None, + False, + ) + self.assertEqual( + result["addresses"][0]["addressLine1"], "123 Main St, Boston, MA 02101" + ) + self.assertEqual(result["addresses"][0]["stateOrProvince"], "IL") + + def test_format_person_json_address_not_parsed_when_zip_present(self): + """Test that address is not parsed when zip is provided""" + from src.stac_utils.ngpvan import NGPVANClient + + result = NGPVANClient.format_person_json( + { + "first_name": "John", + "last_name": "Smith", + "address": "123 Main St, Boston, MA 02101", + "zip": "99999", + }, + None, + False, + ) + self.assertEqual( + result["addresses"][0]["addressLine1"], "123 Main St, Boston, MA 02101" + ) + self.assertEqual(result["addresses"][0]["zipOrPostalCode"], "99999") + + # ========================================================================= + # Phone Field Alias and Comma-Delimited Tests + # ========================================================================= + + def test_format_person_json_phone_number_alias(self): + """Test that 'phone_number' field is treated as phone""" + from src.stac_utils.ngpvan import NGPVANClient + + result = NGPVANClient.format_person_json( + { + "first_name": "John", + "last_name": "Smith", + "phone_number": "555-123-4567", + }, + None, + False, + ) + self.assertEqual(result["phones"], [{"phoneNumber": "555-123-4567"}]) + + def test_format_person_json_multiple_phones_comma_delimited(self): + """Test that comma-delimited phones are parsed into multiple entries""" + from src.stac_utils.ngpvan import NGPVANClient + + result = NGPVANClient.format_person_json( + { + "first_name": "John", + "last_name": "Smith", + "phone": "555-111-1111, 555-222-2222, 555-333-3333", + }, + None, + False, + ) + self.assertEqual( + result["phones"], + [ + {"phoneNumber": "555-111-1111"}, + {"phoneNumber": "555-222-2222"}, + {"phoneNumber": "555-333-3333"}, + ], + ) + + def test_format_person_json_multiple_phones_with_spaces(self): + """Test that comma-delimited phones handle extra whitespace""" + from src.stac_utils.ngpvan import NGPVANClient + + result = NGPVANClient.format_person_json( + { + "first_name": "John", + "last_name": "Smith", + "phone": " 555-111-1111 , 555-222-2222 ", + }, + None, + False, + ) + self.assertEqual( + result["phones"], + [ + {"phoneNumber": "555-111-1111"}, + {"phoneNumber": "555-222-2222"}, + ], + ) + + def test_format_person_json_empty_phone_values_filtered(self): + """Test that empty values in comma-delimited phones are filtered out""" + from src.stac_utils.ngpvan import NGPVANClient + + result = NGPVANClient.format_person_json( + { + "first_name": "John", + "last_name": "Smith", + "phone": "555-111-1111, , , 555-222-2222", + }, + None, + False, + ) + self.assertEqual( + result["phones"], + [ + {"phoneNumber": "555-111-1111"}, + {"phoneNumber": "555-222-2222"}, + ], + ) + + # ========================================================================= + # Email Field Alias and Comma-Delimited Tests + # ========================================================================= + + def test_format_person_json_email_address_alias(self): + """Test that 'email_address' field is treated as email""" + from src.stac_utils.ngpvan import NGPVANClient + + result = NGPVANClient.format_person_json( + { + "first_name": "John", + "last_name": "Smith", + "email_address": "john@example.com", + }, + None, + False, + ) + self.assertEqual(result["emails"], [{"email": "john@example.com"}]) + + def test_format_person_json_multiple_emails_comma_delimited(self): + """Test that comma-delimited emails are parsed into multiple entries""" + from src.stac_utils.ngpvan import NGPVANClient + + result = NGPVANClient.format_person_json( + { + "first_name": "John", + "last_name": "Smith", + "email": "john@work.com, john@home.com, john@other.com", + }, + None, + False, + ) + self.assertEqual( + result["emails"], + [ + {"email": "john@work.com"}, + {"email": "john@home.com"}, + {"email": "john@other.com"}, + ], + ) + + def test_format_person_json_multiple_emails_with_spaces(self): + """Test that comma-delimited emails handle extra whitespace""" + from src.stac_utils.ngpvan import NGPVANClient + + result = NGPVANClient.format_person_json( + { + "first_name": "John", + "last_name": "Smith", + "email": " john@work.com , john@home.com ", + }, + None, + False, + ) + self.assertEqual( + result["emails"], + [ + {"email": "john@work.com"}, + {"email": "john@home.com"}, + ], + ) + + def test_format_person_json_empty_email_values_filtered(self): + """Test that empty values in comma-delimited emails are filtered out""" + from src.stac_utils.ngpvan import NGPVANClient + + result = NGPVANClient.format_person_json( + { + "first_name": "John", + "last_name": "Smith", + "email": "john@work.com, , , john@home.com", + }, + None, + False, + ) + self.assertEqual( + result["emails"], + [ + {"email": "john@work.com"}, + {"email": "john@home.com"}, + ], + ) + + # ========================================================================= + # Address Line 2 Tests + # ========================================================================= + + def test_format_person_json_address2(self): + """Test that 'address2' field is treated as addressLine2""" + from src.stac_utils.ngpvan import NGPVANClient + + result = NGPVANClient.format_person_json( + { + "first_name": "John", + "last_name": "Smith", + "street_address": "123 Main St", + "address2": "Apt 4B", + "city": "Clinton", + "state": "IA", + "zip": "12345", + }, + None, + False, + ) + self.assertEqual(result["addresses"][0]["addressLine2"], "Apt 4B") + + def test_format_person_json_address_2_alias(self): + """Test that 'address_2' field is treated as addressLine2""" + from src.stac_utils.ngpvan import NGPVANClient + + result = NGPVANClient.format_person_json( + { + "first_name": "John", + "last_name": "Smith", + "street_address": "123 Main St", + "address_2": "Suite 100", + "city": "Clinton", + "state": "IA", + "zip": "12345", + }, + None, + False, + ) + self.assertEqual(result["addresses"][0]["addressLine2"], "Suite 100") + + def test_format_person_json_street_address_2_alias(self): + """Test that 'street_address_2' field is treated as addressLine2""" + from src.stac_utils.ngpvan import NGPVANClient + + result = NGPVANClient.format_person_json( + { + "first_name": "John", + "last_name": "Smith", + "street_address": "123 Main St", + "street_address_2": "Floor 5", + "city": "Clinton", + "state": "IA", + "zip": "12345", + }, + None, + False, + ) + self.assertEqual(result["addresses"][0]["addressLine2"], "Floor 5") + + def test_format_person_json_address2_without_address1_logs_error(self): + """Test that providing address2 without address1 logs an error""" + from src.stac_utils.ngpvan import NGPVANClient + + with patch("src.stac_utils.ngpvan.logger") as mock_logger: + result = NGPVANClient.format_person_json( + { + "first_name": "John", + "last_name": "Smith", + "address_2": "Apt 4B", # No address1 provided + "city": "Clinton", + "state": "IA", + "zip": "12345", + }, + None, + False, + ) + mock_logger.error.assert_called() + # addressLine2 should NOT be included when addressLine1 is missing + self.assertNotIn("addressLine2", result["addresses"][0]) + + # ========================================================================= + # Suffix Tests + # ========================================================================= + + def test_format_person_json_suffix(self): + """Test that suffix field is included in output""" + from src.stac_utils.ngpvan import NGPVANClient + + result = NGPVANClient.format_person_json( + { + "first_name": "John", + "last_name": "Smith", + "suffix": "Jr.", + }, + None, + False, + ) + self.assertEqual(result["suffix"], "Jr.") + + def test_format_person_json_suffix_various_values(self): + """Test suffix with various common values""" + from src.stac_utils.ngpvan import NGPVANClient + + for suffix in ["Jr.", "Sr.", "III", "IV", "PhD", "MD", "Esq."]: + result = NGPVANClient.format_person_json( + { + "first_name": "John", + "last_name": "Smith", + "suffix": suffix, + }, + None, + False, + ) + self.assertEqual(result["suffix"], suffix) + + def test_format_person_json_no_suffix_when_empty(self): + """Test that suffix is not included when empty or None""" + from src.stac_utils.ngpvan import NGPVANClient + + result = NGPVANClient.format_person_json( + { + "first_name": "John", + "last_name": "Smith", + "suffix": "", + }, + None, + False, + ) + self.assertNotIn("suffix", result) + + result = NGPVANClient.format_person_json( + { + "first_name": "John", + "last_name": "Smith", + "suffix": None, + }, + None, + False, + ) + self.assertNotIn("suffix", result) + + # ========================================================================= + # Combined/Integration Tests + # ========================================================================= + + def test_format_person_json_all_new_features_combined(self): + """Test using all new features together""" + from src.stac_utils.ngpvan import NGPVANClient + + result = NGPVANClient.format_person_json( + { + "first_name": "John", + "last_name": "Smith", + "date_of_birth": "1984-01-01", + "middle_name": "Jacob", + "suffix": "Jr.", + "email_address": "john@work.com, john@home.com", + "phone_number": "555-111-1111, 555-222-2222", + "address1": "123 Main St", + "address_2": "Suite 500", + "city": "Clinton", + "state": "IA", + "zip": "12345", + }, + None, + False, + ) + + self.assertEqual(result["firstName"], "John") + self.assertEqual(result["lastName"], "Smith") + self.assertEqual(result["middleName"], "Jacob") + self.assertEqual(result["suffix"], "Jr.") + self.assertEqual( + result["emails"], + [{"email": "john@work.com"}, {"email": "john@home.com"}], + ) + self.assertEqual( + result["phones"], + [{"phoneNumber": "555-111-1111"}, {"phoneNumber": "555-222-2222"}], + ) + self.assertEqual( + result["addresses"], + [ + { + "addressLine1": "123 Main St", + "addressLine2": "Suite 500", + "city": "Clinton", + "stateOrProvince": "IA", + "zipOrPostalCode": "12345", + } + ], + ) + + def test_format_person_json_backwards_compatibility(self): + """Test that original functionality still works with original field names""" + from src.stac_utils.ngpvan import NGPVANClient + + # This is essentially the same as the existing test, ensuring backwards compatibility + result = NGPVANClient.format_person_json( + { + "first_name": "John", + "last_name": "Smith", + "date_of_birth": "1984-01-01", + "email": "foo@bar.com", + "phone": "817-555-1234", + "middle_name": "Jacob", + "street_address": "123 Main", + "city": "Clinton", + "state": "IA", + "zip": "12345", + }, + "van_id", + True, + ) + + self.assertEqual(result["firstName"], "John") + self.assertEqual(result["lastName"], "Smith") + self.assertEqual(result["emails"], [{"email": "foo@bar.com"}]) + self.assertEqual(result["phones"], [{"phoneNumber": "817-555-1234"}]) + self.assertEqual(result["addresses"][0]["addressLine1"], "123 Main") + + def test_validate_phone(self): self.test_client.post = MagicMock(return_value={"findbyphone": "555-123-4567"}) self.assertEqual(