Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions model/countries/FR/FR-formatting-rules.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,7 @@ formatting-rules:
- separator: "\n"
- locality2
- separator: "\n"
- postal-code
- separator: " "
- locality1
- postal-code-and-city
- separator: "\n"
- admin-area1
- separator: "\n"
Expand All @@ -24,6 +22,11 @@ formatting-rules:
- separator: " "
- street

postal-code-and-city:
- postal-code
- separator: " "
- locality1

examples:
- id: name
comment: |
Expand Down
15 changes: 14 additions & 1 deletion model/countries/FR/FR-model.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,19 @@ cut-off-tokens:
- locality4

extra-definitions:
address:
- street-address
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

drive by:

What's the reasoning for having both street-address and street-address-alternative-1 as children of address and then skipping 'street-address' in formatting rules?

Could we just not add street-address at all instead?

- street-address-alternative-1
- locality2
- postal-code-and-city
- admin-area1
- country
- country-name

street-address-alternative-1:
- building-location
- address-overflow
- address-overflow

postal-code-and-city:
- postal-code
- locality1
78 changes: 77 additions & 1 deletion model/countries/FR/FR-parsing-rules.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@ regex_definitions:
kCommaOrNewlineSeparator:
regex_fragment: '(?:, |\n|\r|,)+'

kZipValueRe:
regex_fragment: '\b\d{5}\b'

capture_definitions:
ParseBuildingLocation:
capture:
Expand Down Expand Up @@ -47,13 +50,44 @@ capture_definitions:
parts: [ {regex_fragment: '(?:[^\r\n]+)'} ]
quantifier: MATCH_OPTIONAL

ParsePostalCodeThenCity:
capture:
output: postal-code-and-city
parts:
- capture:
output: postal-code
parts: [ {regex_reference: kZipValueRe} ]
- separator: {regex_reference: kCommaOrWhitespaceSeparator}
- capture:
output: locality1
parts: [ {regex_reference: kMultipleWordsRe} ]

ParseCityThenPostalCode:
capture:
output: postal-code-and-city
parts:
- capture:
output: locality1
parts: [ {regex_reference: kMultipleWordsRe} ]
- separator: {regex_reference: kCommaOrWhitespaceSeparator}
- capture:
output: postal-code
parts: [ {regex_reference: kZipValueRe} ]

parsing_definitions:
building-location:
decomposition:
capture_reference: ParseBuildingLocation
street-address-alternative-1:
decomposition:
capture_reference: StreetAddressDecomposition
postal-code-and-city:
decomposition_cascade:
alternatives:
- decomposition:
capture_reference: ParsePostalCodeThenCity
- decomposition:
capture_reference: ParseCityThenPostalCode

test_parsing_definitions:
- id: "Test 1"
Expand Down Expand Up @@ -155,4 +189,46 @@ test_parsing_definitions:
building-location: "1661 Place Charles de Gaulle"
street: "Place Charles de Gaulle"
building: "1661"
address-overflow: "Floor 5, Apartment 2"
address-overflow: "Floor 5, Apartment 2"
- id: "Test 13"
type: postal-code-and-city
input: "59491 Villeneuve-d'Ascq"
output:
postal-code-and-city: "59491 Villeneuve-d'Ascq"
postal-code: "59491"
locality1: "Villeneuve-d'Ascq"
- id: "Test 14"
type: postal-code-and-city
input: "Paris 75002"
output:
postal-code-and-city: "75002 Paris"
locality1: "Paris"
postal-code: "75002"
- id: "Test 15"
type: postal-code-and-city
input: "69120 Vaulx-en-Velin"
output:
postal-code-and-city: "69120 Vaulx-en-Velin"
postal-code: "69120"
locality1: "Vaulx-en-Velin"
- id: "Test 16"
type: postal-code-and-city
input: "Aix-en-Provence 13100"
output:
postal-code-and-city: "Aix-en-Provence 13100"
locality1: "Aix-en-Provence"
postal-code: "13100"
- id: "Test 17"
type: postal-code-and-city # Matches the key in parsing_definitions
input: "Val de Moder 67350"
output:
postal-code-and-city: "Val de Moder 67350"
locality1: "Val de Moder"
postal-code: "67350"
- id: "Test 18"
type: postal-code-and-city # Matches the key in parsing_definitions
input: "42000 St. Étienne"
output:
postal-code-and-city: "42000 St. Étienne"
locality1: "St. Étienne"
postal-code: "42000"