From fa197fc0d1875cba9fd2fc2271039daeb2bc689e Mon Sep 17 00:00:00 2001 From: Salsi Shekiladze Date: Tue, 14 May 2024 16:18:52 +0200 Subject: [PATCH] Add changes to PL model according to the new findings in Data Mining by Raters --- docs/model/PL.html | 510 +++++++++++++++++++- model/countries/PL/PL-formatting-rules.yaml | 6 + model/countries/PL/PL-model.yaml | 3 +- model/countries/PL/PL-parsing-rules.yaml | 63 ++- 4 files changed, 572 insertions(+), 10 deletions(-) diff --git a/docs/model/PL.html b/docs/model/PL.html index 3c5f1db..9044bfa 100644 --- a/docs/model/PL.html +++ b/docs/model/PL.html @@ -709,6 +709,23 @@

Overview of concepts

+ +
+
+ + + address-overflow + - An overflow field for information that is not captured differently in a form + + + + + +
+ +
+ + @@ -1388,6 +1405,8 @@

Example addresses

+ + @@ -1479,6 +1498,12 @@

Example addresses

+ + address-overflow
Nějaké doplňující informace
+ + + + locality1
Warsaw
@@ -1556,6 +1581,7 @@

Example addresses

Output for "address":
ul. Warsaw 9/10
+Nějaké doplňující informace
 01-001 Warsaw
 Polska
@@ -1636,6 +1662,12 @@

Example addresses

+ + address-overflow
floor 5 apt 7
+ + + + locality1
Warsaw
@@ -1713,6 +1745,7 @@

Example addresses

Output for "address":
ul. Warsaw 9
+floor 5 apt 7
 01-001 Warsaw
 Polska
@@ -3715,7 +3748,7 @@

Formatting:

Flattened formatting:
address =
-streetbuilding/unit-typeunit-name
postal-codelocality1
country-name +streetbuilding/unit-typeunit-name
address-overflow
postal-codelocality1
country-name

@@ -3980,12 +4013,26 @@

Parsing:

+
+ Decomposition Cascade + + + + Cascade:
+
    + +
  1. + +
    Decomposition + (ParseStreetAddressWithOverflow) + Anchor beginning: True
    + Capture Reference: ParseStreetAddressWithOverflow
    @@ -4324,6 +4371,46 @@

    Parsing:

    +
    + + +
  2. + +
  3. + + +Regex Fragment: \n + + +
  4. + +
  5. + + +
    + + + Capture address-overflow + + (MATCH_REQUIRED) + + + + Parts:
    +
      + +
    • + + +Regex Fragment: [\s\S]+ + + +
    • + +
    + + +
    @@ -4342,6 +4429,382 @@

    Parsing:

+ + +
  • + + +
    + Decomposition + + (ParseStreetAddressWithoutOverflow) + + + Anchor beginning: True
    + + Capture Reference: ParseStreetAddressWithoutOverflow
    + + + +
    + + + Capture street-address-alternative-1 + + (MATCH_REQUIRED) + + + + Parts:
    +
      + +
    • + + +
      + Capture Reference: ParseBuildingLocation + + + + +
      + + + Capture building-location + + (MATCH_REQUIRED) + + + + Parts:
      +
        + +
      • + + +
        + + + Capture street + + (MATCH_REQUIRED) + + + Prefix: + + + + + Regex Reference: kStreetOptionalPrefixRe => + (?:(?:ulica|ul\.?|aleja|al\.?|plac|pl\.?|skwer|rondo|osiedle|boczna|bulwar|droga|rynek|szosa|zaulek)\s*)? + + + +
        + + + Parts:
        +
          + +
        • + + + + Regex Reference: kMultipleLazyWordsRe => + (?:[^\s,]+(?:[^\S\r\n]+[^\s,]+)*?) + + + +
        • + +
        + + + +
        + + +
      • + +
      • + + + +Separator: Regex Reference kWhitespaceSeparator => + (?:^|\s+) + + + +
      • + +
      • + + +
        + + + Capture building-and-unit + + (MATCH_REQUIRED) + + + + Parts:
        +
          + +
        • + + +
          + + + Capture building + + (MATCH_REQUIRED) + + + + Parts:
          +
            + +
          • + + + +
            + Regex Reference: kBuildingValueRe + + + + Regex concatenation +
              + +
            1. + + +Regex Fragment: \d+ + + +
            2. + +
            3. + + +Regex Fragment: (?: + + +
            4. + +
            5. + + +Regex Fragment: \s*[[:alpha:]]\b + + +
            6. + +
            7. + + +Regex Fragment: )? + + +
            8. + +
            + Wrap as non-capture group: False + + + +
            + + + +
          • + +
          + + + +
          + + +
        • + +
        • + + + +Separator: Regex Reference kHouseNumberAndUnitSeparator => + (?:^|[/\s]+) + + + +
        • + +
        • + + +
          + Capture Reference: ParseUnitWithOptionalPrefix + + + + +
          + + + Capture unit + + (MATCH_OPTIONAL) + + + + Parts:
          +
            + +
          • + + +
            + + + Capture unit-type + + (MATCH_OPTIONAL) + + + + Parts:
            +
              + +
            • + + + + Regex Reference: kUnitTypeLiteralRe => + (?:mieszkanie|m\.?|lokal|lok\.?|apartment|apt\.?)? + + + +
            • + +
            + + + +
            + + +
          • + +
          • + + + +Separator: + + +Regex Fragment: \s* + + + + + + +
          • + +
          • + + +
            + + + Capture unit-name + + (MATCH_REQUIRED) + + + + Parts:
            +
              + +
            • + + + + Regex Reference: kUnitNameValueRe => + (?:\d+\w?\b|\w\b) + + + +
            • + +
            + + + +
            + + +
          • + +
          + + + +
          + + + + +
          + + +
        • + +
        + + + +
        + + +
      • + +
      + + + +
      + + + + +
      + + +
    • + +
    + + + +
    + + + + + Anchor end: True
    +
    + + +
  • + + + + + @@ -4355,6 +4818,11 @@

    Children:

    building-location +
  • + + address-overflow +
  • + @@ -4367,13 +4835,13 @@

    Children:

    Formatting:

    street-address-alternative-1 = -building-location +building-locationaddress-overflow
    Flattened formatting:
    street-address-alternative-1 =
    -streetbuilding/unit-typeunit-name +streetbuilding/unit-typeunit-name
    address-overflow

    @@ -5446,6 +5914,42 @@

    Flattened formatting:
    + + + + + + + + +

    + # + + address-overflow +

    +
    + +An overflow field for information that is not captured differently in a form + + + + + + + + + + + + + + + + + + + + diff --git a/model/countries/PL/PL-formatting-rules.yaml b/model/countries/PL/PL-formatting-rules.yaml index 8d276c4..7c289f7 100644 --- a/model/countries/PL/PL-formatting-rules.yaml +++ b/model/countries/PL/PL-formatting-rules.yaml @@ -13,6 +13,8 @@ formatting-rules: street-address-alternative-1: - building-location + - separator: "\n" + - address-overflow building-location: - street @@ -44,6 +46,7 @@ examples: building: 9 unit: 10 building-and-unit: 9/10 + address-overflow: Nějaké doplňující informace locality1: Warsaw postal-code: 01-001 country: PL @@ -54,6 +57,7 @@ examples: show: true text: | ul. Warsaw 9/10 + Nějaké doplňující informace 01-001 Warsaw Polska @@ -64,6 +68,7 @@ examples: street: ul. Warsaw building: 9 building-and-unit: 9 + address-overflow: floor 5 apt 7 locality1: Warsaw postal-code: 01-001 country: PL @@ -74,5 +79,6 @@ examples: show: true text: | ul. Warsaw 9 + floor 5 apt 7 01-001 Warsaw Polska diff --git a/model/countries/PL/PL-model.yaml b/model/countries/PL/PL-model.yaml index 6648a0e..962f991 100644 --- a/model/countries/PL/PL-model.yaml +++ b/model/countries/PL/PL-model.yaml @@ -21,4 +21,5 @@ extra-definitions: - street - building-and-unit street-address-alternative-1: - - building-location \ No newline at end of file + - building-location + - address-overflow \ No newline at end of file diff --git a/model/countries/PL/PL-parsing-rules.yaml b/model/countries/PL/PL-parsing-rules.yaml index da58b1b..7c618a8 100644 --- a/model/countries/PL/PL-parsing-rules.yaml +++ b/model/countries/PL/PL-parsing-rules.yaml @@ -26,6 +26,25 @@ regex_definitions: regex_fragment: (?:^|[/\s]+) capture_definitions: + ParseStreetAddressWithOverflow: + capture: + output: street-address-alternative-1 + parts: + - capture_reference: ParseBuildingLocation + - regex_fragment: '\n' + - capture: + output: address-overflow + # Matches any non-empty string, including new-line characters. + parts: [ {regex_fragment: '[\s\S]+'} ] + quantifier: MATCH_REQUIRED + + ParseStreetAddressWithoutOverflow: + capture: + output: street-address-alternative-1 + parts: + - capture_reference: ParseBuildingLocation + + ParseBuildingLocation: capture: output: building-location @@ -44,6 +63,7 @@ capture_definitions: - separator: {regex_reference: kHouseNumberAndUnitSeparator} - capture_reference: ParseUnitWithOptionalPrefix + ParseUnitWithOptionalPrefix: capture: output: unit @@ -65,11 +85,10 @@ parsing_definitions: capture_reference: ParseBuildingLocation street-address-alternative-1: - decomposition: - capture: - output: street-address-alternative-1 - parts: - - capture_reference: ParseBuildingLocation + decomposition_cascade: + alternatives: + - decomposition: {capture_reference: ParseStreetAddressWithOverflow} + - decomposition: {capture_reference: ParseStreetAddressWithoutOverflow} building-and-unit: decomposition: @@ -222,4 +241,36 @@ test_parsing_definitions: building: "9A" unit: "m.10" unit-type: "m." - unit-name: "10" \ No newline at end of file + unit-name: "10" +- id: "Test 15" + type: street-address-alternative-1 + input: "ul. Warsaw 9A\nthird entrance of building" + output: + street-address-alternative-1: "ul. Warsaw 9A\nthird entrance of building" + building-location: "ul. Warsaw 9A" + street: "Warsaw" + building-and-unit: "9A" + building: "9A" + address-overflow: "third entrance of building" +- id: "Test 16" + type: street-address-alternative-1 + input: "ul. Warsaw 9\nthird entrance of building" + output: + street-address-alternative-1: "ul. Warsaw 9\nthird entrance of building" + building-location: "ul. Warsaw 9" + street: "Warsaw" + building-and-unit: "9" + building: "9" + address-overflow: "third entrance of building" +- id: "Test 17" + type: street-address-alternative-1 + input: "ul. Warsaw 9/10\nNějaké doplňující informace" + output: + street-address-alternative-1: "ul. Warsaw 9\nNějaké doplňující informace" + building-location: "ul. Warsaw 9/10" + street: "Warsaw" + building-and-unit: "9/10" + building: "9" + unit: "10" + unit-name: "10" + address-overflow: "Nějaké doplňující informace" \ No newline at end of file