Skip to content

Commit 0514445

Browse files
authored
Fix: only include file pages that are referenced in the content of a page (#58)
* Moved RegExPatternExtended to osw.utils.strings Some example RegExPatternExtended usage is given * Fixing WtSite.create_page_package() to include all file pages that are referenced in the slot content of a page (that is packaged into a page package) * Fix: Fixing failing test due to moved module
1 parent fd4ea02 commit 0514445

9 files changed

Lines changed: 538 additions & 404 deletions

File tree

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
from osw.utils.regex_pattern import REGEX_PATTERN_LIB
2+
3+
test_string = """
4+
{{Template:Viewer/Media
5+
| image_size = 300
6+
| mode = default
7+
| textdata = File:OSW420279d1be9640ad96e6685277a3f29b.png{{!}}Navigation menu;
8+
File:OSW0024d7c0a0d64642bf51f5facfe85d42{{!}}Search bar;
9+
File:OSW841b61b5996340fa81c3163bcea87482.png{{!}}Three points menu;
10+
File:OSWe8c6b659eab14cca927835ccd6baef15.png{{!}}Wiki preferences;
11+
File:OSWc0df6b35cc964307bbf3d18c78e5cb3e.png{{!}}Alerts;
12+
File:OSWd173625534d04fe6aab90a7bee4008e2.png{{!}}Notices;
13+
File:OSWc34ced55461949a59f950283e905b5fc.drawio.png{{!}}Personal menu;
14+
}}
15+
"""
16+
17+
second_test_string = r"""
18+
{
19+
"type": [
20+
"Category:OSW92cc6b1a2e6b4bb7bad470dfdcfdaf26"
21+
],
22+
"author": [],
23+
"uuid": "d00e7453-f1f3-4a6e-94e6-3c2664fb7776",
24+
"label": [
25+
{
26+
"text": "Full knowledge graph",
27+
"lang": "en"
28+
}
29+
],
30+
"description": [
31+
{
32+
"text": "Large graph displaying the full knowledge base",
33+
"lang": "en"
34+
}
35+
],
36+
"name": "FullKnowledgeGraph",
37+
"image": "File:OSWc34ced55461949a59f950283e905b5fc.drawio.png"
38+
}
39+
"""
40+
41+
my_dict = {
42+
"type": ["Category:OSW92cc6b1a2e6b4bb7bad470dfdcfdaf26"],
43+
"author": [],
44+
"uuid": "d00e7453-f1f3-4a6e-94e6-3c2664fb7776",
45+
"label": [{"text": "Full knowledge graph", "lang": "en"}],
46+
"description": [
47+
{"text": "Large graph displaying the full knowledge base", "lang": "en"}
48+
],
49+
"name": "FullKnowledgeGraph",
50+
"image": "File:OSWc34ced55461949a59f950283e905b5fc.drawio.png",
51+
}
52+
53+
my_dict_as_str = str(my_dict)
54+
55+
56+
# Test the regex pattern
57+
# Run the following code in the Python console / this script in interactive console
58+
my_pattern = REGEX_PATTERN_LIB["File page strings from any text"]
59+
60+
search_result = my_pattern.search(test_string)
61+
62+
findall_result = my_pattern.findall(test_string)
63+
64+
full_page_names = my_pattern.findall_by_group_key(test_string, "Full page name")
65+
full_page_names_2 = my_pattern.findall_by_group_key(
66+
second_test_string, "Full page name"
67+
)

scripts/migration/file_page_migration.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,13 @@
77
from typing import Dict, List
88

99
import mwclient
10-
from regex_pattern import REGEX_PATTERN_LIB, REGEX_PATTERN_LIST
1110

1211
import osw.data.import_utility as iu
1312
import osw.wiki_tools as wt
1413
from osw.core import OSW
15-
from osw.data.mining import match_first_regex_pattern, test_regex_pattern
1614
from osw.model.entity import Label, WikiFile
15+
from osw.utils.regex_pattern import REGEX_PATTERN_LIB, REGEX_PATTERN_LIST
16+
from osw.utils.strings import match_first_regex_pattern, test_regex_pattern
1717
from osw.utils.util import parallelize
1818
from osw.wtsite import WtPage, WtSite
1919

src/osw/controller/page_package.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,10 @@
1111

1212
import osw.model.page_package as model
1313
from osw.auth import CredentialManager
14-
from osw.data.mining import RegExPatternExtended
1514
from osw.model import page_package as package
1615
from osw.model.page_package import NAMESPACE_CONST_TO_NAMESPACE_MAPPING
1716
from osw.model.static import OswBaseModel
17+
from osw.utils.strings import RegExPatternExtended
1818
from osw.wtsite import WtSite
1919

2020
# Definition of constants

src/osw/data/import_utility.py

Lines changed: 31 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -10,72 +10,20 @@
1010
from geopy import Nominatim
1111
from jsonpath_ng import ext as jp
1212

13-
import osw.data.mining as dm
13+
import osw.utils.strings as strutil
1414
from osw import wiki_tools as wt
1515
from osw.auth import CredentialManager
1616
from osw.core import OSW
17-
from osw.data.mining import RegExPatternExtended
1817
from osw.model import entity as model
18+
from osw.utils.regex_pattern import REGEX_PATTERN_LIB, REGEX_PATTERN_LIST
1919
from osw.wtsite import WtSite
2020

2121
# Constants
2222
PACKAGE_ROOT_PATH = Path(__file__).parents[2]
2323
CREDENTIALS_FILE_PATH_DEFAULT = PACKAGE_ROOT_PATH / "examples" / "accounts.pwd.yaml"
2424
ENABLE_SORTING = True
25-
REGEX_PATTERN: Dict[str, Union[str, Dict[str, str]]] = {
26-
"SAP OU number and name from DN": {
27-
"Pattern": r"CN=(.+)([0-9]{10})-(.+),OU=Abteilungen",
28-
"Groups": {2: "SAP OU number", 3: "SAP OU name"},
29-
},
30-
"Location name from DN": {
31-
"Pattern": r"CN=[A-Za-z]+-(\d+)_L_([^_]+),OU=Standorte",
32-
"Groups": {1: "SAP institute number", 2: "Location name"},
33-
},
34-
"Location/Site parts from DN": {
35-
"Pattern": r"CN=[A-Za-z]+-(\d+)_L_(([^_^ ^-]+)-([^_^ ]+) (\d+)),OU=Standorte",
36-
"Groups": {
37-
1: "SAP institute number",
38-
2: "Site name",
39-
3: "City",
40-
4: "Street",
41-
5: "House number",
42-
},
43-
},
44-
"UUID from full page title": {
45-
"Pattern": r"([A-Za-z]+):([A-Z]+)([a-z\d\-]+)",
46-
"Groups": {1: "Namespace", 2: "Prefix", 3: "UUID"},
47-
},
48-
}
49-
REGEX_PATTERN_LIST = [
50-
RegExPatternExtended(
51-
description="SAP OU number and name from DN",
52-
pattern=r"CN=(.+)([0-9]{10})-(.+),OU=Abteilungen",
53-
group_keys=["Something", "SAP OU number", "SAP OU name"],
54-
),
55-
RegExPatternExtended(
56-
description="Location name from DN",
57-
pattern=r"CN=[A-Za-z]+\-(\d+)_L_([^_]+),OU=Standorte",
58-
group_keys=["SAP institute number", "Location name"],
59-
),
60-
RegExPatternExtended(
61-
description="Location/Site parts from DN",
62-
pattern=r"CN=[A-Za-z]+\-(\d+)_L_(([^_^ ^-]+)-([^_^ ]+) (\d+))," r"OU=Standorte",
63-
group_keys=[
64-
"SAP institute number",
65-
"Site name",
66-
"City",
67-
"Street",
68-
"House number",
69-
],
70-
),
71-
RegExPatternExtended(
72-
description="UUID from full page title",
73-
pattern=r"([A-Za-z]+):([A-Z]+)([a-z\d\-]+)",
74-
group_keys=["Namespace", "Prefix", "UUID"],
75-
),
76-
]
25+
# For compatibility with the old version of the module
7726
REGEX_PATTERN = {rep.description: rep.dict() for rep in REGEX_PATTERN_LIST}
78-
REGEX_PATTERN_LIB = {rep.description: rep for rep in REGEX_PATTERN_LIST}
7927

8028

8129
# Classes
@@ -203,7 +151,7 @@ def get_uuid_from_object_via_type(obj: Any) -> Union[uuid_module.UUID, None]:
203151
else:
204152
type_str = str(type_)
205153
match = re.match(
206-
pattern=REGEX_PATTERN["UUID from full page title"]["Pattern"],
154+
pattern=REGEX_PATTERN_LIB["UUID from full page title"].pattern,
207155
string=type_str,
208156
)
209157
uuid_str = match.group(3)
@@ -473,8 +421,8 @@ def nan_empty_or_none(inp: Any) -> bool:
473421

474422

475423
def regex_match_list(
476-
pattern: Union[str, dm.RegExPatternExtended], list_of_strings: List[str]
477-
) -> List[Union[str, dm.MatchResult]]:
424+
pattern: Union[str, strutil.RegExPatternExtended], list_of_strings: List[str]
425+
) -> List[Union[str, strutil.MatchResult]]:
478426
"""Returns a subset of the 'list_of_strings' that matched the regex 'pattern'.
479427
480428
Parameters
@@ -493,7 +441,7 @@ def regex_match_list(
493441
if re.match(pattern=pattern, string=string):
494442
matches.append(string)
495443
return matches
496-
elif isinstance(pattern, dm.RegExPatternExtended):
444+
elif isinstance(pattern, strutil.RegExPatternExtended):
497445
matches = []
498446
for string in list_of_strings:
499447
match_result_obj = pattern.match(string)
@@ -780,6 +728,30 @@ def get_entities_from_osw(
780728
return entities_from_osw
781729

782730

731+
def full_page_title_to_uuid(full_page_title: str) -> uuid_module.UUID:
732+
"""Extracts a UUID from a full page title."""
733+
match = re.match(
734+
pattern=REGEX_PATTERN_LIB["UUID from full page title"].pattern,
735+
string=full_page_title,
736+
)
737+
uuid_str = match.group(3)
738+
return uuid_module.UUID(uuid_str)
739+
740+
741+
def osw_id_to_uuid(osw_id: str) -> uuid_module.UUID:
742+
"""Extracts a UUID from an OSW ID."""
743+
match = re.match(
744+
pattern=REGEX_PATTERN_LIB["UUID from OSW ID"].pattern, string=osw_id
745+
)
746+
uuid_str = match.group(2)
747+
return uuid_module.UUID(uuid_str)
748+
749+
750+
def uuid_to_osw_id(uuid: uuid_module.UUID, prefix: str = "OSW") -> str:
751+
"""Creates an OSW ID from a UUID."""
752+
return f"{prefix}{str(uuid).replace('-', '')}"
753+
754+
783755
def uuid_to_full_page_title(
784756
uuid: Union[uuid_module.UUID, str],
785757
wiki_ns: str = "Item",

0 commit comments

Comments
 (0)