Skip to content

Commit 349b1bd

Browse files
committed
feat: implement offline page package
1 parent f273c3c commit 349b1bd

3 files changed

Lines changed: 206 additions & 26 deletions

File tree

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
import os
2+
from pathlib import Path
3+
4+
from osw import wiki_tools
5+
from osw.auth import CredentialManager
6+
from osw.core import OSW
7+
from osw.model.page_package import PagePackage, PagePackageBundle, PagePackageConfig
8+
from osw.params import CreatePagePackageParam, PageDumpConfig
9+
from osw.wtsite import WtSite
10+
11+
# Create/update the password file under examples/accounts.pwd.yaml
12+
pwd_file_path = os.path.join(
13+
os.path.dirname(os.path.abspath(__file__)), "accounts.pwd.yaml"
14+
)
15+
16+
# login to demo.open-semantic-lab.org
17+
osw_obj = OSW(
18+
site=WtSite(
19+
WtSite.WtSiteConfig(
20+
iri="demo.open-semantic-lab.org",
21+
cred_mngr=CredentialManager(cred_filepath=pwd_file_path),
22+
)
23+
)
24+
)
25+
26+
# download all pages in a PagePackage
27+
# pages = wiki_tools.semantic_search(
28+
# osw_obj.site._site,
29+
# "[[Category:Entity]] OR [[:Category:+]]
30+
# )
31+
# only classes
32+
pages = wiki_tools.semantic_search(osw_obj.site._site, "[[HasType::Category:Category]]")
33+
print(f"Found {len(pages)} Entity pages.")
34+
target_dir = Path("osw_files") / "packages" / "local.offline"
35+
package = osw_obj.site.create_page_package(
36+
CreatePagePackageParam(
37+
config=PagePackageConfig(
38+
name="local.offline",
39+
config_path=target_dir / "package.json",
40+
content_path=target_dir, # / "content",
41+
titles=pages,
42+
bundle=PagePackageBundle(
43+
packages={
44+
"local.offline": PagePackage(
45+
globalID="local.offline",
46+
description="Offline OSW package example",
47+
version="0.1.0",
48+
baseURL="http://local.offline",
49+
)
50+
}
51+
),
52+
),
53+
dump_config=PageDumpConfig(target_dir=target_dir),
54+
)
55+
)
56+
57+
# load pages from local offline package
58+
offline_pages = {}
59+
result = osw_obj.site.read_page_package(
60+
WtSite.ReadPagePackageParam(
61+
storage_path=target_dir,
62+
)
63+
)
64+
offline_pages = {page.title: page for page in result.pages}
65+
print(offline_pages["Category:Category"])
66+
res = osw_obj.load_entity(
67+
OSW.LoadEntityParam(titles=pages, offline_pages=offline_pages)
68+
)
69+
70+
result = osw_obj.export_jsonld(
71+
params=OSW.ExportJsonLdParams(
72+
entities=res.entities,
73+
mode=OSW.JsonLdMode.expand,
74+
build_rdf_graph=True,
75+
context_loader_config=WtSite.JsonLdContextLoaderParams(
76+
prefer_external_vocal=False, offline_pages=offline_pages
77+
),
78+
)
79+
)
80+
81+
graph = result.graph
82+
# all triples in the graph
83+
qres = graph.query(
84+
"""
85+
SELECT ?s ?p ?o
86+
WHERE {
87+
?s ?p ?o .
88+
}
89+
"""
90+
)
91+
# Count triples
92+
print(f"\nTotal triples in the graph: {len(qres)}")
93+
94+
# query all properties of Category:Item
95+
qres = graph.query(
96+
"""
97+
SELECT ?property ?object
98+
WHERE {
99+
Category:Item ?property ?object .
100+
}
101+
"""
102+
)
103+
print("\nProperties of Category:Item:")
104+
for row in qres:
105+
print(row.property, row.object)

src/osw/core.py

Lines changed: 57 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -348,6 +348,14 @@ class FetchSchemaParam(BaseModel):
348348
)
349349
legacy_generator: Optional[bool] = False
350350
"""uses legacy command line for code generation if true"""
351+
offline_pages: Optional[Dict[str, WtPage]] = None
352+
"""pages to be used offline instead of fetching them from the OSW instance"""
353+
result_model_path: Optional[Union[str, pathlib.Path]] = None
354+
"""path to the generated model file, if None,
355+
the default path ./model/entity.py is used"""
356+
357+
class Config:
358+
arbitrary_types_allowed = True
351359

352360
def fetch_schema(self, fetchSchemaParam: FetchSchemaParam = None) -> None:
353361
"""Loads the given schemas from the OSW instance and auto-generates python
@@ -370,6 +378,8 @@ def fetch_schema(self, fetchSchemaParam: FetchSchemaParam = None) -> None:
370378
schema_title=schema_title,
371379
mode=mode,
372380
legacy_generator=fetchSchemaParam.legacy_generator,
381+
offline_pages=fetchSchemaParam.offline_pages,
382+
result_model_path=fetchSchemaParam.result_model_path,
373383
)
374384
)
375385
first = False
@@ -396,6 +406,14 @@ class _FetchSchemaParam(BaseModel):
396406
)
397407
legacy_generator: Optional[bool] = False
398408
"""uses legacy command line for code generation if true"""
409+
offline_pages: Optional[Dict[str, WtPage]] = None
410+
"""pages to be used offline instead of fetching them from the OSW instance"""
411+
result_model_path: Optional[Union[str, pathlib.Path]] = None
412+
"""path to the generated model file, if None,
413+
the default path ./model/entity.py is used"""
414+
415+
class Config:
416+
arbitrary_types_allowed = True
399417

400418
def _fetch_schema(self, fetchSchemaParam: _FetchSchemaParam = None) -> None:
401419
"""Loads the given schema from the OSW instance and autogenerates python
@@ -413,11 +431,21 @@ def _fetch_schema(self, fetchSchemaParam: _FetchSchemaParam = None) -> None:
413431
schema_title = fetchSchemaParam.schema_title
414432
root = fetchSchemaParam.root
415433
schema_name = schema_title.split(":")[-1]
416-
page = self.site.get_page(WtSite.GetPageParam(titles=[schema_title])).pages[0]
417-
if not page.exists:
418-
print(f"Error: Page {schema_title} does not exist")
419-
return
420-
# not only in the JsonSchema namespace the schema is located in the main sot
434+
if (
435+
fetchSchemaParam.offline_pages is not None
436+
and schema_title in fetchSchemaParam.offline_pages
437+
):
438+
print(f"Fetch {schema_title} from offline pages")
439+
page = fetchSchemaParam.offline_pages[schema_title]
440+
else:
441+
print(f"Fetch {schema_title} from online pages")
442+
page = self.site.get_page(WtSite.GetPageParam(titles=[schema_title])).pages[
443+
0
444+
]
445+
if not page.exists:
446+
print(f"Error: Page {schema_title} does not exist")
447+
return
448+
# not only in the JsonSchema namespace the schema is located in the main slot
421449
# in all other namespaces, the json_schema slot is used
422450
if schema_title.startswith("JsonSchema:"):
423451
schema_str = ""
@@ -441,7 +469,6 @@ def _fetch_schema(self, fetchSchemaParam: _FetchSchemaParam = None) -> None:
441469
)
442470
# fix https://github.com/koxudaxi/datamodel-code-generator/issues/1910
443471
)
444-
print(f"Fetch {schema_title}")
445472

446473
jsonpath_expr = parse("$..dollarref")
447474
for match in jsonpath_expr.find(schema):
@@ -462,9 +489,10 @@ def _fetch_schema(self, fetchSchemaParam: _FetchSchemaParam = None) -> None:
462489
if (
463490
ref_schema_title != schema_title
464491
): # prevent recursion in case of self references
465-
self._fetch_schema(
466-
OSW._FetchSchemaParam(schema_title=ref_schema_title, root=False)
467-
) # resolve references recursive
492+
_param = fetchSchemaParam.copy()
493+
_param.root = False
494+
_param.schema_title = ref_schema_title
495+
self._fetch_schema(_param) # resolve references recursive
468496

469497
model_dir_path = os.path.join(
470498
os.path.dirname(os.path.abspath(__file__)), "model"
@@ -730,6 +758,11 @@ class LoadEntityParam(BaseModel):
730758
from the jsondata."""
731759
disable_cache: bool = False
732760
"""If true, disable the cache for the loading process"""
761+
offline_pages: Optional[Dict[str, WtPage]] = None
762+
"""pages to be used offline instead of fetching them from the OSW instance"""
763+
764+
class Config:
765+
arbitrary_types_allowed = True # allow any class as type
733766

734767
def __init__(self, **data):
735768
super().__init__(**data)
@@ -795,7 +828,9 @@ def load_entity(
795828
self.site.enable_cache()
796829

797830
entities = []
798-
pages = self.site.get_page(WtSite.GetPageParam(titles=param.titles)).pages
831+
pages = self.site.get_page(
832+
WtSite.GetPageParam(titles=param.titles, offline_pages=param.offline_pages)
833+
).pages
799834
for page in pages:
800835
entity = None
801836
schemas = []
@@ -806,7 +841,11 @@ def load_entity(
806841
if jsondata:
807842
for category in jsondata["type"]:
808843
schema = (
809-
self.site.get_page(WtSite.GetPageParam(titles=[category]))
844+
self.site.get_page(
845+
WtSite.GetPageParam(
846+
titles=[category], offline_pages=param.offline_pages
847+
)
848+
)
810849
.pages[0]
811850
.get_slot_content("jsonschema")
812851
)
@@ -820,7 +859,9 @@ def load_entity(
820859
if param.autofetch_schema:
821860
self.fetch_schema(
822861
OSW.FetchSchemaParam(
823-
schema_title=category, mode="append"
862+
schema_title=category,
863+
mode="append",
864+
offline_pages=param.offline_pages,
824865
)
825866
)
826867
if not hasattr(model, cls_name):
@@ -1586,6 +1627,10 @@ class ExportJsonLdParams(OswBaseModel):
15861627
build_rdf_graph: Optional[bool] = False
15871628
"""If True, the output is a graph."""
15881629
debug: Optional[bool] = False
1630+
"""If True, debug information is printed."""
1631+
1632+
class Config:
1633+
arbitrary_types_allowed = True
15891634

15901635
def __init__(self, **data):
15911636
super().__init__(**data)

src/osw/wtsite.py

Lines changed: 44 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -251,6 +251,11 @@ class GetPageParam(OswBaseModel):
251251
"""Whether to raise an exception if an error occurs"""
252252
raise_warning: Optional[bool] = True
253253
"""Whether to raise a warning if a page does not exist occurs"""
254+
offline_pages: Optional[Dict[str, "WtPage"]] = None
255+
"""pages to be used offline instead of fetching them from the OSW instance"""
256+
257+
class Config:
258+
arbitrary_types_allowed = True # allows to use WtPage in type hints
254259

255260
def __init__(self, **data):
256261
super().__init__(**data)
@@ -292,7 +297,11 @@ def get_page_(title: str, index: int = None):
292297
if index is not None:
293298
msg = f"({index + 1}/{max_index}) "
294299
try:
295-
if self._cache_enabled and title in self._page_cache:
300+
if param.offline_pages and title in param.offline_pages:
301+
wtpage = param.offline_pages[title]
302+
wtpage.exists = True
303+
msg += "Page loaded from offline pages. "
304+
elif self._cache_enabled and title in self._page_cache:
296305
wtpage = self._page_cache[title]
297306
msg += "Page loaded from cache. "
298307
else:
@@ -954,18 +963,25 @@ def read_page_package(self, param: ReadPagePackageParam) -> ReadPagePackageResul
954963
search_path=storage_path, recursive=True
955964
)
956965
sub_dirs = top_level_content["directories"]
957-
if len(top_level_content["directories"]) == 0:
966+
if len(sub_dirs) == 0:
958967
# No subdirectories found, assume that the pages files are located in the
959968
# top level
960969
sub_dirs = [storage_path]
961970

971+
# if subdirs are namespaces, skip them as well
972+
if any(
973+
d.name in ["Category", "Item", "File", "Module"]
974+
for d in storage_path_content["directories"]
975+
):
976+
sub_dirs = [storage_path]
977+
962978
def get_slot_content(
963979
parent_dir: List[Union[str, Path]],
964980
url_path: str,
965981
files_in_storage_path: List[Path],
966982
) -> Union[str, Dict]:
967983
for pdir in parent_dir:
968-
slot_path = storage_path / pdir / url_path
984+
slot_path = pdir / url_path
969985
if slot_path in files_in_storage_path:
970986
with open(slot_path, "r", encoding="utf-8") as f:
971987
file_content = f.read()
@@ -986,8 +1002,18 @@ def get_slot_content(
9861002
namespace = page["namespace"].split("_")[-1].capitalize()
9871003
name = page["name"]
9881004
# Create the WtPage object
989-
page_obj = WtPage(wtSite=self, title=f"{namespace}:{name}")
990-
if "main" in selected_slots:
1005+
page_obj = WtPage(
1006+
wtSite=self, title=f"{namespace}:{name}", do_init=False
1007+
)
1008+
if selected_slots is None:
1009+
_selected_slots = page["slots"]
1010+
else:
1011+
_selected_slots = {
1012+
slot_name: slot_dict
1013+
for slot_name, slot_dict in page["slots"].items()
1014+
if slot_name in selected_slots
1015+
}
1016+
if "main" in _selected_slots:
9911017
# Main slot is special
9921018
slot_content = get_slot_content(
9931019
parent_dir=sub_dirs,
@@ -999,14 +1025,6 @@ def get_slot_content(
9991025
slot_key="main",
10001026
content=slot_content,
10011027
)
1002-
if selected_slots is None:
1003-
_selected_slots = page["slots"]
1004-
else:
1005-
_selected_slots = {
1006-
slot_name: slot_dict
1007-
for slot_name, slot_dict in page["slots"].items()
1008-
if slot_name in selected_slots
1009-
}
10101028
for slot_name, slot_dict in _selected_slots.items():
10111029
slot_content = get_slot_content(
10121030
parent_dir=sub_dirs,
@@ -1125,6 +1143,12 @@ class JsonLdContextLoaderParams(OswBaseModel):
11251143
prefer_external_vocal: Optional[bool] = True
11261144
"""Whether to prefer external vocabularies (e.g. skos, schema, etc.)
11271145
over the local properties (Namespace 'Property:')"""
1146+
offline_pages: Optional[Dict[str, "WtPage"]] = None
1147+
"""A dictionary of pages that are already loaded. Pages in this dictionary
1148+
will not be fetched again."""
1149+
1150+
class Config:
1151+
arbitrary_types_allowed = True
11281152

11291153
def _replace_jsonld_context_mapping(
11301154
self, context: Union[str, list, dict], config: JsonLdContextLoaderParams
@@ -1202,7 +1226,11 @@ def loader(url, options=None):
12021226
# print("Requesting", url)
12031227
if "/wiki/" in url:
12041228
title = url.split("/wiki/")[-1].split("?")[0]
1205-
page = self.get_page(WtSite.GetPageParam(titles=[title])).pages[0]
1229+
page = self.get_page(
1230+
WtSite.GetPageParam(
1231+
titles=[title], offline_pages=params.offline_pages
1232+
)
1233+
).pages[0]
12061234
if "JsonSchema:" in title:
12071235
schema = page.get_slot_content("main")
12081236
else:
@@ -2177,8 +2205,10 @@ def import_xml(self, config: ImportConfig) -> ImportResult:
21772205
WtPage.PageCopyResult.update_forward_refs()
21782206
WtSite.CopyPagesParam.update_forward_refs()
21792207
WtSite.UploadPageParam.update_forward_refs()
2208+
WtSite.GetPageParam.update_forward_refs()
21802209
WtSite.GetPageResult.update_forward_refs()
21812210
WtSite.CreatePagePackageParam.update_forward_refs()
21822211
WtSite.UploadPagePackageParam.update_forward_refs()
21832212
WtSite.ReadPagePackageResult.update_forward_refs()
21842213
WtSite.DeletePageParam.update_forward_refs()
2214+
WtSite.JsonLdContextLoaderParams.update_forward_refs()

0 commit comments

Comments
 (0)