From 5638118f68791399e021c34ebf6cc97772324793 Mon Sep 17 00:00:00 2001 From: Magnus Bakken <10287813+magbak@users.noreply.github.com> Date: Sun, 14 Jun 2026 17:28:04 +0200 Subject: [PATCH 1/2] Begin support for facade-x style mapping of dataframes --- Cargo.lock | 1 + lib/maplib/src/model.rs | 8 ++ lib/representation/src/constants.rs | 2 +- lib/triplestore/Cargo.toml | 1 + lib/triplestore/src/lib.rs | 1 + lib/triplestore/src/map_df.rs | 132 ++++++++++++++++++++++++++++ py_maplib/maplib/__init__.pyi | 18 ++++ py_maplib/src/lib.rs | 21 +++++ py_maplib/tests/test_map_df.py | 35 ++++++++ 9 files changed, 218 insertions(+), 1 deletion(-) create mode 100644 lib/triplestore/src/map_df.rs create mode 100644 py_maplib/tests/test_map_df.py diff --git a/Cargo.lock b/Cargo.lock index 6e767be9..bb6139f7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4584,6 +4584,7 @@ dependencies = [ "sprs", "thiserror", "tracing", + "uri_encode", "utils", "uuid", ] diff --git a/lib/maplib/src/model.rs b/lib/maplib/src/model.rs index 2a38adef..110a10cf 100644 --- a/lib/maplib/src/model.rs +++ b/lib/maplib/src/model.rs @@ -229,6 +229,14 @@ impl Model { .map_err(MaplibError::TriplestoreError) } + #[instrument(skip_all)] + pub fn map_df(&mut self, df: &DataFrame, graph: &NamedGraph) -> Result<(), MaplibError> { + self.add_fx_prefixes(); + self.triplestore + .map_df(df, graph) + .map_err(MaplibError::TriplestoreError) + } + #[instrument(skip_all)] pub fn map_xml_path( &mut self, diff --git a/lib/representation/src/constants.rs b/lib/representation/src/constants.rs index db456422..97b653bc 100644 --- a/lib/representation/src/constants.rs +++ b/lib/representation/src/constants.rs @@ -33,4 +33,4 @@ pub const FOAF_PREFIX: &str = "foaf"; pub const FOAF_PREFIX_IRI: &str = "http://xmlns.com/foaf/0.1/"; pub const MAPLIB_PREFIX: &str = "maplib"; -pub const MAPLIB_PREFIX_IRI: &str = "https://github.com/DataTreehouse/maplib#"; \ No newline at end of file +pub const MAPLIB_PREFIX_IRI: &str = "https://github.com/DataTreehouse/maplib#"; diff --git a/lib/triplestore/Cargo.toml b/lib/triplestore/Cargo.toml index 3c0c0410..665ff4de 100644 --- a/lib/triplestore/Cargo.toml +++ b/lib/triplestore/Cargo.toml @@ -37,6 +37,7 @@ simd-json.workspace = true serde_json.workspace = true ordered-float.workspace = true quick-xml.workspace = true +uri_encode.workspace = true pyo3 = { workspace = true, optional = true } diff --git a/lib/triplestore/src/lib.rs b/lib/triplestore/src/lib.rs index 8c02fbbb..44e48697 100644 --- a/lib/triplestore/src/lib.rs +++ b/lib/triplestore/src/lib.rs @@ -4,6 +4,7 @@ extern crate core; pub mod cats; mod dblf; pub mod errors; +mod map_df; mod map_json; mod map_xml; pub mod native_parquet_write; diff --git a/lib/triplestore/src/map_df.rs b/lib/triplestore/src/map_df.rs new file mode 100644 index 00000000..9b248bdd --- /dev/null +++ b/lib/triplestore/src/map_df.rs @@ -0,0 +1,132 @@ +use crate::errors::TriplestoreError; +use crate::{TriplesToAdd, Triplestore}; +use oxrdf::vocab::{rdf, xsd}; +use oxrdf::NamedNode; +use polars::prelude::{col, lit, DataFrame, IntoLazy}; +use polars_core::prelude::{Column, IntoColumn, NamedFrom, Series}; +use rayon::iter::{IntoParallelIterator, ParallelIterator}; +use representation::constants::MAPLIB_PREFIX_IRI; +use representation::dataset::NamedGraph; +use representation::polars_to_rdf::polars_type_to_literal_type; +use representation::{BaseRDFNodeType, OBJECT_COL_NAME, SUBJECT_COL_NAME}; +use std::collections::HashMap; + +const FACADE_X_ROOT: &str = "http://sparql.xyz/facade-x/ns/root"; +const FACADE_X_CHILD: &str = "http://sparql.xyz/facade-x/ns/child"; +const FACADE_X_CHILD_NUMBER: &str = "http://sparql.xyz/facade-x/ns/childNumber"; +const DEFAULT_FACADE_X_DATA_PREFIX: &str = "http://sparql.xyz/facade-x/data/"; + +impl Triplestore { + pub fn map_df( + &mut self, + df: &DataFrame, + named_graph: &NamedGraph, + ) -> Result<(), TriplestoreError> { + let mut column_types = HashMap::new(); + let col_names: Vec<_> = df.columns().iter().map(|x| x.name().to_string()).collect(); + for c in df.columns() { + //Todo handle + let dt = polars_type_to_literal_type(c.dtype()).unwrap(); + column_types.insert(c.name().to_string(), dt); + } + let id_col = uuid::Uuid::new_v4().to_string(); + let mut df = df.clone(); + let root_node_uuri = format!("{}{}", MAPLIB_PREFIX_IRI, uuid::Uuid::new_v4().to_string()); + let uuids: Vec<_> = (0..df.height()) + .into_par_iter() + .map(|_| format!("{}{}", MAPLIB_PREFIX_IRI, uuid::Uuid::new_v4().to_string())) + .collect(); + df.with_column(Series::new(id_col.as_str().into(), uuids).into_column()) + .unwrap(); + let mut triples_to_add = Vec::new(); + for c in &col_names { + let subj_type = BaseRDFNodeType::IRI; + let obj_type = column_types.get(c).unwrap().clone(); + let subj_state = subj_type.default_input_cat_state(); + + if obj_type.is_multi() { + todo!() + } else { + let base_obj_type = obj_type.get_base_type().unwrap(); + let base_obj_state = obj_type.get_base_state().unwrap(); + let tta = TriplesToAdd { + df: df + .clone() + .lazy() + .select([ + col(id_col.clone()).alias(SUBJECT_COL_NAME), + col(c).alias(OBJECT_COL_NAME), + ]) + .collect() + .unwrap(), + subject_type: subj_type, + object_type: base_obj_type.clone(), + predicate: Some(NamedNode::new_unchecked(format!( + "{}{}", + DEFAULT_FACADE_X_DATA_PREFIX, + uri_encode::encode_uri(c) + ))), + graph: named_graph.clone(), + subject_cat_state: subj_state, + object_cat_state: base_obj_state.clone(), + predicate_cat_state: None, + }; + triples_to_add.push(tta); + } + } + let tta_children = TriplesToAdd { + df: df + .clone() + .lazy() + .select([ + lit(root_node_uuri.as_str()).alias(SUBJECT_COL_NAME), + col(id_col.clone()).alias(OBJECT_COL_NAME), + ]) + .collect() + .unwrap(), + subject_type: BaseRDFNodeType::IRI, + object_type: BaseRDFNodeType::IRI, + predicate: Some(NamedNode::new_unchecked(FACADE_X_CHILD.to_string())), + graph: named_graph.clone(), + subject_cat_state: BaseRDFNodeType::IRI.default_input_cat_state(), + object_cat_state: BaseRDFNodeType::IRI.default_input_cat_state(), + predicate_cat_state: None, + }; + let num_dt = BaseRDFNodeType::Literal(xsd::UNSIGNED_INT.into_owned()); + let tta_child_num = TriplesToAdd { + df: df + .clone() + .lazy() + .select([col(id_col).alias(SUBJECT_COL_NAME)]) + .with_row_index(OBJECT_COL_NAME.to_string(), None) + .collect() + .unwrap(), + subject_type: BaseRDFNodeType::IRI, + object_type: num_dt.clone(), + predicate: Some(NamedNode::new_unchecked(FACADE_X_CHILD_NUMBER.to_string())), + graph: named_graph.clone(), + subject_cat_state: BaseRDFNodeType::IRI.default_input_cat_state(), + object_cat_state: num_dt.default_input_cat_state(), + predicate_cat_state: None, + }; + + let mut root_cols = Vec::new(); + root_cols.push(Column::new(SUBJECT_COL_NAME.into(), vec![root_node_uuri])); + root_cols.push(Column::new(OBJECT_COL_NAME.into(), vec![FACADE_X_ROOT])); + let root = TriplesToAdd { + df: DataFrame::new(1, root_cols).unwrap(), + subject_type: BaseRDFNodeType::IRI, + object_type: BaseRDFNodeType::IRI, + predicate: Some(rdf::TYPE.into_owned()), + graph: named_graph.clone(), + subject_cat_state: BaseRDFNodeType::IRI.default_input_cat_state(), + object_cat_state: BaseRDFNodeType::IRI.default_input_cat_state(), + predicate_cat_state: None, + }; + triples_to_add.push(tta_children); + triples_to_add.push(tta_child_num); + triples_to_add.push(root); + self.add_triples_vec(triples_to_add, false)?; + Ok(()) + } +} diff --git a/py_maplib/maplib/__init__.pyi b/py_maplib/maplib/__init__.pyi index b3260b0f..827ac8b6 100644 --- a/py_maplib/maplib/__init__.pyi +++ b/py_maplib/maplib/__init__.pyi @@ -645,6 +645,24 @@ class Model: :return: The generated template """ + + def map_df( + self, + df: DataFrame, + graph: str = None, + ): + """ + Create a default template and map it based on a dataframe. + Usage: + + >>> df = pl.read_csv("my_csv.csv") + >>> m.map_df(df) + + :param df: DataFrame to map using Facade-X (using approximately the CSV-mapping) + :param graph: The IRI of the graph to add triples to. + :return: None + """ + def explore( self, host: str = "localhost", diff --git a/py_maplib/src/lib.rs b/py_maplib/src/lib.rs index ad01c3b4..291782fe 100644 --- a/py_maplib/src/lib.rs +++ b/py_maplib/src/lib.rs @@ -372,6 +372,18 @@ impl PyModel { }) } + #[pyo3(signature = (df, graph=None))] + #[instrument(skip_all)] + fn map_df(&self, py: Python<'_>, df: &Bound<'_, PyAny>, graph: Option) -> PyResult<()> { + let (df, _) = data_to_mappings_types(df, py)?; + py.detach(move || -> PyResult<()> { + let mut inner = self.inner.lock().unwrap(); + let graph = parse_optional_named_node(graph)?; + let named_graph = NamedGraph::from_maybe_named_node(graph.as_ref()); + map_df_mutex(&mut inner, df, named_graph) + }) + } + /// Starts a graph explorer session. /// /// To run from Jupyter Notebook use: @@ -1295,6 +1307,15 @@ fn map_default_mutex( Ok(format!("{tmpl}")) } +fn map_df_mutex( + inner: &mut MutexGuard, + df: DataFrame, + graph: NamedGraph, +) -> PyResult<()> { + inner.map_df(&df, &graph).map_err(PyMaplibError::from)?; + Ok(()) +} + fn query_mutex( inner: &mut MutexGuard, query: String, diff --git a/py_maplib/tests/test_map_df.py b/py_maplib/tests/test_map_df.py new file mode 100644 index 00000000..4683ff3c --- /dev/null +++ b/py_maplib/tests/test_map_df.py @@ -0,0 +1,35 @@ +import os +from typing import List, Optional + +import pytest + +from .disk import disk_params +from maplib import Model +from polars.testing import assert_frame_equal +import polars as pl +import pathlib +import time + +pl.Config.set_fmt_str_lengths(300) + +PATH_HERE = pathlib.Path(__file__).parent +TESTDATA_PATH = PATH_HERE / "testdata" / "jsons" + +@pytest.mark.parametrize("disk", disk_params()) +def test_map_df_easy(disk): + df = pl.DataFrame({ + "a": ["abc", "def"], + "b": [1,2] + }) + m = Model() + m.map_df(df) + out_df = m.query(""" + SELECT * WHERE { + ?root a fx:root . + ?root fx:child ?row . + ?row xyz:a ?a . + ?row fx:childNumber ?ch_num . + BIND(IRI(CONCAT("urn:my_ns:", ?a)) AS ?a_uri) + } + """) + assert out_df.shape == (2,5) \ No newline at end of file From facbf2763093a593433d463978e8af18661a3976 Mon Sep 17 00:00:00 2001 From: vennyy3 Date: Tue, 16 Jun 2026 11:07:07 +0200 Subject: [PATCH 2/2] added tests --- py_maplib/tests/test_map_df.py | 83 ++++++++++++++++++++++++++++++---- 1 file changed, 75 insertions(+), 8 deletions(-) diff --git a/py_maplib/tests/test_map_df.py b/py_maplib/tests/test_map_df.py index 4683ff3c..d3615de5 100644 --- a/py_maplib/tests/test_map_df.py +++ b/py_maplib/tests/test_map_df.py @@ -1,14 +1,12 @@ -import os -from typing import List, Optional - import pytest - from .disk import disk_params -from maplib import Model -from polars.testing import assert_frame_equal +from maplib import ( + Model, + xsd, + RDFType, +) import polars as pl import pathlib -import time pl.Config.set_fmt_str_lengths(300) @@ -32,4 +30,73 @@ def test_map_df_easy(disk): BIND(IRI(CONCAT("urn:my_ns:", ?a)) AS ?a_uri) } """) - assert out_df.shape == (2,5) \ No newline at end of file + assert out_df.shape == (2,5) + +@pytest.mark.parametrize("disk", disk_params()) +def test_map_df_named_graph_arg(disk): + df = pl.DataFrame({ + "a": ["abc", "def"], + "b": [1,2] + }) + m = Model() + m.map_df(df, "urn:maplib:test") + out_df = m.query(""" + SELECT * WHERE { + ?root a fx:root . + ?root fx:child ?row . + ?row xyz:a ?a . + ?row fx:childNumber ?ch_num . + BIND(IRI(CONCAT("urn:my_ns:", ?a)) AS ?a_uri) + } + """, graph="urn:maplib:test") + assert out_df.height == 2 + +def test_map_df_float(): + df = pl.DataFrame( + { + "a": [ + "abc", + "def", + "ghi", + ], + "b": [ + 1.4, + 4.2, + 7.8, + ], + } + ) + m = Model() + m.map_df(df) + res = m.query( + """ + SELECT ?a ?c WHERE {?a xyz:b ?c} ORDER BY ?a ?c + """, solution_mappings=True + ) + print(res) + assert res.rdf_types["c"] == RDFType.Literal(xsd.double) + +def test_map_df_datetime(): + df = pl.DataFrame( + { + "a": [ + "abc", + "def", + "ghi", + ], + "b": [ + "2019-05-15T00:00:00", + "2020-02-02T00:00:00", + "2007-07-07T00:00:00", + ], + } + ) + df = df.with_columns(pl.col("b").str.to_datetime()) + m = Model() + m.map_df(df) + res = m.query( + """ + SELECT ?a ?c WHERE {?a xyz:b ?c} ORDER BY ?a ?c + """, solution_mappings=True + ) + assert res.rdf_types["c"] == RDFType.Literal(xsd.dateTime) \ No newline at end of file