diff --git a/Cargo.lock b/Cargo.lock index 6e767be9..61b5a399 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -83,6 +83,56 @@ dependencies = [ "libc", ] +[[package]] +name = "anstream" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "824a212faf96e9acacdbd09febd34438f8f711fb84e09a8916013cd7815ca28d" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is_terminal_polyfill", + "utf8parse", +] + +[[package]] +name = "anstyle" +version = "1.0.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "940b3a0ca603d1eade50a4846a2afffd5ef57a9feac2c0e2ec2e14f9ead76000" + +[[package]] +name = "anstyle-parse" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52ce7f38b242319f7cabaa6813055467063ecdc9d355bbb4ce0c68908cd8130e" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc" +dependencies = [ + "windows-sys 0.61.2", +] + +[[package]] +name = "anstyle-wincon" +version = "3.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d" +dependencies = [ + "anstyle", + "once_cell_polyfill", + "windows-sys 0.61.2", +] + [[package]] name = "anyhow" version = "1.0.102" @@ -390,7 +440,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c7c24de15d275a1ecfd47a380fb4d5ec9bfe0933f309ed5e705b775596a3574d" dependencies = [ "proc-macro2", - "quote", + "quote 1.0.45", "syn 2.0.117", ] @@ -401,7 +451,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb" dependencies = [ "proc-macro2", - "quote", + "quote 1.0.45", "syn 2.0.117", ] @@ -441,6 +491,15 @@ version = "0.22.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" +[[package]] +name = "bincode" +version = "1.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad" +dependencies = [ + "serde", +] + [[package]] name = "bincode" version = "2.0.1" @@ -461,6 +520,12 @@ dependencies = [ "virtue", ] +[[package]] +name = "binout" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "222fb4925a15bea6a68075021910e03d6aa2d04951d71ff1d956190a551d738f" + [[package]] name = "bitflags" version = "2.13.0" @@ -470,6 +535,12 @@ dependencies = [ "serde_core", ] +[[package]] +name = "bitset-core" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f421f1bcb30aa9d851a03c2920ab5d96ca920d5786645a597b5fc37922f8b89e" + [[package]] name = "bitvec" version = "1.0.1" @@ -534,7 +605,7 @@ dependencies = [ "once_cell", "proc-macro-crate", "proc-macro2", - "quote", + "quote 1.0.45", "syn 2.0.117", ] @@ -589,7 +660,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3db406d29fbcd95542e92559bed4d8ad92636d1ca8b3b72ede10b4bcc010e659" dependencies = [ "proc-macro2", - "quote", + "quote 1.0.45", "syn 1.0.109", ] @@ -609,7 +680,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f9abbd1bc6865053c427f7198e6af43bfdedc55ab791faed4fbd361d789575ff" dependencies = [ "proc-macro2", - "quote", + "quote 1.0.45", "syn 2.0.117", ] @@ -622,6 +693,12 @@ dependencies = [ "serde", ] +[[package]] +name = "bytesize" +version = "2.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6bd91ee7b2422bcb158d90ef4d14f75ef67f340943fc4149891dcce8f8b972a3" + [[package]] name = "castaway" version = "0.2.4" @@ -724,6 +801,58 @@ dependencies = [ "representation", ] +[[package]] +name = "clap" +version = "4.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ddb117e43bbf7dacf0a4190fef4d345b9bad68dfc649cb349e7d17d28428e51" +dependencies = [ + "clap_builder", + "clap_derive", +] + +[[package]] +name = "clap_builder" +version = "4.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "714a53001bf66416adb0e2ef5ac857140e7dc3a0c48fb28b2f10762fc4b5069f" +dependencies = [ + "anstream", + "anstyle", + "clap_lex", + "strsim", +] + +[[package]] +name = "clap_derive" +version = "4.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2ce8604710f6733aa641a2b3731eaa1e8b3d9973d5e3565da11800813f997a9" +dependencies = [ + "heck", + "proc-macro2", + "quote 1.0.45", + "syn 2.0.117", +] + +[[package]] +name = "clap_lex" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9" + +[[package]] +name = "co_sort" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5cc18e115ded94ba1e1b820c7631d25b7364e27c25f066ecbce37aaf88abdcf4" + +[[package]] +name = "colorchoice" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570" + [[package]] name = "comfy-table" version = "7.2.2" @@ -759,6 +888,16 @@ dependencies = [ "crossbeam-utils", ] +[[package]] +name = "console_error_panic_hook" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a06aeb73f470f66dcdbf7223caeebb85984942f22f1adb2a088cf9668146bbbc" +dependencies = [ + "cfg-if", + "wasm-bindgen", +] + [[package]] name = "const-oid" version = "0.10.2" @@ -825,6 +964,21 @@ dependencies = [ "libc", ] +[[package]] +name = "crc" +version = "3.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5eb8a2a1cd12ab0d987a5d5e825195d372001a4094a0376319d5a0ad71c1ba0d" +dependencies = [ + "crc-catalog", +] + +[[package]] +name = "crc-catalog" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "217698eaf96b4a3f0bc4f3662aaa55bdf913cd54d7204591faa790070c6d0853" + [[package]] name = "crc32fast" version = "1.5.0" @@ -946,6 +1100,20 @@ dependencies = [ "memchr", ] +[[package]] +name = "dashmap" +version = "6.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6361d5c062261c78a176addb82d4c821ae42bed6089de0e12603cd25de2059c" +dependencies = [ + "cfg-if", + "crossbeam-utils", + "hashbrown 0.14.5", + "lock_api", + "once_cell", + "parking_lot_core", +] + [[package]] name = "datalog" version = "0.1.0" @@ -970,7 +1138,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e567bd82dcff979e4b03460c307b3cdc9e96fde3d73bed1496d2bc75d9dd62a" dependencies = [ "proc-macro2", - "quote", + "quote 1.0.45", "syn 2.0.117", ] @@ -1011,7 +1179,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1ac70aa55017e108007fbaf5aa0f54b021c98f92ff8af59d42eda9da96e3dd4f" dependencies = [ "proc-macro2", - "quote", + "quote 1.0.45", "syn 2.0.117", ] @@ -1030,12 +1198,39 @@ version = "1.0.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d0881ea181b1df73ff77ffaaf9c7544ecc11e82fba9b5f27b262a3c73a332555" +[[package]] +name = "dyn_size_of" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4a742b95783b1f45b900129082cbc47717b6a77ee8d17eea70a8ea62462f5de3" + [[package]] name = "either" version = "1.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "91622ff5e7162018101f2fea40d6ebf4a78bbe5a49736a2020649edf9693679e" +[[package]] +name = "env_filter" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32e90c2accc4b07a8456ea0debdc2e7587bdd890680d71173a15d4ae604f6eef" +dependencies = [ + "log", +] + +[[package]] +name = "env_logger" +version = "0.11.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0621c04f2196ac3f488dd583365b9c09be011a4ab8b9f37248ffcc8f6198b56a" +dependencies = [ + "anstream", + "anstyle", + "env_filter", + "log", +] + [[package]] name = "equivalent" version = "1.0.2" @@ -1178,6 +1373,12 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "fsum" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f5f673e5179fc055a5cb48fb40fc3f317160598d60c93b0ef8173504117765b0" + [[package]] name = "fts" version = "0.1.0" @@ -1251,7 +1452,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e835b70203e41293343137df5c0664546da5745f82ec9b84d40be8336958447b" dependencies = [ "proc-macro2", - "quote", + "quote 1.0.45", "syn 2.0.117", ] @@ -1294,6 +1495,17 @@ dependencies = [ "version_check", ] +[[package]] +name = "generic-tests" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9ff6d6584f4f6fa911d5e07856abf1a48dc5599b3734f2eaea130f2c3baa989" +dependencies = [ + "proc-macro2", + "quote 1.0.45", + "syn 2.0.117", +] + [[package]] name = "getrandom" version = "0.2.17" @@ -1393,6 +1605,16 @@ dependencies = [ "ahash 0.7.8", ] +[[package]] +name = "hashbrown" +version = "0.14.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" +dependencies = [ + "ahash 0.8.12", + "allocator-api2", +] + [[package]] name = "hashbrown" version = "0.15.5" @@ -1424,6 +1646,33 @@ version = "0.17.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ed5909b6e89a2db4456e54cd5f673791d7eca6732202bbf2a9cc504fe2f9b84a" +[[package]] +name = "hdt" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b2b1ea7cbd25071f796251a1081ded2a164dd24e034d5f75342da35c157cf4e" +dependencies = [ + "bitset-core", + "bytesize", + "console_error_panic_hook", + "crc", + "env_logger", + "getrandom 0.2.17", + "getrandom 0.3.4", + "lasso", + "log", + "mem_dbg", + "mownstr", + "ntriple", + "oxttl", + "qwt", + "rayon", + "serde", + "serde_json", + "thiserror", + "wasm-bindgen", +] + [[package]] name = "heck" version = "0.5.0" @@ -1726,6 +1975,12 @@ version = "2.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d98f6fed1fde3f8c21bc40a1abb88dd75e67924f9cffc3ef95607bad8017f8e2" +[[package]] +name = "is_terminal_polyfill" +version = "1.70.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695" + [[package]] name = "itertools" version = "0.14.0" @@ -1768,6 +2023,16 @@ version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "574b0cd5e90ee2ba03a66d0611fc9a09c9a0c28b2ecc2dc8a181dd31a53ca5d7" +[[package]] +name = "lasso" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e14eda50a3494b3bf7b9ce51c52434a761e383d7238ce1dd5dcec2fbc13e9fb" +dependencies = [ + "dashmap", + "hashbrown 0.14.5", +] + [[package]] name = "lazy_static" version = "1.5.0" @@ -1972,6 +2237,27 @@ dependencies = [ "digest 0.11.3", ] +[[package]] +name = "mem_dbg" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "728cc9dc97593cd22f7bc81fbef70a2d391d7a9a855e7d658b653318124a6cf0" +dependencies = [ + "bitflags", + "mem_dbg-derive", +] + +[[package]] +name = "mem_dbg-derive" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d84f40c93b0508d5565db79a814d02d5b2545967205ce44be211592aafa34d6c" +dependencies = [ + "proc-macro2", + "quote 1.0.45", + "syn 2.0.117", +] + [[package]] name = "memchr" version = "2.8.2" @@ -1996,6 +2282,18 @@ dependencies = [ "libmimalloc-sys", ] +[[package]] +name = "minimum_redundancy" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a28ed9f799347e3fc3b0cc999332dbba53499551bbcf1070fcc12737645ac05b" +dependencies = [ + "binout", + "co_sort", + "dyn_size_of", + "fsum", +] + [[package]] name = "miniz_oxide" version = "0.8.9" @@ -2017,6 +2315,12 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "mownstr" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b33dce847b8623c1f2e473ed3a05e43d0c395e3b93fab62378b6ae94b0a1c42c" + [[package]] name = "ndarray" version = "0.17.2" @@ -2056,6 +2360,15 @@ dependencies = [ "winapi", ] +[[package]] +name = "ntriple" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "020fb7cf74ddf131e4ba84e13221d2493ae4d17cad3982a9158771442d6b0730" +dependencies = [ + "peg 0.5.7", +] + [[package]] name = "nu-ansi-term" version = "0.50.3" @@ -2101,7 +2414,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ed3955f1a9c7c0c15e092f9c887db08b1fc683305fdf6eb6684f22555355e202" dependencies = [ "proc-macro2", - "quote", + "quote 1.0.45", "syn 2.0.117", ] @@ -2205,6 +2518,12 @@ version = "1.21.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50" +[[package]] +name = "once_cell_polyfill" +version = "1.70.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" + [[package]] name = "openssl-probe" version = "0.2.1" @@ -2338,6 +2657,21 @@ dependencies = [ "windows-link 0.2.1", ] +[[package]] +name = "paste" +version = "1.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" + +[[package]] +name = "peg" +version = "0.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40df12dde1d836ed2a4c3bfc2799797e3abaf807d97520d28d6e3f3bf41a5f85" +dependencies = [ + "quote 0.3.15", +] + [[package]] name = "peg" version = "0.8.6" @@ -2356,7 +2690,7 @@ checksum = "ddd8ef6825cae95355031ae26a99b616a2a21f22ba2de0197c43dfb05acbe7ee" dependencies = [ "peg-runtime", "proc-macro2", - "quote", + "quote 1.0.45", ] [[package]] @@ -3020,7 +3354,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "590b0a94aa8f97992d52f1198600ecc1c1f7cfa03c1b31cae057143455804ac0" dependencies = [ "argminmax", - "bincode", + "bincode 2.0.1", "bytemuck", "bytes", "compact_str", @@ -3140,7 +3474,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "16b845dbfca988fa33db069c0e230574d15a3088f147a87b64c7589eb662c9ac" dependencies = [ "proc-macro2", - "quote", + "quote 1.0.45", "syn 1.0.109", ] @@ -3214,7 +3548,7 @@ checksum = "9ac53762fd065daa3194dd09337a38bd793a188100fd1a9304c4ab312d901771" dependencies = [ "proc-macro2", "pyo3-macros-backend", - "quote", + "quote 1.0.45", "syn 2.0.117", ] @@ -3226,6 +3560,8 @@ checksum = "4ca3a1557399783172dc5bf39cfca835157732532cba56b71d2292161e53b362" dependencies = [ "heck", "proc-macro2", + "pyo3-build-config", + "quote 1.0.45", "quote", "syn 2.0.117", ] @@ -3332,6 +3668,12 @@ dependencies = [ "windows-sys 0.60.2", ] +[[package]] +name = "quote" +version = "0.3.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a6e920b65c65f10b2ae65c831a81a073a89edd28c7cce89475bff467ab4167a" + [[package]] name = "quote" version = "1.0.45" @@ -3341,6 +3683,24 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "qwt" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1518389953b62e3b4bac17d8edc67e8291ec040523ab682479d8c706efc0b7b" +dependencies = [ + "bincode 1.3.3", + "clap", + "generic-tests", + "mem_dbg", + "minimum_redundancy", + "num-traits", + "paste", + "rand 0.8.6", + "serde", + "serde-big-array", +] + [[package]] name = "r-efi" version = "5.3.0" @@ -3496,7 +3856,7 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "76009fbe0614077fc1a2ce255e3a1881a2e3a3527097d5dc6d8212c585e7e38b" dependencies = [ - "quote", + "quote 1.0.45", "syn 2.0.117", ] @@ -3525,7 +3885,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b7186006dcb21920990093f30e3dea63b7d6e977bf1256be20c3563a5db070da" dependencies = [ "proc-macro2", - "quote", + "quote 1.0.45", "syn 2.0.117", ] @@ -3672,7 +4032,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "84d7b42d4b8d06048d3ac8db0eb31bcb942cbeb709f0b5f2b2ebde398d3038f5" dependencies = [ "proc-macro2", - "quote", + "quote 1.0.45", "syn 1.0.109", ] @@ -3874,6 +4234,15 @@ dependencies = [ "serde_derive", ] +[[package]] +name = "serde-big-array" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "11fc7cc2c76d73e0f27ee52abbd64eec84d46f370c88371120433196934e4b7f" +dependencies = [ + "serde", +] + [[package]] name = "serde_core" version = "1.0.228" @@ -3890,7 +4259,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" dependencies = [ "proc-macro2", - "quote", + "quote 1.0.45", "syn 2.0.117", ] @@ -4096,7 +4465,7 @@ dependencies = [ "oxilangtag", "oxiri", "oxrdf", - "peg", + "peg 0.8.6", "rand 0.10.1", "thiserror", ] @@ -4134,7 +4503,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "028e551d5e270b31b9f3ea271778d9d827148d4287a5d96167b6bb9787f5cc38" dependencies = [ "proc-macro2", - "quote", + "quote 1.0.45", "syn 2.0.117", ] @@ -4184,6 +4553,12 @@ version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fe895eb47f22e2ddd4dabc02bce419d2e643c8e3b585c78158b349195bc24d82" +[[package]] +name = "strsim" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" + [[package]] name = "strum_macros" version = "0.27.2" @@ -4192,7 +4567,7 @@ checksum = "7695ce3845ea4b33927c055a39dc438a45b059f7c1b3d91d38d10355fb8cbca7" dependencies = [ "heck", "proc-macro2", - "quote", + "quote 1.0.45", "syn 2.0.117", ] @@ -4209,7 +4584,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" dependencies = [ "proc-macro2", - "quote", + "quote 1.0.45", "unicode-ident", ] @@ -4220,7 +4595,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99" dependencies = [ "proc-macro2", - "quote", + "quote 1.0.45", "unicode-ident", ] @@ -4240,7 +4615,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" dependencies = [ "proc-macro2", - "quote", + "quote 1.0.45", "syn 2.0.117", ] @@ -4277,7 +4652,7 @@ dependencies = [ "oxilangtag", "oxiri", "oxrdf", - "peg", + "peg 0.8.6", "pyo3", "representation", "spargebra", @@ -4302,7 +4677,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5" dependencies = [ "proc-macro2", - "quote", + "quote 1.0.45", "syn 2.0.117", ] @@ -4391,7 +4766,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "385a6cb71ab9ab790c5fe8d67f1645e6c450a7ce006a33de03daa956cf70a496" dependencies = [ "proc-macro2", - "quote", + "quote 1.0.45", "syn 2.0.117", ] @@ -4512,7 +4887,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7490cfa5ec963746568740651ac6781f701c9c5ea257c58e057f3ba8cf69e8da" dependencies = [ "proc-macro2", - "quote", + "quote 1.0.45", "syn 2.0.117", ] @@ -4563,6 +4938,7 @@ dependencies = [ "cimxml_import", "file_io", "fts", + "hdt", "itoa", "memmap2", "ordered-float", @@ -4678,6 +5054,12 @@ version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" +[[package]] +name = "utf8parse" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" + [[package]] name = "utils" version = "0.1.0" @@ -4830,7 +5212,7 @@ version = "0.2.123" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "24a40fc75b0ec6f3746ceb10d36f53a93dcd68a93b11b6445983945d79eba0dc" dependencies = [ - "quote", + "quote 1.0.45", "wasm-bindgen-macro-support", ] @@ -4842,7 +5224,7 @@ checksum = "908f34bd9b9ce3d4caf07b72dfab63d61504d156856c6bd3cd87fa350cf3985b" dependencies = [ "bumpalo", "proc-macro2", - "quote", + "quote 1.0.45", "syn 2.0.117", "wasm-bindgen-shared", ] @@ -5020,7 +5402,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf" dependencies = [ "proc-macro2", - "quote", + "quote 1.0.45", "syn 2.0.117", ] @@ -5031,7 +5413,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358" dependencies = [ "proc-macro2", - "quote", + "quote 1.0.45", "syn 2.0.117", ] @@ -5327,7 +5709,7 @@ dependencies = [ "anyhow", "prettyplease", "proc-macro2", - "quote", + "quote 1.0.45", "syn 2.0.117", "wit-bindgen-core", "wit-bindgen-rust", @@ -5409,7 +5791,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "de844c262c8848816172cef550288e7dc6c7b7814b4ee56b3e1553f275f1858e" dependencies = [ "proc-macro2", - "quote", + "quote 1.0.45", "syn 2.0.117", "synstructure", ] @@ -5430,7 +5812,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1ae7f38b72ec2a254e2b87ef277cf2cd4fb97cbebf944faa6f33354da0867930" dependencies = [ "proc-macro2", - "quote", + "quote 1.0.45", "syn 2.0.117", ] @@ -5450,7 +5832,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "11532158c46691caf0f2593ea8358fed6bbf68a0315e80aae9bd41fbade684a1" dependencies = [ "proc-macro2", - "quote", + "quote 1.0.45", "syn 2.0.117", "synstructure", ] @@ -5490,7 +5872,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "625dc425cab0dca6dc3c3319506e6593dcb08a9f387ea3b284dbd52a92c40555" dependencies = [ "proc-macro2", - "quote", + "quote 1.0.45", "syn 2.0.117", ] diff --git a/Cargo.toml b/Cargo.toml index 92491475..57ff40de 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -60,6 +60,7 @@ peg = "0.8" rand = "0.10.1" oxilangtag = "0.1.5" fundu = "2.0.1" +hdt = { version = "0.6.0", default-features = false, features = ["nt"] } memmap2 = "0.9.5" sprs = "0.11.3" walkdir = "2.5.0" diff --git a/lib/maplib/src/model.rs b/lib/maplib/src/model.rs index f50ad72b..49f7f62a 100644 --- a/lib/maplib/src/model.rs +++ b/lib/maplib/src/model.rs @@ -492,6 +492,16 @@ impl Model { Ok(()) } + pub fn write_hdt( + &mut self, + buffer: &mut W, + graph: &NamedGraph, + ) -> Result<(), MaplibError> { + self.triplestore + .write_hdt(buffer, graph) + .map_err(MaplibError::TriplestoreError) + } + pub fn write_cim_xml( &mut self, buffer: &mut W, diff --git a/lib/triplestore/Cargo.toml b/lib/triplestore/Cargo.toml index 3c0c0410..e1b68669 100644 --- a/lib/triplestore/Cargo.toml +++ b/lib/triplestore/Cargo.toml @@ -29,6 +29,7 @@ uuid.workspace = true thiserror.workspace = true oxrdfio.workspace = true oxttl.workspace = true +hdt.workspace = true memmap2.workspace = true sparesults.workspace = true tracing.workspace = true diff --git a/lib/triplestore/src/errors.rs b/lib/triplestore/src/errors.rs index 5c92775f..4dc0ff7b 100644 --- a/lib/triplestore/src/errors.rs +++ b/lib/triplestore/src/errors.rs @@ -60,6 +60,8 @@ pub enum TriplestoreError { DecodeError(PolarsError), #[error("Error collecting lazy triples {0}")] LazyLoadError(PolarsError), + #[error("HDT error: {0}")] + HDTError(String), } impl From> for TriplestoreError { diff --git a/lib/triplestore/src/triples_read.rs b/lib/triplestore/src/triples_read.rs index ffb30366..0b28a999 100644 --- a/lib/triplestore/src/triples_read.rs +++ b/lib/triplestore/src/triples_read.rs @@ -4,8 +4,9 @@ use crate::{NewTriples, TriplesToAdd}; use std::cmp; use cimxml_import::{fix_cim_quad, Remapper}; +use hdt::Hdt; use memmap2::MmapOptions; -use oxrdf::{BlankNode, GraphName, NamedNode, NamedOrBlankNode, Quad, Term, Triple}; +use oxrdf::{BlankNode, GraphName, Literal, NamedNode, NamedOrBlankNode, Quad, Term, Triple}; use oxrdfio::{ JsonLdProfileSet, LoadedDocument, RdfFormat, RdfParser, RdfSyntaxError, SliceQuadParser, }; @@ -25,7 +26,10 @@ use representation::{ use representation::{OBJECT_COL_NAME, SUBJECT_COL_NAME}; use std::collections::HashMap; use std::fs::File; +use std::io::Cursor; use std::path::Path; +use std::str::FromStr; +use std::sync::Arc; use std::time::Instant; use tracing::{debug, instrument}; @@ -35,6 +39,7 @@ const UTF8_BOM: [u8; 3] = [0xEF, 0xBB, 0xBF]; pub enum ExtendedRdfFormat { Normal(RdfFormat), CIMXML, + HDT, } impl Triplestore { @@ -70,6 +75,8 @@ impl Triplestore { ExtendedRdfFormat::Normal(RdfFormat::JsonLd { profile: JsonLdProfileSet::empty(), }) + } else if path.extension() == Some("hdt".as_ref()) { + ExtendedRdfFormat::HDT } else { todo!("Have not implemented file format {:?}", path); }; @@ -152,84 +159,23 @@ impl Triplestore { } else { matches!(rdf_format, ExtendedRdfFormat::Normal(RdfFormat::NTriples)) }; - let mut readers = if matches!( - rdf_format, - ExtendedRdfFormat::Normal(RdfFormat::NTriples) - | ExtendedRdfFormat::Normal(RdfFormat::Turtle) - ) && parallel - { - let threads = if let Ok(threads) = std::thread::available_parallelism() { - threads.get() - } else { - 1 - }; - - let mut readers = vec![]; - if rdf_format == ExtendedRdfFormat::Normal(RdfFormat::Turtle) { - let mut parser = TurtleParser::new(); - for (k, v) in prefixes { - parser = parser.with_prefix(k, v.as_str()).unwrap(); - } - if !checked { - parser = parser.lenient(); - } - if let Some(base_iri) = base_iri { - parser = parser.with_base_iri(base_iri).unwrap(); - } - for r in parser.split_slice_for_parallel_parsing(use_slice, threads) { - readers.push(MyFromSliceQuadReader { - parser: MyFromSliceQuadReaderKind::TurtlePar(r), - }); - } - } else if rdf_format == ExtendedRdfFormat::Normal(RdfFormat::NTriples) { - let mut parser = NTriplesParser::new(); - if !checked { - parser = parser.lenient(); - } - for r in parser.split_slice_for_parallel_parsing(use_slice, threads) { - readers.push(MyFromSliceQuadReader { - parser: MyFromSliceQuadReaderKind::NTriplesPar(r), - }); - } - } - readers + let hdt; + let mut readers = if rdf_format == ExtendedRdfFormat::HDT { + hdt = Hdt::read(Cursor::new(use_slice)) + .map_err(|e| TriplestoreError::HDTError(e.to_string()))?; + vec![hdt_reader(&hdt)] + } else if parallel && rdf_format == ExtendedRdfFormat::Normal(RdfFormat::Turtle) { + parallel_turtle_readers(use_slice, prefixes, checked, base_iri) + } else if parallel && rdf_format == ExtendedRdfFormat::Normal(RdfFormat::NTriples) { + parallel_ntriples_readers(use_slice, checked) } else { - let use_format = match rdf_format { - ExtendedRdfFormat::Normal(n) => n, - ExtendedRdfFormat::CIMXML => RdfFormat::RdfXml, - }; - let mut parser = RdfParser::from(use_format.clone()); - if !checked { - parser = parser.lenient(); - } - if let Some(base_iri) = &base_iri { - parser = parser.with_base_iri(base_iri).unwrap(); - } - let mut for_slice = parser.for_slice(use_slice); - if matches!(use_format, RdfFormat::JsonLd { .. }) { - for_slice = for_slice.with_document_loader(move |url| { - if let Some(doc) = known_contexts.get(url) { - Ok(LoadedDocument { - url: url.to_string(), - content: doc.clone().into_bytes(), - format: RdfFormat::JsonLd { - profile: JsonLdProfileSet::empty(), - }, - }) - } else { - Err(Box::new(TriplestoreError::MissingContext(url.to_string()))) - } - }); - } - if matches!(rdf_format, ExtendedRdfFormat::CIMXML) { - vec![MyFromSliceQuadReader { - parser: MyFromSliceQuadReaderKind::CIMXML(for_slice, base_iri.clone()), - }] - } else { - vec![MyFromSliceQuadReader { - parser: MyFromSliceQuadReaderKind::Other(for_slice), - }] - } + vec![rdf_parser_reader( + use_slice, + &rdf_format, + checked, + base_iri, + known_contexts, + )] }; debug!("Effective parallelization for reading is {}", readers.len()); @@ -578,6 +524,101 @@ fn create_predicate_map<'a>( Ok((out_r, graph_predicate_map)) } +fn hdt_reader(hdt: &Hdt) -> MyFromSliceQuadReader<'_> { + MyFromSliceQuadReader { + parser: MyFromSliceQuadReaderKind::HDT(Box::new( + hdt.triples_all().map(hdt_string_triple_to_quad), + )), + } +} + +fn parallel_turtle_readers<'a>( + slice: &'a [u8], + prefixes: &HashMap, + checked: bool, + base_iri: Option, +) -> Vec> { + let threads = std::thread::available_parallelism() + .map(|t| t.get()) + .unwrap_or(1); + let mut parser = TurtleParser::new(); + for (k, v) in prefixes { + parser = parser.with_prefix(k, v.as_str()).unwrap(); + } + if !checked { + parser = parser.lenient(); + } + if let Some(base_iri) = base_iri { + parser = parser.with_base_iri(base_iri).unwrap(); + } + parser + .split_slice_for_parallel_parsing(slice, threads) + .into_iter() + .map(|r| MyFromSliceQuadReader { + parser: MyFromSliceQuadReaderKind::TurtlePar(r), + }) + .collect() +} + +fn parallel_ntriples_readers(slice: &[u8], checked: bool) -> Vec> { + let threads = std::thread::available_parallelism() + .map(|t| t.get()) + .unwrap_or(1); + let mut parser = NTriplesParser::new(); + if !checked { + parser = parser.lenient(); + } + parser + .split_slice_for_parallel_parsing(slice, threads) + .into_iter() + .map(|r| MyFromSliceQuadReader { + parser: MyFromSliceQuadReaderKind::NTriplesPar(r), + }) + .collect() +} + +fn rdf_parser_reader<'a>( + slice: &'a [u8], + rdf_format: &ExtendedRdfFormat, + checked: bool, + base_iri: Option, + known_contexts: HashMap, +) -> MyFromSliceQuadReader<'a> { + let use_format = match rdf_format { + ExtendedRdfFormat::Normal(n) => *n, + ExtendedRdfFormat::CIMXML => RdfFormat::RdfXml, + ExtendedRdfFormat::HDT => unreachable!("HDT is handled in read_triples"), + }; + let mut parser = RdfParser::from(use_format); + if !checked { + parser = parser.lenient(); + } + if let Some(base_iri) = &base_iri { + parser = parser.with_base_iri(base_iri).unwrap(); + } + let mut for_slice = parser.for_slice(slice); + if matches!(use_format, RdfFormat::JsonLd { .. }) { + for_slice = for_slice.with_document_loader(move |url| { + let Some(doc) = known_contexts.get(url) else { + return Err(Box::new(TriplestoreError::MissingContext(url.to_string()))); + }; + Ok(LoadedDocument { + url: url.to_string(), + content: doc.clone().into_bytes(), + format: RdfFormat::JsonLd { + profile: JsonLdProfileSet::empty(), + }, + }) + }); + } + let parser = if matches!(rdf_format, ExtendedRdfFormat::CIMXML) { + MyFromSliceQuadReaderKind::CIMXML(for_slice, base_iri) + } else { + MyFromSliceQuadReaderKind::Other(for_slice) + }; + MyFromSliceQuadReader { parser } +} + //Adapted from proposed change to https://github.com/oxigraph/ #[must_use] pub struct MyFromSliceQuadReader<'a> { @@ -589,6 +630,7 @@ pub enum MyFromSliceQuadReaderKind<'a> { CIMXML(SliceQuadParser<'a>, Option), TurtlePar(SliceTurtleParser<'a>), NTriplesPar(SliceNTriplesParser<'a>), + HDT(Box + Send + 'a>), } impl Iterator for MyFromSliceQuadReader<'_> { @@ -615,10 +657,39 @@ impl Iterator for MyFromSliceQuadReader<'_> { Ok(triple) => Ok(triple.in_graph(GraphName::default())), Err(e) => Err(e.into()), }, + MyFromSliceQuadReaderKind::HDT(iter) => Ok(iter.next()?), }) } } +fn hdt_string_triple_to_quad(t: [Arc; 3]) -> Quad { + let [s, p, o] = t; + let subject = if let Some(label) = s.strip_prefix("_:") { + NamedOrBlankNode::BlankNode(BlankNode::new_unchecked(label)) + } else { + NamedOrBlankNode::NamedNode(NamedNode::new_unchecked(s.as_ref())) + }; + let predicate = NamedNode::new_unchecked(p.as_ref()); + Quad::new( + subject, + predicate, + hdt_object_string_to_term(&o), + GraphName::DefaultGraph, + ) +} + +fn hdt_object_string_to_term(o: &str) -> Term { + if o.starts_with('"') { + return Literal::from_str(o) + .unwrap_or_else(|_| Literal::new_simple_literal(o)) + .into(); + } + if let Some(label) = o.strip_prefix("_:") { + return BlankNode::new_unchecked(label).into(); + } + NamedNode::new_unchecked(o).into() +} + fn get_or_insert_dt( base_rdfnode_type_ref: BaseRDFNodeTypeRef, type_map: &mut HashMap, diff --git a/lib/triplestore/src/triples_write.rs b/lib/triplestore/src/triples_write.rs index 619bec6f..cc6b05d9 100644 --- a/lib/triplestore/src/triples_write.rs +++ b/lib/triplestore/src/triples_write.rs @@ -18,6 +18,7 @@ use std::collections::HashMap; use std::io::Write; mod fast_ntriples; +mod hdt_write; mod pretty_turtle; mod serializers; @@ -132,6 +133,32 @@ impl Triplestore { Ok(()) } + pub fn write_hdt( + &mut self, + buf: &mut W, + graph: &NamedGraph, + ) -> Result<(), TriplestoreError> { + self.check_graph_exists(graph)?; + let mut builder = hdt_write::HdtBuilder::new(); + for (predicate, df_map) in self.graph_triples_map.get(graph).unwrap() { + for ((subject_type, object_type), tt) in df_map { + for (lf, _) in tt.get_lazy_frames(&None, &None)? { + let triples = global_df_as_triples( + lf.collect().unwrap(), + subject_type.clone(), + object_type.clone(), + predicate, + self.global_cats.clone(), + ); + for t in &triples { + builder.add_triple(t); + } + } + } + } + builder.finish(buf) + } + pub(crate) fn check_graph_exists(&self, graph: &NamedGraph) -> Result<(), TriplestoreError> { if !self.graph_triples_map.contains_key(graph) { Err(TriplestoreError::GraphDoesNotExist(graph.to_string())) diff --git a/lib/triplestore/src/triples_write/hdt_write.rs b/lib/triplestore/src/triples_write/hdt_write.rs new file mode 100644 index 00000000..da6d2fec --- /dev/null +++ b/lib/triplestore/src/triples_write/hdt_write.rs @@ -0,0 +1,206 @@ +use crate::errors::TriplestoreError; +use hdt::containers::rdf::{Id, Literal as HdtLiteral, Term as HdtTerm, Triple as HdtTriple}; +use hdt::containers::ControlInfo; +use hdt::dict_sect_pfc::DictSectPFC; +use hdt::four_sect_dict::FourSectDict; +use hdt::header::Header; +use hdt::triples::{TripleId, TriplesBitmap}; +use hdt::IdKind; +use oxrdf::{NamedOrBlankNode, Term, Triple}; +use std::collections::hash_map::Entry; +use std::collections::{BTreeSet, HashMap}; +use std::io::{BufWriter, Write}; + +const BLOCK_SIZE: usize = 16; +const BASE_IRI: &str = "https://github.com/DataTreehouse/maplib"; + +pub(super) struct HdtBuilder { + term_ids: HashMap, + roles: Vec, + triples: Vec<[usize; 3]>, +} + +#[derive(Default, Clone, Copy)] +struct TermRoles { + subject: bool, + predicate: bool, + object: bool, +} + +impl HdtBuilder { + pub(super) fn new() -> Self { + Self { + term_ids: HashMap::new(), + roles: Vec::new(), + triples: Vec::new(), + } + } + + pub(super) fn add_triple(&mut self, t: &Triple) { + let s = self.intern(subject_dict_string(&t.subject)); + let p = self.intern(t.predicate.as_str().to_owned()); + let o = self.intern(object_dict_string(&t.object)); + self.roles[s].subject = true; + self.roles[p].predicate = true; + self.roles[o].object = true; + self.triples.push([s, p, o]); + } + + fn intern(&mut self, term: String) -> usize { + let next_id = self.term_ids.len(); + match self.term_ids.entry(term) { + Entry::Occupied(e) => *e.get(), + Entry::Vacant(e) => { + e.insert(next_id); + self.roles.push(TermRoles::default()); + next_id + } + } + } + + pub(super) fn finish(self, buf: &mut W) -> Result<(), TriplestoreError> { + let dict = self.build_dict(); + let encoded = self.encode_triples(&dict)?; + let triples = TriplesBitmap::from_triples(&encoded); + let header = statistics_header(&dict, encoded.len()); + + let mut writer = BufWriter::new(buf); + ControlInfo::global() + .write(&mut writer) + .map_err(hdt_error)?; + header.write(&mut writer).map_err(hdt_error)?; + dict.write(&mut writer).map_err(hdt_error)?; + triples.write(&mut writer).map_err(hdt_error)?; + writer.flush().map_err(hdt_error)?; + Ok(()) + } + + fn build_dict(&self) -> FourSectDict { + let mut shared = BTreeSet::new(); + let mut subjects = BTreeSet::new(); + let mut predicates = BTreeSet::new(); + let mut objects = BTreeSet::new(); + for (term, &id) in &self.term_ids { + let roles = &self.roles[id]; + if roles.predicate { + predicates.insert(term.as_str()); + } + if roles.subject && roles.object { + shared.insert(term.as_str()); + } else if roles.subject { + subjects.insert(term.as_str()); + } else if roles.object { + objects.insert(term.as_str()); + } + } + FourSectDict { + shared: DictSectPFC::compress(&shared, BLOCK_SIZE), + subjects: DictSectPFC::compress(&subjects, BLOCK_SIZE), + predicates: DictSectPFC::compress(&predicates, BLOCK_SIZE), + objects: DictSectPFC::compress(&objects, BLOCK_SIZE), + } + } + + fn encode_triples(&self, dict: &FourSectDict) -> Result, TriplestoreError> { + let mut term_by_id = vec![""; self.term_ids.len()]; + for (term, &id) in &self.term_ids { + term_by_id[id] = term.as_str(); + } + let mut encoded = Vec::with_capacity(self.triples.len()); + for [s, p, o] in &self.triples { + let triple_id: TripleId = [ + dict.string_to_id(term_by_id[*s], IdKind::Subject), + dict.string_to_id(term_by_id[*p], IdKind::Predicate), + dict.string_to_id(term_by_id[*o], IdKind::Object), + ]; + if triple_id.contains(&0) { + return Err(TriplestoreError::HDTError(format!( + "term of ({}, {}, {}) missing from the HDT dictionary", + term_by_id[*s], term_by_id[*p], term_by_id[*o] + ))); + } + encoded.push(triple_id); + } + encoded.sort_unstable(); + encoded.dedup(); + Ok(encoded) + } +} + +fn subject_dict_string(subject: &NamedOrBlankNode) -> String { + match subject { + NamedOrBlankNode::NamedNode(nn) => nn.as_str().to_owned(), + NamedOrBlankNode::BlankNode(bn) => bn.to_string(), + } +} + +fn object_dict_string(object: &Term) -> String { + match object { + Term::NamedNode(nn) => nn.as_str().to_owned(), + Term::BlankNode(bn) => bn.to_string(), + Term::Literal(lit) => lit.to_string(), + } +} + +fn statistics_header(dict: &FourSectDict, num_triples: usize) -> Header { + use hdt::vocab::*; + + let mut body = BTreeSet::new(); + let base = Id::Named(BASE_IRI.to_owned()); + let stats_id = Id::Blank("statistics".to_owned()); + let pub_id = Id::Blank("publicationInformation".to_owned()); + let format_id = Id::Blank("format".to_owned()); + let dict_id = Id::Blank("dictionary".to_owned()); + let triples_id = Id::Blank("triples".to_owned()); + + let distinct_subjects = dict.subjects.num_strings() + dict.shared.num_strings(); + let distinct_objects = dict.objects.num_strings() + dict.shared.num_strings(); + + insert_literal(&mut body, &base, RDF_TYPE, HDT_CONTAINER); + insert_literal(&mut body, &base, RDF_TYPE, VOID_DATASET); + insert_literal(&mut body, &base, VOID_TRIPLES, num_triples); + insert_literal(&mut body, &base, VOID_PROPERTIES, dict.predicates.num_strings()); + insert_literal(&mut body, &base, VOID_DISTINCT_SUBJECTS, distinct_subjects); + insert_literal(&mut body, &base, VOID_DISTINCT_OBJECTS, distinct_objects); + + insert_id(&mut body, &base, HDT_STATISTICAL_INFORMATION, &stats_id); + insert_id(&mut body, &base, HDT_STATISTICAL_INFORMATION, &pub_id); + insert_id(&mut body, &base, HDT_FORMAT_INFORMATION, &format_id); + insert_id(&mut body, &format_id, HDT_DICTIONARY, &dict_id); + insert_id(&mut body, &format_id, HDT_TRIPLES, &triples_id); + + insert_literal(&mut body, &dict_id, HDT_DICT_SHARED_SO, dict.shared.num_strings()); + insert_literal(&mut body, &dict_id, HDT_DICT_MAPPING, "1"); + insert_literal(&mut body, &dict_id, HDT_DICT_SIZE_STRINGS, dict.size_in_bytes()); + insert_literal(&mut body, &dict_id, HDT_DICT_BLOCK_SIZE, BLOCK_SIZE); + + insert_literal(&mut body, &triples_id, DC_TERMS_FORMAT, HDT_TYPE_BITMAP); + insert_literal(&mut body, &triples_id, HDT_NUM_TRIPLES, num_triples); + insert_literal(&mut body, &triples_id, HDT_TRIPLES_ORDER, "SPO"); + + let mut serialized_body = Vec::new(); + for triple in &body { + writeln!(serialized_body, "{triple}").unwrap(); + } + Header { + format: "ntriples".to_owned(), + length: serialized_body.len(), + body, + } +} + +fn insert_literal(body: &mut BTreeSet, s: &Id, p: &str, o: impl ToString) { + body.insert(HdtTriple::new( + s.clone(), + p.to_owned(), + HdtTerm::Literal(HdtLiteral::new(o.to_string())), + )); +} + +fn insert_id(body: &mut BTreeSet, s: &Id, p: &str, o: &Id) { + body.insert(HdtTriple::new(s.clone(), p.to_owned(), HdtTerm::Id(o.clone()))); +} + +fn hdt_error(e: impl std::fmt::Display) -> TriplestoreError { + TriplestoreError::HDTError(e.to_string()) +} diff --git a/py_maplib/maplib/__init__.pyi b/py_maplib/maplib/__init__.pyi index 205a90df..ff180a31 100644 --- a/py_maplib/maplib/__init__.pyi +++ b/py_maplib/maplib/__init__.pyi @@ -860,7 +860,7 @@ class Model: def read( self, file_path: Union[str, Path], - format: LiteralType["ntriples", "turtle", "rdf/xml", "cim/xml", "json-ld"] = None, + format: LiteralType["ntriples", "turtle", "rdf/xml", "cim/xml", "json-ld", "hdt"] = None, base_iri: str = None, transient: bool = False, parallel: bool = None, @@ -881,7 +881,7 @@ class Model: >>> m.read("my_triples.ttl") :param file_path: The path of the file containing triples - :param format: One of "ntriples", "turtle", "rdf/xml", "json-ld" or "cim/xml", otherwise it is inferred from the file extension. + :param format: One of "ntriples", "turtle", "rdf/xml", "json-ld", "cim/xml" or "hdt", otherwise it is inferred from the file extension. :param base_iri: Base iri :param transient: Should these triples be included when writing the graph to the file system? :param parallel: Parse triples in parallel, currently only NTRiples and Turtle. Assumes all prefixes are in the beginning of the document. Defaults to true only for NTriples. @@ -986,7 +986,7 @@ class Model: def write( self, file_path: Union[str, Path], - format=LiteralType["ntriples", "turtle", "rdf/xml"], + format=LiteralType["ntriples", "turtle", "rdf/xml", "hdt"], graph: str = None, prefixes: Dict[str, str] = None, ) -> None: @@ -998,7 +998,8 @@ class Model: >>> m.write("my_triples.nt", format="ntriples") :param file_path: The path of the file containing triples - :param format: One of "ntriples", "turtle", "rdf/xml". + :param format: One of "ntriples", "turtle", "rdf/xml", "hdt". + HDT is built in memory; literals with special characters are stored N-Triples-escaped, following the Rust hdt crate. :param graph: The IRI of the graph to write. :param prefixes: The prefixes that will be used in turtle serialization. """ @@ -1107,8 +1108,8 @@ class Model: def size(self, graph:str=None) -> int: """ - Get the number of triples in a graph. - + Get the number of triples in a graph. + :param graph: The named graph we are returning the size for :return: The inferred N-Tuples. """ @@ -1176,4 +1177,4 @@ class VirtualizedDatabase: :param:database: An instance of a class containing a query method. :param:resource_sql_map: A dict providing a sqlalchemy Select for each resource. :param:sql_dialect: The SQL dialect accepted by the query method. - """ \ No newline at end of file + """ diff --git a/py_maplib/src/lib.rs b/py_maplib/src/lib.rs index efee53b5..6657f2e2 100644 --- a/py_maplib/src/lib.rs +++ b/py_maplib/src/lib.rs @@ -1578,6 +1578,12 @@ fn reads_mutex( let graph = parse_optional_named_node(graph)?; let named_graph = NamedGraph::from_maybe_named_node(graph.as_ref()); let format = resolve_format(format).map_err(PyMaplibError::from)?; + if format == ExtendedRdfFormat::HDT { + return Err(PyMaplibError::FunctionArgumentError( + "HDT is a binary format, use read() instead of reads()".to_string(), + ) + .into()); + } inner .reads( s, @@ -1603,18 +1609,34 @@ fn write_triples_mutex( prefixes: Option>, ) -> PyResult<()> { let format = if let Some(format) = format { - resolve_normal_format(&format).map_err(PyMaplibError::from)? + resolve_format(&format).map_err(PyMaplibError::from)? } else { - RdfFormat::NTriples + ExtendedRdfFormat::Normal(RdfFormat::NTriples) }; + if format == ExtendedRdfFormat::CIMXML { + return Err(PyMaplibError::FunctionArgumentError( + "Use write_cim_xml to write CIM XML".to_string(), + ) + .into()); + } let path_buf = PathBuf::from(file_path); let mut actual_file = File::create(path_buf.as_path()) .map_err(|x| PyMaplibError::from(MaplibError::FileCreateIOError(x)))?; let graph = parse_optional_named_node(graph)?; let named_graph = NamedGraph::from_maybe_named_node(graph.as_ref()); - inner - .write_triples(&mut actual_file, &named_graph, format, prefixes.as_ref()) - .unwrap(); + match format { + ExtendedRdfFormat::Normal(format) => { + inner + .write_triples(&mut actual_file, &named_graph, format, prefixes.as_ref()) + .unwrap(); + } + ExtendedRdfFormat::HDT => { + inner + .write_hdt(&mut actual_file, &named_graph) + .map_err(PyMaplibError::from)?; + } + ExtendedRdfFormat::CIMXML => unreachable!("rejected above"), + } Ok(()) } @@ -1686,6 +1708,12 @@ fn writes_mutex( prefixes: Option>, ) -> PyResult { let format = if let Some(format) = format { + if format.eq_ignore_ascii_case("hdt") { + return Err(PyMaplibError::FunctionArgumentError( + "HDT is a binary format, use write() instead of writes()".to_string(), + ) + .into()); + } resolve_normal_format(&format).map_err(PyMaplibError::from)? } else { RdfFormat::NTriples @@ -1957,6 +1985,7 @@ fn resolve_normal_format(format: &str) -> Result { fn resolve_format(format: &str) -> Result { match format.to_lowercase().as_str() { "cim" | "cim/xml" | "cimxml" => Ok(ExtendedRdfFormat::CIMXML), + "hdt" => Ok(ExtendedRdfFormat::HDT), _ => match resolve_normal_format(format) { Ok(o) => Ok(ExtendedRdfFormat::Normal(o)), Err(e) => Err(e), diff --git a/py_maplib/tests/test_hdt.py b/py_maplib/tests/test_hdt.py new file mode 100644 index 00000000..495b257c --- /dev/null +++ b/py_maplib/tests/test_hdt.py @@ -0,0 +1,171 @@ +import pathlib + +import polars as pl +import pytest +import rdflib +from rdflib.compare import isomorphic + +from maplib import Model + +pl.Config.set_fmt_str_lengths(300) + +PATH_HERE = pathlib.Path(__file__).parent +TESTDATA_PATH = PATH_HERE / "testdata" + + +def model_as_rdflib_graph(m: Model) -> rdflib.Graph: + g = rdflib.Graph() + g.parse(data=m.writes(format="ntriples"), format="nt") + return g + + +def test_write_read_hdt_round_trip_is_isomorphic(tmp_path): + m = Model() + m.read(str(TESTDATA_PATH / "read_ntriples.nt")) + hdt_path = tmp_path / "out.hdt" + m.write(str(hdt_path), format="hdt") + + m2 = Model() + m2.read(str(hdt_path), format="hdt") + + assert isomorphic(model_as_rdflib_graph(m), model_as_rdflib_graph(m2)) + + +def test_read_hdt_infers_format_from_file_extension(tmp_path): + m = Model() + m.read(str(TESTDATA_PATH / "read_ntriples.nt")) + hdt_path = tmp_path / "out.hdt" + m.write(str(hdt_path), format="hdt") + + m2 = Model() + m2.read(str(hdt_path)) + res = m2.query( + """ + SELECT ?v ?o WHERE { + ?s ?v ?o . + } + """ + ) + assert res.height == 8 + + +def test_hdt_round_trip_preserves_typed_and_language_literals(tmp_path): + m = Model() + m.reads( + """ + "plain" . + "hello"@en . + "42"^^ . + "1.5"^^ . + "line1\\nline2 \\"quoted\\"" . + . + """, + format="ntriples", + ) + hdt_path = tmp_path / "literals.hdt" + m.write(str(hdt_path), format="hdt") + + m2 = Model() + m2.read(str(hdt_path)) + + assert isomorphic(model_as_rdflib_graph(m), model_as_rdflib_graph(m2)) + + +def test_empty_graph_hdt_round_trip(tmp_path): + m = Model() + m.reads(" .", format="ntriples") + m.update("DELETE WHERE { ?s ?p ?o }") + hdt_path = tmp_path / "empty.hdt" + m.write(str(hdt_path), format="hdt") + + m2 = Model() + m2.read(str(hdt_path), format="hdt") + res = m2.query("SELECT ?s ?p ?o WHERE { ?s ?p ?o }") + assert res.height == 0 + + +def test_writes_with_hdt_format_raises_since_hdt_is_binary(): + m = Model() + m.reads(" .", format="ntriples") + with pytest.raises(Exception, match="binary"): + m.writes(format="hdt") + + +def test_reads_with_hdt_format_raises_since_hdt_is_binary(): + m = Model() + with pytest.raises(Exception, match="binary"): + m.reads("anything", format="hdt") + + +def test_hdt_round_trip_preserves_object_position_blank_nodes(tmp_path): + m = Model() + m.reads( + """ + _:b1 . + _:b1 _:b2 . + _:b2 "leaf" . + """, + format="ntriples", + ) + hdt_path = tmp_path / "blanks.hdt" + m.write(str(hdt_path), format="hdt") + + m2 = Model() + m2.read(str(hdt_path)) + + assert isomorphic(model_as_rdflib_graph(m), model_as_rdflib_graph(m2)) + + +def test_hdt_round_trip_preserves_backslashes_in_literals(tmp_path): + m = Model() + m.reads( + " \"C:\\\\new\\\\path\" .", + format="ntriples", + ) + hdt_path = tmp_path / "backslash.hdt" + m.write(str(hdt_path), format="hdt") + + m2 = Model() + m2.read(str(hdt_path)) + res = m2.query("SELECT ?o WHERE { ?s ?p ?o }") + assert res.get_column("o").to_list() == ["C:\\new\\path"] + + +def test_reading_truncated_hdt_file_raises_normal_exception(tmp_path): + m = Model() + m.read(str(TESTDATA_PATH / "read_ntriples.nt")) + hdt_path = tmp_path / "out.hdt" + m.write(str(hdt_path), format="hdt") + + content = hdt_path.read_bytes() + truncated_path = tmp_path / "truncated.hdt" + truncated_path.write_bytes(content[: len(content) // 2]) + + m2 = Model() + with pytest.raises(Exception, match="HDT"): + m2.read(str(truncated_path)) + + +def test_write_cim_xml_format_does_not_truncate_existing_file(tmp_path): + m = Model() + m.reads(" .", format="ntriples") + out_path = tmp_path / "out.xml" + out_path.write_text("precious") + with pytest.raises(Exception, match="write_cim_xml"): + m.write(str(out_path), format="cim/xml") + assert out_path.read_text() == "precious" + + +def test_hdt_round_trip_preserves_language_tags(tmp_path): + m = Model() + m.reads( + ' "hello"@en-US .', + format="ntriples", + ) + hdt_path = tmp_path / "lang.hdt" + m.write(str(hdt_path), format="hdt") + + m2 = Model() + m2.read(str(hdt_path)) + + assert m.writes(format="ntriples").strip() == m2.writes(format="ntriples").strip()