From 35bf67e38c593acc271c45ee9b7127c535ac8f16 Mon Sep 17 00:00:00 2001 From: Oleh Martsokha Date: Sun, 15 Feb 2026 17:20:27 +0100 Subject: [PATCH 01/11] chore(deps): bump strum 0.27, rand 0.10, zip 7, imageproc 0.26, futures 0.3.32, uuid 1.21, jiff 0.2.20 Co-Authored-By: Claude Opus 4.6 --- .github/dependabot.yml | 7 +- Cargo.lock | 579 +++++++++++++++++++++++++++++++---------- Cargo.toml | 8 +- README.md | 9 +- 4 files changed, 457 insertions(+), 146 deletions(-) diff --git a/.github/dependabot.yml b/.github/dependabot.yml index f68793c..c127dbb 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -1,5 +1,6 @@ version: 2 updates: + # Enable version updates for cargo - package-ecosystem: "cargo" directory: "/" schedule: @@ -11,7 +12,10 @@ updates: labels: - "chore" commit-message: - prefix: "chore(deps-rs)" + prefix: "chore(deps)" + prefix-development: "chore(deps-dev)" + rebase-strategy: "auto" + versioning-strategy: "auto" groups: rust-dependencies: patterns: @@ -60,6 +64,7 @@ updates: - "minor" - "patch" + # Version updates for GitHub Actions - package-ecosystem: "github-actions" directory: "/" schedule: diff --git a/Cargo.lock b/Cargo.lock index c8c1ce5..7fb9d4c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -41,7 +41,7 @@ checksum = "b169f7a6d4742236a0a00c541b845991d0ac43e546831af1249753ab4c3aa3a0" dependencies = [ "cfg-if", "cipher", - "cpufeatures", + "cpufeatures 0.2.17", ] [[package]] @@ -150,9 +150,6 @@ name = "arbitrary" version = "1.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c3d036a3c4ab069c7b410a2ce876bd74808d2d0888a82667669f8e783a898bf1" -dependencies = [ - "derive_arbitrary", -] [[package]] name = "arg_enum_proc_macro" @@ -343,21 +340,11 @@ dependencies = [ [[package]] name = "bzip2" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49ecfb22d906f800d4fe833b6282cf4dc1c298f5057ca0b5445e5c209735ca47" -dependencies = [ - "bzip2-sys", -] - -[[package]] -name = "bzip2-sys" -version = "0.1.13+1.0.8" +version = "0.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "225bff33b2141874fe80d71e07d6eec4f85c5c216453dd96388240f96e1acc14" +checksum = "f3a53fac24f34a81bc9954b5d6cfce0c21e18ec6959f44f56e8e90e4bb7c346c" dependencies = [ - "cc", - "pkg-config", + "libbz2-rs-sys", ] [[package]] @@ -374,7 +361,7 @@ dependencies = [ "log", "quick-xml 0.38.4", "serde", - "zip 7.4.0", + "zip", ] [[package]] @@ -406,6 +393,17 @@ version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" +[[package]] +name = "chacha20" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f8d983286843e49675a4b7a2d174efe136dc93a18d69130dd18198a6c167601" +dependencies = [ + "cfg-if", + "cpufeatures 0.3.0", + "rand_core 0.10.0", +] + [[package]] name = "chrono" version = "0.4.43" @@ -452,9 +450,9 @@ checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" [[package]] name = "constant_time_eq" -version = "0.3.1" +version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7c74b8349d32d297c9134b8c88677813a227df8f779daa29bfc29c183fe3dca6" +checksum = "3d52eff69cd5e647efe296129160853a42795992097e8af39800e1060caeea9b" [[package]] name = "core-foundation" @@ -490,6 +488,15 @@ dependencies = [ "libc", ] +[[package]] +name = "cpufeatures" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b2a41393f66f16b0823bb79094d54ac5fbd34ab292ddafb9a0456ac9f87d201" +dependencies = [ + "libc", +] + [[package]] name = "crc" version = "3.3.0" @@ -645,17 +652,6 @@ dependencies = [ "syn 1.0.109", ] -[[package]] -name = "derive_arbitrary" -version = "1.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e567bd82dcff979e4b03460c307b3cdc9e96fde3d73bed1496d2bc75d9dd62a" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.114", -] - [[package]] name = "derive_more" version = "0.99.20" @@ -947,9 +943,9 @@ dependencies = [ [[package]] name = "futures" -version = "0.3.31" +version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "65bc07b1a8bc7c85c5f2e110c476c7389b4554ba72af57d8445ea63a576b0876" +checksum = "8b147ee9d1f6d097cef9ce628cd2ee62288d963e16fb287bd9286455b241382d" dependencies = [ "futures-channel", "futures-core", @@ -962,9 +958,9 @@ dependencies = [ [[package]] name = "futures-channel" -version = "0.3.31" +version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2dff15bf788c671c1934e366d07e30c1814a8ef514e1af724a602e8a2fbe1b10" +checksum = "07bbe89c50d7a535e539b8c17bc0b49bdb77747034daa8087407d655f3f7cc1d" dependencies = [ "futures-core", "futures-sink", @@ -972,15 +968,15 @@ dependencies = [ [[package]] name = "futures-core" -version = "0.3.31" +version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e" +checksum = "7e3450815272ef58cec6d564423f6e755e25379b217b0bc688e295ba24df6b1d" [[package]] name = "futures-executor" -version = "0.3.31" +version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e28d1d997f585e54aebc3f97d39e72338912123a67330d723fdbb564d646c9f" +checksum = "baf29c38818342a3b26b5b923639e7b1f4a61fc5e76102d4b1981c6dc7a7579d" dependencies = [ "futures-core", "futures-task", @@ -989,15 +985,15 @@ dependencies = [ [[package]] name = "futures-io" -version = "0.3.31" +version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6" +checksum = "cecba35d7ad927e23624b22ad55235f2239cfa44fd10428eecbeba6d6a717718" [[package]] name = "futures-macro" -version = "0.3.31" +version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" +checksum = "e835b70203e41293343137df5c0664546da5745f82ec9b84d40be8336958447b" dependencies = [ "proc-macro2", "quote", @@ -1006,21 +1002,21 @@ dependencies = [ [[package]] name = "futures-sink" -version = "0.3.31" +version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e575fab7d1e0dcb8d0c7bcf9a63ee213816ab51902e6d244a95819acacf1d4f7" +checksum = "c39754e157331b013978ec91992bde1ac089843443c49cbc7f46150b0fad0893" [[package]] name = "futures-task" -version = "0.3.31" +version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f90f7dce0722e95104fcb095585910c0977252f286e354b5e3bd38902cd99988" +checksum = "037711b3d59c33004d3856fbdc83b99d4ff37a24768fa1be9ce3538a1cde4393" [[package]] name = "futures-util" -version = "0.3.31" +version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81" +checksum = "389ca41296e6190b48053de0321d02a77f32f8a5d2461dd38762c0593805c6d6" dependencies = [ "futures-channel", "futures-core", @@ -1030,7 +1026,6 @@ dependencies = [ "futures-task", "memchr", "pin-project-lite", - "pin-utils", "slab", ] @@ -1069,10 +1064,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ff2abc00be7fca6ebc474524697ae276ad847ad0a6b3faa4bcb027e9a4614ad0" dependencies = [ "cfg-if", - "js-sys", "libc", "wasi", - "wasm-bindgen", ] [[package]] @@ -1089,6 +1082,22 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "getrandom" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "139ef39800118c7683f2fd3c98c1b23c09ae076556b435f8e9064ae108aaeeec" +dependencies = [ + "cfg-if", + "js-sys", + "libc", + "r-efi", + "rand_core 0.10.0", + "wasip2", + "wasip3", + "wasm-bindgen", +] + [[package]] name = "gif" version = "0.14.1" @@ -1099,6 +1108,102 @@ dependencies = [ "weezl", ] +[[package]] +name = "glam" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "333928d5eb103c5d4050533cec0384302db6be8ef7d3cebd30ec6a35350353da" + +[[package]] +name = "glam" +version = "0.15.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3abb554f8ee44336b72d522e0a7fe86a29e09f839a36022fa869a7dfe941a54b" + +[[package]] +name = "glam" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4126c0479ccf7e8664c36a2d719f5f2c140fbb4f9090008098d2c291fa5b3f16" + +[[package]] +name = "glam" +version = "0.17.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e01732b97afd8508eee3333a541b9f7610f454bb818669e66e90f5f57c93a776" + +[[package]] +name = "glam" +version = "0.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "525a3e490ba77b8e326fb67d4b44b4bd2f920f44d4cc73ccec50adc68e3bee34" + +[[package]] +name = "glam" +version = "0.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b8509e6791516e81c1a630d0bd7fbac36d2fa8712a9da8662e716b52d5051ca" + +[[package]] +name = "glam" +version = "0.20.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f43e957e744be03f5801a55472f593d43fabdebf25a4585db250f04d86b1675f" + +[[package]] +name = "glam" +version = "0.21.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "518faa5064866338b013ff9b2350dc318e14cc4fcd6cb8206d7e7c9886c98815" + +[[package]] +name = "glam" +version = "0.22.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "12f597d56c1bd55a811a1be189459e8fad2bbc272616375602443bdfb37fa774" + +[[package]] +name = "glam" +version = "0.23.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e4afd9ad95555081e109fe1d21f2a30c691b5f0919c67dfa690a2e1eb6bd51c" + +[[package]] +name = "glam" +version = "0.24.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5418c17512bdf42730f9032c74e1ae39afc408745ebb2acf72fbc4691c17945" + +[[package]] +name = "glam" +version = "0.25.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "151665d9be52f9bb40fc7966565d39666f2d1e69233571b71b87791c7e0528b3" + +[[package]] +name = "glam" +version = "0.27.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e05e7e6723e3455f4818c7b26e855439f7546cf617ef669d1adedb8669e5cb9" + +[[package]] +name = "glam" +version = "0.28.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "779ae4bf7e8421cf91c0b3b64e7e8b40b862fba4d393f59150042de7c4965a94" + +[[package]] +name = "glam" +version = "0.29.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8babf46d4c1c9d92deac9f7be466f76dfc4482b6452fc5024b5e8daf6ffeb3ee" + +[[package]] +name = "glam" +version = "0.30.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19fc433e8437a212d1b6f1e68c7824af3aed907da60afa994e7f542d18d12aa9" + [[package]] name = "h2" version = "0.4.13" @@ -1406,6 +1511,12 @@ dependencies = [ "zerovec", ] +[[package]] +name = "id-arena" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d3067d79b975e8844ca9eb072e16b31c3c1c36928edf9c6789548c524d0d954" + [[package]] name = "idna" version = "1.1.0" @@ -1463,20 +1574,21 @@ dependencies = [ [[package]] name = "imageproc" -version = "0.25.0" +version = "0.26.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2393fb7808960751a52e8a154f67e7dd3f8a2ef9bd80d1553078a7b4e8ed3f0d" +checksum = "3880a67ffee492100224b2ff31f4220ade971fc327c3c96faa63095a3825ea4d" dependencies = [ "ab_glyph", "approx", - "getrandom 0.2.17", + "getrandom 0.3.4", "image", - "itertools 0.12.1", + "itertools", "nalgebra", "num", - "rand 0.8.5", + "rand 0.9.2", "rand_distr", "rayon", + "rustdct", ] [[package]] @@ -1493,6 +1605,8 @@ checksum = "7714e70437a7dc3ac8eb7e6f8df75fd8eb422675fc7678aff7364301092b1017" dependencies = [ "equivalent", "hashbrown 0.16.1", + "serde", + "serde_core", ] [[package]] @@ -1555,15 +1669,6 @@ version = "1.70.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695" -[[package]] -name = "itertools" -version = "0.12.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569" -dependencies = [ - "either", -] - [[package]] name = "itertools" version = "0.14.0" @@ -1581,9 +1686,9 @@ checksum = "92ecc6618181def0457392ccd0ee51198e065e016d1d527a7ac1b6dc7c1f09d2" [[package]] name = "jiff" -version = "0.2.19" +version = "0.2.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d89a5b5e10d5a9ad6e5d1f4bd58225f655d6fe9767575a5e8ac5a6fe64e04495" +checksum = "c867c356cc096b33f4981825ab281ecba3db0acefe60329f044c1789d94c6543" dependencies = [ "jiff-static", "jiff-tzdb-platform", @@ -1596,9 +1701,9 @@ dependencies = [ [[package]] name = "jiff-static" -version = "0.2.19" +version = "0.2.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ff7a39c8862fc1369215ccf0a8f12dd4598c7f6484704359f0351bd617034dbf" +checksum = "f7946b4325269738f270bb55b3c19ab5c5040525f83fd625259422a9d25d9be5" dependencies = [ "proc-macro2", "quote", @@ -1646,12 +1751,24 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" +[[package]] +name = "leb128fmt" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2" + [[package]] name = "lebe" version = "0.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7a79a3332a6609480d7d0c9eab957bca6b455b91bb84e66d19f5ff66294b85b8" +[[package]] +name = "libbz2-rs-sys" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c4a545a15244c7d945065b5d392b2d2d7f21526fba56ce51467b06ed445e8f7" + [[package]] name = "libc" version = "0.2.181" @@ -1731,24 +1848,13 @@ dependencies = [ ] [[package]] -name = "lzma-rs" -version = "0.3.0" +name = "lzma-rust2" +version = "0.15.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "297e814c836ae64db86b36cf2a557ba54368d03f6afcd7d947c266692f71115e" +checksum = "1670343e58806300d87950e3401e820b519b9384281bbabfb15e3636689ffd69" dependencies = [ - "byteorder", "crc", -] - -[[package]] -name = "lzma-sys" -version = "0.1.20" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5fda04ab3764e6cde78b9974eec4f779acaba7c4e84b36eca3cf77c581b85d27" -dependencies = [ - "cc", - "libc", - "pkg-config", + "sha2", ] [[package]] @@ -1921,11 +2027,27 @@ dependencies = [ [[package]] name = "nalgebra" -version = "0.32.6" +version = "0.34.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b5c17de023a86f59ed79891b2e5d5a94c705dbe904a5b5c9c952ea6221b03e4" +checksum = "c4d5b3eff5cd580f93da45e64715e8c20a3996342f1e466599cf7a267a0c2f5f" dependencies = [ "approx", + "glam 0.14.0", + "glam 0.15.2", + "glam 0.16.0", + "glam 0.17.3", + "glam 0.18.0", + "glam 0.19.0", + "glam 0.20.5", + "glam 0.21.3", + "glam 0.22.0", + "glam 0.23.0", + "glam 0.24.2", + "glam 0.25.0", + "glam 0.27.0", + "glam 0.28.0", + "glam 0.29.3", + "glam 0.30.10", "matrixmultiply", "num-complex", "num-rational", @@ -1988,7 +2110,6 @@ version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "35bd024e8b2ff75562e5f34e7f4905839deb4b22955ef5e73d2fea1b9813cb23" dependencies = [ - "num-bigint", "num-complex", "num-integer", "num-iter", @@ -2105,7 +2226,7 @@ dependencies = [ "nvisy-core", "nvisy-ontology", "petgraph", - "rand 0.9.2", + "rand 0.10.0", "schemars", "serde", "serde_json", @@ -2138,7 +2259,7 @@ dependencies = [ "tokio", "tracing", "uuid", - "zip 2.4.2", + "zip", ] [[package]] @@ -2490,6 +2611,12 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391" +[[package]] +name = "ppmd-rust" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "efca4c95a19a79d1c98f791f10aebd5c1363b473244630bb7dbde1dc98455a24" + [[package]] name = "ppv-lite86" version = "0.2.21" @@ -2505,6 +2632,25 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c" +[[package]] +name = "prettyplease" +version = "0.2.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b" +dependencies = [ + "proc-macro2", + "syn 2.0.114", +] + +[[package]] +name = "primal-check" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc0d895b311e3af9902528fbb8f928688abbd95872819320517cc24ca6b2bd08" +dependencies = [ + "num-integer", +] + [[package]] name = "proc-macro2" version = "1.0.106" @@ -2676,6 +2822,17 @@ dependencies = [ "rand_core 0.9.5", ] +[[package]] +name = "rand" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc266eb313df6c5c09c1c7b1fbe2510961e5bcd3add930c1e31f7ed9da0feff8" +dependencies = [ + "chacha20", + "getrandom 0.4.1", + "rand_core 0.10.0", +] + [[package]] name = "rand_chacha" version = "0.3.1" @@ -2714,14 +2871,20 @@ dependencies = [ "getrandom 0.3.4", ] +[[package]] +name = "rand_core" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c8d0fd677905edcbeedbf2edb6494d676f0e98d54d5cf9bda0b061cb8fb8aba" + [[package]] name = "rand_distr" -version = "0.4.3" +version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32cb0b9bc82b0a0876c2dd994a7e7a2683d3e7390ca40e6886785ef0c7e3ee31" +checksum = "6a8615d50dcf34fa31f7ab52692afec947c4dd0ab803cc87cb3b0b4570ff7463" dependencies = [ "num-traits", - "rand 0.8.5", + "rand 0.9.2", ] [[package]] @@ -2746,7 +2909,7 @@ dependencies = [ "built", "cfg-if", "interpolate_name", - "itertools 0.14.0", + "itertools", "libc", "libfuzzer-sys", "log", @@ -2909,6 +3072,29 @@ version = "0.8.52" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0c6a884d2998352bb4daf0183589aec883f16a6da1f4dde84d8e2e9a5409a1ce" +[[package]] +name = "rustdct" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b61555105d6a9bf98797c063c362a1d24ed8ab0431655e38f1cf51e52089551" +dependencies = [ + "rustfft", +] + +[[package]] +name = "rustfft" +version = "6.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "21db5f9893e91f41798c88680037dba611ca6674703c1a18601b01a72c8adb89" +dependencies = [ + "num-complex", + "num-integer", + "num-traits", + "primal-check", + "strength_reduce", + "transpose", +] + [[package]] name = "rustix" version = "1.1.3" @@ -3153,7 +3339,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e3bf829a2d51ab4a5ddf1352d8470c140cadc8301b2ae1789db023f01cedd6ba" dependencies = [ "cfg-if", - "cpufeatures", + "cpufeatures 0.2.17", "digest", ] @@ -3164,7 +3350,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283" dependencies = [ "cfg-if", - "cpufeatures", + "cpufeatures 0.2.17", "digest", ] @@ -3186,9 +3372,9 @@ dependencies = [ [[package]] name = "simba" -version = "0.8.1" +version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "061507c94fc6ab4ba1c9a0305018408e312e17c041eb63bef8aa726fa33aceae" +checksum = "c99284beb21666094ba2b75bbceda012e610f5479dfcc2d6e2426f53197ffd95" dependencies = [ "approx", "num-complex", @@ -3252,6 +3438,12 @@ version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596" +[[package]] +name = "strength_reduce" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fe895eb47f22e2ddd4dabc02bce419d2e643c8e3b585c78158b349195bc24d82" + [[package]] name = "string_cache" version = "0.8.9" @@ -3279,23 +3471,22 @@ dependencies = [ [[package]] name = "strum" -version = "0.26.3" +version = "0.27.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8fec0f0aef304996cf250b31b5a10dee7980c85da9d759361292b8bca5a18f06" +checksum = "af23d6f6c1a224baef9d3f61e287d2761385a5b88fdab4eb4c6f11aeb54c4bcf" dependencies = [ "strum_macros", ] [[package]] name = "strum_macros" -version = "0.26.4" +version = "0.27.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be" +checksum = "7695ce3845ea4b33927c055a39dc438a45b059f7c1b3d91d38d10355fb8cbca7" dependencies = [ "heck", "proc-macro2", "quote", - "rustversion", "syn 2.0.114", ] @@ -3360,7 +3551,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0136791f7c95b1f6dd99f9cc786b91bb81c3800b639b3478e561ddb7be95e5f1" dependencies = [ "fastrand", - "getrandom 0.3.4", + "getrandom 0.4.1", "once_cell", "rustix", "windows-sys 0.61.2", @@ -3419,6 +3610,7 @@ checksum = "743bd48c283afc0388f9b8827b976905fb217ad9e647fae3a379a9283c4def2c" dependencies = [ "deranged", "itoa", + "js-sys", "num-conv", "powerfmt", "serde_core", @@ -3605,6 +3797,16 @@ dependencies = [ "once_cell", ] +[[package]] +name = "transpose" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ad61aed86bc3faea4300c7aee358b4c6d0c8d6ccc36524c96e4c92ccf26e77e" +dependencies = [ + "num-integer", + "strength_reduce", +] + [[package]] name = "try-lock" version = "0.2.5" @@ -3709,11 +3911,11 @@ checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" [[package]] name = "uuid" -version = "1.20.0" +version = "1.21.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ee48d38b119b0cd71fe4141b30f5ba9c7c5d9f4e7a3a8b4a674e4b6ef789976f" +checksum = "b672338555252d43fd2240c714dc444b8c6fb0a5c5335e65a07bba7742735ddb" dependencies = [ - "getrandom 0.3.4", + "getrandom 0.4.1", "js-sys", "serde_core", "wasm-bindgen", @@ -3766,6 +3968,15 @@ dependencies = [ "wit-bindgen", ] +[[package]] +name = "wasip3" +version = "0.4.0+wasi-0.3.0-rc-2026-01-06" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5428f8bf88ea5ddc08faddef2ac4a67e390b88186c703ce6dbd955e1c145aca5" +dependencies = [ + "wit-bindgen", +] + [[package]] name = "wasm-bindgen" version = "0.2.108" @@ -3825,6 +4036,28 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "wasm-encoder" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "990065f2fe63003fe337b932cfb5e3b80e0b4d0f5ff650e6985b1048f62c8319" +dependencies = [ + "leb128fmt", + "wasmparser", +] + +[[package]] +name = "wasm-metadata" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bb0e353e6a2fbdc176932bbaab493762eb1255a7900fe0fea1a2f96c296cc909" +dependencies = [ + "anyhow", + "indexmap", + "wasm-encoder", + "wasmparser", +] + [[package]] name = "wasm-streams" version = "0.4.2" @@ -3838,6 +4071,18 @@ dependencies = [ "web-sys", ] +[[package]] +name = "wasmparser" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe" +dependencies = [ + "bitflags", + "hashbrown 0.15.5", + "indexmap", + "semver", +] + [[package]] name = "web-sys" version = "0.3.85" @@ -4011,6 +4256,88 @@ name = "wit-bindgen" version = "0.51.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5" +dependencies = [ + "wit-bindgen-rust-macro", +] + +[[package]] +name = "wit-bindgen-core" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea61de684c3ea68cb082b7a88508a8b27fcc8b797d738bfc99a82facf1d752dc" +dependencies = [ + "anyhow", + "heck", + "wit-parser", +] + +[[package]] +name = "wit-bindgen-rust" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7c566e0f4b284dd6561c786d9cb0142da491f46a9fbed79ea69cdad5db17f21" +dependencies = [ + "anyhow", + "heck", + "indexmap", + "prettyplease", + "syn 2.0.114", + "wasm-metadata", + "wit-bindgen-core", + "wit-component", +] + +[[package]] +name = "wit-bindgen-rust-macro" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c0f9bfd77e6a48eccf51359e3ae77140a7f50b1e2ebfe62422d8afdaffab17a" +dependencies = [ + "anyhow", + "prettyplease", + "proc-macro2", + "quote", + "syn 2.0.114", + "wit-bindgen-core", + "wit-bindgen-rust", +] + +[[package]] +name = "wit-component" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2" +dependencies = [ + "anyhow", + "bitflags", + "indexmap", + "log", + "serde", + "serde_derive", + "serde_json", + "wasm-encoder", + "wasm-metadata", + "wasmparser", + "wit-parser", +] + +[[package]] +name = "wit-parser" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ecc8ac4bc1dc3381b7f59c34f00b67e18f910c2c0f50015669dde7def656a736" +dependencies = [ + "anyhow", + "id-arena", + "indexmap", + "log", + "semver", + "serde", + "serde_derive", + "serde_json", + "unicode-xid", + "wasmparser", +] [[package]] name = "writeable" @@ -4033,15 +4360,6 @@ dependencies = [ "xml-rs", ] -[[package]] -name = "xz2" -version = "0.1.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "388c44dc09d76f1536602ead6d325eb532f5c122f17782bd57fb47baeeb767e2" -dependencies = [ - "lzma-sys", -] - [[package]] name = "y4m" version = "0.8.0" @@ -4167,48 +4485,31 @@ dependencies = [ [[package]] name = "zip" -version = "2.4.2" +version = "7.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fabe6324e908f85a1c52063ce7aa26b68dcb7eb6dbc83a2d148403c9bc3eba50" +checksum = "cc12baa6db2b15a140161ce53d72209dacea594230798c24774139b54ecaa980" dependencies = [ "aes", - "arbitrary", "bzip2", "constant_time_eq", "crc32fast", - "crossbeam-utils", "deflate64", - "displaydoc", "flate2", - "getrandom 0.3.4", + "getrandom 0.4.1", "hmac", "indexmap", - "lzma-rs", + "lzma-rust2", "memchr", "pbkdf2", + "ppmd-rust", "sha1", - "thiserror", "time", - "xz2", + "typed-path", "zeroize", "zopfli", "zstd", ] -[[package]] -name = "zip" -version = "7.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc12baa6db2b15a140161ce53d72209dacea594230798c24774139b54ecaa980" -dependencies = [ - "crc32fast", - "flate2", - "indexmap", - "memchr", - "typed-path", - "zopfli", -] - [[package]] name = "zlib-rs" version = "0.6.0" diff --git a/Cargo.toml b/Cargo.toml index f62acce..55ffb06 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -59,7 +59,7 @@ schemars = { version = "1", features = ["uuid1", "bytes1"] } thiserror = { version = "2.0", features = [] } anyhow = { version = "1.0", features = [] } derive_more = { version = "1", features = ["display", "from"] } -strum = { version = "0.26", features = ["derive"] } +strum = { version = "0.27", features = ["derive"] } # Primitive datatypes uuid = { version = "1", features = ["serde", "v4", "v7"] } @@ -88,14 +88,14 @@ minio = { version = "0.3", features = [] } # Image processing image = { version = "0.25", default-features = false, features = ["png", "jpeg", "tiff"] } -imageproc = { version = "0.25", features = [] } +imageproc = { version = "0.26", features = [] } # Document parsing pdf-extract = { version = "0.7", features = [] } lopdf = { version = "0.34", features = [] } scraper = { version = "0.22", features = [] } calamine = { version = "0.33", features = [] } -zip = { version = "2", features = [] } +zip = { version = "7", features = [] } quick-xml = { version = "0.37", features = [] } # Semantic versioning @@ -105,4 +105,4 @@ semver = { version = "1", features = ["serde"] } tempfile = { version = "3", features = [] } # Randomness -rand = { version = "0.9", features = [] } +rand = { version = "0.10", features = [] } diff --git a/README.md b/README.md index 7201a3e..e651886 100644 --- a/README.md +++ b/README.md @@ -40,7 +40,11 @@ make help # list all targets ## Documentation -See [`docs/`](docs/) for architecture and development documentation. +See [`docs/`](docs/) for architecture, security, and API documentation. + +## Changelog + +See [CHANGELOG.md](CHANGELOG.md) for release notes and version history. ## License @@ -49,5 +53,6 @@ Apache 2.0 License, see [LICENSE.txt](LICENSE.txt) ## Support - **Documentation**: [docs.nvisy.com](https://docs.nvisy.com) -- **Issues**: [GitHub Issues](https://github.com/nvisycom/runtime/issues) +- **Issues**: [GitHub Issues](https://github.com/nvisycom/server/issues) - **Email**: [support@nvisy.com](mailto:support@nvisy.com) +- **API Status**: [nvisy.openstatus.dev](https://nvisy.openstatus.dev) From 86497be46bff8cc317802d2da4cfd073975b07ac Mon Sep 17 00:00:00 2001 From: Oleh Martsokha Date: Mon, 16 Feb 2026 07:42:05 +0100 Subject: [PATCH 02/11] refactor: rename nvisy-ingest to nvisy-codec, restructure render modules, extract text redaction primitives MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rename nvisy-ingest crate to nvisy-codec and update all workspace references. Split render/image.rs into a folder-module (mod.rs, blur.rs, block.rs) with improved documentation. Create render/text/ module and move text redaction primitives (PendingRedaction → PendingReplacement, apply_text_redactions → apply_replacements, apply_cell_redaction → mask_cell, hash_string) from pipeline's apply.rs into nvisy-codec. Enable jpeg alongside png for the pipeline's image-redaction feature. Remove old pipeline render/block.rs and render/blur.rs modules. Co-Authored-By: Claude Opus 4.6 --- Cargo.lock | 58 ++++----- Cargo.toml | 4 +- .../{nvisy-ingest => nvisy-codec}/Cargo.toml | 15 ++- crates/nvisy-codec/README.md | 8 ++ .../src/document/edit_stream.rs | 0 .../src/document/mod.rs | 0 .../src/document/view_stream.rs | 0 .../src/handler/audio/mod.rs | 0 .../src/handler/audio/mp3.rs | 0 .../src/handler/audio/wav.rs | 0 .../src/handler/document/docx.rs | 0 .../src/handler/document/mod.rs | 0 .../src/handler/document/pdf.rs | 0 .../src/handler/encoding.rs | 0 .../src/handler/image/jpeg.rs | 0 .../src/handler/image/mod.rs | 2 +- .../src/handler/image/png.rs | 20 ++- .../src/handler/mod.rs | 2 + .../src/handler/span.rs | 0 .../src/handler/tabular/mod.rs | 0 .../src/handler/tabular/xlsx.rs | 0 .../src/handler/text/csv_handler.rs | 0 .../src/handler/text/csv_loader.rs | 0 .../src/handler/text/html.rs | 0 .../src/handler/text/json_handler.rs | 0 .../src/handler/text/json_loader.rs | 0 .../src/handler/text/mod.rs | 0 .../src/handler/text/txt_handler.rs | 0 .../src/handler/text/txt_loader.rs | 0 .../{nvisy-ingest => nvisy-codec}/src/lib.rs | 1 + .../src/prelude.rs | 0 .../src/render/image}/block.rs | 17 ++- .../src/render/image}/blur.rs | 18 ++- crates/nvisy-codec/src/render/image/mod.rs | 54 ++++++++ crates/nvisy-codec/src/render/mod.rs | 8 ++ crates/nvisy-codec/src/render/text/mask.rs | 47 +++++++ crates/nvisy-codec/src/render/text/mod.rs | 17 +++ crates/nvisy-codec/src/render/text/replace.rs | 44 +++++++ crates/nvisy-ingest/README.md | 9 -- crates/nvisy-object/src/client/mod.rs | 7 + crates/nvisy-object/src/providers/s3.rs | 9 +- crates/nvisy-pipeline/Cargo.toml | 15 +-- .../src/detection/dictionary.rs | 4 +- crates/nvisy-pipeline/src/detection/ner.rs | 6 +- crates/nvisy-pipeline/src/detection/regex.rs | 4 +- .../nvisy-pipeline/src/detection/tabular.rs | 4 +- crates/nvisy-pipeline/src/generation/ocr.rs | 4 +- .../src/generation/transcribe.rs | 4 +- crates/nvisy-pipeline/src/provider.rs | 24 ++-- crates/nvisy-pipeline/src/redaction/apply.rs | 123 +++--------------- .../src/redaction/evaluate_policy.rs | 9 +- crates/nvisy-pipeline/src/redaction/mod.rs | 3 - .../src/redaction/render/mod.rs | 6 - crates/nvisy-python/Cargo.toml | 7 +- crates/nvisy-python/src/actions/mod.rs | 47 +++---- crates/nvisy-python/src/actions/ocr.rs | 34 +++-- crates/nvisy-python/src/bridge/mod.rs | 7 + crates/nvisy-python/src/prelude.rs | 5 +- crates/nvisy-python/src/provider/mod.rs | 11 +- 59 files changed, 389 insertions(+), 268 deletions(-) rename crates/{nvisy-ingest => nvisy-codec}/Cargo.toml (83%) create mode 100644 crates/nvisy-codec/README.md rename crates/{nvisy-ingest => nvisy-codec}/src/document/edit_stream.rs (100%) rename crates/{nvisy-ingest => nvisy-codec}/src/document/mod.rs (100%) rename crates/{nvisy-ingest => nvisy-codec}/src/document/view_stream.rs (100%) rename crates/{nvisy-ingest => nvisy-codec}/src/handler/audio/mod.rs (100%) rename crates/{nvisy-ingest => nvisy-codec}/src/handler/audio/mp3.rs (100%) rename crates/{nvisy-ingest => nvisy-codec}/src/handler/audio/wav.rs (100%) rename crates/{nvisy-ingest => nvisy-codec}/src/handler/document/docx.rs (100%) rename crates/{nvisy-ingest => nvisy-codec}/src/handler/document/mod.rs (100%) rename crates/{nvisy-ingest => nvisy-codec}/src/handler/document/pdf.rs (100%) rename crates/{nvisy-ingest => nvisy-codec}/src/handler/encoding.rs (100%) rename crates/{nvisy-ingest => nvisy-codec}/src/handler/image/jpeg.rs (100%) rename crates/{nvisy-ingest => nvisy-codec}/src/handler/image/mod.rs (59%) rename crates/{nvisy-ingest => nvisy-codec}/src/handler/image/png.rs (57%) rename crates/{nvisy-ingest => nvisy-codec}/src/handler/mod.rs (97%) rename crates/{nvisy-ingest => nvisy-codec}/src/handler/span.rs (100%) rename crates/{nvisy-ingest => nvisy-codec}/src/handler/tabular/mod.rs (100%) rename crates/{nvisy-ingest => nvisy-codec}/src/handler/tabular/xlsx.rs (100%) rename crates/{nvisy-ingest => nvisy-codec}/src/handler/text/csv_handler.rs (100%) rename crates/{nvisy-ingest => nvisy-codec}/src/handler/text/csv_loader.rs (100%) rename crates/{nvisy-ingest => nvisy-codec}/src/handler/text/html.rs (100%) rename crates/{nvisy-ingest => nvisy-codec}/src/handler/text/json_handler.rs (100%) rename crates/{nvisy-ingest => nvisy-codec}/src/handler/text/json_loader.rs (100%) rename crates/{nvisy-ingest => nvisy-codec}/src/handler/text/mod.rs (100%) rename crates/{nvisy-ingest => nvisy-codec}/src/handler/text/txt_handler.rs (100%) rename crates/{nvisy-ingest => nvisy-codec}/src/handler/text/txt_loader.rs (100%) rename crates/{nvisy-ingest => nvisy-codec}/src/lib.rs (91%) rename crates/{nvisy-ingest => nvisy-codec}/src/prelude.rs (100%) rename crates/{nvisy-pipeline/src/redaction/render => nvisy-codec/src/render/image}/block.rs (60%) rename crates/{nvisy-pipeline/src/redaction/render => nvisy-codec/src/render/image}/blur.rs (59%) create mode 100644 crates/nvisy-codec/src/render/image/mod.rs create mode 100644 crates/nvisy-codec/src/render/mod.rs create mode 100644 crates/nvisy-codec/src/render/text/mask.rs create mode 100644 crates/nvisy-codec/src/render/text/mod.rs create mode 100644 crates/nvisy-codec/src/render/text/replace.rs delete mode 100644 crates/nvisy-ingest/README.md delete mode 100644 crates/nvisy-pipeline/src/redaction/render/mod.rs diff --git a/Cargo.lock b/Cargo.lock index 7fb9d4c..ee53333 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2194,6 +2194,32 @@ dependencies = [ "libm", ] +[[package]] +name = "nvisy-codec" +version = "0.1.0" +dependencies = [ + "async-trait", + "bytes", + "calamine", + "csv", + "futures", + "image", + "imageproc", + "infer", + "lopdf", + "nvisy-core", + "nvisy-ontology", + "pdf-extract", + "quick-xml 0.37.5", + "scraper", + "serde", + "serde_json", + "tokio", + "tracing", + "uuid", + "zip", +] + [[package]] name = "nvisy-core" version = "0.1.0" @@ -2237,31 +2263,6 @@ dependencies = [ "uuid", ] -[[package]] -name = "nvisy-ingest" -version = "0.1.0" -dependencies = [ - "async-trait", - "bytes", - "calamine", - "csv", - "futures", - "image", - "infer", - "lopdf", - "nvisy-core", - "nvisy-ontology", - "pdf-extract", - "quick-xml 0.37.5", - "scraper", - "serde", - "serde_json", - "tokio", - "tracing", - "uuid", - "zip", -] - [[package]] name = "nvisy-object" version = "0.1.0" @@ -2309,12 +2310,9 @@ version = "0.1.0" dependencies = [ "aho-corasick", "async-trait", - "bytes", - "image", - "imageproc", "jiff", + "nvisy-codec", "nvisy-core", - "nvisy-ingest", "nvisy-ontology", "nvisy-pattern", "regex", @@ -2330,8 +2328,8 @@ name = "nvisy-python" version = "0.1.0" dependencies = [ "async-trait", + "nvisy-codec", "nvisy-core", - "nvisy-ingest", "nvisy-ontology", "nvisy-pipeline", "pyo3", diff --git a/Cargo.toml b/Cargo.toml index 55ffb06..0e2cf50 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,7 +5,7 @@ resolver = "2" members = [ "./crates/nvisy-core", "./crates/nvisy-engine", - "./crates/nvisy-ingest", + "./crates/nvisy-codec", "./crates/nvisy-object", "./crates/nvisy-pattern", "./crates/nvisy-pipeline", @@ -34,7 +34,7 @@ documentation = "https://docs.rs/nvisy-runtime" # Internal crates nvisy-core = { path = "./crates/nvisy-core", version = "0.1.0" } nvisy-engine = { path = "./crates/nvisy-engine", version = "0.1.0" } -nvisy-ingest = { path = "./crates/nvisy-ingest", version = "0.1.0" } +nvisy-codec = { path = "./crates/nvisy-codec", version = "0.1.0" } nvisy-object = { path = "./crates/nvisy-object", version = "0.1.0" } nvisy-pattern = { path = "./crates/nvisy-pattern", version = "0.1.0" } nvisy-pipeline = { path = "./crates/nvisy-pipeline", version = "0.1.0" } diff --git a/crates/nvisy-ingest/Cargo.toml b/crates/nvisy-codec/Cargo.toml similarity index 83% rename from crates/nvisy-ingest/Cargo.toml rename to crates/nvisy-codec/Cargo.toml index d367962..b17ef7d 100644 --- a/crates/nvisy-ingest/Cargo.toml +++ b/crates/nvisy-codec/Cargo.toml @@ -1,10 +1,10 @@ # https://doc.rust-lang.org/cargo/reference/manifest.html [package] -name = "nvisy-ingest" -description = "File-format loaders and unified Document type for the Nvisy multimodal redaction platform" -keywords = ["nvisy", "ingest", "loader", "pdf", "docx"] -categories = ["parser-implementations"] +name = "nvisy-codec" +description = "File-format codecs — read, edit, and write documents for the Nvisy multimodal redaction platform" +keywords = ["nvisy", "codec", "loader", "pdf", "docx"] +categories = ["parser-implementations", "encoding"] version = { workspace = true } rust-version = { workspace = true } @@ -33,9 +33,9 @@ html = ["dep:scraper"] xlsx = ["dep:calamine"] # Convenience alias: all image formats image = ["jpeg", "png"] -# Individual image format handlers (each requires dep:image) -jpeg = ["dep:image"] -png = ["dep:image"] +# Individual image format handlers (each pulls in image + imageproc for rendering) +jpeg = ["dep:image", "dep:imageproc"] +png = ["dep:image", "dep:imageproc"] # Audio format handlers (no additional dependencies) wav = [] mp3 = [] @@ -75,6 +75,7 @@ quick-xml = { workspace = true, optional = true, features = [] } scraper = { workspace = true, optional = true, features = [] } calamine = { workspace = true, optional = true, features = [] } image = { workspace = true, optional = true, features = [] } +imageproc = { workspace = true, optional = true, features = [] } [dev-dependencies] tokio = { workspace = true, features = ["macros", "rt"] } diff --git a/crates/nvisy-codec/README.md b/crates/nvisy-codec/README.md new file mode 100644 index 0000000..8b783eb --- /dev/null +++ b/crates/nvisy-codec/README.md @@ -0,0 +1,8 @@ +# nvisy-codec + +File-format codecs for the Nvisy multimodal redaction platform. + +This crate provides handlers for reading, editing, and writing PDF, DOCX, +HTML, Image, XLSX, Audio, CSV, JSON, and plain-text files. Each handler +implements the [`Handler`](crate::handler::Handler) trait and provides +span-based access to content for detection and redaction. diff --git a/crates/nvisy-ingest/src/document/edit_stream.rs b/crates/nvisy-codec/src/document/edit_stream.rs similarity index 100% rename from crates/nvisy-ingest/src/document/edit_stream.rs rename to crates/nvisy-codec/src/document/edit_stream.rs diff --git a/crates/nvisy-ingest/src/document/mod.rs b/crates/nvisy-codec/src/document/mod.rs similarity index 100% rename from crates/nvisy-ingest/src/document/mod.rs rename to crates/nvisy-codec/src/document/mod.rs diff --git a/crates/nvisy-ingest/src/document/view_stream.rs b/crates/nvisy-codec/src/document/view_stream.rs similarity index 100% rename from crates/nvisy-ingest/src/document/view_stream.rs rename to crates/nvisy-codec/src/document/view_stream.rs diff --git a/crates/nvisy-ingest/src/handler/audio/mod.rs b/crates/nvisy-codec/src/handler/audio/mod.rs similarity index 100% rename from crates/nvisy-ingest/src/handler/audio/mod.rs rename to crates/nvisy-codec/src/handler/audio/mod.rs diff --git a/crates/nvisy-ingest/src/handler/audio/mp3.rs b/crates/nvisy-codec/src/handler/audio/mp3.rs similarity index 100% rename from crates/nvisy-ingest/src/handler/audio/mp3.rs rename to crates/nvisy-codec/src/handler/audio/mp3.rs diff --git a/crates/nvisy-ingest/src/handler/audio/wav.rs b/crates/nvisy-codec/src/handler/audio/wav.rs similarity index 100% rename from crates/nvisy-ingest/src/handler/audio/wav.rs rename to crates/nvisy-codec/src/handler/audio/wav.rs diff --git a/crates/nvisy-ingest/src/handler/document/docx.rs b/crates/nvisy-codec/src/handler/document/docx.rs similarity index 100% rename from crates/nvisy-ingest/src/handler/document/docx.rs rename to crates/nvisy-codec/src/handler/document/docx.rs diff --git a/crates/nvisy-ingest/src/handler/document/mod.rs b/crates/nvisy-codec/src/handler/document/mod.rs similarity index 100% rename from crates/nvisy-ingest/src/handler/document/mod.rs rename to crates/nvisy-codec/src/handler/document/mod.rs diff --git a/crates/nvisy-ingest/src/handler/document/pdf.rs b/crates/nvisy-codec/src/handler/document/pdf.rs similarity index 100% rename from crates/nvisy-ingest/src/handler/document/pdf.rs rename to crates/nvisy-codec/src/handler/document/pdf.rs diff --git a/crates/nvisy-ingest/src/handler/encoding.rs b/crates/nvisy-codec/src/handler/encoding.rs similarity index 100% rename from crates/nvisy-ingest/src/handler/encoding.rs rename to crates/nvisy-codec/src/handler/encoding.rs diff --git a/crates/nvisy-ingest/src/handler/image/jpeg.rs b/crates/nvisy-codec/src/handler/image/jpeg.rs similarity index 100% rename from crates/nvisy-ingest/src/handler/image/jpeg.rs rename to crates/nvisy-codec/src/handler/image/jpeg.rs diff --git a/crates/nvisy-ingest/src/handler/image/mod.rs b/crates/nvisy-codec/src/handler/image/mod.rs similarity index 59% rename from crates/nvisy-ingest/src/handler/image/mod.rs rename to crates/nvisy-codec/src/handler/image/mod.rs index 46bcfdd..e3c4cb0 100644 --- a/crates/nvisy-ingest/src/handler/image/mod.rs +++ b/crates/nvisy-codec/src/handler/image/mod.rs @@ -1,4 +1,4 @@ -//! Image format handlers and shared decode helper. +//! Image format handlers. #[cfg(feature = "jpeg")] pub mod jpeg; diff --git a/crates/nvisy-ingest/src/handler/image/png.rs b/crates/nvisy-codec/src/handler/image/png.rs similarity index 57% rename from crates/nvisy-ingest/src/handler/image/png.rs rename to crates/nvisy-codec/src/handler/image/png.rs index a761862..fe0c02f 100644 --- a/crates/nvisy-ingest/src/handler/image/png.rs +++ b/crates/nvisy-codec/src/handler/image/png.rs @@ -1,13 +1,15 @@ //! PNG handler (stub — awaiting migration to Loader/Handler pattern). use bytes::Bytes; +use image::DynamicImage; -use nvisy_core::error::Error; +use nvisy_core::error::{Error, ErrorKind}; use nvisy_ontology::entity::DocumentType; use crate::document::edit_stream::SpanEditStream; use crate::document::view_stream::SpanStream; use crate::handler::Handler; +use crate::render::image::AsImage; #[derive(Debug, Clone)] pub struct PngHandler { @@ -44,3 +46,19 @@ impl Handler for PngHandler { Ok(()) } } + +impl AsImage for PngHandler { + fn decode(&self) -> Result { + image::load_from_memory(&self.bytes).map_err(|e| { + Error::new(ErrorKind::Runtime, format!("PNG decode failed: {e}")) + }) + } + + fn encode(image: &DynamicImage) -> Result { + let mut buf = std::io::Cursor::new(Vec::new()); + image.write_to(&mut buf, image::ImageFormat::Png).map_err(|e| { + Error::new(ErrorKind::Runtime, format!("PNG encode failed: {e}")) + })?; + Ok(Self::new(Bytes::from(buf.into_inner()))) + } +} diff --git a/crates/nvisy-ingest/src/handler/mod.rs b/crates/nvisy-codec/src/handler/mod.rs similarity index 97% rename from crates/nvisy-ingest/src/handler/mod.rs rename to crates/nvisy-codec/src/handler/mod.rs index 13fea3b..ebfc755 100644 --- a/crates/nvisy-ingest/src/handler/mod.rs +++ b/crates/nvisy-codec/src/handler/mod.rs @@ -38,6 +38,8 @@ pub use text::json_loader::{JsonParams, JsonLoader}; #[cfg(feature = "png")] pub use image::png::PngHandler; +#[cfg(any(feature = "png", feature = "jpeg"))] +pub use crate::render::image::AsImage; #[cfg(feature = "wav")] pub use audio::wav::WavHandler; diff --git a/crates/nvisy-ingest/src/handler/span.rs b/crates/nvisy-codec/src/handler/span.rs similarity index 100% rename from crates/nvisy-ingest/src/handler/span.rs rename to crates/nvisy-codec/src/handler/span.rs diff --git a/crates/nvisy-ingest/src/handler/tabular/mod.rs b/crates/nvisy-codec/src/handler/tabular/mod.rs similarity index 100% rename from crates/nvisy-ingest/src/handler/tabular/mod.rs rename to crates/nvisy-codec/src/handler/tabular/mod.rs diff --git a/crates/nvisy-ingest/src/handler/tabular/xlsx.rs b/crates/nvisy-codec/src/handler/tabular/xlsx.rs similarity index 100% rename from crates/nvisy-ingest/src/handler/tabular/xlsx.rs rename to crates/nvisy-codec/src/handler/tabular/xlsx.rs diff --git a/crates/nvisy-ingest/src/handler/text/csv_handler.rs b/crates/nvisy-codec/src/handler/text/csv_handler.rs similarity index 100% rename from crates/nvisy-ingest/src/handler/text/csv_handler.rs rename to crates/nvisy-codec/src/handler/text/csv_handler.rs diff --git a/crates/nvisy-ingest/src/handler/text/csv_loader.rs b/crates/nvisy-codec/src/handler/text/csv_loader.rs similarity index 100% rename from crates/nvisy-ingest/src/handler/text/csv_loader.rs rename to crates/nvisy-codec/src/handler/text/csv_loader.rs diff --git a/crates/nvisy-ingest/src/handler/text/html.rs b/crates/nvisy-codec/src/handler/text/html.rs similarity index 100% rename from crates/nvisy-ingest/src/handler/text/html.rs rename to crates/nvisy-codec/src/handler/text/html.rs diff --git a/crates/nvisy-ingest/src/handler/text/json_handler.rs b/crates/nvisy-codec/src/handler/text/json_handler.rs similarity index 100% rename from crates/nvisy-ingest/src/handler/text/json_handler.rs rename to crates/nvisy-codec/src/handler/text/json_handler.rs diff --git a/crates/nvisy-ingest/src/handler/text/json_loader.rs b/crates/nvisy-codec/src/handler/text/json_loader.rs similarity index 100% rename from crates/nvisy-ingest/src/handler/text/json_loader.rs rename to crates/nvisy-codec/src/handler/text/json_loader.rs diff --git a/crates/nvisy-ingest/src/handler/text/mod.rs b/crates/nvisy-codec/src/handler/text/mod.rs similarity index 100% rename from crates/nvisy-ingest/src/handler/text/mod.rs rename to crates/nvisy-codec/src/handler/text/mod.rs diff --git a/crates/nvisy-ingest/src/handler/text/txt_handler.rs b/crates/nvisy-codec/src/handler/text/txt_handler.rs similarity index 100% rename from crates/nvisy-ingest/src/handler/text/txt_handler.rs rename to crates/nvisy-codec/src/handler/text/txt_handler.rs diff --git a/crates/nvisy-ingest/src/handler/text/txt_loader.rs b/crates/nvisy-codec/src/handler/text/txt_loader.rs similarity index 100% rename from crates/nvisy-ingest/src/handler/text/txt_loader.rs rename to crates/nvisy-codec/src/handler/text/txt_loader.rs diff --git a/crates/nvisy-ingest/src/lib.rs b/crates/nvisy-codec/src/lib.rs similarity index 91% rename from crates/nvisy-ingest/src/lib.rs rename to crates/nvisy-codec/src/lib.rs index ed421f6..ecf5d8f 100644 --- a/crates/nvisy-ingest/src/lib.rs +++ b/crates/nvisy-codec/src/lib.rs @@ -4,6 +4,7 @@ pub mod handler; pub mod document; +pub mod render; #[doc(hidden)] pub mod prelude; diff --git a/crates/nvisy-ingest/src/prelude.rs b/crates/nvisy-codec/src/prelude.rs similarity index 100% rename from crates/nvisy-ingest/src/prelude.rs rename to crates/nvisy-codec/src/prelude.rs diff --git a/crates/nvisy-pipeline/src/redaction/render/block.rs b/crates/nvisy-codec/src/render/image/block.rs similarity index 60% rename from crates/nvisy-pipeline/src/redaction/render/block.rs rename to crates/nvisy-codec/src/render/image/block.rs index 05d69a3..eba1545 100644 --- a/crates/nvisy-pipeline/src/redaction/render/block.rs +++ b/crates/nvisy-codec/src/render/image/block.rs @@ -1,17 +1,24 @@ -//! Solid color block overlay for image regions. +//! Solid-color block overlay rendering for bounding-box regions. +//! +//! For each region the algorithm creates an opaque [`RgbaImage`] rectangle +//! filled with the requested colour and composites it onto the target image +//! using alpha-over blending. Regions are clamped to image bounds. -use image::{DynamicImage, Rgba, RgbaImage}; +use ::image::{DynamicImage, Rgba, RgbaImage}; use nvisy_ontology::entity::BoundingBox; /// Apply a solid color block overlay to the specified regions of an image. /// /// Each [`BoundingBox`] describes a rectangular region (in pixel coordinates) -/// that will be covered with an opaque rectangle of the given `color`. +/// that will be covered with an opaque rectangle of the given `color` (RGBA). +/// The overlay is composited using alpha-over blending via +/// [`imageops::overlay`](::image::imageops::overlay). pub fn apply_block_overlay( image: &DynamicImage, regions: &[BoundingBox], - color: Rgba, + color: [u8; 4], ) -> DynamicImage { + let color = Rgba(color); let mut result = image.to_rgba8(); let img_w = result.width(); let img_h = result.height(); @@ -29,7 +36,7 @@ pub fn apply_block_overlay( let h = h.min(img_h - y); let block = RgbaImage::from_pixel(w, h, color); - image::imageops::overlay(&mut result, &block, x as i64, y as i64); + ::image::imageops::overlay(&mut result, &block, x as i64, y as i64); } DynamicImage::ImageRgba8(result) diff --git a/crates/nvisy-pipeline/src/redaction/render/blur.rs b/crates/nvisy-codec/src/render/image/blur.rs similarity index 59% rename from crates/nvisy-pipeline/src/redaction/render/blur.rs rename to crates/nvisy-codec/src/render/image/blur.rs index 468c49e..2d99acd 100644 --- a/crates/nvisy-pipeline/src/redaction/render/blur.rs +++ b/crates/nvisy-codec/src/render/image/blur.rs @@ -1,13 +1,23 @@ -//! Gaussian blur for image regions. +//! Gaussian blur rendering for bounding-box regions. +//! +//! The algorithm works per-region: +//! 1. Crop the rectangular area from the source image. +//! 2. Apply a gaussian blur with the given `sigma` to the cropped sub-image. +//! 3. Paste the blurred sub-image back over the original at the same position. +//! +//! Regions are clamped to image bounds so that out-of-range coordinates are +//! silently ignored rather than causing a panic. -use image::DynamicImage; +use ::image::DynamicImage; use imageproc::filter::gaussian_blur_f32; use nvisy_ontology::entity::BoundingBox; /// Apply gaussian blur to the specified regions of an image. /// /// Each [`BoundingBox`] describes a rectangular region (in pixel coordinates) -/// that will be blurred with the given `sigma` value. +/// that will be blurred with the given `sigma` value. The algorithm crops +/// each region, applies [`gaussian_blur_f32`], and pastes the result back, +/// leaving the rest of the image untouched. pub fn apply_gaussian_blur( image: &DynamicImage, regions: &[BoundingBox], @@ -36,7 +46,7 @@ pub fn apply_gaussian_blur( // Crop the region, blur it, paste it back let sub = result.crop_imm(x, y, w, h); let blurred = DynamicImage::ImageRgba8(gaussian_blur_f32(&sub.to_rgba8(), sigma)); - image::imageops::overlay(&mut result, &blurred, x as i64, y as i64); + ::image::imageops::overlay(&mut result, &blurred, x as i64, y as i64); } result diff --git a/crates/nvisy-codec/src/render/image/mod.rs b/crates/nvisy-codec/src/render/image/mod.rs new file mode 100644 index 0000000..3d043b8 --- /dev/null +++ b/crates/nvisy-codec/src/render/image/mod.rs @@ -0,0 +1,54 @@ +//! Image rendering primitives for redaction overlays. +//! +//! Provides gaussian blur and solid-color block overlay functions that +//! operate on [`DynamicImage`] values using bounding-box regions. +//! +//! # Trait +//! +//! [`AsImage`] is the main extension point: image format handlers implement +//! [`decode`](AsImage::decode) and [`encode`](AsImage::encode) to round-trip +//! through [`DynamicImage`], and then get [`blur`](AsImage::blur) and +//! [`block`](AsImage::block) convenience methods for free via default +//! implementations. +//! +//! # Sub-modules +//! +//! | Module | Description | +//! |--------|-------------| +//! | [`blur`] | Gaussian blur rendering | +//! | [`block`] | Solid-color block overlay rendering | + +mod blur; +mod block; + +pub use blur::apply_gaussian_blur; +pub use block::apply_block_overlay; + +use ::image::DynamicImage; +use nvisy_core::error::Error; +use nvisy_ontology::entity::BoundingBox; + +/// Trait for handlers that wrap a raster image. +/// +/// Provides [`decode`](Self::decode) / [`encode`](Self::encode) for +/// round-tripping through [`DynamicImage`], plus convenience methods for +/// applying blur and block-overlay redactions in a single call. +pub trait AsImage: Sized { + /// Decode the handler's raw bytes into a [`DynamicImage`]. + fn decode(&self) -> Result; + + /// Encode a [`DynamicImage`] back into a new handler instance. + fn encode(image: &DynamicImage) -> Result; + + /// Apply gaussian blur to the given bounding-box regions. + fn blur(&self, regions: &[BoundingBox], sigma: f32) -> Result { + let img = apply_gaussian_blur(&self.decode()?, regions, sigma); + Self::encode(&img) + } + + /// Apply a solid-color block overlay to the given bounding-box regions. + fn block(&self, regions: &[BoundingBox], color: [u8; 4]) -> Result { + let img = apply_block_overlay(&self.decode()?, regions, color); + Self::encode(&img) + } +} diff --git a/crates/nvisy-codec/src/render/mod.rs b/crates/nvisy-codec/src/render/mod.rs new file mode 100644 index 0000000..4492209 --- /dev/null +++ b/crates/nvisy-codec/src/render/mod.rs @@ -0,0 +1,8 @@ +//! Rendering primitives for redaction overlays. + +/// Image rendering: blur and block overlay for bounding-box regions. +#[cfg(any(feature = "png", feature = "jpeg"))] +pub mod image; + +/// Text rendering: byte-offset replacement engine and cell-level masking. +pub mod text; diff --git a/crates/nvisy-codec/src/render/text/mask.rs b/crates/nvisy-codec/src/render/text/mask.rs new file mode 100644 index 0000000..c7703bc --- /dev/null +++ b/crates/nvisy-codec/src/render/text/mask.rs @@ -0,0 +1,47 @@ +//! Cell-level masking and hashing utilities. +//! +//! These functions are used by tabular redaction actions to transform +//! individual cell values according to a [`RedactionOutput`] variant. + +use nvisy_ontology::redaction::{RedactionOutput, TextRedactionOutput}; + +/// Redact a single cell value according to `output`. +/// +/// Dispatches on the [`RedactionOutput`] variant: +/// - **Mask**: preserve the last 4 characters, replacing the rest with the +/// mask character from the output. +/// - **Remove**: return an empty string. +/// - **Hash**: return `[HASH:{hex}]` using [`hash_string`]. +/// - **Other text variants**: use [`replacement_value()`](RedactionOutput::replacement_value), +/// falling back to repeating `default_mask` for the cell length. +pub fn mask_cell(cell: &str, output: &RedactionOutput, default_mask: char) -> String { + match output { + RedactionOutput::Text(TextRedactionOutput::Mask { mask_char, .. }) => { + if cell.len() > 4 { + format!( + "{}{}", + mask_char.to_string().repeat(cell.len() - 4), + &cell[cell.len() - 4..] + ) + } else { + mask_char.to_string().repeat(cell.len()) + } + } + RedactionOutput::Text(TextRedactionOutput::Remove) => String::new(), + RedactionOutput::Text(TextRedactionOutput::Hash { .. }) => { + format!("[HASH:{:x}]", hash_string(cell)) + } + _ => output + .replacement_value() + .map(|v| v.to_string()) + .unwrap_or_else(|| default_mask.to_string().repeat(cell.len())), + } +} + +/// Compute a deterministic 64-bit hash of `s` using [`DefaultHasher`](std::collections::hash_map::DefaultHasher). +pub fn hash_string(s: &str) -> u64 { + use std::hash::{Hash, Hasher}; + let mut hasher = std::collections::hash_map::DefaultHasher::new(); + s.hash(&mut hasher); + hasher.finish() +} diff --git a/crates/nvisy-codec/src/render/text/mod.rs b/crates/nvisy-codec/src/render/text/mod.rs new file mode 100644 index 0000000..bea4c46 --- /dev/null +++ b/crates/nvisy-codec/src/render/text/mod.rs @@ -0,0 +1,17 @@ +//! Text rendering and redaction primitives. +//! +//! Low-level utilities for applying text replacements and cell-level +//! masking, used by pipeline redaction actions. +//! +//! # Sub-modules +//! +//! | Module | Description | +//! |--------|-------------| +//! | [`replace`] | Byte-offset text replacement engine | +//! | [`mask`] | Cell-level masking and hashing utilities | + +pub mod mask; +pub mod replace; + +pub use mask::{hash_string, mask_cell}; +pub use replace::{apply_replacements, PendingReplacement}; diff --git a/crates/nvisy-codec/src/render/text/replace.rs b/crates/nvisy-codec/src/render/text/replace.rs new file mode 100644 index 0000000..733c328 --- /dev/null +++ b/crates/nvisy-codec/src/render/text/replace.rs @@ -0,0 +1,44 @@ +//! Byte-offset text replacement engine. +//! +//! Provides a simple but correct algorithm for applying multiple +//! non-overlapping replacements to a string by processing them +//! right-to-left (descending start offset). This ensures that each +//! substitution does not invalidate the byte offsets of earlier +//! (leftward) replacements. + +/// A single text replacement that has been resolved but not yet applied. +pub struct PendingReplacement { + /// Byte offset where the replacement starts in the original text. + pub start: usize, + /// Byte offset where the replacement ends (exclusive) in the original text. + pub end: usize, + /// The string that will replace the original span. + pub value: String, +} + +/// Apply a set of pending replacements to `text`, returning the result. +/// +/// Replacements are applied right-to-left (descending start offset) so that +/// earlier byte offsets remain valid after each substitution. Out-of-range +/// offsets are clamped to the text length and empty spans are skipped. +pub fn apply_replacements(text: &str, pending: &mut [PendingReplacement]) -> String { + // Sort by start offset descending (right-to-left) to preserve positions + pending.sort_by(|a, b| b.start.cmp(&a.start)); + + let mut result = text.to_string(); + for replacement in pending.iter() { + let start = replacement.start.min(result.len()); + let end = replacement.end.min(result.len()); + if start >= end { + continue; + } + + result = format!( + "{}{}{}", + &result[..start], + replacement.value, + &result[end..] + ); + } + result +} diff --git a/crates/nvisy-ingest/README.md b/crates/nvisy-ingest/README.md deleted file mode 100644 index 378ce7b..0000000 --- a/crates/nvisy-ingest/README.md +++ /dev/null @@ -1,9 +0,0 @@ -# nvisy-ingest - -File-format loaders for the Nvisy multimodal redaction platform. - -This crate provides loaders for PDF, DOCX, HTML, Image, XLSX, Audio, -CSV, JSON, and plain-text files. Each loader implements the -[`Loader`](crate::loaders::Loader) trait and converts raw -blob bytes into structured `Document`, `ImageData`, or `TabularData` -artifacts. diff --git a/crates/nvisy-object/src/client/mod.rs b/crates/nvisy-object/src/client/mod.rs index df925d2..bc762fc 100644 --- a/crates/nvisy-object/src/client/mod.rs +++ b/crates/nvisy-object/src/client/mod.rs @@ -5,6 +5,7 @@ //! a concrete client so it can be passed through the engine as `Box`. use bytes::Bytes; +use nvisy_pipeline::provider::ConnectedInstance; /// Result returned by [`ObjectStoreClient::list`]. pub struct ListResult { @@ -51,3 +52,9 @@ impl ObjectStoreBox { Self(Box::new(client)) } } + +impl ConnectedInstance for ObjectStoreBox { + fn disconnect(self) -> Option + Send>>> { + None + } +} diff --git a/crates/nvisy-object/src/providers/s3.rs b/crates/nvisy-object/src/providers/s3.rs index c1cc8e7..3f8b5eb 100644 --- a/crates/nvisy-object/src/providers/s3.rs +++ b/crates/nvisy-object/src/providers/s3.rs @@ -14,7 +14,7 @@ use minio::s3::types::{S3Api, ToStream}; use minio::s3::{Client as MinioClient, ClientBuilder as MinioClientBuilder}; use nvisy_core::error::Error; -use nvisy_pipeline::provider::{ConnectedInstance, Provider}; +use nvisy_pipeline::provider::Provider; use crate::client::{GetResult, ListResult, ObjectStoreBox, ObjectStoreClient}; /// S3-compatible object store client. @@ -149,7 +149,7 @@ impl Provider for S3Provider { Ok(()) } - async fn connect(&self, creds: &Self::Credentials) -> Result, Error> { + async fn connect(&self, creds: &Self::Credentials) -> Result { let endpoint = creds.endpoint.as_deref().unwrap_or("https://s3.amazonaws.com"); let mut base_url: BaseUrl = endpoint.parse().map_err(|e| { @@ -175,9 +175,6 @@ impl Provider for S3Provider { let store_client = S3ObjectStoreClient::new(client, creds.bucket.clone()); - Ok(ConnectedInstance { - client: ObjectStoreBox::new(store_client), - disconnect: None, - }) + Ok(ObjectStoreBox::new(store_client)) } } diff --git a/crates/nvisy-pipeline/Cargo.toml b/crates/nvisy-pipeline/Cargo.toml index 368e7ba..0e0fa50 100644 --- a/crates/nvisy-pipeline/Cargo.toml +++ b/crates/nvisy-pipeline/Cargo.toml @@ -23,16 +23,16 @@ rustdoc-args = ["--cfg", "docsrs"] [features] default = ["image-redaction", "audio-redaction"] -# Image blur/block redaction via image + imageproc; enables nvisy-ingest/png for PngHandler -image-redaction = ["dep:image", "dep:imageproc", "nvisy-ingest/png"] -# Audio redaction pass-through; enables nvisy-ingest/wav for WavHandler -audio-redaction = ["nvisy-ingest/wav"] +# Image redaction (blur, block); enables nvisy-codec image handlers and rendering +image-redaction = ["nvisy-codec/png", "nvisy-codec/jpeg"] +# Audio redaction pass-through; enables nvisy-codec/wav for WavHandler +audio-redaction = ["nvisy-codec/wav"] [dependencies] # Internal crates nvisy-core = { workspace = true, features = [] } nvisy-ontology = { workspace = true, features = [] } -nvisy-ingest = { workspace = true, features = [] } +nvisy-codec = { workspace = true, features = [] } nvisy-pattern = { workspace = true, features = [] } # (De)serialization @@ -45,7 +45,6 @@ async-trait = { workspace = true, features = [] } # Primitive datatypes uuid = { workspace = true, features = ["v4"] } -bytes = { workspace = true, features = [] } # Time jiff = { workspace = true, features = [] } @@ -56,7 +55,3 @@ aho-corasick = { workspace = true, features = [] } # Observability tracing = { workspace = true, features = [] } - -# Image processing (feature-gated) -image = { workspace = true, optional = true, features = [] } -imageproc = { workspace = true, optional = true, features = [] } diff --git a/crates/nvisy-pipeline/src/detection/dictionary.rs b/crates/nvisy-pipeline/src/detection/dictionary.rs index e653724..51f1fed 100644 --- a/crates/nvisy-pipeline/src/detection/dictionary.rs +++ b/crates/nvisy-pipeline/src/detection/dictionary.rs @@ -3,8 +3,8 @@ use aho_corasick::AhoCorasick; use serde::Deserialize; -use nvisy_ingest::handler::{TxtHandler, CsvHandler}; -use nvisy_ingest::document::Document; +use nvisy_codec::handler::{TxtHandler, CsvHandler}; +use nvisy_codec::document::Document; use nvisy_ontology::entity::{ DetectionMethod, Entity, EntityCategory, TabularLocation, TextLocation, }; diff --git a/crates/nvisy-pipeline/src/detection/ner.rs b/crates/nvisy-pipeline/src/detection/ner.rs index 305e7c8..9ee780f 100644 --- a/crates/nvisy-pipeline/src/detection/ner.rs +++ b/crates/nvisy-pipeline/src/detection/ner.rs @@ -2,13 +2,13 @@ use serde::Deserialize; -use nvisy_ingest::document::Document; -use nvisy_ingest::handler::TxtHandler; +use nvisy_codec::document::Document; +use nvisy_codec::handler::TxtHandler; use nvisy_ontology::entity::Entity; use nvisy_core::error::Error; #[cfg(feature = "image-redaction")] -use nvisy_ingest::handler::PngHandler; +use nvisy_codec::handler::PngHandler; use crate::action::Action; diff --git a/crates/nvisy-pipeline/src/detection/regex.rs b/crates/nvisy-pipeline/src/detection/regex.rs index 79326c9..f7cc05d 100644 --- a/crates/nvisy-pipeline/src/detection/regex.rs +++ b/crates/nvisy-pipeline/src/detection/regex.rs @@ -3,8 +3,8 @@ use regex::Regex; use serde::Deserialize; -use nvisy_ingest::handler::TxtHandler; -use nvisy_ingest::document::Document; +use nvisy_codec::handler::TxtHandler; +use nvisy_codec::document::Document; use nvisy_ontology::entity::{DetectionMethod, Entity, TextLocation}; use nvisy_core::error::Error; use nvisy_pattern::patterns::{self, PatternDefinition}; diff --git a/crates/nvisy-pipeline/src/detection/tabular.rs b/crates/nvisy-pipeline/src/detection/tabular.rs index 92bcfcf..db5d6f4 100644 --- a/crates/nvisy-pipeline/src/detection/tabular.rs +++ b/crates/nvisy-pipeline/src/detection/tabular.rs @@ -3,8 +3,8 @@ use regex::Regex; use serde::Deserialize; -use nvisy_ingest::handler::CsvHandler; -use nvisy_ingest::document::Document; +use nvisy_codec::handler::CsvHandler; +use nvisy_codec::document::Document; use nvisy_ontology::entity::{ DetectionMethod, Entity, EntityCategory, TabularLocation, }; diff --git a/crates/nvisy-pipeline/src/generation/ocr.rs b/crates/nvisy-pipeline/src/generation/ocr.rs index 9fbc022..0d1e6dd 100644 --- a/crates/nvisy-pipeline/src/generation/ocr.rs +++ b/crates/nvisy-pipeline/src/generation/ocr.rs @@ -3,8 +3,8 @@ use serde::Deserialize; -use nvisy_ingest::document::Document; -use nvisy_ingest::handler::{PngHandler, TxtHandler}; +use nvisy_codec::document::Document; +use nvisy_codec::handler::{PngHandler, TxtHandler}; use nvisy_ontology::entity::Entity; use nvisy_core::error::Error; diff --git a/crates/nvisy-pipeline/src/generation/transcribe.rs b/crates/nvisy-pipeline/src/generation/transcribe.rs index 3620e1b..d705660 100644 --- a/crates/nvisy-pipeline/src/generation/transcribe.rs +++ b/crates/nvisy-pipeline/src/generation/transcribe.rs @@ -3,8 +3,8 @@ use serde::Deserialize; -use nvisy_ingest::document::Document; -use nvisy_ingest::handler::{WavHandler, TxtHandler}; +use nvisy_codec::document::Document; +use nvisy_codec::handler::{WavHandler, TxtHandler}; use nvisy_ontology::entity::Entity; use nvisy_core::error::Error; diff --git a/crates/nvisy-pipeline/src/provider.rs b/crates/nvisy-pipeline/src/provider.rs index b14ee26..443a3ed 100644 --- a/crates/nvisy-pipeline/src/provider.rs +++ b/crates/nvisy-pipeline/src/provider.rs @@ -7,13 +7,17 @@ use serde::de::DeserializeOwned; use nvisy_core::error::Error; -/// A connected provider instance holding a typed client and an -/// optional async disconnect callback. -pub struct ConnectedInstance { - /// Typed client handle. - pub client: C, - /// Optional cleanup function called when the connection is no longer needed. - pub disconnect: Option Pin + Send>> + Send>>, +/// Implemented by provider clients that support lifecycle management. +/// +/// The [`disconnect`](ConnectedInstance::disconnect) method is called when +/// the connection is no longer needed. Implementations that hold no +/// resources can simply return `None`. +pub trait ConnectedInstance: Send + 'static { + /// Optional async cleanup when the connection is released. + /// + /// Return `None` if no cleanup is needed. + #[allow(clippy::type_complexity)] + fn disconnect(self) -> Option + Send>>>; } /// Factory for creating authenticated connections to an external service. @@ -25,7 +29,7 @@ pub trait Provider: Send + Sync + 'static { /// Strongly-typed credentials for this provider. type Credentials: DeserializeOwned + Send; /// The client type produced by [`connect`](Self::connect). - type Client: Send + 'static; + type Client: ConnectedInstance; /// Unique identifier (e.g. "s3", "openai"). fn id(&self) -> &str; @@ -36,9 +40,9 @@ pub trait Provider: Send + Sync + 'static { /// Verify credentials by attempting a lightweight connection. async fn verify(&self, creds: &Self::Credentials) -> Result<(), Error>; - /// Create a connected instance. + /// Create a connected client instance. async fn connect( &self, creds: &Self::Credentials, - ) -> Result, Error>; + ) -> Result; } diff --git a/crates/nvisy-pipeline/src/redaction/apply.rs b/crates/nvisy-pipeline/src/redaction/apply.rs index 7927c9a..6d5fff7 100644 --- a/crates/nvisy-pipeline/src/redaction/apply.rs +++ b/crates/nvisy-pipeline/src/redaction/apply.rs @@ -4,25 +4,22 @@ use std::collections::HashMap; use uuid::Uuid; use serde::Deserialize; -use nvisy_ingest::handler::{TxtHandler, TxtData, CsvHandler}; -use nvisy_ingest::document::Document; +use nvisy_codec::handler::{TxtHandler, TxtData, CsvHandler}; +use nvisy_codec::document::Document; +use nvisy_codec::render::text::{PendingReplacement, apply_replacements, mask_cell}; use nvisy_ontology::entity::Entity; -use nvisy_ontology::redaction::{Redaction, RedactionOutput, TextRedactionOutput}; +use nvisy_ontology::redaction::{Redaction, RedactionOutput}; use nvisy_core::error::Error; #[cfg(feature = "image-redaction")] -use bytes::Bytes; -#[cfg(feature = "image-redaction")] -use nvisy_ingest::handler::PngHandler; +use nvisy_codec::handler::{PngHandler, AsImage}; #[cfg(feature = "image-redaction")] use nvisy_ontology::entity::BoundingBox; #[cfg(feature = "image-redaction")] use nvisy_ontology::redaction::ImageRedactionOutput; -#[cfg(feature = "image-redaction")] -use nvisy_core::error::ErrorKind; #[cfg(feature = "audio-redaction")] -use nvisy_ingest::handler::WavHandler; +use nvisy_codec::handler::WavHandler; use crate::action::Action; @@ -114,16 +111,6 @@ pub struct ApplyRedactionAction { params: ApplyRedactionParams, } -/// A single text replacement that has been resolved but not yet applied. -struct PendingRedaction { - /// Byte offset where the redaction starts in the original text. - start_offset: usize, - /// Byte offset where the redaction ends (exclusive) in the original text. - end_offset: usize, - /// The string that will replace the original span. - replacement_value: String, -} - #[async_trait::async_trait] impl Action for ApplyRedactionAction { type Params = ApplyRedactionParams; @@ -215,7 +202,7 @@ fn apply_text_doc( content.push('\n'); } - let mut pending: Vec = Vec::new(); + let mut pending: Vec = Vec::new(); for (entity_id, redaction) in redaction_map { let entity = match entity_map.get(entity_id) { @@ -234,7 +221,7 @@ fn apply_text_doc( None => continue, }; - let replacement_value = match redaction.output.replacement_value() { + let value = match redaction.output.replacement_value() { Some(v) => v.to_string(), None => { let span_len = end_offset.saturating_sub(start_offset); @@ -242,10 +229,10 @@ fn apply_text_doc( } }; - pending.push(PendingRedaction { - start_offset, - end_offset, - replacement_value, + pending.push(PendingReplacement { + start: start_offset, + end: end_offset, + value, }); } @@ -253,7 +240,7 @@ fn apply_text_doc( return doc.clone(); } - let redacted_content = apply_text_redactions(&content, &mut pending); + let redacted_content = apply_replacements(&content, &mut pending); let trailing_newline = redacted_content.ends_with('\n'); let new_lines: Vec = redacted_content.lines().map(String::from).collect(); @@ -266,32 +253,6 @@ fn apply_text_doc( result } -/// Applies a set of pending redactions to `text`, returning the redacted result. -/// -/// Replacements are applied right-to-left (descending start offset) so that -/// earlier byte offsets remain valid after each substitution. -fn apply_text_redactions(text: &str, pending: &mut [PendingRedaction]) -> String { - // Sort by start offset descending (right-to-left) to preserve positions - pending.sort_by(|a, b| b.start_offset.cmp(&a.start_offset)); - - let mut result = text.to_string(); - for redaction in pending.iter() { - let start = redaction.start_offset.min(result.len()); - let end = redaction.end_offset.min(result.len()); - if start >= end { - continue; - } - - result = format!( - "{}{}{}", - &result[..start], - redaction.replacement_value, - &result[end..] - ); - } - result -} - // --------------------------------------------------------------------------- // Image redaction (feature-gated) // --------------------------------------------------------------------------- @@ -304,10 +265,6 @@ fn apply_image_doc( blur_sigma: f32, block_color: [u8; 4], ) -> Result, Error> { - use crate::redaction::render::{blur, block}; - - let image_bytes = doc.handler().bytes(); - let mut blur_regions: Vec = Vec::new(); let mut block_regions: Vec = Vec::new(); @@ -332,29 +289,15 @@ fn apply_image_doc( return Ok(doc.clone()); } - let dyn_img = image::load_from_memory(image_bytes).map_err(|e| { - Error::new(ErrorKind::Runtime, format!("image decode failed: {e}")) - })?; - - let mut result = dyn_img; + let mut handler = doc.handler().clone(); if !blur_regions.is_empty() { - result = blur::apply_gaussian_blur(&result, &blur_regions, blur_sigma); + handler = handler.blur(&blur_regions, blur_sigma)?; } if !block_regions.is_empty() { - let color = image::Rgba(block_color); - result = block::apply_block_overlay(&result, &block_regions, color); + handler = handler.block(&block_regions, block_color)?; } - // Encode back to PNG - let mut buf = std::io::Cursor::new(Vec::new()); - result - .write_to(&mut buf, image::ImageFormat::Png) - .map_err(|e| { - Error::new(ErrorKind::Runtime, format!("image encode failed: {e}")) - })?; - - let new_doc = Document::new(PngHandler::new(Bytes::from(buf.into_inner()))); - Ok(new_doc) + Ok(Document::new(handler)) } // --------------------------------------------------------------------------- @@ -385,7 +328,7 @@ fn apply_tabular_doc( if let Some(redaction) = redaction_map.get(&entity.source.as_uuid()) { if let Some(row) = result.handler_mut().rows_mut().get_mut(row_idx) { if let Some(cell) = row.get_mut(col_idx) { - *cell = apply_cell_redaction(cell, &redaction.output, params.mask_char); + *cell = mask_cell(cell, &redaction.output, params.mask_char); } } } @@ -395,33 +338,3 @@ fn apply_tabular_doc( result } -fn apply_cell_redaction(cell: &str, output: &RedactionOutput, default_mask: char) -> String { - match output { - RedactionOutput::Text(TextRedactionOutput::Mask { mask_char, .. }) => { - if cell.len() > 4 { - format!( - "{}{}", - mask_char.to_string().repeat(cell.len() - 4), - &cell[cell.len() - 4..] - ) - } else { - mask_char.to_string().repeat(cell.len()) - } - } - RedactionOutput::Text(TextRedactionOutput::Remove) => String::new(), - RedactionOutput::Text(TextRedactionOutput::Hash { .. }) => { - format!("[HASH:{:x}]", hash_string(cell)) - } - _ => output - .replacement_value() - .map(|v| v.to_string()) - .unwrap_or_else(|| default_mask.to_string().repeat(cell.len())), - } -} - -fn hash_string(s: &str) -> u64 { - use std::hash::{Hash, Hasher}; - let mut hasher = std::collections::hash_map::DefaultHasher::new(); - s.hash(&mut hasher); - hasher.finish() -} diff --git a/crates/nvisy-pipeline/src/redaction/evaluate_policy.rs b/crates/nvisy-pipeline/src/redaction/evaluate_policy.rs index 92e61d0..d7b1236 100644 --- a/crates/nvisy-pipeline/src/redaction/evaluate_policy.rs +++ b/crates/nvisy-pipeline/src/redaction/evaluate_policy.rs @@ -100,12 +100,9 @@ impl Action for EvaluatePolicyAction { /// Returns the first enabled rule whose [`EntitySelector`] matches the given entity, /// or `None` if no rule applies. fn find_matching_rule<'a>(entity: &Entity, rules: &'a [PolicyRule]) -> Option<&'a PolicyRule> { - for rule in rules { - if rule.selector.matches(&entity.category, &entity.entity_type, entity.confidence) { - return Some(rule); - } - } - None + rules.iter().find(|rule| { + rule.selector.matches(&entity.category, &entity.entity_type, entity.confidence) + }) } /// Expands a replacement template using entity metadata. diff --git a/crates/nvisy-pipeline/src/redaction/mod.rs b/crates/nvisy-pipeline/src/redaction/mod.rs index 66ccd46..69bdd32 100644 --- a/crates/nvisy-pipeline/src/redaction/mod.rs +++ b/crates/nvisy-pipeline/src/redaction/mod.rs @@ -5,9 +5,6 @@ /// Applies pending redactions to document content (text, image, tabular, audio). pub mod apply; -/// Image rendering primitives for redaction overlays. -#[cfg(feature = "image-redaction")] -pub mod render; /// Emits audit trail records for every applied redaction. pub mod emit_audit; /// Evaluates policy rules against detected entities and produces redaction instructions. diff --git a/crates/nvisy-pipeline/src/redaction/render/mod.rs b/crates/nvisy-pipeline/src/redaction/render/mod.rs deleted file mode 100644 index 1796d48..0000000 --- a/crates/nvisy-pipeline/src/redaction/render/mod.rs +++ /dev/null @@ -1,6 +0,0 @@ -//! Image rendering primitives for redaction overlays. - -/// Gaussian blur for image regions. -pub mod blur; -/// Solid color block overlay for image regions. -pub mod block; diff --git a/crates/nvisy-python/Cargo.toml b/crates/nvisy-python/Cargo.toml index 089355f..de125ea 100644 --- a/crates/nvisy-python/Cargo.toml +++ b/crates/nvisy-python/Cargo.toml @@ -21,12 +21,17 @@ documentation = { workspace = true } all-features = true rustdoc-args = ["--cfg", "docsrs"] +[features] +default = ["png"] +# Image-based NER and OCR actions (requires PngHandler) +png = ["nvisy-codec/png"] + [dependencies] # Internal crates nvisy-core = { workspace = true, features = [] } nvisy-ontology = { workspace = true, features = [] } nvisy-pipeline = { workspace = true, features = [] } -nvisy-ingest = { workspace = true, features = [] } +nvisy-codec = { workspace = true, features = [] } # (De)serialization serde = { workspace = true, features = ["derive"] } diff --git a/crates/nvisy-python/src/actions/mod.rs b/crates/nvisy-python/src/actions/mod.rs index cd5e2b1..37d7026 100644 --- a/crates/nvisy-python/src/actions/mod.rs +++ b/crates/nvisy-python/src/actions/mod.rs @@ -6,13 +6,15 @@ //! - [`OcrDetectAction`] -- performs OCR on images to extract text regions. /// OCR detection pipeline action. +#[cfg(feature = "png")] pub mod ocr; use serde::Deserialize; -use nvisy_ingest::handler::{FormatHandler, TxtHandler}; -use nvisy_ingest::document::Document; -use nvisy_ingest::document::data::*; +use nvisy_codec::handler::{TxtHandler, TxtData}; +#[cfg(feature = "png")] +use nvisy_codec::handler::PngHandler; +use nvisy_codec::document::Document; use nvisy_ontology::entity::Entity; use nvisy_core::error::Error; use nvisy_core::io::ContentData; @@ -70,7 +72,7 @@ impl DetectNerAction { #[async_trait::async_trait] impl Action for DetectNerAction { type Params = DetectNerParams; - type Input = (ContentData, Vec>); + type Input = (ContentData, Vec>); type Output = Vec; fn id(&self) -> &str { "detect-ner" } @@ -93,20 +95,20 @@ impl Action for DetectNerAction { "python/ner", false, ))?; - vec![Document::new( - FormatHandler::Txt(TxtHandler), - DocumentData::Text(TextData { text: text.to_string() }), - )] + let handler = TxtHandler::new(TxtData { + lines: text.lines().map(String::from).collect(), + trailing_newline: text.ends_with('\n'), + }); + vec![Document::new(handler)] } else { documents }; let mut all_entities = Vec::new(); for doc in &docs { - if let Some(content) = doc.text() { - let entities = ner::detect_ner(&self.bridge, content, &config).await?; - all_entities.extend(entities); - } + let text = doc.handler().lines().join("\n"); + let entities = ner::detect_ner(&self.bridge, &text, &config).await?; + all_entities.extend(entities); } Ok(all_entities) @@ -119,12 +121,14 @@ impl Action for DetectNerAction { /// provided, the raw content is treated as a single image whose MIME type /// is inferred from the content metadata. Detected entities are returned /// directly. +#[cfg(feature = "png")] pub struct DetectNerImageAction { /// Python bridge used to call the NER model. pub bridge: PythonBridge, params: DetectNerParams, } +#[cfg(feature = "png")] impl DetectNerImageAction { /// Replace the default bridge with a pre-configured one. pub fn with_bridge(mut self, bridge: PythonBridge) -> Self { @@ -133,10 +137,11 @@ impl DetectNerImageAction { } } +#[cfg(feature = "png")] #[async_trait::async_trait] impl Action for DetectNerImageAction { type Params = DetectNerParams; - type Input = (ContentData, Vec>); + type Input = (ContentData, Vec>); type Output = Vec; fn id(&self) -> &str { "detect-ner-image" } @@ -167,15 +172,13 @@ impl Action for DetectNerImageAction { all_entities.extend(entities); } else { for doc in &images { - if let Some(image) = doc.image() { - let entities = ner::detect_ner_image( - &self.bridge, - &image.bytes, - &image.mime_type, - &config, - ).await?; - all_entities.extend(entities); - } + let entities = ner::detect_ner_image( + &self.bridge, + doc.handler().bytes(), + "image/png", + &config, + ).await?; + all_entities.extend(entities); } } diff --git a/crates/nvisy-python/src/actions/ocr.rs b/crates/nvisy-python/src/actions/ocr.rs index 9d54e17..60f2016 100644 --- a/crates/nvisy-python/src/actions/ocr.rs +++ b/crates/nvisy-python/src/actions/ocr.rs @@ -2,9 +2,8 @@ use serde::Deserialize; -use nvisy_ingest::handler::{FormatHandler, TxtHandler}; -use nvisy_ingest::document::Document; -use nvisy_ingest::document::data::*; +use nvisy_codec::handler::{TxtHandler, TxtData, PngHandler}; +use nvisy_codec::document::Document; use nvisy_ontology::entity::Entity; use nvisy_core::error::Error; use nvisy_core::io::ContentData; @@ -57,8 +56,8 @@ impl OcrDetectAction { #[async_trait::async_trait] impl Action for OcrDetectAction { type Params = OcrDetectParams; - type Input = (ContentData, Vec>); - type Output = (Vec, Vec>); + type Input = (ContentData, Vec>); + type Output = (Vec, Vec>); fn id(&self) -> &str { "detect-ocr" @@ -96,26 +95,25 @@ impl Action for OcrDetectAction { all_entities.extend(entities); } else { for doc in &images { - if let Some(image) = doc.image() { - let entities = - ocr::detect_ocr(&self.bridge, &image.bytes, &image.mime_type, &config) - .await?; - for entity in &entities { - all_ocr_text.push(entity.value.clone()); - } - all_entities.extend(entities); + let entities = + ocr::detect_ocr(&self.bridge, doc.handler().bytes(), "image/png", &config) + .await?; + for entity in &entities { + all_ocr_text.push(entity.value.clone()); } + all_entities.extend(entities); } } // Create a Document from concatenated OCR text for downstream processing let mut documents = Vec::new(); if !all_ocr_text.is_empty() { - let ocr_doc = Document::new( - FormatHandler::Txt(TxtHandler), - DocumentData::Text(TextData { text: all_ocr_text.join("\n") }), - ); - documents.push(ocr_doc); + let text = all_ocr_text.join("\n"); + let handler = TxtHandler::new(TxtData { + lines: text.lines().map(String::from).collect(), + trailing_newline: text.ends_with('\n'), + }); + documents.push(Document::new(handler)); } Ok((all_entities, documents)) diff --git a/crates/nvisy-python/src/bridge/mod.rs b/crates/nvisy-python/src/bridge/mod.rs index d785427..e3044d6 100644 --- a/crates/nvisy-python/src/bridge/mod.rs +++ b/crates/nvisy-python/src/bridge/mod.rs @@ -2,6 +2,7 @@ use pyo3::prelude::*; use nvisy_core::error::Error; +use nvisy_pipeline::provider::ConnectedInstance; use crate::error::from_pyerr; /// Lightweight handle to a Python NER module. @@ -43,3 +44,9 @@ impl Default for PythonBridge { Self::new("nvisy_ai") } } + +impl ConnectedInstance for PythonBridge { + fn disconnect(self) -> Option + Send>>> { + None + } +} diff --git a/crates/nvisy-python/src/prelude.rs b/crates/nvisy-python/src/prelude.rs index 325beb1..b1f49dd 100644 --- a/crates/nvisy-python/src/prelude.rs +++ b/crates/nvisy-python/src/prelude.rs @@ -1,5 +1,8 @@ //! Convenience re-exports. -pub use crate::actions::{DetectNerAction, DetectNerImageAction}; +pub use crate::actions::DetectNerAction; +#[cfg(feature = "png")] +pub use crate::actions::DetectNerImageAction; +#[cfg(feature = "png")] pub use crate::actions::ocr::OcrDetectAction; pub use crate::bridge::PythonBridge; pub use crate::provider::AiProvider; diff --git a/crates/nvisy-python/src/provider/mod.rs b/crates/nvisy-python/src/provider/mod.rs index efae0f7..2e9faa0 100644 --- a/crates/nvisy-python/src/provider/mod.rs +++ b/crates/nvisy-python/src/provider/mod.rs @@ -6,7 +6,7 @@ use serde::Deserialize; use nvisy_core::error::Error; -use nvisy_pipeline::provider::{ConnectedInstance, Provider}; +use nvisy_pipeline::provider::Provider; use crate::bridge::PythonBridge; /// Typed credentials for the AI provider. @@ -39,14 +39,9 @@ impl Provider for AiProvider { self.validate_credentials(creds) } - async fn connect(&self, _creds: &Self::Credentials) -> Result, Error> { - let bridge = PythonBridge::default(); + async fn connect(&self, _creds: &Self::Credentials) -> Result { // Don't init here — Python might not be available at connect time // Init happens lazily when detect_ner is called - - Ok(ConnectedInstance { - client: bridge, - disconnect: None, - }) + Ok(PythonBridge::default()) } } From 76249a23bcef9b8de24fc226631a380bba204a4e Mon Sep 17 00:00:00 2001 From: Oleh Martsokha Date: Mon, 16 Feb 2026 14:19:26 +0100 Subject: [PATCH 03/11] refactor(ontology): reduce duplication, add missing fields, improve type safety Move method() and replacement_value() to inner enums (TextRedactionOutput, ImageRedactionOutput, AudioRedactionOutput, TextRedactionSpec, etc.) so RedactionOutput and RedactionSpec delegate instead of duplicating match logic. Drop Copy/Eq/Hash from AudioRedactionSpec for consistency. Add audio_id to AudioLocation, type AnnotationLabel.scope as AnnotationScope enum, make Redaction.original_value non-optional, and add confidence field. Delete duplicated replacement_value() in nvisy-codec. Co-Authored-By: Claude Opus 4.6 --- crates/nvisy-codec/src/render/text/mod.rs | 71 +++++++++++- .../src/detection/annotation.rs | 17 ++- crates/nvisy-ontology/src/detection/mod.rs | 2 +- crates/nvisy-ontology/src/entity/location.rs | 3 + crates/nvisy-ontology/src/prelude.rs | 3 +- crates/nvisy-ontology/src/redaction/mod.rs | 21 ++-- crates/nvisy-ontology/src/redaction/output.rs | 103 +++++++++++------- crates/nvisy-ontology/src/redaction/spec.rs | 71 +++++++----- .../src/redaction/evaluate_policy.rs | 8 +- 9 files changed, 219 insertions(+), 80 deletions(-) diff --git a/crates/nvisy-codec/src/render/text/mod.rs b/crates/nvisy-codec/src/render/text/mod.rs index bea4c46..d522178 100644 --- a/crates/nvisy-codec/src/render/text/mod.rs +++ b/crates/nvisy-codec/src/render/text/mod.rs @@ -1,7 +1,16 @@ //! Text rendering and redaction primitives. //! -//! Low-level utilities for applying text replacements and cell-level -//! masking, used by pipeline redaction actions. +//! Provides byte-offset replacement, cell-level masking, and the +//! [`AsText`] trait that text-bearing handlers implement to support +//! redaction in a single call. +//! +//! # Trait +//! +//! [`AsText`] is the main extension point: text format handlers implement +//! [`content`](AsText::content) and [`replace_content`](AsText::replace_content) +//! to read and write their backing text, and then get a +//! [`redact`](AsText::redact) convenience method for free via the default +//! implementation. //! //! # Sub-modules //! @@ -15,3 +24,61 @@ pub mod replace; pub use mask::{hash_string, mask_cell}; pub use replace::{apply_replacements, PendingReplacement}; + +use nvisy_core::error::Error; +use nvisy_ontology::redaction::TextRedactionOutput; + +/// A located text redaction: pairs a byte range with a +/// [`TextRedactionOutput`] that carries the already-resolved replacement. +pub struct TextRedaction { + /// Byte offset where the redacted span starts in the content. + pub start: usize, + /// Byte offset where the redacted span ends (exclusive) in the content. + pub end: usize, + /// The redaction output that carries the replacement value. + pub output: TextRedactionOutput, +} + +/// Trait for handlers that hold redactable text content. +/// +/// Mirrors [`AsImage`](super::image::AsImage) for the text modality. +/// Handlers implement [`content`](Self::content) and +/// [`replace_content`](Self::replace_content), and get +/// [`redact`](Self::redact) for free. +pub trait AsText: Sized { + /// Return the handler's full text content as a single string. + fn content(&self) -> String; + + /// Build a new handler instance with the given text content. + fn replace_content(&self, content: &str) -> Result; + + /// Apply a batch of text redactions, returning a new handler. + /// + /// Each [`TextRedaction`] identifies a byte range and a + /// [`TextRedactionOutput`] whose replacement value is written into + /// the content. Replacements are applied right-to-left so that byte + /// offsets remain valid. + fn redact(&self, redactions: &[TextRedaction]) -> Result { + if redactions.is_empty() { + return self.replace_content(&self.content()); + } + + let content = self.content(); + let mut pending: Vec = redactions + .iter() + .map(|r| { + let value = r.output.replacement_value() + .unwrap_or_default() + .to_string(); + PendingReplacement { + start: r.start, + end: r.end, + value, + } + }) + .collect(); + + let result = apply_replacements(&content, &mut pending); + self.replace_content(&result) + } +} diff --git a/crates/nvisy-ontology/src/detection/annotation.rs b/crates/nvisy-ontology/src/detection/annotation.rs index eb7b8ae..e2ec81b 100644 --- a/crates/nvisy-ontology/src/detection/annotation.rs +++ b/crates/nvisy-ontology/src/detection/annotation.rs @@ -25,15 +25,28 @@ pub enum AnnotationKind { Label, } +/// The scope to which an annotation label applies. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] +#[serde(rename_all = "snake_case")] +pub enum AnnotationScope { + /// Label applies to the entire document. + Document, + /// Label applies to a specific page. + Page, + /// Label applies to a specific region or element. + Region, +} + /// A classification label attached to a document or region. #[derive(Debug, Clone, Serialize, Deserialize)] #[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] pub struct AnnotationLabel { /// Label name (e.g. `"contains-phi"`, `"gdpr-request"`). pub name: String, - /// Scope of the label: `"document"` or a region identifier. + /// Scope of the label. #[serde(skip_serializing_if = "Option::is_none")] - pub scope: Option, + pub scope: Option, /// Confidence of the label assignment. #[serde(skip_serializing_if = "Option::is_none")] pub confidence: Option, diff --git a/crates/nvisy-ontology/src/detection/mod.rs b/crates/nvisy-ontology/src/detection/mod.rs index 9af8a56..4e31a01 100644 --- a/crates/nvisy-ontology/src/detection/mod.rs +++ b/crates/nvisy-ontology/src/detection/mod.rs @@ -8,7 +8,7 @@ mod annotation; mod classification; mod sensitivity; -pub use annotation::{Annotation, AnnotationKind, AnnotationLabel}; +pub use annotation::{Annotation, AnnotationKind, AnnotationLabel, AnnotationScope}; pub use classification::ClassificationResult; pub use sensitivity::{Sensitivity, SensitivityLevel}; diff --git a/crates/nvisy-ontology/src/entity/location.rs b/crates/nvisy-ontology/src/entity/location.rs index d8a4ba9..1ea734c 100644 --- a/crates/nvisy-ontology/src/entity/location.rs +++ b/crates/nvisy-ontology/src/entity/location.rs @@ -92,6 +92,9 @@ pub struct AudioLocation { /// Speaker identifier from diarization. #[serde(skip_serializing_if = "Option::is_none")] pub speaker_id: Option, + /// Links this entity to a specific audio document. + #[serde(skip_serializing_if = "Option::is_none")] + pub audio_id: Option, } /// Location of an entity within a video stream. diff --git a/crates/nvisy-ontology/src/prelude.rs b/crates/nvisy-ontology/src/prelude.rs index c2dc957..a71439c 100644 --- a/crates/nvisy-ontology/src/prelude.rs +++ b/crates/nvisy-ontology/src/prelude.rs @@ -4,7 +4,8 @@ pub use crate::audit::{ Audit, AuditAction, Auditable, Explainable, Explanation, RetentionPolicy, RetentionScope, }; pub use crate::detection::{ - Annotation, AnnotationKind, AnnotationLabel, ClassificationResult, DetectionResult, + Annotation, AnnotationKind, AnnotationLabel, AnnotationScope, ClassificationResult, + DetectionResult, Sensitivity, SensitivityLevel, }; pub use crate::entity::{ diff --git a/crates/nvisy-ontology/src/redaction/mod.rs b/crates/nvisy-ontology/src/redaction/mod.rs index 568a419..d5eb01e 100644 --- a/crates/nvisy-ontology/src/redaction/mod.rs +++ b/crates/nvisy-ontology/src/redaction/mod.rs @@ -68,8 +68,9 @@ pub struct Redaction { /// Redaction output recording the method used and its result data. pub output: RedactionOutput, /// The original sensitive value, retained for audit purposes. - #[serde(skip_serializing_if = "Option::is_none")] - pub original_value: Option, + pub original_value: String, + /// Detection confidence that led to this redaction. + pub confidence: f64, /// Identifier of the policy rule that triggered this redaction. #[serde(skip_serializing_if = "Option::is_none")] pub policy_rule_id: Option, @@ -84,12 +85,18 @@ pub struct Redaction { impl Redaction { /// Create a new pending redaction for the given entity. - pub fn new(entity_id: Uuid, output: impl Into) -> Self { + pub fn new( + entity_id: Uuid, + output: impl Into, + original_value: impl Into, + confidence: f64, + ) -> Self { Self { source: ContentSource::new(), entity_id, output: output.into(), - original_value: None, + original_value: original_value.into(), + confidence, policy_rule_id: None, applied: false, version: 1, @@ -97,12 +104,6 @@ impl Redaction { } } - /// Record the original sensitive value for audit trail purposes. - pub fn with_original_value(mut self, value: impl Into) -> Self { - self.original_value = Some(value.into()); - self - } - /// Associate this redaction with the policy rule that triggered it. pub fn with_policy_rule_id(mut self, id: Uuid) -> Self { self.policy_rule_id = Some(id); diff --git a/crates/nvisy-ontology/src/redaction/output.rs b/crates/nvisy-ontology/src/redaction/output.rs index 5392b9c..ee96557 100644 --- a/crates/nvisy-ontology/src/redaction/output.rs +++ b/crates/nvisy-ontology/src/redaction/output.rs @@ -80,6 +80,68 @@ pub enum AudioRedactionOutput { Synthesize, } +impl TextRedactionOutput { + /// Returns the [`TextRedactionMethod`] tag this output corresponds to. + pub fn method(&self) -> TextRedactionMethod { + match self { + Self::Mask { .. } => TextRedactionMethod::Mask, + Self::Replace { .. } => TextRedactionMethod::Replace, + Self::Hash { .. } => TextRedactionMethod::Hash, + Self::Encrypt { .. } => TextRedactionMethod::Encrypt, + Self::Remove => TextRedactionMethod::Remove, + Self::Synthesize { .. } => TextRedactionMethod::Synthesize, + Self::Pseudonymize { .. } => TextRedactionMethod::Pseudonymize, + Self::Tokenize { .. } => TextRedactionMethod::Tokenize, + Self::Aggregate { .. } => TextRedactionMethod::Aggregate, + Self::Generalize { .. } => TextRedactionMethod::Generalize, + Self::DateShift { .. } => TextRedactionMethod::DateShift, + } + } + + /// Returns the text replacement string, regardless of specific method. + /// + /// Returns `None` for [`Remove`](Self::Remove) — the caller should + /// treat that as an empty string (span deleted). + pub fn replacement_value(&self) -> Option<&str> { + match self { + Self::Mask { replacement, .. } => Some(replacement), + Self::Replace { replacement } => Some(replacement), + Self::Hash { hash_value } => Some(hash_value), + Self::Encrypt { ciphertext, .. } => Some(ciphertext), + Self::Remove => None, + Self::Synthesize { replacement } => Some(replacement), + Self::Pseudonymize { pseudonym } => Some(pseudonym), + Self::Tokenize { token, .. } => Some(token), + Self::Aggregate { replacement } => Some(replacement), + Self::Generalize { replacement, .. } => Some(replacement), + Self::DateShift { replacement, .. } => Some(replacement), + } + } +} + +impl ImageRedactionOutput { + /// Returns the [`ImageRedactionMethod`] tag this output corresponds to. + pub fn method(&self) -> ImageRedactionMethod { + match self { + Self::Blur { .. } => ImageRedactionMethod::Blur, + Self::Block { .. } => ImageRedactionMethod::Block, + Self::Pixelate { .. } => ImageRedactionMethod::Pixelate, + Self::Synthesize => ImageRedactionMethod::Synthesize, + } + } +} + +impl AudioRedactionOutput { + /// Returns the [`AudioRedactionMethod`] tag this output corresponds to. + pub fn method(&self) -> AudioRedactionMethod { + match self { + Self::Silence => AudioRedactionMethod::Silence, + Self::Remove => AudioRedactionMethod::Remove, + Self::Synthesize => AudioRedactionMethod::Synthesize, + } + } +} + /// Unified redaction output that wraps modality-specific output variants. /// /// Carries method-specific result data (replacement strings, ciphertext, @@ -103,19 +165,7 @@ impl RedactionOutput { /// Returns `None` for image and audio outputs, or text `Remove`. pub fn replacement_value(&self) -> Option<&str> { match self { - Self::Text(t) => match t { - TextRedactionOutput::Mask { replacement, .. } => Some(replacement), - TextRedactionOutput::Replace { replacement } => Some(replacement), - TextRedactionOutput::Hash { hash_value } => Some(hash_value), - TextRedactionOutput::Encrypt { ciphertext, .. } => Some(ciphertext), - TextRedactionOutput::Remove => None, - TextRedactionOutput::Synthesize { replacement } => Some(replacement), - TextRedactionOutput::Pseudonymize { pseudonym } => Some(pseudonym), - TextRedactionOutput::Tokenize { token, .. } => Some(token), - TextRedactionOutput::Aggregate { replacement } => Some(replacement), - TextRedactionOutput::Generalize { replacement, .. } => Some(replacement), - TextRedactionOutput::DateShift { replacement, .. } => Some(replacement), - }, + Self::Text(t) => t.replacement_value(), Self::Image(_) | Self::Audio(_) => None, } } @@ -123,30 +173,9 @@ impl RedactionOutput { /// Returns the [`RedactionMethod`] tag this output corresponds to. pub fn method(&self) -> RedactionMethod { match self { - Self::Text(t) => RedactionMethod::Text(match t { - TextRedactionOutput::Mask { .. } => TextRedactionMethod::Mask, - TextRedactionOutput::Replace { .. } => TextRedactionMethod::Replace, - TextRedactionOutput::Hash { .. } => TextRedactionMethod::Hash, - TextRedactionOutput::Encrypt { .. } => TextRedactionMethod::Encrypt, - TextRedactionOutput::Remove => TextRedactionMethod::Remove, - TextRedactionOutput::Synthesize { .. } => TextRedactionMethod::Synthesize, - TextRedactionOutput::Pseudonymize { .. } => TextRedactionMethod::Pseudonymize, - TextRedactionOutput::Tokenize { .. } => TextRedactionMethod::Tokenize, - TextRedactionOutput::Aggregate { .. } => TextRedactionMethod::Aggregate, - TextRedactionOutput::Generalize { .. } => TextRedactionMethod::Generalize, - TextRedactionOutput::DateShift { .. } => TextRedactionMethod::DateShift, - }), - Self::Image(i) => RedactionMethod::Image(match i { - ImageRedactionOutput::Blur { .. } => ImageRedactionMethod::Blur, - ImageRedactionOutput::Block { .. } => ImageRedactionMethod::Block, - ImageRedactionOutput::Pixelate { .. } => ImageRedactionMethod::Pixelate, - ImageRedactionOutput::Synthesize => ImageRedactionMethod::Synthesize, - }), - Self::Audio(a) => RedactionMethod::Audio(match a { - AudioRedactionOutput::Silence => AudioRedactionMethod::Silence, - AudioRedactionOutput::Remove => AudioRedactionMethod::Remove, - AudioRedactionOutput::Synthesize => AudioRedactionMethod::Synthesize, - }), + Self::Text(t) => RedactionMethod::Text(t.method()), + Self::Image(i) => RedactionMethod::Image(i.method()), + Self::Audio(a) => RedactionMethod::Audio(a.method()), } } } diff --git a/crates/nvisy-ontology/src/redaction/spec.rs b/crates/nvisy-ontology/src/redaction/spec.rs index 5c7a039..ed146af 100644 --- a/crates/nvisy-ontology/src/redaction/spec.rs +++ b/crates/nvisy-ontology/src/redaction/spec.rs @@ -106,7 +106,7 @@ fn default_block_size() -> u32 { } /// Audio redaction specification. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] #[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] #[serde(tag = "method", rename_all = "snake_case")] pub enum AudioRedactionSpec { @@ -118,6 +118,48 @@ pub enum AudioRedactionSpec { Synthesize, } +impl TextRedactionSpec { + /// Returns the [`TextRedactionMethod`] tag this spec corresponds to. + pub fn method(&self) -> TextRedactionMethod { + match self { + Self::Mask { .. } => TextRedactionMethod::Mask, + Self::Replace { .. } => TextRedactionMethod::Replace, + Self::Hash => TextRedactionMethod::Hash, + Self::Encrypt { .. } => TextRedactionMethod::Encrypt, + Self::Remove => TextRedactionMethod::Remove, + Self::Synthesize => TextRedactionMethod::Synthesize, + Self::Pseudonymize => TextRedactionMethod::Pseudonymize, + Self::Tokenize { .. } => TextRedactionMethod::Tokenize, + Self::Aggregate => TextRedactionMethod::Aggregate, + Self::Generalize { .. } => TextRedactionMethod::Generalize, + Self::DateShift { .. } => TextRedactionMethod::DateShift, + } + } +} + +impl ImageRedactionSpec { + /// Returns the [`ImageRedactionMethod`] tag this spec corresponds to. + pub fn method(&self) -> ImageRedactionMethod { + match self { + Self::Blur { .. } => ImageRedactionMethod::Blur, + Self::Block { .. } => ImageRedactionMethod::Block, + Self::Pixelate { .. } => ImageRedactionMethod::Pixelate, + Self::Synthesize => ImageRedactionMethod::Synthesize, + } + } +} + +impl AudioRedactionSpec { + /// Returns the [`AudioRedactionMethod`] tag this spec corresponds to. + pub fn method(&self) -> AudioRedactionMethod { + match self { + Self::Silence => AudioRedactionMethod::Silence, + Self::Remove => AudioRedactionMethod::Remove, + Self::Synthesize => AudioRedactionMethod::Synthesize, + } + } +} + /// Unified redaction specification submitted to the engine. /// /// Carries the method to apply and its configuration parameters. @@ -139,30 +181,9 @@ impl RedactionSpec { /// Returns the [`RedactionMethod`] tag this spec corresponds to. pub fn method(&self) -> RedactionMethod { match self { - Self::Text(t) => RedactionMethod::Text(match t { - TextRedactionSpec::Mask { .. } => TextRedactionMethod::Mask, - TextRedactionSpec::Replace { .. } => TextRedactionMethod::Replace, - TextRedactionSpec::Hash => TextRedactionMethod::Hash, - TextRedactionSpec::Encrypt { .. } => TextRedactionMethod::Encrypt, - TextRedactionSpec::Remove => TextRedactionMethod::Remove, - TextRedactionSpec::Synthesize => TextRedactionMethod::Synthesize, - TextRedactionSpec::Pseudonymize => TextRedactionMethod::Pseudonymize, - TextRedactionSpec::Tokenize { .. } => TextRedactionMethod::Tokenize, - TextRedactionSpec::Aggregate => TextRedactionMethod::Aggregate, - TextRedactionSpec::Generalize { .. } => TextRedactionMethod::Generalize, - TextRedactionSpec::DateShift { .. } => TextRedactionMethod::DateShift, - }), - Self::Image(i) => RedactionMethod::Image(match i { - ImageRedactionSpec::Blur { .. } => ImageRedactionMethod::Blur, - ImageRedactionSpec::Block { .. } => ImageRedactionMethod::Block, - ImageRedactionSpec::Pixelate { .. } => ImageRedactionMethod::Pixelate, - ImageRedactionSpec::Synthesize => ImageRedactionMethod::Synthesize, - }), - Self::Audio(a) => RedactionMethod::Audio(match a { - AudioRedactionSpec::Silence => AudioRedactionMethod::Silence, - AudioRedactionSpec::Remove => AudioRedactionMethod::Remove, - AudioRedactionSpec::Synthesize => AudioRedactionMethod::Synthesize, - }), + Self::Text(t) => RedactionMethod::Text(t.method()), + Self::Image(i) => RedactionMethod::Image(i.method()), + Self::Audio(a) => RedactionMethod::Audio(a.method()), } } } diff --git a/crates/nvisy-pipeline/src/redaction/evaluate_policy.rs b/crates/nvisy-pipeline/src/redaction/evaluate_policy.rs index d7b1236..5af6b59 100644 --- a/crates/nvisy-pipeline/src/redaction/evaluate_policy.rs +++ b/crates/nvisy-pipeline/src/redaction/evaluate_policy.rs @@ -83,8 +83,12 @@ impl Action for EvaluatePolicyAction { build_default_output(entity, spec) }; - let mut redaction = Redaction::new(entity.source.as_uuid(), output); - redaction = redaction.with_original_value(&entity.value); + let mut redaction = Redaction::new( + entity.source.as_uuid(), + output, + &entity.value, + entity.confidence, + ); if let Some(r) = rule { redaction = redaction.with_policy_rule_id(r.id); } From 322ba430e71266620d6c94d00063ba96aba907d7 Mon Sep 17 00:00:00 2001 From: Oleh Martsokha Date: Mon, 16 Feb 2026 14:19:58 +0100 Subject: [PATCH 04/11] refactor(codec, pipeline): implement AsText for TxtHandler, simplify text/tabular redaction apply Implement AsText trait on TxtHandler so apply_text_doc can use the trait's redact() method instead of manually reconstructing content. Narrow mask_cell to accept TextRedactionOutput directly instead of RedactionOutput, removing the default_mask_char param. Simplify apply_tabular_doc accordingly. Co-Authored-By: Claude Opus 4.6 --- crates/nvisy-codec/src/handler/mod.rs | 2 + .../src/handler/text/txt_handler.rs | 20 ++++++ crates/nvisy-codec/src/render/text/mask.rs | 29 +++++---- crates/nvisy-pipeline/src/redaction/apply.rs | 61 ++++++------------- 4 files changed, 56 insertions(+), 56 deletions(-) diff --git a/crates/nvisy-codec/src/handler/mod.rs b/crates/nvisy-codec/src/handler/mod.rs index ebfc755..81b9dee 100644 --- a/crates/nvisy-codec/src/handler/mod.rs +++ b/crates/nvisy-codec/src/handler/mod.rs @@ -36,6 +36,8 @@ pub use text::json_handler::{ }; pub use text::json_loader::{JsonParams, JsonLoader}; +pub use crate::render::text::AsText; + #[cfg(feature = "png")] pub use image::png::PngHandler; #[cfg(any(feature = "png", feature = "jpeg"))] diff --git a/crates/nvisy-codec/src/handler/text/txt_handler.rs b/crates/nvisy-codec/src/handler/text/txt_handler.rs index d6c5932..3f7a212 100644 --- a/crates/nvisy-codec/src/handler/text/txt_handler.rs +++ b/crates/nvisy-codec/src/handler/text/txt_handler.rs @@ -22,6 +22,7 @@ use nvisy_ontology::entity::DocumentType; use crate::document::edit_stream::SpanEditStream; use crate::document::view_stream::SpanStream; use crate::handler::{Handler, Span}; +use crate::render::text::AsText; /// 0-based line index identifying a span within a plain-text document. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] @@ -108,6 +109,25 @@ impl TxtHandler { } } +impl AsText for TxtHandler { + fn content(&self) -> String { + let mut s = self.data.lines.join("\n"); + if self.data.trailing_newline { + s.push('\n'); + } + s + } + + fn replace_content(&self, content: &str) -> Result { + let trailing_newline = content.ends_with('\n'); + let lines: Vec = content.lines().map(String::from).collect(); + Ok(Self::new(TxtData { + lines, + trailing_newline, + })) + } +} + /// Iterator over lines of a plain-text document. struct TxtSpanIter<'a> { lines: &'a [String], diff --git a/crates/nvisy-codec/src/render/text/mask.rs b/crates/nvisy-codec/src/render/text/mask.rs index c7703bc..4ce46b5 100644 --- a/crates/nvisy-codec/src/render/text/mask.rs +++ b/crates/nvisy-codec/src/render/text/mask.rs @@ -1,22 +1,21 @@ //! Cell-level masking and hashing utilities. //! //! These functions are used by tabular redaction actions to transform -//! individual cell values according to a [`RedactionOutput`] variant. +//! individual cell values according to a [`TextRedactionOutput`] variant. -use nvisy_ontology::redaction::{RedactionOutput, TextRedactionOutput}; +use nvisy_ontology::redaction::TextRedactionOutput; /// Redact a single cell value according to `output`. /// -/// Dispatches on the [`RedactionOutput`] variant: +/// Dispatches on the [`TextRedactionOutput`] variant: /// - **Mask**: preserve the last 4 characters, replacing the rest with the /// mask character from the output. /// - **Remove**: return an empty string. /// - **Hash**: return `[HASH:{hex}]` using [`hash_string`]. -/// - **Other text variants**: use [`replacement_value()`](RedactionOutput::replacement_value), -/// falling back to repeating `default_mask` for the cell length. -pub fn mask_cell(cell: &str, output: &RedactionOutput, default_mask: char) -> String { +/// - **Other variants**: use the output's replacement value directly. +pub fn mask_cell(cell: &str, output: &TextRedactionOutput) -> String { match output { - RedactionOutput::Text(TextRedactionOutput::Mask { mask_char, .. }) => { + TextRedactionOutput::Mask { mask_char, .. } => { if cell.len() > 4 { format!( "{}{}", @@ -27,14 +26,18 @@ pub fn mask_cell(cell: &str, output: &RedactionOutput, default_mask: char) -> St mask_char.to_string().repeat(cell.len()) } } - RedactionOutput::Text(TextRedactionOutput::Remove) => String::new(), - RedactionOutput::Text(TextRedactionOutput::Hash { .. }) => { + TextRedactionOutput::Remove => String::new(), + TextRedactionOutput::Hash { .. } => { format!("[HASH:{:x}]", hash_string(cell)) } - _ => output - .replacement_value() - .map(|v| v.to_string()) - .unwrap_or_else(|| default_mask.to_string().repeat(cell.len())), + TextRedactionOutput::Replace { replacement } + | TextRedactionOutput::Synthesize { replacement } + | TextRedactionOutput::Aggregate { replacement } + | TextRedactionOutput::Generalize { replacement, .. } + | TextRedactionOutput::DateShift { replacement, .. } => replacement.clone(), + TextRedactionOutput::Encrypt { ciphertext, .. } => ciphertext.clone(), + TextRedactionOutput::Pseudonymize { pseudonym } => pseudonym.clone(), + TextRedactionOutput::Tokenize { token, .. } => token.clone(), } } diff --git a/crates/nvisy-pipeline/src/redaction/apply.rs b/crates/nvisy-pipeline/src/redaction/apply.rs index 6d5fff7..5017ccc 100644 --- a/crates/nvisy-pipeline/src/redaction/apply.rs +++ b/crates/nvisy-pipeline/src/redaction/apply.rs @@ -4,9 +4,9 @@ use std::collections::HashMap; use uuid::Uuid; use serde::Deserialize; -use nvisy_codec::handler::{TxtHandler, TxtData, CsvHandler}; +use nvisy_codec::handler::{TxtHandler, CsvHandler, AsText}; use nvisy_codec::document::Document; -use nvisy_codec::render::text::{PendingReplacement, apply_replacements, mask_cell}; +use nvisy_codec::render::text::{TextRedaction, mask_cell}; use nvisy_ontology::entity::Entity; use nvisy_ontology::redaction::{Redaction, RedactionOutput}; use nvisy_core::error::Error; @@ -27,9 +27,6 @@ use crate::action::Action; #[derive(Debug, Deserialize)] #[serde(rename_all = "camelCase")] pub struct ApplyRedactionParams { - /// Default mask character for text [`Mask`](nvisy_ontology::redaction::TextRedactionOutput::Mask) redactions. - #[serde(default = "default_mask_char")] - pub mask_char: char, /// Sigma value for gaussian blur (image redaction). #[cfg(feature = "image-redaction")] #[serde(default = "default_sigma")] @@ -48,9 +45,6 @@ pub struct ApplyRedactionParams { pub crossfade_secs: f64, } -fn default_mask_char() -> char { - '*' -} #[cfg(feature = "image-redaction")] fn default_sigma() -> f32 { 15.0 @@ -140,7 +134,7 @@ impl Action for ApplyRedactionAction { // Text documents let mut result_text = Vec::new(); for doc in &input.text_docs { - let redacted = apply_text_doc(doc, &entity_map, &redaction_map, &self.params); + let redacted = apply_text_doc(doc, &entity_map, &redaction_map); result_text.push(redacted); } @@ -171,7 +165,7 @@ impl Action for ApplyRedactionAction { // Tabular documents let mut result_tabular = Vec::new(); for doc in &input.tabular_docs { - let redacted = apply_tabular_doc(doc, &input.entities, &redaction_map, &self.params); + let redacted = apply_tabular_doc(doc, &input.entities, &redaction_map); result_tabular.push(redacted); } @@ -194,15 +188,8 @@ fn apply_text_doc( doc: &Document, entity_map: &HashMap, redaction_map: &HashMap, - params: &ApplyRedactionParams, ) -> Document { - let lines = doc.handler().lines(); - let mut content = lines.join("\n"); - if doc.handler().trailing_newline() { - content.push('\n'); - } - - let mut pending: Vec = Vec::new(); + let mut redactions: Vec = Vec::new(); for (entity_id, redaction) in redaction_map { let entity = match entity_map.get(entity_id) { @@ -211,43 +198,28 @@ fn apply_text_doc( }; // Check entity belongs to this document - let belongs = entity.source.parent_id() == Some(doc.source.as_uuid()); - if !belongs { + if entity.source.parent_id() != Some(doc.source.as_uuid()) { continue; } - let (start_offset, end_offset) = match &entity.text_location { + let (start, end) = match &entity.text_location { Some(loc) => (loc.start_offset, loc.end_offset), None => continue, }; - let value = match redaction.output.replacement_value() { - Some(v) => v.to_string(), - None => { - let span_len = end_offset.saturating_sub(start_offset); - params.mask_char.to_string().repeat(span_len) - } + let output = match &redaction.output { + RedactionOutput::Text(t) => t.clone(), + _ => continue, }; - pending.push(PendingReplacement { - start: start_offset, - end: end_offset, - value, - }); + redactions.push(TextRedaction { start, end, output }); } - if pending.is_empty() { + if redactions.is_empty() { return doc.clone(); } - let redacted_content = apply_replacements(&content, &mut pending); - - let trailing_newline = redacted_content.ends_with('\n'); - let new_lines: Vec = redacted_content.lines().map(String::from).collect(); - let handler = TxtHandler::new(TxtData { - lines: new_lines, - trailing_newline, - }); + let handler = doc.handler().redact(&redactions).expect("text redaction failed"); let mut result = Document::new(handler); result.source.set_parent_id(Some(doc.source.as_uuid())); result @@ -318,7 +290,6 @@ fn apply_tabular_doc( doc: &Document, entities: &[Entity], redaction_map: &HashMap, - params: &ApplyRedactionParams, ) -> Document { let mut result = doc.clone(); @@ -326,9 +297,13 @@ fn apply_tabular_doc( if let Some(ref tab_loc) = entity.tabular_location { let (row_idx, col_idx) = (tab_loc.row_index, tab_loc.column_index); if let Some(redaction) = redaction_map.get(&entity.source.as_uuid()) { + let output = match &redaction.output { + RedactionOutput::Text(t) => t, + _ => continue, + }; if let Some(row) = result.handler_mut().rows_mut().get_mut(row_idx) { if let Some(cell) = row.get_mut(col_idx) { - *cell = mask_cell(cell, &redaction.output, params.mask_char); + *cell = mask_cell(cell, output); } } } From 50c18f1e24cce5f1535d4500b9c77bc3dba92edf Mon Sep 17 00:00:00 2001 From: Oleh Martsokha Date: Mon, 16 Feb 2026 16:05:04 +0100 Subject: [PATCH 05/11] refactor(codec, ontology): split redaction traits, add BoundingBoxU32, tighten module visibility Split AsText into AsText + AsRedactableText and AsImage into AsImage + AsRedactableImage with blanket impls, mirroring the codec/redaction separation. Add BoundingBoxU32 for pixel-exact rendering and use it in blur/block/pixelate renderers. Make render submodules private, remove internal re-exports from handler/mod.rs, and add pixelate renderer. Also includes ontology improvements: strum Display derives, typed AnnotationScope, non-optional Redaction.original_value, Entity helper methods, and deduplicated spec/output method() delegation. Co-Authored-By: Claude Opus 4.6 --- Cargo.lock | 1 + crates/nvisy-codec/Cargo.toml | 5 +- crates/nvisy-codec/src/handler/mod.rs | 4 - crates/nvisy-codec/src/render/image/block.rs | 16 +-- crates/nvisy-codec/src/render/image/blur.rs | 16 +-- crates/nvisy-codec/src/render/image/mod.rs | 98 +++++++++---- .../nvisy-codec/src/render/image/pixelate.rs | 55 +++++++ crates/nvisy-codec/src/render/text/mask.rs | 29 ++-- crates/nvisy-codec/src/render/text/mod.rs | 52 ++++--- crates/nvisy-ontology/Cargo.toml | 3 + crates/nvisy-ontology/src/audit/mod.rs | 3 +- .../src/detection/annotation.rs | 6 +- .../src/detection/sensitivity.rs | 3 +- crates/nvisy-ontology/src/entity/location.rs | 38 +++++ crates/nvisy-ontology/src/entity/mod.rs | 18 ++- .../nvisy-ontology/src/policy/regulation.rs | 4 +- crates/nvisy-ontology/src/policy/rule.rs | 3 +- crates/nvisy-ontology/src/prelude.rs | 2 +- crates/nvisy-ontology/src/redaction/method.rs | 9 +- crates/nvisy-ontology/src/redaction/mod.rs | 5 +- crates/nvisy-ontology/src/redaction/spec.rs | 20 ++- .../nvisy-pipeline/src/detection/checksum.rs | 6 +- crates/nvisy-pipeline/src/redaction/apply.rs | 91 ++++-------- .../src/redaction/evaluate_policy.rs | 135 +++++++++--------- 24 files changed, 373 insertions(+), 249 deletions(-) create mode 100644 crates/nvisy-codec/src/render/image/pixelate.rs diff --git a/Cargo.lock b/Cargo.lock index ee53333..df8d1e9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2292,6 +2292,7 @@ dependencies = [ "semver", "serde", "serde_json", + "strum", "uuid", ] diff --git a/crates/nvisy-codec/Cargo.toml b/crates/nvisy-codec/Cargo.toml index b17ef7d..64e3217 100644 --- a/crates/nvisy-codec/Cargo.toml +++ b/crates/nvisy-codec/Cargo.toml @@ -23,6 +23,7 @@ rustdoc-args = ["--cfg", "docsrs"] [features] default = ["pdf", "docx", "html", "xlsx", "image", "wav", "mp3"] + # PDF parsing and text extraction via pdf-extract + lopdf; enables png for extracted images pdf = ["dep:pdf-extract", "dep:lopdf", "png"] # Microsoft Word (.docx) parsing via zip + quick-xml; enables image formats for extracted images @@ -31,11 +32,13 @@ docx = ["dep:zip", "dep:quick-xml", "jpeg", "png"] html = ["dep:scraper"] # Excel (.xlsx) spreadsheet parsing via calamine xlsx = ["dep:calamine"] + # Convenience alias: all image formats image = ["jpeg", "png"] # Individual image format handlers (each pulls in image + imageproc for rendering) jpeg = ["dep:image", "dep:imageproc"] png = ["dep:image", "dep:imageproc"] + # Audio format handlers (no additional dependencies) wav = [] mp3 = [] @@ -48,8 +51,6 @@ nvisy-ontology = { workspace = true, features = [] } # (De)serialization serde = { workspace = true, features = ["derive"] } serde_json = { workspace = true, features = [] } - -# Text parsing csv = { workspace = true, features = [] } # Async runtime diff --git a/crates/nvisy-codec/src/handler/mod.rs b/crates/nvisy-codec/src/handler/mod.rs index 81b9dee..13fea3b 100644 --- a/crates/nvisy-codec/src/handler/mod.rs +++ b/crates/nvisy-codec/src/handler/mod.rs @@ -36,12 +36,8 @@ pub use text::json_handler::{ }; pub use text::json_loader::{JsonParams, JsonLoader}; -pub use crate::render::text::AsText; - #[cfg(feature = "png")] pub use image::png::PngHandler; -#[cfg(any(feature = "png", feature = "jpeg"))] -pub use crate::render::image::AsImage; #[cfg(feature = "wav")] pub use audio::wav::WavHandler; diff --git a/crates/nvisy-codec/src/render/image/block.rs b/crates/nvisy-codec/src/render/image/block.rs index eba1545..78a17a4 100644 --- a/crates/nvisy-codec/src/render/image/block.rs +++ b/crates/nvisy-codec/src/render/image/block.rs @@ -5,17 +5,16 @@ //! using alpha-over blending. Regions are clamped to image bounds. use ::image::{DynamicImage, Rgba, RgbaImage}; -use nvisy_ontology::entity::BoundingBox; +use nvisy_ontology::entity::BoundingBoxU32; /// Apply a solid color block overlay to the specified regions of an image. /// -/// Each [`BoundingBox`] describes a rectangular region (in pixel coordinates) -/// that will be covered with an opaque rectangle of the given `color` (RGBA). -/// The overlay is composited using alpha-over blending via -/// [`imageops::overlay`](::image::imageops::overlay). +/// Each [`BoundingBoxU32`] describes a rectangular region (in pixel +/// coordinates) that will be covered with an opaque rectangle of the +/// given `color` (RGBA). pub fn apply_block_overlay( image: &DynamicImage, - regions: &[BoundingBox], + regions: &[BoundingBoxU32], color: [u8; 4], ) -> DynamicImage { let color = Rgba(color); @@ -24,10 +23,7 @@ pub fn apply_block_overlay( let img_h = result.height(); for region in regions { - let x = region.x.round() as u32; - let y = region.y.round() as u32; - let w = region.width.round() as u32; - let h = region.height.round() as u32; + let (x, y, w, h) = (region.x, region.y, region.width, region.height); if x >= img_w || y >= img_h { continue; diff --git a/crates/nvisy-codec/src/render/image/blur.rs b/crates/nvisy-codec/src/render/image/blur.rs index 2d99acd..d05f806 100644 --- a/crates/nvisy-codec/src/render/image/blur.rs +++ b/crates/nvisy-codec/src/render/image/blur.rs @@ -10,26 +10,21 @@ use ::image::DynamicImage; use imageproc::filter::gaussian_blur_f32; -use nvisy_ontology::entity::BoundingBox; +use nvisy_ontology::entity::BoundingBoxU32; /// Apply gaussian blur to the specified regions of an image. /// -/// Each [`BoundingBox`] describes a rectangular region (in pixel coordinates) -/// that will be blurred with the given `sigma` value. The algorithm crops -/// each region, applies [`gaussian_blur_f32`], and pastes the result back, -/// leaving the rest of the image untouched. +/// Each [`BoundingBoxU32`] describes a rectangular region (in pixel +/// coordinates) that will be blurred with the given `sigma` value. pub fn apply_gaussian_blur( image: &DynamicImage, - regions: &[BoundingBox], + regions: &[BoundingBoxU32], sigma: f32, ) -> DynamicImage { let mut result = image.clone(); for region in regions { - let x = region.x.round() as u32; - let y = region.y.round() as u32; - let w = region.width.round() as u32; - let h = region.height.round() as u32; + let (x, y, w, h) = (region.x, region.y, region.width, region.height); // Clamp to image bounds let img_w = result.width(); @@ -43,7 +38,6 @@ pub fn apply_gaussian_blur( continue; } - // Crop the region, blur it, paste it back let sub = result.crop_imm(x, y, w, h); let blurred = DynamicImage::ImageRgba8(gaussian_blur_f32(&sub.to_rgba8(), sigma)); ::image::imageops::overlay(&mut result, &blurred, x as i64, y as i64); diff --git a/crates/nvisy-codec/src/render/image/mod.rs b/crates/nvisy-codec/src/render/image/mod.rs index 3d043b8..cf02254 100644 --- a/crates/nvisy-codec/src/render/image/mod.rs +++ b/crates/nvisy-codec/src/render/image/mod.rs @@ -1,54 +1,96 @@ //! Image rendering primitives for redaction overlays. //! -//! Provides gaussian blur and solid-color block overlay functions that -//! operate on [`DynamicImage`] values using bounding-box regions. +//! Provides gaussian blur, solid-color block overlay, and pixelation +//! functions that operate on [`DynamicImage`] values using bounding-box +//! regions. //! -//! # Trait +//! # Traits //! -//! [`AsImage`] is the main extension point: image format handlers implement -//! [`decode`](AsImage::decode) and [`encode`](AsImage::encode) to round-trip -//! through [`DynamicImage`], and then get [`blur`](AsImage::blur) and -//! [`block`](AsImage::block) convenience methods for free via default -//! implementations. +//! [`AsImage`] is the codec extension point: image format handlers +//! implement [`decode`](AsImage::decode) and [`encode`](AsImage::encode) +//! to round-trip through [`DynamicImage`]. //! -//! # Sub-modules -//! -//! | Module | Description | -//! |--------|-------------| -//! | [`blur`] | Gaussian blur rendering | -//! | [`block`] | Solid-color block overlay rendering | +//! [`AsRedactableImage`] adds a [`redact`](AsRedactableImage::redact) +//! convenience method that dispatches [`ImageRedactionOutput`] variants +//! to the appropriate rendering primitive. It is automatically +//! implemented for every type that implements [`AsImage`]. mod blur; mod block; +mod pixelate; -pub use blur::apply_gaussian_blur; -pub use block::apply_block_overlay; +use blur::apply_gaussian_blur; +use block::apply_block_overlay; +use pixelate::apply_pixelate; use ::image::DynamicImage; use nvisy_core::error::Error; -use nvisy_ontology::entity::BoundingBox; +use nvisy_ontology::entity::{BoundingBox, BoundingBoxU32}; +use nvisy_ontology::redaction::ImageRedactionOutput; + +/// A located image redaction: pairs a bounding box with an +/// [`ImageRedactionOutput`] that carries the method-specific parameters. +pub struct ImageRedaction { + /// Bounding box of the region to redact. + pub bounding_box: BoundingBox, + /// The redaction output that determines the rendering method. + pub output: ImageRedactionOutput, +} /// Trait for handlers that wrap a raster image. /// -/// Provides [`decode`](Self::decode) / [`encode`](Self::encode) for -/// round-tripping through [`DynamicImage`], plus convenience methods for -/// applying blur and block-overlay redactions in a single call. +/// Handlers implement [`decode`](Self::decode) and [`encode`](Self::encode) +/// to round-trip through [`DynamicImage`]. See [`AsRedactableImage`] for +/// the higher-level redaction API. pub trait AsImage: Sized { /// Decode the handler's raw bytes into a [`DynamicImage`]. fn decode(&self) -> Result; /// Encode a [`DynamicImage`] back into a new handler instance. fn encode(image: &DynamicImage) -> Result; +} - /// Apply gaussian blur to the given bounding-box regions. - fn blur(&self, regions: &[BoundingBox], sigma: f32) -> Result { - let img = apply_gaussian_blur(&self.decode()?, regions, sigma); - Self::encode(&img) - } +/// Extension trait that adds [`ImageRedactionOutput`]-driven redaction +/// to any [`AsImage`] implementor. +/// +/// This trait is automatically implemented for every type that implements +/// [`AsImage`] — handler authors only need to implement [`AsImage`]. +pub trait AsRedactableImage: AsImage { + /// Apply a batch of image redactions, returning a new handler. + /// + /// Each [`ImageRedaction`] identifies a bounding box and an + /// [`ImageRedactionOutput`] that determines the rendering method + /// (blur, block, pixelate). The image is decoded once, all + /// redactions are applied in order, and then re-encoded. + fn redact(&self, redactions: &[ImageRedaction]) -> Result { + if redactions.is_empty() { + return Self::encode(&self.decode()?); + } + + let mut img = self.decode()?; + + for r in redactions { + let region = BoundingBoxU32::from(&r.bounding_box); + let regions = std::slice::from_ref(®ion); + match &r.output { + ImageRedactionOutput::Blur { sigma } => { + img = apply_gaussian_blur(&img, regions, *sigma); + } + ImageRedactionOutput::Block { color } => { + img = apply_block_overlay(&img, regions, *color); + } + ImageRedactionOutput::Pixelate { block_size } => { + img = apply_pixelate(&img, regions, *block_size); + } + ImageRedactionOutput::Synthesize => { + img = apply_block_overlay(&img, regions, [0, 0, 0, 255]); + } + } + } - /// Apply a solid-color block overlay to the given bounding-box regions. - fn block(&self, regions: &[BoundingBox], color: [u8; 4]) -> Result { - let img = apply_block_overlay(&self.decode()?, regions, color); Self::encode(&img) } } + +/// Blanket implementation: every [`AsImage`] type gets [`AsRedactableImage`] for free. +impl AsRedactableImage for T {} diff --git a/crates/nvisy-codec/src/render/image/pixelate.rs b/crates/nvisy-codec/src/render/image/pixelate.rs new file mode 100644 index 0000000..a0bba2e --- /dev/null +++ b/crates/nvisy-codec/src/render/image/pixelate.rs @@ -0,0 +1,55 @@ +//! Pixelation (mosaic) rendering for bounding-box regions. +//! +//! The algorithm works per-region: +//! 1. Crop the rectangular area from the source image. +//! 2. Downscale the crop by the block size factor. +//! 3. Upscale back to the original size using nearest-neighbour sampling. +//! 4. Paste the pixelated sub-image back over the original at the same position. +//! +//! Regions are clamped to image bounds so that out-of-range coordinates are +//! silently ignored rather than causing a panic. + +use ::image::imageops::FilterType; +use ::image::DynamicImage; +use nvisy_ontology::entity::BoundingBoxU32; + +/// Apply pixelation (mosaic effect) to the specified regions of an image. +/// +/// Each [`BoundingBoxU32`] describes a rectangular region (in pixel +/// coordinates) that will be pixelated. The `block_size` controls mosaic +/// granularity — higher values produce larger, blockier pixels. +pub fn apply_pixelate( + image: &DynamicImage, + regions: &[BoundingBoxU32], + block_size: u32, +) -> DynamicImage { + let mut result = image.clone(); + let block_size = block_size.max(1); + + for region in regions { + let (x, y, w, h) = (region.x, region.y, region.width, region.height); + + // Clamp to image bounds + let img_w = result.width(); + let img_h = result.height(); + if x >= img_w || y >= img_h { + continue; + } + let w = w.min(img_w - x); + let h = h.min(img_h - y); + if w == 0 || h == 0 { + continue; + } + + let small_w = (w / block_size).max(1); + let small_h = (h / block_size).max(1); + + let sub = result.crop_imm(x, y, w, h); + let small = sub.resize_exact(small_w, small_h, FilterType::Nearest); + let pixelated = small.resize_exact(w, h, FilterType::Nearest); + + ::image::imageops::overlay(&mut result, &pixelated, x as i64, y as i64); + } + + result +} diff --git a/crates/nvisy-codec/src/render/text/mask.rs b/crates/nvisy-codec/src/render/text/mask.rs index 4ce46b5..e891b36 100644 --- a/crates/nvisy-codec/src/render/text/mask.rs +++ b/crates/nvisy-codec/src/render/text/mask.rs @@ -16,28 +16,27 @@ use nvisy_ontology::redaction::TextRedactionOutput; pub fn mask_cell(cell: &str, output: &TextRedactionOutput) -> String { match output { TextRedactionOutput::Mask { mask_char, .. } => { - if cell.len() > 4 { - format!( - "{}{}", - mask_char.to_string().repeat(cell.len() - 4), - &cell[cell.len() - 4..] - ) + let char_count = cell.chars().count(); + if char_count > 4 { + let masked: String = cell + .chars() + .take(char_count - 4) + .map(|_| *mask_char) + .collect(); + let tail: String = cell.chars().skip(char_count - 4).collect(); + format!("{masked}{tail}") } else { - mask_char.to_string().repeat(cell.len()) + mask_char.to_string().repeat(char_count) } } TextRedactionOutput::Remove => String::new(), TextRedactionOutput::Hash { .. } => { format!("[HASH:{:x}]", hash_string(cell)) } - TextRedactionOutput::Replace { replacement } - | TextRedactionOutput::Synthesize { replacement } - | TextRedactionOutput::Aggregate { replacement } - | TextRedactionOutput::Generalize { replacement, .. } - | TextRedactionOutput::DateShift { replacement, .. } => replacement.clone(), - TextRedactionOutput::Encrypt { ciphertext, .. } => ciphertext.clone(), - TextRedactionOutput::Pseudonymize { pseudonym } => pseudonym.clone(), - TextRedactionOutput::Tokenize { token, .. } => token.clone(), + _ => output + .replacement_value() + .unwrap_or_default() + .to_string(), } } diff --git a/crates/nvisy-codec/src/render/text/mod.rs b/crates/nvisy-codec/src/render/text/mod.rs index d522178..46bb7a5 100644 --- a/crates/nvisy-codec/src/render/text/mod.rs +++ b/crates/nvisy-codec/src/render/text/mod.rs @@ -1,29 +1,27 @@ //! Text rendering and redaction primitives. //! //! Provides byte-offset replacement, cell-level masking, and the -//! [`AsText`] trait that text-bearing handlers implement to support -//! redaction in a single call. +//! [`AsText`] / [`AsRedactableText`] traits that text-bearing handlers +//! implement to support redaction in a single call. //! -//! # Trait +//! # Traits //! -//! [`AsText`] is the main extension point: text format handlers implement -//! [`content`](AsText::content) and [`replace_content`](AsText::replace_content) -//! to read and write their backing text, and then get a -//! [`redact`](AsText::redact) convenience method for free via the default -//! implementation. +//! [`AsText`] is the codec extension point: text format handlers +//! implement [`content`](AsText::content) and +//! [`replace_content`](AsText::replace_content) to read and write their +//! backing text. //! -//! # Sub-modules -//! -//! | Module | Description | -//! |--------|-------------| -//! | [`replace`] | Byte-offset text replacement engine | -//! | [`mask`] | Cell-level masking and hashing utilities | +//! [`AsRedactableText`] adds a [`redact`](AsRedactableText::redact) +//! convenience method that resolves [`TextRedaction`] items into +//! byte-offset replacements. It is automatically implemented for every +//! type that implements [`AsText`]. + +mod mask; +mod replace; -pub mod mask; -pub mod replace; +pub use mask::mask_cell; -pub use mask::{hash_string, mask_cell}; -pub use replace::{apply_replacements, PendingReplacement}; +use replace::{apply_replacements, PendingReplacement}; use nvisy_core::error::Error; use nvisy_ontology::redaction::TextRedactionOutput; @@ -39,19 +37,26 @@ pub struct TextRedaction { pub output: TextRedactionOutput, } -/// Trait for handlers that hold redactable text content. +/// Trait for handlers that wrap text content. /// -/// Mirrors [`AsImage`](super::image::AsImage) for the text modality. /// Handlers implement [`content`](Self::content) and -/// [`replace_content`](Self::replace_content), and get -/// [`redact`](Self::redact) for free. +/// [`replace_content`](Self::replace_content) to round-trip through +/// plain text. See [`AsRedactableText`] for the higher-level redaction +/// API. pub trait AsText: Sized { /// Return the handler's full text content as a single string. fn content(&self) -> String; /// Build a new handler instance with the given text content. fn replace_content(&self, content: &str) -> Result; +} +/// Extension trait that adds [`TextRedactionOutput`]-driven redaction +/// to any [`AsText`] implementor. +/// +/// This trait is automatically implemented for every type that implements +/// [`AsText`] — handler authors only need to implement [`AsText`]. +pub trait AsRedactableText: AsText { /// Apply a batch of text redactions, returning a new handler. /// /// Each [`TextRedaction`] identifies a byte range and a @@ -82,3 +87,6 @@ pub trait AsText: Sized { self.replace_content(&result) } } + +/// Blanket implementation: every [`AsText`] type gets [`AsRedactableText`] for free. +impl AsRedactableText for T {} diff --git a/crates/nvisy-ontology/Cargo.toml b/crates/nvisy-ontology/Cargo.toml index a5bb2e8..bed8e13 100644 --- a/crates/nvisy-ontology/Cargo.toml +++ b/crates/nvisy-ontology/Cargo.toml @@ -39,5 +39,8 @@ jiff = { workspace = true, features = [] } # Semantic versioning semver = { workspace = true, features = [] } +# Enum utilities +strum = { workspace = true, features = ["derive"] } + # Error handling derive_more = { workspace = true, features = ["display", "from"] } diff --git a/crates/nvisy-ontology/src/audit/mod.rs b/crates/nvisy-ontology/src/audit/mod.rs index fad2171..0cd4921 100644 --- a/crates/nvisy-ontology/src/audit/mod.rs +++ b/crates/nvisy-ontology/src/audit/mod.rs @@ -22,9 +22,10 @@ pub trait Auditable { } /// Kind of auditable action recorded in an [`Audit`] entry. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, strum::Display, Serialize, Deserialize)] #[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] #[serde(rename_all = "snake_case")] +#[strum(serialize_all = "snake_case")] pub enum AuditAction { /// A sensitive entity was detected. Detection, diff --git a/crates/nvisy-ontology/src/detection/annotation.rs b/crates/nvisy-ontology/src/detection/annotation.rs index e2ec81b..ead0bf7 100644 --- a/crates/nvisy-ontology/src/detection/annotation.rs +++ b/crates/nvisy-ontology/src/detection/annotation.rs @@ -13,9 +13,10 @@ use crate::entity::{ }; /// The kind of annotation applied to a content region. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, strum::Display, Serialize, Deserialize)] #[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] #[serde(rename_all = "snake_case")] +#[strum(serialize_all = "snake_case")] pub enum AnnotationKind { /// Pre-identified sensitive region that should be treated as a detection. Inclusion, @@ -26,9 +27,10 @@ pub enum AnnotationKind { } /// The scope to which an annotation label applies. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, strum::Display, Serialize, Deserialize)] #[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] #[serde(rename_all = "snake_case")] +#[strum(serialize_all = "snake_case")] pub enum AnnotationScope { /// Label applies to the entire document. Document, diff --git a/crates/nvisy-ontology/src/detection/sensitivity.rs b/crates/nvisy-ontology/src/detection/sensitivity.rs index 783f2a7..55170d5 100644 --- a/crates/nvisy-ontology/src/detection/sensitivity.rs +++ b/crates/nvisy-ontology/src/detection/sensitivity.rs @@ -6,9 +6,10 @@ use serde::{Deserialize, Serialize}; /// /// Drives downstream policy: rules can be scoped to specific sensitivity /// levels via [`RuleCondition`](crate::policy::RuleCondition). -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, strum::Display, Serialize, Deserialize)] #[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] #[serde(rename_all = "snake_case")] +#[strum(serialize_all = "snake_case")] pub enum SensitivityLevel { /// No sensitive data detected or all data is publicly available. Public, diff --git a/crates/nvisy-ontology/src/entity/location.rs b/crates/nvisy-ontology/src/entity/location.rs index 1ea734c..25ec432 100644 --- a/crates/nvisy-ontology/src/entity/location.rs +++ b/crates/nvisy-ontology/src/entity/location.rs @@ -14,6 +14,10 @@ pub struct TimeSpan { } /// Axis-aligned bounding box for image-based entity locations. +/// +/// Coordinates are `f64` to support both pixel and normalized (0.0–1.0) +/// values from detection models. Use [`BoundingBoxU32`] (or [`Into`]) +/// when integer pixel coordinates are needed for rendering. #[derive(Debug, Clone, Serialize, Deserialize)] #[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] pub struct BoundingBox { @@ -27,6 +31,40 @@ pub struct BoundingBox { pub height: f64, } +/// Integer pixel-coordinate bounding box for rendering operations. +/// +/// Converted from [`BoundingBox`] by rounding each field to the nearest +/// integer. Use this at the rendering boundary where pixel-exact +/// coordinates are required. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub struct BoundingBoxU32 { + /// Horizontal offset of the top-left corner in pixels. + pub x: u32, + /// Vertical offset of the top-left corner in pixels. + pub y: u32, + /// Width in pixels. + pub width: u32, + /// Height in pixels. + pub height: u32, +} + +impl From<&BoundingBox> for BoundingBoxU32 { + fn from(bb: &BoundingBox) -> Self { + Self { + x: bb.x.round() as u32, + y: bb.y.round() as u32, + width: bb.width.round() as u32, + height: bb.height.round() as u32, + } + } +} + +impl From for BoundingBoxU32 { + fn from(bb: BoundingBox) -> Self { + Self::from(&bb) + } +} + /// Location of an entity within text content. #[derive(Debug, Clone, Serialize, Deserialize)] #[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] diff --git a/crates/nvisy-ontology/src/entity/mod.rs b/crates/nvisy-ontology/src/entity/mod.rs index 88fe2d3..8e20a0f 100644 --- a/crates/nvisy-ontology/src/entity/mod.rs +++ b/crates/nvisy-ontology/src/entity/mod.rs @@ -11,7 +11,7 @@ mod selector; pub use document::DocumentType; pub use location::{ - AudioLocation, BoundingBox, ImageLocation, TabularLocation, + AudioLocation, BoundingBox, BoundingBoxU32, ImageLocation, TabularLocation, TextLocation, TimeSpan, VideoLocation, }; pub use model::{ModelInfo, ModelKind}; @@ -23,9 +23,10 @@ use serde_json::{Map, Value}; use nvisy_core::path::ContentSource; /// Category of sensitive data an entity belongs to. -#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[derive(Debug, Clone, PartialEq, Eq, Hash, strum::Display, Serialize, Deserialize)] #[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] #[serde(rename_all = "snake_case")] +#[strum(serialize_all = "snake_case")] pub enum EntityCategory { /// Personally Identifiable Information (names, SSNs, addresses, etc.). Pii, @@ -40,13 +41,15 @@ pub enum EntityCategory { /// Biometric data (fingerprints, iris scans, voiceprints). Biometric, /// User-defined or plugin-specific category. + #[strum(to_string = "{0}")] Custom(String), } /// Method used to detect a sensitive entity. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, strum::Display, Serialize, Deserialize)] #[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] #[serde(rename_all = "snake_case")] +#[strum(serialize_all = "snake_case")] pub enum DetectionMethod { /// Regular expression pattern matching. Regex, @@ -180,4 +183,13 @@ impl Entity { self.source = self.source.with_parent(parent); self } + + /// Copy all location fields from another entity. + pub fn copy_locations_from(&mut self, other: &Self) { + self.text_location = other.text_location.clone(); + self.image_location = other.image_location.clone(); + self.tabular_location = other.tabular_location.clone(); + self.audio_location = other.audio_location.clone(); + self.video_location = other.video_location.clone(); + } } diff --git a/crates/nvisy-ontology/src/policy/regulation.rs b/crates/nvisy-ontology/src/policy/regulation.rs index 8007b1f..5cad331 100644 --- a/crates/nvisy-ontology/src/policy/regulation.rs +++ b/crates/nvisy-ontology/src/policy/regulation.rs @@ -3,9 +3,10 @@ use serde::{Deserialize, Serialize}; /// A compliance regulation or framework that a policy targets. -#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[derive(Debug, Clone, PartialEq, Eq, Hash, strum::Display, Serialize, Deserialize)] #[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] #[serde(rename_all = "snake_case")] +#[strum(serialize_all = "snake_case")] pub enum RegulationKind { /// Health Insurance Portability and Accountability Act. Hipaa, @@ -22,5 +23,6 @@ pub enum RegulationKind { /// Sarbanes-Oxley Act. Sox, /// User-defined regulation or framework. + #[strum(to_string = "{0}")] Custom(String), } diff --git a/crates/nvisy-ontology/src/policy/rule.rs b/crates/nvisy-ontology/src/policy/rule.rs index c7e1af5..4c223f7 100644 --- a/crates/nvisy-ontology/src/policy/rule.rs +++ b/crates/nvisy-ontology/src/policy/rule.rs @@ -29,9 +29,10 @@ pub struct RuleCondition { } /// Classifies what a policy rule does when it matches. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, strum::Display, Serialize, Deserialize)] #[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] #[serde(rename_all = "snake_case")] +#[strum(serialize_all = "snake_case")] pub enum RuleKind { /// Apply a redaction to the matched entity. Redaction, diff --git a/crates/nvisy-ontology/src/prelude.rs b/crates/nvisy-ontology/src/prelude.rs index a71439c..f5e0150 100644 --- a/crates/nvisy-ontology/src/prelude.rs +++ b/crates/nvisy-ontology/src/prelude.rs @@ -9,7 +9,7 @@ pub use crate::detection::{ Sensitivity, SensitivityLevel, }; pub use crate::entity::{ - AudioLocation, BoundingBox, DetectionMethod, DocumentType, Entity, EntityCategory, + AudioLocation, BoundingBox, BoundingBoxU32, DetectionMethod, DocumentType, Entity, EntityCategory, EntitySelector, ImageLocation, ModelInfo, ModelKind, TabularLocation, TextLocation, TimeSpan, VideoLocation, }; diff --git a/crates/nvisy-ontology/src/redaction/method.rs b/crates/nvisy-ontology/src/redaction/method.rs index 13290a2..e74b84a 100644 --- a/crates/nvisy-ontology/src/redaction/method.rs +++ b/crates/nvisy-ontology/src/redaction/method.rs @@ -9,9 +9,10 @@ use derive_more::From; use serde::{Deserialize, Serialize}; /// Redaction strategies for text and tabular content. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, strum::Display, Serialize, Deserialize)] #[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] #[serde(rename_all = "snake_case")] +#[strum(serialize_all = "snake_case")] pub enum TextRedactionMethod { /// Replace characters with a mask character (e.g. `***-**-1234`). Mask, @@ -38,9 +39,10 @@ pub enum TextRedactionMethod { } /// Redaction strategies for image and video regions. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, strum::Display, Serialize, Deserialize)] #[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] #[serde(rename_all = "snake_case")] +#[strum(serialize_all = "snake_case")] pub enum ImageRedactionMethod { /// Apply a gaussian blur to the region. Blur, @@ -53,9 +55,10 @@ pub enum ImageRedactionMethod { } /// Redaction strategies for audio segments. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, strum::Display, Serialize, Deserialize)] #[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] #[serde(rename_all = "snake_case")] +#[strum(serialize_all = "snake_case")] pub enum AudioRedactionMethod { /// Replace the audio segment with silence. Silence, diff --git a/crates/nvisy-ontology/src/redaction/mod.rs b/crates/nvisy-ontology/src/redaction/mod.rs index d5eb01e..6bc0c42 100644 --- a/crates/nvisy-ontology/src/redaction/mod.rs +++ b/crates/nvisy-ontology/src/redaction/mod.rs @@ -34,7 +34,10 @@ pub use output::{ AudioRedactionOutput, ImageRedactionOutput, RedactionOutput, TextRedactionOutput, }; pub use review::{ReviewDecision, ReviewStatus}; -pub use spec::{AudioRedactionSpec, ImageRedactionSpec, RedactionSpec, TextRedactionSpec}; +pub use spec::{ + AudioRedactionSpec, ImageRedactionSpec, RedactionSpec, TextRedactionSpec, + DEFAULT_BLOCK_COLOR, DEFAULT_BLUR_SIGMA, DEFAULT_MASK_CHAR, DEFAULT_PIXELATE_BLOCK_SIZE, +}; pub use summary::RedactionSummary; use serde::{Deserialize, Serialize}; diff --git a/crates/nvisy-ontology/src/redaction/spec.rs b/crates/nvisy-ontology/src/redaction/spec.rs index ed146af..c87f784 100644 --- a/crates/nvisy-ontology/src/redaction/spec.rs +++ b/crates/nvisy-ontology/src/redaction/spec.rs @@ -64,8 +64,20 @@ pub enum TextRedactionSpec { }, } +/// Default mask character for text redaction. +pub const DEFAULT_MASK_CHAR: char = '*'; + +/// Default gaussian blur sigma value. +pub const DEFAULT_BLUR_SIGMA: f32 = 15.0; + +/// Default RGBA color for block overlays (opaque black). +pub const DEFAULT_BLOCK_COLOR: [u8; 4] = [0, 0, 0, 255]; + +/// Default pixel block size for pixelation/mosaic. +pub const DEFAULT_PIXELATE_BLOCK_SIZE: u32 = 10; + fn default_mask_char() -> char { - '*' + DEFAULT_MASK_CHAR } /// Image redaction specification with method-specific configuration. @@ -96,13 +108,13 @@ pub enum ImageRedactionSpec { } fn default_sigma() -> f32 { - 15.0 + DEFAULT_BLUR_SIGMA } fn default_block_color() -> [u8; 4] { - [0, 0, 0, 255] + DEFAULT_BLOCK_COLOR } fn default_block_size() -> u32 { - 10 + DEFAULT_PIXELATE_BLOCK_SIZE } /// Audio redaction specification. diff --git a/crates/nvisy-pipeline/src/detection/checksum.rs b/crates/nvisy-pipeline/src/detection/checksum.rs index f301e00..78c8c46 100644 --- a/crates/nvisy-pipeline/src/detection/checksum.rs +++ b/crates/nvisy-pipeline/src/detection/checksum.rs @@ -78,11 +78,7 @@ impl Action for DetectChecksumAction { DetectionMethod::Checksum, (entity.confidence + confidence_boost).min(1.0), ); - boosted.text_location = entity.text_location.clone(); - boosted.image_location = entity.image_location.clone(); - boosted.tabular_location = entity.tabular_location.clone(); - boosted.audio_location = entity.audio_location.clone(); - boosted.video_location = entity.video_location.clone(); + boosted.copy_locations_from(&entity); boosted.source.set_parent_id(entity.source.parent_id()); result.push(boosted); diff --git a/crates/nvisy-pipeline/src/redaction/apply.rs b/crates/nvisy-pipeline/src/redaction/apply.rs index 5017ccc..5ab9981 100644 --- a/crates/nvisy-pipeline/src/redaction/apply.rs +++ b/crates/nvisy-pipeline/src/redaction/apply.rs @@ -4,19 +4,17 @@ use std::collections::HashMap; use uuid::Uuid; use serde::Deserialize; -use nvisy_codec::handler::{TxtHandler, CsvHandler, AsText}; +use nvisy_codec::handler::{TxtHandler, CsvHandler}; use nvisy_codec::document::Document; -use nvisy_codec::render::text::{TextRedaction, mask_cell}; +use nvisy_codec::render::text::{TextRedaction, AsRedactableText, mask_cell}; use nvisy_ontology::entity::Entity; use nvisy_ontology::redaction::{Redaction, RedactionOutput}; use nvisy_core::error::Error; #[cfg(feature = "image-redaction")] -use nvisy_codec::handler::{PngHandler, AsImage}; +use nvisy_codec::handler::PngHandler; #[cfg(feature = "image-redaction")] -use nvisy_ontology::entity::BoundingBox; -#[cfg(feature = "image-redaction")] -use nvisy_ontology::redaction::ImageRedactionOutput; +use nvisy_codec::render::image::{ImageRedaction, AsRedactableImage}; #[cfg(feature = "audio-redaction")] use nvisy_codec::handler::WavHandler; @@ -27,36 +25,12 @@ use crate::action::Action; #[derive(Debug, Deserialize)] #[serde(rename_all = "camelCase")] pub struct ApplyRedactionParams { - /// Sigma value for gaussian blur (image redaction). - #[cfg(feature = "image-redaction")] - #[serde(default = "default_sigma")] - pub blur_sigma: f32, - /// RGBA color for block overlays (image redaction). - #[cfg(feature = "image-redaction")] - #[serde(default = "default_block_color")] - pub block_color: [u8; 4], - /// Pixel block size for pixelation/mosaic (image redaction). - #[cfg(feature = "image-redaction")] - #[serde(default = "default_pixelate_block_size")] - pub pixelate_block_size: u32, /// Duration in seconds to crossfade at silence boundaries (audio redaction). #[cfg(feature = "audio-redaction")] #[serde(default = "default_crossfade_secs")] pub crossfade_secs: f64, } -#[cfg(feature = "image-redaction")] -fn default_sigma() -> f32 { - 15.0 -} -#[cfg(feature = "image-redaction")] -fn default_block_color() -> [u8; 4] { - [0, 0, 0, 255] -} -#[cfg(feature = "image-redaction")] -fn default_pixelate_block_size() -> u32 { - 10 -} #[cfg(feature = "audio-redaction")] fn default_crossfade_secs() -> f64 { 0.05 @@ -102,6 +76,7 @@ pub struct ApplyRedactionOutput { /// - **Audio documents**: stub pass-through (feature-gated) /// - **Tabular documents**: cell-level redaction pub struct ApplyRedactionAction { + #[allow(dead_code)] params: ApplyRedactionParams, } @@ -134,7 +109,7 @@ impl Action for ApplyRedactionAction { // Text documents let mut result_text = Vec::new(); for doc in &input.text_docs { - let redacted = apply_text_doc(doc, &entity_map, &redaction_map); + let redacted = apply_text_doc(doc, &entity_map, &redaction_map)?; result_text.push(redacted); } @@ -143,13 +118,7 @@ impl Action for ApplyRedactionAction { let mut result_image = Vec::new(); #[cfg(feature = "image-redaction")] for doc in &input.image_docs { - let redacted = apply_image_doc( - doc, - &input.entities, - &redaction_map, - self.params.blur_sigma, - self.params.block_color, - )?; + let redacted = apply_image_doc(doc, &input.entities, &redaction_map)?; result_image.push(redacted); } @@ -188,7 +157,7 @@ fn apply_text_doc( doc: &Document, entity_map: &HashMap, redaction_map: &HashMap, -) -> Document { +) -> Result, Error> { let mut redactions: Vec = Vec::new(); for (entity_id, redaction) in redaction_map { @@ -216,13 +185,13 @@ fn apply_text_doc( } if redactions.is_empty() { - return doc.clone(); + return Ok(doc.clone()); } - let handler = doc.handler().redact(&redactions).expect("text redaction failed"); + let handler = doc.handler().redact(&redactions)?; let mut result = Document::new(handler); result.source.set_parent_id(Some(doc.source.as_uuid())); - result + Ok(result) } // --------------------------------------------------------------------------- @@ -234,42 +203,32 @@ fn apply_image_doc( doc: &Document, entities: &[Entity], redaction_map: &HashMap, - blur_sigma: f32, - block_color: [u8; 4], ) -> Result, Error> { - let mut blur_regions: Vec = Vec::new(); - let mut block_regions: Vec = Vec::new(); + let mut redactions: Vec = Vec::new(); for entity in entities { if let Some(ref img_loc) = entity.image_location { - let bbox = &img_loc.bounding_box; if let Some(redaction) = redaction_map.get(&entity.source.as_uuid()) { - match &redaction.output { - RedactionOutput::Image(ImageRedactionOutput::Blur { .. }) => { - blur_regions.push(bbox.clone()) - } - RedactionOutput::Image(ImageRedactionOutput::Block { .. }) => { - block_regions.push(bbox.clone()) - } - _ => block_regions.push(bbox.clone()), - } + let output = match &redaction.output { + RedactionOutput::Image(img) => img.clone(), + _ => continue, + }; + redactions.push(ImageRedaction { + bounding_box: img_loc.bounding_box.clone(), + output, + }); } } } - if blur_regions.is_empty() && block_regions.is_empty() { + if redactions.is_empty() { return Ok(doc.clone()); } - let mut handler = doc.handler().clone(); - if !blur_regions.is_empty() { - handler = handler.blur(&blur_regions, blur_sigma)?; - } - if !block_regions.is_empty() { - handler = handler.block(&block_regions, block_color)?; - } - - Ok(Document::new(handler)) + let handler = doc.handler().redact(&redactions)?; + let mut result = Document::new(handler); + result.source.set_parent_id(Some(doc.source.as_uuid())); + Ok(result) } // --------------------------------------------------------------------------- diff --git a/crates/nvisy-pipeline/src/redaction/evaluate_policy.rs b/crates/nvisy-pipeline/src/redaction/evaluate_policy.rs index 5af6b59..b7590ec 100644 --- a/crates/nvisy-pipeline/src/redaction/evaluate_policy.rs +++ b/crates/nvisy-pipeline/src/redaction/evaluate_policy.rs @@ -53,7 +53,8 @@ impl Action for EvaluatePolicyAction { "evaluate-policy" } - async fn connect(params: Self::Params) -> Result { + async fn connect(mut params: Self::Params) -> Result { + params.rules.sort_by_key(|r| r.priority); Ok(Self { params }) } @@ -64,13 +65,10 @@ impl Action for EvaluatePolicyAction { let default_spec = &self.params.default_spec; let default_threshold = self.params.default_confidence_threshold; - let mut sorted_rules = self.params.rules.clone(); - sorted_rules.sort_by_key(|r| r.priority); - let mut redactions = Vec::new(); for entity in &entities { - let rule = find_matching_rule(entity, &sorted_rules); + let rule = find_matching_rule(entity, &self.params.rules); let spec = rule.map(|r| &r.spec).unwrap_or(default_spec); if rule.is_none() && entity.confidence < default_threshold { @@ -115,10 +113,7 @@ fn find_matching_rule<'a>(entity: &Entity, rules: &'a [PolicyRule]) -> Option<&' fn apply_template(template: &str, entity: &Entity) -> String { template .replace("{entityType}", &entity.entity_type) - .replace( - "{category}", - &format!("{:?}", entity.category).to_lowercase(), - ) + .replace("{category}", &entity.category.to_string()) .replace("{value}", &entity.value) } @@ -157,71 +152,75 @@ fn build_default_output(entity: &Entity, spec: &RedactionSpec) -> RedactionOutpu TextRedactionSpec::Generalize { .. } => format!("[GEN:{}]", entity.entity_type), TextRedactionSpec::DateShift { .. } => format!("[SHIFTED:{}]", entity.entity_type), }; - build_output_with_replacement(spec, replacement) + RedactionOutput::Text(build_text_output(text, replacement)) } - RedactionSpec::Image(img) => RedactionOutput::Image(match img { - ImageRedactionSpec::Blur { sigma } => ImageRedactionOutput::Blur { sigma: *sigma }, - ImageRedactionSpec::Block { color } => ImageRedactionOutput::Block { color: *color }, - ImageRedactionSpec::Pixelate { block_size } => { - ImageRedactionOutput::Pixelate { block_size: *block_size } - } - ImageRedactionSpec::Synthesize => ImageRedactionOutput::Synthesize, - }), - RedactionSpec::Audio(audio) => RedactionOutput::Audio(match audio { - AudioRedactionSpec::Silence => AudioRedactionOutput::Silence, - AudioRedactionSpec::Remove => AudioRedactionOutput::Remove, - AudioRedactionSpec::Synthesize => AudioRedactionOutput::Synthesize, - }), + RedactionSpec::Image(img) => RedactionOutput::Image(build_image_output(img)), + RedactionSpec::Audio(audio) => RedactionOutput::Audio(build_audio_output(audio)), } } /// Builds a [`RedactionOutput`] from a spec and a replacement string. fn build_output_with_replacement(spec: &RedactionSpec, replacement: String) -> RedactionOutput { match spec { - RedactionSpec::Text(text) => RedactionOutput::Text(match text { - TextRedactionSpec::Mask { mask_char } => TextRedactionOutput::Mask { - replacement, - mask_char: *mask_char, - }, - TextRedactionSpec::Replace { .. } => TextRedactionOutput::Replace { replacement }, - TextRedactionSpec::Hash => TextRedactionOutput::Hash { - hash_value: replacement, - }, - TextRedactionSpec::Encrypt { key_id } => TextRedactionOutput::Encrypt { - ciphertext: replacement, - key_id: key_id.clone(), - }, - TextRedactionSpec::Remove => TextRedactionOutput::Remove, - TextRedactionSpec::Synthesize => TextRedactionOutput::Synthesize { replacement }, - TextRedactionSpec::Pseudonymize => TextRedactionOutput::Pseudonymize { - pseudonym: replacement, - }, - TextRedactionSpec::Tokenize { vault_id } => TextRedactionOutput::Tokenize { - token: replacement, - vault_id: vault_id.clone(), - }, - TextRedactionSpec::Aggregate => TextRedactionOutput::Aggregate { replacement }, - TextRedactionSpec::Generalize { level } => TextRedactionOutput::Generalize { - replacement, - level: *level, - }, - TextRedactionSpec::DateShift { offset_days } => TextRedactionOutput::DateShift { - replacement, - offset_days: *offset_days, - }, - }), - RedactionSpec::Image(img) => RedactionOutput::Image(match img { - ImageRedactionSpec::Blur { sigma } => ImageRedactionOutput::Blur { sigma: *sigma }, - ImageRedactionSpec::Block { color } => ImageRedactionOutput::Block { color: *color }, - ImageRedactionSpec::Pixelate { block_size } => { - ImageRedactionOutput::Pixelate { block_size: *block_size } - } - ImageRedactionSpec::Synthesize => ImageRedactionOutput::Synthesize, - }), - RedactionSpec::Audio(audio) => RedactionOutput::Audio(match audio { - AudioRedactionSpec::Silence => AudioRedactionOutput::Silence, - AudioRedactionSpec::Remove => AudioRedactionOutput::Remove, - AudioRedactionSpec::Synthesize => AudioRedactionOutput::Synthesize, - }), + RedactionSpec::Text(text) => RedactionOutput::Text(build_text_output(text, replacement)), + RedactionSpec::Image(img) => RedactionOutput::Image(build_image_output(img)), + RedactionSpec::Audio(audio) => RedactionOutput::Audio(build_audio_output(audio)), + } +} + +/// Maps a [`TextRedactionSpec`] and replacement string to a [`TextRedactionOutput`]. +fn build_text_output(spec: &TextRedactionSpec, replacement: String) -> TextRedactionOutput { + match spec { + TextRedactionSpec::Mask { mask_char } => TextRedactionOutput::Mask { + replacement, + mask_char: *mask_char, + }, + TextRedactionSpec::Replace { .. } => TextRedactionOutput::Replace { replacement }, + TextRedactionSpec::Hash => TextRedactionOutput::Hash { + hash_value: replacement, + }, + TextRedactionSpec::Encrypt { key_id } => TextRedactionOutput::Encrypt { + ciphertext: replacement, + key_id: key_id.clone(), + }, + TextRedactionSpec::Remove => TextRedactionOutput::Remove, + TextRedactionSpec::Synthesize => TextRedactionOutput::Synthesize { replacement }, + TextRedactionSpec::Pseudonymize => TextRedactionOutput::Pseudonymize { + pseudonym: replacement, + }, + TextRedactionSpec::Tokenize { vault_id } => TextRedactionOutput::Tokenize { + token: replacement, + vault_id: vault_id.clone(), + }, + TextRedactionSpec::Aggregate => TextRedactionOutput::Aggregate { replacement }, + TextRedactionSpec::Generalize { level } => TextRedactionOutput::Generalize { + replacement, + level: *level, + }, + TextRedactionSpec::DateShift { offset_days } => TextRedactionOutput::DateShift { + replacement, + offset_days: *offset_days, + }, + } +} + +/// Maps an [`ImageRedactionSpec`] to an [`ImageRedactionOutput`]. +fn build_image_output(spec: &ImageRedactionSpec) -> ImageRedactionOutput { + match spec { + ImageRedactionSpec::Blur { sigma } => ImageRedactionOutput::Blur { sigma: *sigma }, + ImageRedactionSpec::Block { color } => ImageRedactionOutput::Block { color: *color }, + ImageRedactionSpec::Pixelate { block_size } => { + ImageRedactionOutput::Pixelate { block_size: *block_size } + } + ImageRedactionSpec::Synthesize => ImageRedactionOutput::Synthesize, + } +} + +/// Maps an [`AudioRedactionSpec`] to an [`AudioRedactionOutput`]. +fn build_audio_output(spec: &AudioRedactionSpec) -> AudioRedactionOutput { + match spec { + AudioRedactionSpec::Silence => AudioRedactionOutput::Silence, + AudioRedactionSpec::Remove => AudioRedactionOutput::Remove, + AudioRedactionSpec::Synthesize => AudioRedactionOutput::Synthesize, } } From 47e6a793fb708226ed9259fa681f5b6ea7172511 Mon Sep 17 00:00:00 2001 From: Oleh Martsokha Date: Mon, 16 Feb 2026 23:30:21 +0100 Subject: [PATCH 06/11] refactor: delete nvisy-ontology, decouple python from domain types, redistribute redaction/audit ownership Strip nvisy-python to a thin PyO3 bridge returning Vec via pythonize::depythonize. Add NerBackend/OcrBackend traits to pipeline (implemented by PythonBridge). Move ontology modules (entity, detection, policy, redaction, audit) from nvisy-ontology into nvisy-pipeline. Move RedactionOutput types to nvisy-codec. Move RedactionMethod tag enums, EntityCategory, BoundingBox, DocumentType to nvisy-core. Delete nvisy-ontology crate entirely. Co-Authored-By: Claude Opus 4.6 --- Cargo.lock | 44 ++-- Cargo.toml | 3 +- crates/nvisy-codec/Cargo.toml | 7 +- crates/nvisy-codec/src/document/mod.rs | 2 +- crates/nvisy-codec/src/handler/audio/mp3.rs | 2 +- crates/nvisy-codec/src/handler/audio/wav.rs | 2 +- .../nvisy-codec/src/handler/document/docx.rs | 2 +- .../nvisy-codec/src/handler/document/pdf.rs | 2 +- crates/nvisy-codec/src/handler/image/jpeg.rs | 2 +- crates/nvisy-codec/src/handler/image/png.rs | 2 +- crates/nvisy-codec/src/handler/mod.rs | 2 +- .../nvisy-codec/src/handler/tabular/xlsx.rs | 2 +- .../src/handler/text/csv_handler.rs | 2 +- .../src/handler/text/csv_loader.rs | 2 +- crates/nvisy-codec/src/handler/text/html.rs | 2 +- .../src/handler/text/json_handler.rs | 2 +- .../src/handler/text/json_loader.rs | 2 +- .../src/handler/text/txt_handler.rs | 2 +- .../src/handler/text/txt_loader.rs | 2 +- crates/nvisy-codec/src/render/image/block.rs | 2 +- crates/nvisy-codec/src/render/image/blur.rs | 2 +- crates/nvisy-codec/src/render/image/mod.rs | 4 +- .../nvisy-codec/src/render/image/pixelate.rs | 2 +- crates/nvisy-codec/src/render/mod.rs | 3 + .../src/render}/output.rs | 15 +- crates/nvisy-codec/src/render/text/mask.rs | 2 +- crates/nvisy-codec/src/render/text/mod.rs | 2 +- crates/nvisy-core/Cargo.toml | 3 + crates/nvisy-core/src/entity.rs | 29 +++ .../src/fs/document_type.rs} | 2 +- crates/nvisy-core/src/fs/mod.rs | 2 + crates/nvisy-core/src/lib.rs | 3 + crates/nvisy-core/src/math.rs | 68 ++++++ .../src/redaction/method.rs | 8 +- crates/nvisy-core/src/redaction/mod.rs | 15 ++ crates/nvisy-engine/Cargo.toml | 5 +- crates/nvisy-engine/src/engine.rs | 8 +- crates/nvisy-engine/src/lib.rs | 1 + .../src/ontology/audit.rs} | 8 +- crates/nvisy-engine/src/ontology/mod.rs | 3 + crates/nvisy-ontology/Cargo.toml | 46 ---- crates/nvisy-ontology/README.md | 3 - crates/nvisy-ontology/src/lib.rs | 12 -- crates/nvisy-ontology/src/prelude.rs | 24 --- crates/nvisy-pattern/Cargo.toml | 2 +- crates/nvisy-pattern/src/patterns/mod.rs | 2 +- crates/nvisy-pipeline/Cargo.toml | 13 +- .../nvisy-pipeline/src/detection/checksum.rs | 2 +- .../nvisy-pipeline/src/detection/classify.rs | 8 +- .../src/detection/dictionary.rs | 5 +- crates/nvisy-pipeline/src/detection/manual.rs | 4 +- crates/nvisy-pipeline/src/detection/mod.rs | 2 +- crates/nvisy-pipeline/src/detection/ner.rs | 169 +++++++++++++-- crates/nvisy-pipeline/src/detection/regex.rs | 2 +- .../nvisy-pipeline/src/detection/tabular.rs | 5 +- crates/nvisy-pipeline/src/generation/ocr.rs | 140 ++++++++++-- .../src/generation/synthetic.rs | 4 +- .../src/generation/transcribe.rs | 4 +- crates/nvisy-pipeline/src/lib.rs | 6 +- .../src/ontology}/audit/mod.rs | 13 +- .../src/ontology}/audit/retention.rs | 4 +- .../src/ontology}/detection/annotation.rs | 19 +- .../src/ontology}/detection/classification.rs | 1 - .../src/ontology}/detection/mod.rs | 7 +- .../src/ontology}/detection/sensitivity.rs | 11 - .../src/ontology}/entity/location.rs | 74 +------ .../src/ontology}/entity/mod.rs | 38 +--- .../src/ontology}/entity/model.rs | 4 +- .../src/ontology}/entity/selector.rs | 6 +- crates/nvisy-pipeline/src/ontology/mod.rs | 9 + .../src/ontology}/policy/evaluation.rs | 8 +- .../src/ontology}/policy/mod.rs | 12 +- .../src/ontology}/policy/regulation.rs | 1 - .../src/ontology}/policy/rule.rs | 20 +- .../src/ontology}/redaction/mod.rs | 54 +---- .../src/ontology}/redaction/review.rs | 6 +- .../src/ontology}/redaction/spec.rs | 15 +- .../src/ontology}/redaction/summary.rs | 2 +- .../src/ontology/redaction/trait_.rs | 12 ++ crates/nvisy-pipeline/src/prelude.rs | 3 +- crates/nvisy-pipeline/src/redaction/apply.rs | 5 +- .../src/redaction/emit_audit.rs | 5 +- .../src/redaction/evaluate_policy.rs | 13 +- crates/nvisy-python/Cargo.toml | 14 +- crates/nvisy-python/src/actions/mod.rs | 199 ------------------ crates/nvisy-python/src/actions/ocr.rs | 121 ----------- crates/nvisy-python/src/lib.rs | 11 +- crates/nvisy-python/src/ner/mod.rs | 150 ++++--------- crates/nvisy-python/src/ocr/mod.rs | 125 +++-------- crates/nvisy-python/src/prelude.rs | 5 - 90 files changed, 665 insertions(+), 1028 deletions(-) rename crates/{nvisy-ontology/src/redaction => nvisy-codec/src/render}/output.rs (93%) create mode 100644 crates/nvisy-core/src/entity.rs rename crates/{nvisy-ontology/src/entity/document.rs => nvisy-core/src/fs/document_type.rs} (90%) create mode 100644 crates/nvisy-core/src/math.rs rename crates/{nvisy-ontology => nvisy-core}/src/redaction/method.rs (93%) create mode 100644 crates/nvisy-core/src/redaction/mod.rs rename crates/{nvisy-ontology/src/audit/explanation.rs => nvisy-engine/src/ontology/audit.rs} (85%) create mode 100644 crates/nvisy-engine/src/ontology/mod.rs delete mode 100644 crates/nvisy-ontology/Cargo.toml delete mode 100644 crates/nvisy-ontology/README.md delete mode 100644 crates/nvisy-ontology/src/lib.rs delete mode 100644 crates/nvisy-ontology/src/prelude.rs rename crates/{nvisy-ontology/src => nvisy-pipeline/src/ontology}/audit/mod.rs (82%) rename crates/{nvisy-ontology/src => nvisy-pipeline/src/ontology}/audit/retention.rs (91%) rename crates/{nvisy-ontology/src => nvisy-pipeline/src/ontology}/detection/annotation.rs (78%) rename crates/{nvisy-ontology/src => nvisy-pipeline/src/ontology}/detection/classification.rs (86%) rename crates/{nvisy-ontology/src => nvisy-pipeline/src/ontology}/detection/mod.rs (80%) rename crates/{nvisy-ontology/src => nvisy-pipeline/src/ontology}/detection/sensitivity.rs (66%) rename crates/{nvisy-ontology/src => nvisy-pipeline/src/ontology}/entity/location.rs (55%) rename crates/{nvisy-ontology/src => nvisy-pipeline/src/ontology}/entity/mod.rs (79%) rename crates/{nvisy-ontology/src => nvisy-pipeline/src/ontology}/entity/model.rs (87%) rename crates/{nvisy-ontology/src => nvisy-pipeline/src/ontology}/entity/selector.rs (89%) create mode 100644 crates/nvisy-pipeline/src/ontology/mod.rs rename crates/{nvisy-ontology/src => nvisy-pipeline/src/ontology}/policy/evaluation.rs (66%) rename crates/{nvisy-ontology/src => nvisy-pipeline/src/ontology}/policy/mod.rs (70%) rename crates/{nvisy-ontology/src => nvisy-pipeline/src/ontology}/policy/regulation.rs (92%) rename crates/{nvisy-ontology/src => nvisy-pipeline/src/ontology}/policy/rule.rs (75%) rename crates/{nvisy-ontology/src => nvisy-pipeline/src/ontology}/redaction/mod.rs (53%) rename crates/{nvisy-ontology/src => nvisy-pipeline/src/ontology}/redaction/review.rs (83%) rename crates/{nvisy-ontology/src => nvisy-pipeline/src/ontology}/redaction/spec.rs (92%) rename crates/{nvisy-ontology/src => nvisy-pipeline/src/ontology}/redaction/summary.rs (89%) create mode 100644 crates/nvisy-pipeline/src/ontology/redaction/trait_.rs delete mode 100644 crates/nvisy-python/src/actions/mod.rs delete mode 100644 crates/nvisy-python/src/actions/ocr.rs diff --git a/Cargo.lock b/Cargo.lock index df8d1e9..ce47d04 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2202,15 +2202,16 @@ dependencies = [ "bytes", "calamine", "csv", + "derive_more 1.0.0", "futures", "image", "imageproc", "infer", "lopdf", "nvisy-core", - "nvisy-ontology", "pdf-extract", "quick-xml 0.37.5", + "schemars", "scraper", "serde", "serde_json", @@ -2232,6 +2233,7 @@ dependencies = [ "infer", "jiff", "schemars", + "semver", "serde", "serde_json", "sha2", @@ -2250,10 +2252,11 @@ dependencies = [ "anyhow", "jiff", "nvisy-core", - "nvisy-ontology", + "nvisy-pipeline", "petgraph", "rand 0.10.0", "schemars", + "semver", "serde", "serde_json", "thiserror", @@ -2281,26 +2284,11 @@ dependencies = [ "uuid", ] -[[package]] -name = "nvisy-ontology" -version = "0.1.0" -dependencies = [ - "derive_more 1.0.0", - "jiff", - "nvisy-core", - "schemars", - "semver", - "serde", - "serde_json", - "strum", - "uuid", -] - [[package]] name = "nvisy-pattern" version = "0.1.0" dependencies = [ - "nvisy-ontology", + "nvisy-core", "serde", "serde_json", ] @@ -2311,14 +2299,17 @@ version = "0.1.0" dependencies = [ "aho-corasick", "async-trait", + "derive_more 1.0.0", "jiff", "nvisy-codec", "nvisy-core", - "nvisy-ontology", "nvisy-pattern", "regex", + "schemars", + "semver", "serde", "serde_json", + "strum", "tokio", "tracing", "uuid", @@ -2329,17 +2320,15 @@ name = "nvisy-python" version = "0.1.0" dependencies = [ "async-trait", - "nvisy-codec", "nvisy-core", - "nvisy-ontology", "nvisy-pipeline", "pyo3", + "pythonize", "serde", "serde_json", "thiserror", "tokio", "tracing", - "uuid", ] [[package]] @@ -2702,7 +2691,6 @@ dependencies = [ "pyo3-build-config", "pyo3-ffi", "pyo3-macros", - "serde", "unindent", ] @@ -2751,6 +2739,16 @@ dependencies = [ "syn 2.0.114", ] +[[package]] +name = "pythonize" +version = "0.23.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91a6ee7a084f913f98d70cdc3ebec07e852b735ae3059a1500db2661265da9ff" +dependencies = [ + "pyo3", + "serde", +] + [[package]] name = "qoi" version = "0.4.1" diff --git a/Cargo.toml b/Cargo.toml index 0e2cf50..706a90d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,7 +9,6 @@ members = [ "./crates/nvisy-object", "./crates/nvisy-pattern", "./crates/nvisy-pipeline", - "./crates/nvisy-ontology", "./crates/nvisy-python", ] @@ -38,7 +37,6 @@ nvisy-codec = { path = "./crates/nvisy-codec", version = "0.1.0" } nvisy-object = { path = "./crates/nvisy-object", version = "0.1.0" } nvisy-pattern = { path = "./crates/nvisy-pattern", version = "0.1.0" } nvisy-pipeline = { path = "./crates/nvisy-pipeline", version = "0.1.0" } -nvisy-ontology = { path = "./crates/nvisy-ontology", version = "0.1.0" } nvisy-python = { path = "./crates/nvisy-python", version = "0.1.0" } # Async runtime @@ -82,6 +80,7 @@ infer = { version = "0.19", features = [] } # Python interop pyo3 = { version = "0.23", features = [] } +pythonize = { version = "0.23", features = [] } # S3-compatible object storage minio = { version = "0.3", features = [] } diff --git a/crates/nvisy-codec/Cargo.toml b/crates/nvisy-codec/Cargo.toml index 64e3217..aac99f7 100644 --- a/crates/nvisy-codec/Cargo.toml +++ b/crates/nvisy-codec/Cargo.toml @@ -46,7 +46,12 @@ mp3 = [] [dependencies] # Internal crates nvisy-core = { workspace = true, features = [] } -nvisy-ontology = { workspace = true, features = [] } + +# Derive macros +derive_more = { workspace = true, features = ["from"] } + +# JSON Schema generation +schemars = { workspace = true, features = [] } # (De)serialization serde = { workspace = true, features = ["derive"] } diff --git a/crates/nvisy-codec/src/document/mod.rs b/crates/nvisy-codec/src/document/mod.rs index c74fb3e..b4d97cd 100644 --- a/crates/nvisy-codec/src/document/mod.rs +++ b/crates/nvisy-codec/src/document/mod.rs @@ -5,7 +5,7 @@ pub mod edit_stream; use nvisy_core::io::ContentData; use nvisy_core::path::ContentSource; -use nvisy_ontology::entity::DocumentType; +use nvisy_core::fs::DocumentType; use crate::handler::Handler; diff --git a/crates/nvisy-codec/src/handler/audio/mp3.rs b/crates/nvisy-codec/src/handler/audio/mp3.rs index 5e0a962..0cbad0e 100644 --- a/crates/nvisy-codec/src/handler/audio/mp3.rs +++ b/crates/nvisy-codec/src/handler/audio/mp3.rs @@ -3,7 +3,7 @@ use bytes::Bytes; use nvisy_core::error::Error; -use nvisy_ontology::entity::DocumentType; +use nvisy_core::fs::DocumentType; use crate::document::edit_stream::SpanEditStream; use crate::document::view_stream::SpanStream; diff --git a/crates/nvisy-codec/src/handler/audio/wav.rs b/crates/nvisy-codec/src/handler/audio/wav.rs index c8cb4e4..4eaef70 100644 --- a/crates/nvisy-codec/src/handler/audio/wav.rs +++ b/crates/nvisy-codec/src/handler/audio/wav.rs @@ -3,7 +3,7 @@ use bytes::Bytes; use nvisy_core::error::Error; -use nvisy_ontology::entity::DocumentType; +use nvisy_core::fs::DocumentType; use crate::document::edit_stream::SpanEditStream; use crate::document::view_stream::SpanStream; diff --git a/crates/nvisy-codec/src/handler/document/docx.rs b/crates/nvisy-codec/src/handler/document/docx.rs index 90b7d9e..805e589 100644 --- a/crates/nvisy-codec/src/handler/document/docx.rs +++ b/crates/nvisy-codec/src/handler/document/docx.rs @@ -1,7 +1,7 @@ //! DOCX handler (stub — awaiting migration to Loader/Handler pattern). use nvisy_core::error::Error; -use nvisy_ontology::entity::DocumentType; +use nvisy_core::fs::DocumentType; use crate::document::edit_stream::SpanEditStream; use crate::document::view_stream::SpanStream; diff --git a/crates/nvisy-codec/src/handler/document/pdf.rs b/crates/nvisy-codec/src/handler/document/pdf.rs index 4c8ac68..aff3137 100644 --- a/crates/nvisy-codec/src/handler/document/pdf.rs +++ b/crates/nvisy-codec/src/handler/document/pdf.rs @@ -1,7 +1,7 @@ //! PDF handler (stub — awaiting migration to Loader/Handler pattern). use nvisy_core::error::Error; -use nvisy_ontology::entity::DocumentType; +use nvisy_core::fs::DocumentType; use crate::document::edit_stream::SpanEditStream; use crate::document::view_stream::SpanStream; diff --git a/crates/nvisy-codec/src/handler/image/jpeg.rs b/crates/nvisy-codec/src/handler/image/jpeg.rs index 9b56f21..6e9e291 100644 --- a/crates/nvisy-codec/src/handler/image/jpeg.rs +++ b/crates/nvisy-codec/src/handler/image/jpeg.rs @@ -1,7 +1,7 @@ //! JPEG handler (stub — awaiting migration to Loader/Handler pattern). use nvisy_core::error::Error; -use nvisy_ontology::entity::DocumentType; +use nvisy_core::fs::DocumentType; use crate::document::edit_stream::SpanEditStream; use crate::document::view_stream::SpanStream; diff --git a/crates/nvisy-codec/src/handler/image/png.rs b/crates/nvisy-codec/src/handler/image/png.rs index fe0c02f..bfa9320 100644 --- a/crates/nvisy-codec/src/handler/image/png.rs +++ b/crates/nvisy-codec/src/handler/image/png.rs @@ -4,7 +4,7 @@ use bytes::Bytes; use image::DynamicImage; use nvisy_core::error::{Error, ErrorKind}; -use nvisy_ontology::entity::DocumentType; +use nvisy_core::fs::DocumentType; use crate::document::edit_stream::SpanEditStream; use crate::document::view_stream::SpanStream; diff --git a/crates/nvisy-codec/src/handler/mod.rs b/crates/nvisy-codec/src/handler/mod.rs index 13fea3b..f2922d9 100644 --- a/crates/nvisy-codec/src/handler/mod.rs +++ b/crates/nvisy-codec/src/handler/mod.rs @@ -9,7 +9,7 @@ use nvisy_core::error::Error; use nvisy_core::io::ContentData; -use nvisy_ontology::entity::DocumentType; +use nvisy_core::fs::DocumentType; use crate::document::edit_stream::SpanEditStream; use crate::document::view_stream::SpanStream; diff --git a/crates/nvisy-codec/src/handler/tabular/xlsx.rs b/crates/nvisy-codec/src/handler/tabular/xlsx.rs index c4c3bad..acf3abd 100644 --- a/crates/nvisy-codec/src/handler/tabular/xlsx.rs +++ b/crates/nvisy-codec/src/handler/tabular/xlsx.rs @@ -1,7 +1,7 @@ //! XLSX handler (stub — awaiting migration to Loader/Handler pattern). use nvisy_core::error::Error; -use nvisy_ontology::entity::DocumentType; +use nvisy_core::fs::DocumentType; use crate::document::edit_stream::SpanEditStream; use crate::document::view_stream::SpanStream; diff --git a/crates/nvisy-codec/src/handler/text/csv_handler.rs b/crates/nvisy-codec/src/handler/text/csv_handler.rs index f05416f..60c21fb 100644 --- a/crates/nvisy-codec/src/handler/text/csv_handler.rs +++ b/crates/nvisy-codec/src/handler/text/csv_handler.rs @@ -18,7 +18,7 @@ use futures::StreamExt; use nvisy_core::error::Error; -use nvisy_ontology::entity::DocumentType; +use nvisy_core::fs::DocumentType; use crate::document::edit_stream::SpanEditStream; use crate::document::view_stream::SpanStream; diff --git a/crates/nvisy-codec/src/handler/text/csv_loader.rs b/crates/nvisy-codec/src/handler/text/csv_loader.rs index 40c7d45..71c05ea 100644 --- a/crates/nvisy-codec/src/handler/text/csv_loader.rs +++ b/crates/nvisy-codec/src/handler/text/csv_loader.rs @@ -116,7 +116,7 @@ mod tests { use bytes::Bytes; use futures::StreamExt; use nvisy_core::path::ContentSource; - use nvisy_ontology::entity::DocumentType; + use nvisy_core::fs::DocumentType; fn content_from_str(s: &str) -> ContentData { ContentData::new(ContentSource::new(), Bytes::from(s.to_owned())) diff --git a/crates/nvisy-codec/src/handler/text/html.rs b/crates/nvisy-codec/src/handler/text/html.rs index 318bd46..a594ee9 100644 --- a/crates/nvisy-codec/src/handler/text/html.rs +++ b/crates/nvisy-codec/src/handler/text/html.rs @@ -1,7 +1,7 @@ //! HTML handler (stub — awaiting migration to Loader/Handler pattern). use nvisy_core::error::Error; -use nvisy_ontology::entity::DocumentType; +use nvisy_core::fs::DocumentType; use crate::document::edit_stream::SpanEditStream; use crate::document::view_stream::SpanStream; diff --git a/crates/nvisy-codec/src/handler/text/json_handler.rs b/crates/nvisy-codec/src/handler/text/json_handler.rs index 74598cb..43eaccd 100644 --- a/crates/nvisy-codec/src/handler/text/json_handler.rs +++ b/crates/nvisy-codec/src/handler/text/json_handler.rs @@ -25,7 +25,7 @@ use futures::StreamExt; use serde::{Deserialize, Serialize}; use nvisy_core::error::Error; -use nvisy_ontology::entity::DocumentType; +use nvisy_core::fs::DocumentType; use crate::document::edit_stream::SpanEditStream; use crate::document::view_stream::SpanStream; diff --git a/crates/nvisy-codec/src/handler/text/json_loader.rs b/crates/nvisy-codec/src/handler/text/json_loader.rs index 5baf657..fadc966 100644 --- a/crates/nvisy-codec/src/handler/text/json_loader.rs +++ b/crates/nvisy-codec/src/handler/text/json_loader.rs @@ -95,7 +95,7 @@ mod tests { use super::*; use bytes::Bytes; use nvisy_core::path::ContentSource; - use nvisy_ontology::entity::DocumentType; + use nvisy_core::fs::DocumentType; use serde_json::json; fn content_from_str(s: &str) -> ContentData { diff --git a/crates/nvisy-codec/src/handler/text/txt_handler.rs b/crates/nvisy-codec/src/handler/text/txt_handler.rs index 3f7a212..8213945 100644 --- a/crates/nvisy-codec/src/handler/text/txt_handler.rs +++ b/crates/nvisy-codec/src/handler/text/txt_handler.rs @@ -17,7 +17,7 @@ use futures::StreamExt; use nvisy_core::error::Error; -use nvisy_ontology::entity::DocumentType; +use nvisy_core::fs::DocumentType; use crate::document::edit_stream::SpanEditStream; use crate::document::view_stream::SpanStream; diff --git a/crates/nvisy-codec/src/handler/text/txt_loader.rs b/crates/nvisy-codec/src/handler/text/txt_loader.rs index 5347415..26d67bd 100644 --- a/crates/nvisy-codec/src/handler/text/txt_loader.rs +++ b/crates/nvisy-codec/src/handler/text/txt_loader.rs @@ -57,7 +57,7 @@ mod tests { use bytes::Bytes; use futures::StreamExt; use nvisy_core::path::ContentSource; - use nvisy_ontology::entity::DocumentType; + use nvisy_core::fs::DocumentType; fn content_from_str(s: &str) -> ContentData { ContentData::new(ContentSource::new(), Bytes::from(s.to_owned())) diff --git a/crates/nvisy-codec/src/render/image/block.rs b/crates/nvisy-codec/src/render/image/block.rs index 78a17a4..b456f7f 100644 --- a/crates/nvisy-codec/src/render/image/block.rs +++ b/crates/nvisy-codec/src/render/image/block.rs @@ -5,7 +5,7 @@ //! using alpha-over blending. Regions are clamped to image bounds. use ::image::{DynamicImage, Rgba, RgbaImage}; -use nvisy_ontology::entity::BoundingBoxU32; +use nvisy_core::math::BoundingBoxU32; /// Apply a solid color block overlay to the specified regions of an image. /// diff --git a/crates/nvisy-codec/src/render/image/blur.rs b/crates/nvisy-codec/src/render/image/blur.rs index d05f806..e548440 100644 --- a/crates/nvisy-codec/src/render/image/blur.rs +++ b/crates/nvisy-codec/src/render/image/blur.rs @@ -10,7 +10,7 @@ use ::image::DynamicImage; use imageproc::filter::gaussian_blur_f32; -use nvisy_ontology::entity::BoundingBoxU32; +use nvisy_core::math::BoundingBoxU32; /// Apply gaussian blur to the specified regions of an image. /// diff --git a/crates/nvisy-codec/src/render/image/mod.rs b/crates/nvisy-codec/src/render/image/mod.rs index cf02254..ceb8f0c 100644 --- a/crates/nvisy-codec/src/render/image/mod.rs +++ b/crates/nvisy-codec/src/render/image/mod.rs @@ -25,8 +25,8 @@ use pixelate::apply_pixelate; use ::image::DynamicImage; use nvisy_core::error::Error; -use nvisy_ontology::entity::{BoundingBox, BoundingBoxU32}; -use nvisy_ontology::redaction::ImageRedactionOutput; +use nvisy_core::math::{BoundingBox, BoundingBoxU32}; +use crate::render::output::ImageRedactionOutput; /// A located image redaction: pairs a bounding box with an /// [`ImageRedactionOutput`] that carries the method-specific parameters. diff --git a/crates/nvisy-codec/src/render/image/pixelate.rs b/crates/nvisy-codec/src/render/image/pixelate.rs index a0bba2e..db76248 100644 --- a/crates/nvisy-codec/src/render/image/pixelate.rs +++ b/crates/nvisy-codec/src/render/image/pixelate.rs @@ -11,7 +11,7 @@ use ::image::imageops::FilterType; use ::image::DynamicImage; -use nvisy_ontology::entity::BoundingBoxU32; +use nvisy_core::math::BoundingBoxU32; /// Apply pixelation (mosaic effect) to the specified regions of an image. /// diff --git a/crates/nvisy-codec/src/render/mod.rs b/crates/nvisy-codec/src/render/mod.rs index 4492209..b6c9b1b 100644 --- a/crates/nvisy-codec/src/render/mod.rs +++ b/crates/nvisy-codec/src/render/mod.rs @@ -1,5 +1,8 @@ //! Rendering primitives for redaction overlays. +/// Redaction output types recording what was done. +pub mod output; + /// Image rendering: blur and block overlay for bounding-box regions. #[cfg(any(feature = "png", feature = "jpeg"))] pub mod image; diff --git a/crates/nvisy-ontology/src/redaction/output.rs b/crates/nvisy-codec/src/render/output.rs similarity index 93% rename from crates/nvisy-ontology/src/redaction/output.rs rename to crates/nvisy-codec/src/render/output.rs index ee96557..3498a38 100644 --- a/crates/nvisy-ontology/src/redaction/output.rs +++ b/crates/nvisy-codec/src/render/output.rs @@ -1,19 +1,18 @@ //! Data-carrying redaction output enums recording what was done. //! //! A [`RedactionOutput`] records the method that was applied and its result -//! data (replacement string, ciphertext, blur sigma, etc.). Stored on -//! [`Redaction`](super::Redaction). +//! data (replacement string, ciphertext, blur sigma, etc.). use derive_more::From; use serde::{Deserialize, Serialize}; -use super::method::{ +use nvisy_core::redaction::{ AudioRedactionMethod, ImageRedactionMethod, RedactionMethod, TextRedactionMethod, }; /// Text redaction output — records the method used and its replacement data. #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] -#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] +#[derive(schemars::JsonSchema)] #[serde(tag = "method", rename_all = "snake_case")] pub enum TextRedactionOutput { /// Characters replaced with a mask character. @@ -54,7 +53,7 @@ pub enum TextRedactionOutput { /// Image redaction output — records the method used and its parameters. #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] -#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] +#[derive(schemars::JsonSchema)] #[serde(tag = "method", rename_all = "snake_case")] pub enum ImageRedactionOutput { /// Gaussian blur applied to the region. @@ -69,7 +68,7 @@ pub enum ImageRedactionOutput { /// Audio redaction output — records the method used. #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] -#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] +#[derive(schemars::JsonSchema)] #[serde(tag = "method", rename_all = "snake_case")] pub enum AudioRedactionOutput { /// Segment replaced with silence. @@ -145,9 +144,9 @@ impl AudioRedactionOutput { /// Unified redaction output that wraps modality-specific output variants. /// /// Carries method-specific result data (replacement strings, ciphertext, -/// blur sigma, etc.). Stored on [`Redaction`](super::Redaction). +/// blur sigma, etc.). #[derive(Debug, Clone, PartialEq, From, Serialize, Deserialize)] -#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] +#[derive(schemars::JsonSchema)] #[serde(rename_all = "snake_case")] pub enum RedactionOutput { /// Text/tabular redaction output. diff --git a/crates/nvisy-codec/src/render/text/mask.rs b/crates/nvisy-codec/src/render/text/mask.rs index e891b36..1682878 100644 --- a/crates/nvisy-codec/src/render/text/mask.rs +++ b/crates/nvisy-codec/src/render/text/mask.rs @@ -3,7 +3,7 @@ //! These functions are used by tabular redaction actions to transform //! individual cell values according to a [`TextRedactionOutput`] variant. -use nvisy_ontology::redaction::TextRedactionOutput; +use crate::render::output::TextRedactionOutput; /// Redact a single cell value according to `output`. /// diff --git a/crates/nvisy-codec/src/render/text/mod.rs b/crates/nvisy-codec/src/render/text/mod.rs index 46bb7a5..17ded0a 100644 --- a/crates/nvisy-codec/src/render/text/mod.rs +++ b/crates/nvisy-codec/src/render/text/mod.rs @@ -24,7 +24,7 @@ pub use mask::mask_cell; use replace::{apply_replacements, PendingReplacement}; use nvisy_core::error::Error; -use nvisy_ontology::redaction::TextRedactionOutput; +use crate::render::output::TextRedactionOutput; /// A located text redaction: pairs a byte range with a /// [`TextRedactionOutput`] that carries the already-resolved replacement. diff --git a/crates/nvisy-core/Cargo.toml b/crates/nvisy-core/Cargo.toml index 8172dd7..9f49039 100644 --- a/crates/nvisy-core/Cargo.toml +++ b/crates/nvisy-core/Cargo.toml @@ -50,6 +50,9 @@ hipstr = { workspace = true, features = [] } sha2 = { workspace = true, features = [] } hex = { workspace = true, features = [] } +# Semantic versioning +semver = { workspace = true, features = [] } + # Enum derives strum = { workspace = true, features = [] } diff --git a/crates/nvisy-core/src/entity.rs b/crates/nvisy-core/src/entity.rs new file mode 100644 index 0000000..21fabd9 --- /dev/null +++ b/crates/nvisy-core/src/entity.rs @@ -0,0 +1,29 @@ +//! Shared entity category tag. +//! +//! [`EntityCategory`] classifies detected sensitive data into broad +//! categories used by both detection and pattern matching crates. + +use serde::{Deserialize, Serialize}; + +/// Category of sensitive data an entity belongs to. +#[derive(Debug, Clone, PartialEq, Eq, Hash, strum::Display, Serialize, Deserialize)] +#[derive(schemars::JsonSchema)] +#[serde(rename_all = "snake_case")] +#[strum(serialize_all = "snake_case")] +pub enum EntityCategory { + /// Personally Identifiable Information (names, SSNs, addresses, etc.). + Pii, + /// Protected Health Information (HIPAA-regulated data). + Phi, + /// Financial data (credit card numbers, bank accounts, etc.). + Financial, + /// Secrets and credentials (API keys, passwords, tokens). + Credentials, + /// Legal documents and privileged communications. + Legal, + /// Biometric data (fingerprints, iris scans, voiceprints). + Biometric, + /// User-defined or plugin-specific category. + #[strum(to_string = "{0}")] + Custom(String), +} diff --git a/crates/nvisy-ontology/src/entity/document.rs b/crates/nvisy-core/src/fs/document_type.rs similarity index 90% rename from crates/nvisy-ontology/src/entity/document.rs rename to crates/nvisy-core/src/fs/document_type.rs index 01d6d68..ba0cec7 100644 --- a/crates/nvisy-ontology/src/entity/document.rs +++ b/crates/nvisy-core/src/fs/document_type.rs @@ -4,7 +4,7 @@ use serde::{Deserialize, Serialize}; /// Document format that content can be classified as. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] -#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] +#[derive(schemars::JsonSchema)] #[serde(rename_all = "snake_case")] pub enum DocumentType { /// Plain text (`.txt`, `.log`, etc.). diff --git a/crates/nvisy-core/src/fs/mod.rs b/crates/nvisy-core/src/fs/mod.rs index 920670e..e032b14 100644 --- a/crates/nvisy-core/src/fs/mod.rs +++ b/crates/nvisy-core/src/fs/mod.rs @@ -33,6 +33,7 @@ mod content_handler; mod content_kind; mod content_metadata; mod content_registry; +mod document_type; // Re-export main types pub use content_file::ContentFile; @@ -40,3 +41,4 @@ pub use content_handler::ContentHandler; pub use content_kind::ContentKind; pub use content_metadata::ContentMetadata; pub use content_registry::ContentRegistry; +pub use document_type::DocumentType; diff --git a/crates/nvisy-core/src/lib.rs b/crates/nvisy-core/src/lib.rs index ad85b54..9c381be 100644 --- a/crates/nvisy-core/src/lib.rs +++ b/crates/nvisy-core/src/lib.rs @@ -2,10 +2,13 @@ #![cfg_attr(docsrs, feature(doc_cfg))] #![doc = include_str!("../README.md")] +pub mod entity; pub mod error; pub mod fs; pub mod io; +pub mod math; pub mod path; +pub mod redaction; #[doc(hidden)] pub mod prelude; diff --git a/crates/nvisy-core/src/math.rs b/crates/nvisy-core/src/math.rs new file mode 100644 index 0000000..374c028 --- /dev/null +++ b/crates/nvisy-core/src/math.rs @@ -0,0 +1,68 @@ +//! Spatial and temporal primitive types. +//! +//! Bounding boxes and time spans used across entity locations, +//! rendering, and redaction operations. + +use serde::{Deserialize, Serialize}; + +/// A time interval within an audio or video stream. +#[derive(Debug, Clone, Serialize, Deserialize)] +#[derive(schemars::JsonSchema)] +pub struct TimeSpan { + /// Start time in seconds from the beginning of the stream. + pub start_secs: f64, + /// End time in seconds from the beginning of the stream. + pub end_secs: f64, +} + +/// Axis-aligned bounding box for image-based entity locations. +/// +/// Coordinates are `f64` to support both pixel and normalized (0.0–1.0) +/// values from detection models. Use [`BoundingBoxU32`] (or [`Into`]) +/// when integer pixel coordinates are needed for rendering. +#[derive(Debug, Clone, Serialize, Deserialize)] +#[derive(schemars::JsonSchema)] +pub struct BoundingBox { + /// Horizontal offset of the top-left corner (pixels or normalized). + pub x: f64, + /// Vertical offset of the top-left corner (pixels or normalized). + pub y: f64, + /// Width of the bounding box. + pub width: f64, + /// Height of the bounding box. + pub height: f64, +} + +/// Integer pixel-coordinate bounding box for rendering operations. +/// +/// Converted from [`BoundingBox`] by rounding each field to the nearest +/// integer. Use this at the rendering boundary where pixel-exact +/// coordinates are required. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub struct BoundingBoxU32 { + /// Horizontal offset of the top-left corner in pixels. + pub x: u32, + /// Vertical offset of the top-left corner in pixels. + pub y: u32, + /// Width in pixels. + pub width: u32, + /// Height in pixels. + pub height: u32, +} + +impl From<&BoundingBox> for BoundingBoxU32 { + fn from(bb: &BoundingBox) -> Self { + Self { + x: bb.x.round() as u32, + y: bb.y.round() as u32, + width: bb.width.round() as u32, + height: bb.height.round() as u32, + } + } +} + +impl From for BoundingBoxU32 { + fn from(bb: BoundingBox) -> Self { + Self::from(&bb) + } +} diff --git a/crates/nvisy-ontology/src/redaction/method.rs b/crates/nvisy-core/src/redaction/method.rs similarity index 93% rename from crates/nvisy-ontology/src/redaction/method.rs rename to crates/nvisy-core/src/redaction/method.rs index e74b84a..15cfa7f 100644 --- a/crates/nvisy-ontology/src/redaction/method.rs +++ b/crates/nvisy-core/src/redaction/method.rs @@ -10,7 +10,7 @@ use serde::{Deserialize, Serialize}; /// Redaction strategies for text and tabular content. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, strum::Display, Serialize, Deserialize)] -#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] +#[derive(schemars::JsonSchema)] #[serde(rename_all = "snake_case")] #[strum(serialize_all = "snake_case")] pub enum TextRedactionMethod { @@ -40,7 +40,7 @@ pub enum TextRedactionMethod { /// Redaction strategies for image and video regions. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, strum::Display, Serialize, Deserialize)] -#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] +#[derive(schemars::JsonSchema)] #[serde(rename_all = "snake_case")] #[strum(serialize_all = "snake_case")] pub enum ImageRedactionMethod { @@ -56,7 +56,7 @@ pub enum ImageRedactionMethod { /// Redaction strategies for audio segments. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, strum::Display, Serialize, Deserialize)] -#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] +#[derive(schemars::JsonSchema)] #[serde(rename_all = "snake_case")] #[strum(serialize_all = "snake_case")] pub enum AudioRedactionMethod { @@ -74,7 +74,7 @@ pub enum AudioRedactionMethod { /// configuration data. For a data-carrying request use [`RedactionSpec`](super::RedactionSpec); /// for a data-carrying result use [`RedactionOutput`](super::RedactionOutput). #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, From, Serialize, Deserialize)] -#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] +#[derive(schemars::JsonSchema)] #[serde(rename_all = "snake_case")] pub enum RedactionMethod { /// Text/tabular redaction strategy. diff --git a/crates/nvisy-core/src/redaction/mod.rs b/crates/nvisy-core/src/redaction/mod.rs new file mode 100644 index 0000000..647b9d6 --- /dev/null +++ b/crates/nvisy-core/src/redaction/mod.rs @@ -0,0 +1,15 @@ +//! Redaction method tag enums. +//! +//! Lightweight identifiers that name a redaction algorithm without +//! carrying any configuration data. +//! +//! - [`TextRedactionMethod`] — text/tabular strategies (mask, replace, hash, etc.) +//! - [`ImageRedactionMethod`] — image/video strategies (blur, block, pixelate) +//! - [`AudioRedactionMethod`] — audio strategies (silence, remove) +//! - [`RedactionMethod`] — unified wrapper + +mod method; + +pub use method::{ + AudioRedactionMethod, ImageRedactionMethod, RedactionMethod, TextRedactionMethod, +}; diff --git a/crates/nvisy-engine/Cargo.toml b/crates/nvisy-engine/Cargo.toml index 095deba..ef870c0 100644 --- a/crates/nvisy-engine/Cargo.toml +++ b/crates/nvisy-engine/Cargo.toml @@ -20,11 +20,14 @@ documentation = { workspace = true } [dependencies] # Internal crates nvisy-core = { workspace = true, features = [] } -nvisy-ontology = { workspace = true, features = [] } +nvisy-pipeline = { workspace = true, features = [] } # JSON Schema generation schemars = { workspace = true, features = [] } +# Semantic versioning +semver = { workspace = true, features = [] } + # (De)serialization serde = { workspace = true, features = ["derive"] } serde_json = { workspace = true, features = [] } diff --git a/crates/nvisy-engine/src/engine.rs b/crates/nvisy-engine/src/engine.rs index b271095..09adca8 100644 --- a/crates/nvisy-engine/src/engine.rs +++ b/crates/nvisy-engine/src/engine.rs @@ -10,10 +10,10 @@ use uuid::Uuid; use nvisy_core::error::Error; use nvisy_core::fs::ContentHandler; -use nvisy_ontology::audit::Audit; -use nvisy_ontology::detection::{ClassificationResult, DetectionResult}; -use nvisy_ontology::policy::{Policies, PolicyEvaluation}; -use nvisy_ontology::redaction::RedactionSummary; +use nvisy_pipeline::ontology::audit::Audit; +use nvisy_pipeline::ontology::redaction::RedactionSummary; +use nvisy_pipeline::ontology::detection::{ClassificationResult, DetectionResult}; +use nvisy_pipeline::ontology::policy::{Policies, PolicyEvaluation}; use crate::compiler::graph::Graph; use crate::connections::Connections; diff --git a/crates/nvisy-engine/src/lib.rs b/crates/nvisy-engine/src/lib.rs index b3e6edc..04a86b0 100644 --- a/crates/nvisy-engine/src/lib.rs +++ b/crates/nvisy-engine/src/lib.rs @@ -12,6 +12,7 @@ pub mod compiler; pub mod connections; pub mod engine; pub mod executor; +pub mod ontology; pub mod policies; pub mod runs; diff --git a/crates/nvisy-ontology/src/audit/explanation.rs b/crates/nvisy-engine/src/ontology/audit.rs similarity index 85% rename from crates/nvisy-ontology/src/audit/explanation.rs rename to crates/nvisy-engine/src/ontology/audit.rs index c4b70fd..6f49ac6 100644 --- a/crates/nvisy-ontology/src/audit/explanation.rs +++ b/crates/nvisy-engine/src/ontology/audit.rs @@ -8,7 +8,7 @@ use semver::Version; use serde::{Deserialize, Serialize}; use uuid::Uuid; -use crate::entity::{DetectionMethod, ModelInfo}; +use nvisy_pipeline::ontology::entity::{DetectionMethod, ModelInfo}; /// Types that carry explainability metadata. pub trait Explainable { @@ -19,9 +19,9 @@ pub trait Explainable { /// Structured explainability metadata for a data protection decision. /// /// Records why an action was taken, which model and rule were involved, -/// and who reviewed it. Complements the freeform `details` field on [`Audit`](super::Audit). +/// and who reviewed it. #[derive(Debug, Clone, Serialize, Deserialize)] -#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] +#[derive(schemars::JsonSchema)] pub struct Explanation { /// Detection model that produced the decision. #[serde(skip_serializing_if = "Option::is_none")] @@ -40,7 +40,7 @@ pub struct Explanation { pub reason: Option, /// Version of the policy that was evaluated. #[serde(skip_serializing_if = "Option::is_none")] - #[cfg_attr(feature = "jsonschema", schemars(with = "Option"))] + #[schemars(with = "Option")] pub policy_version: Option, /// Identifier of the reviewer who approved/rejected. #[serde(skip_serializing_if = "Option::is_none")] diff --git a/crates/nvisy-engine/src/ontology/mod.rs b/crates/nvisy-engine/src/ontology/mod.rs new file mode 100644 index 0000000..7d0757a --- /dev/null +++ b/crates/nvisy-engine/src/ontology/mod.rs @@ -0,0 +1,3 @@ +//! Engine-level domain types. + +pub mod audit; diff --git a/crates/nvisy-ontology/Cargo.toml b/crates/nvisy-ontology/Cargo.toml deleted file mode 100644 index bed8e13..0000000 --- a/crates/nvisy-ontology/Cargo.toml +++ /dev/null @@ -1,46 +0,0 @@ -# https://doc.rust-lang.org/cargo/reference/manifest.html - -[package] -name = "nvisy-ontology" -description = "Detection ontology and redaction policy types for the Nvisy platform" -keywords = ["nvisy", "ontology", "redaction", "policy"] -categories = ["data-structures"] - -version = { workspace = true } -rust-version = { workspace = true } -edition = { workspace = true } -license = { workspace = true } -publish = { workspace = true } - -authors = { workspace = true } -repository = { workspace = true } -homepage = { workspace = true } -documentation = { workspace = true } - -[features] -default = [] -jsonschema = ["dep:schemars"] - -[dependencies] -# Internal crates -nvisy-core = { workspace = true, features = [] } - -# JSON Schema generation (optional) -schemars = { workspace = true, optional = true, features = [] } - -# (De)serialization -serde = { workspace = true, features = ["derive"] } -serde_json = { workspace = true, features = [] } - -# Primitive datatypes -uuid = { workspace = true, features = ["serde", "v4"] } -jiff = { workspace = true, features = [] } - -# Semantic versioning -semver = { workspace = true, features = [] } - -# Enum utilities -strum = { workspace = true, features = ["derive"] } - -# Error handling -derive_more = { workspace = true, features = ["display", "from"] } diff --git a/crates/nvisy-ontology/README.md b/crates/nvisy-ontology/README.md deleted file mode 100644 index 00ccb86..0000000 --- a/crates/nvisy-ontology/README.md +++ /dev/null @@ -1,3 +0,0 @@ -# nvisy-ontology - -Detection ontology and redaction policy types for the Nvisy platform. Defines entities, redaction methods, audit records, and policy rules that all detection and redaction crates depend on. diff --git a/crates/nvisy-ontology/src/lib.rs b/crates/nvisy-ontology/src/lib.rs deleted file mode 100644 index 48686f2..0000000 --- a/crates/nvisy-ontology/src/lib.rs +++ /dev/null @@ -1,12 +0,0 @@ -#![forbid(unsafe_code)] -#![cfg_attr(docsrs, feature(doc_cfg))] -#![doc = include_str!("../README.md")] - -pub mod audit; -pub mod detection; -pub mod entity; -pub mod policy; -pub mod redaction; - -#[doc(hidden)] -pub mod prelude; diff --git a/crates/nvisy-ontology/src/prelude.rs b/crates/nvisy-ontology/src/prelude.rs deleted file mode 100644 index f5e0150..0000000 --- a/crates/nvisy-ontology/src/prelude.rs +++ /dev/null @@ -1,24 +0,0 @@ -//! Convenience re-exports for common nvisy-ontology types. - -pub use crate::audit::{ - Audit, AuditAction, Auditable, Explainable, Explanation, RetentionPolicy, RetentionScope, -}; -pub use crate::detection::{ - Annotation, AnnotationKind, AnnotationLabel, AnnotationScope, ClassificationResult, - DetectionResult, - Sensitivity, SensitivityLevel, -}; -pub use crate::entity::{ - AudioLocation, BoundingBox, BoundingBoxU32, DetectionMethod, DocumentType, Entity, EntityCategory, - EntitySelector, ImageLocation, ModelInfo, ModelKind, TabularLocation, - TextLocation, TimeSpan, VideoLocation, -}; -pub use crate::policy::{ - Policies, Policy, PolicyEvaluation, PolicyRule, RegulationKind, RuleCondition, RuleKind, -}; -pub use crate::redaction::{ - AudioRedactionMethod, AudioRedactionOutput, AudioRedactionSpec, ImageRedactionMethod, - ImageRedactionOutput, ImageRedactionSpec, Redactable, Redaction, RedactionMethod, - RedactionOutput, RedactionSpec, RedactionSummary, ReviewDecision, ReviewStatus, - TextRedactionMethod, TextRedactionOutput, TextRedactionSpec, -}; diff --git a/crates/nvisy-pattern/Cargo.toml b/crates/nvisy-pattern/Cargo.toml index f51b7c9..0428128 100644 --- a/crates/nvisy-pattern/Cargo.toml +++ b/crates/nvisy-pattern/Cargo.toml @@ -23,7 +23,7 @@ rustdoc-args = ["--cfg", "docsrs"] [dependencies] # Internal crates -nvisy-ontology = { workspace = true, features = [] } +nvisy-core = { workspace = true, features = [] } # (De)serialization serde = { workspace = true, features = ["derive"] } diff --git a/crates/nvisy-pattern/src/patterns/mod.rs b/crates/nvisy-pattern/src/patterns/mod.rs index f4b79f2..7285a0e 100644 --- a/crates/nvisy-pattern/src/patterns/mod.rs +++ b/crates/nvisy-pattern/src/patterns/mod.rs @@ -9,7 +9,7 @@ pub mod validators; use std::collections::HashMap; use std::sync::LazyLock; -use nvisy_ontology::entity::EntityCategory; +use nvisy_core::entity::EntityCategory; /// JSON representation of a pattern loaded from disk. #[derive(Debug, Clone, serde::Deserialize)] diff --git a/crates/nvisy-pipeline/Cargo.toml b/crates/nvisy-pipeline/Cargo.toml index 0e0fa50..5c1d1cb 100644 --- a/crates/nvisy-pipeline/Cargo.toml +++ b/crates/nvisy-pipeline/Cargo.toml @@ -31,7 +31,6 @@ audio-redaction = ["nvisy-codec/wav"] [dependencies] # Internal crates nvisy-core = { workspace = true, features = [] } -nvisy-ontology = { workspace = true, features = [] } nvisy-codec = { workspace = true, features = [] } nvisy-pattern = { workspace = true, features = [] } @@ -49,6 +48,18 @@ uuid = { workspace = true, features = ["v4"] } # Time jiff = { workspace = true, features = [] } +# Semantic versioning +semver = { workspace = true, features = [] } + +# Derive macros +derive_more = { workspace = true, features = ["from"] } + +# Enum utilities +strum = { workspace = true, features = ["derive"] } + +# JSON Schema generation +schemars = { workspace = true, features = [] } + # Text processing regex = { workspace = true, features = [] } aho-corasick = { workspace = true, features = [] } diff --git a/crates/nvisy-pipeline/src/detection/checksum.rs b/crates/nvisy-pipeline/src/detection/checksum.rs index 78c8c46..3a1b764 100644 --- a/crates/nvisy-pipeline/src/detection/checksum.rs +++ b/crates/nvisy-pipeline/src/detection/checksum.rs @@ -2,7 +2,7 @@ use serde::Deserialize; -use nvisy_ontology::entity::{DetectionMethod, Entity}; +use crate::ontology::entity::{DetectionMethod, Entity}; use nvisy_core::error::Error; use nvisy_pattern::patterns::validators::luhn_check; diff --git a/crates/nvisy-pipeline/src/detection/classify.rs b/crates/nvisy-pipeline/src/detection/classify.rs index 0e4b0d4..34a69a1 100644 --- a/crates/nvisy-pipeline/src/detection/classify.rs +++ b/crates/nvisy-pipeline/src/detection/classify.rs @@ -1,8 +1,8 @@ //! Sensitivity classification action. -pub use nvisy_ontology::detection::ClassificationResult; -use nvisy_ontology::detection::{Sensitivity, SensitivityLevel}; -use nvisy_ontology::entity::Entity; +pub use crate::ontology::detection::ClassificationResult; +use crate::ontology::detection::{Sensitivity, SensitivityLevel}; +use crate::ontology::entity::Entity; use nvisy_core::error::Error; use crate::action::Action; @@ -60,7 +60,7 @@ fn compute_sensitivity_level(entities: &[Entity]) -> SensitivityLevel { let has_critical_types = entities.iter().any(|e| { matches!( e.category, - nvisy_ontology::entity::EntityCategory::Credentials + nvisy_core::entity::EntityCategory::Credentials ) || e.entity_type == "ssn" || e.entity_type == "credit_card" }); diff --git a/crates/nvisy-pipeline/src/detection/dictionary.rs b/crates/nvisy-pipeline/src/detection/dictionary.rs index 51f1fed..20b6aed 100644 --- a/crates/nvisy-pipeline/src/detection/dictionary.rs +++ b/crates/nvisy-pipeline/src/detection/dictionary.rs @@ -5,8 +5,9 @@ use serde::Deserialize; use nvisy_codec::handler::{TxtHandler, CsvHandler}; use nvisy_codec::document::Document; -use nvisy_ontology::entity::{ - DetectionMethod, Entity, EntityCategory, TabularLocation, TextLocation, +use nvisy_core::entity::EntityCategory; +use crate::ontology::entity::{ + DetectionMethod, Entity, TabularLocation, TextLocation, }; use nvisy_core::error::{Error, ErrorKind}; use nvisy_pattern::dictionaries; diff --git a/crates/nvisy-pipeline/src/detection/manual.rs b/crates/nvisy-pipeline/src/detection/manual.rs index 80f89d2..6314d6e 100644 --- a/crates/nvisy-pipeline/src/detection/manual.rs +++ b/crates/nvisy-pipeline/src/detection/manual.rs @@ -4,8 +4,8 @@ use serde::Deserialize; -use nvisy_ontology::entity::{DetectionMethod, Entity}; -use nvisy_ontology::detection::{Annotation, AnnotationKind}; +use crate::ontology::entity::{DetectionMethod, Entity}; +use crate::ontology::detection::{Annotation, AnnotationKind}; use nvisy_core::error::Error; use crate::action::Action; diff --git a/crates/nvisy-pipeline/src/detection/mod.rs b/crates/nvisy-pipeline/src/detection/mod.rs index 88c6648..d55af68 100644 --- a/crates/nvisy-pipeline/src/detection/mod.rs +++ b/crates/nvisy-pipeline/src/detection/mod.rs @@ -1,7 +1,7 @@ //! Entity detection actions. //! //! Each sub-module exposes a single [`Action`](crate::action::Action) -//! that produces [`Entity`](nvisy_ontology::entity::Entity) values from +//! that produces [`Entity`](crate::ontology::entity::Entity) values from //! document content. /// Validates detected entities using checksum algorithms (e.g. Luhn). diff --git a/crates/nvisy-pipeline/src/detection/ner.rs b/crates/nvisy-pipeline/src/detection/ner.rs index 9ee780f..d74c938 100644 --- a/crates/nvisy-pipeline/src/detection/ner.rs +++ b/crates/nvisy-pipeline/src/detection/ner.rs @@ -1,21 +1,58 @@ //! AI-powered named-entity recognition (NER) detection action. use serde::Deserialize; +use serde_json::Value; use nvisy_codec::document::Document; use nvisy_codec::handler::TxtHandler; -use nvisy_ontology::entity::Entity; +use nvisy_core::entity::EntityCategory; use nvisy_core::error::Error; +use crate::ontology::entity::{DetectionMethod, Entity, TextLocation}; + #[cfg(feature = "image-redaction")] use nvisy_codec::handler::PngHandler; -use crate::action::Action; - fn default_confidence() -> f64 { 0.5 } +/// Configuration passed to an [`NerBackend`] implementation. +/// +/// Contains only the model-agnostic parameters that every backend needs. +/// Provider-specific fields (API key, model name, etc.) belong in the +/// action's [`DetectNerParams`] or the provider's credentials. +#[derive(Debug, Clone)] +pub struct NerConfig { + /// Entity type labels to detect (e.g., `["PERSON", "SSN"]`). + pub entity_types: Vec, + /// Minimum confidence score to include a detection (0.0 -- 1.0). + pub confidence_threshold: f64, +} + +/// Backend trait for NER providers. +/// +/// Implementations call an external NER service (e.g. via Python, HTTP) +/// and return raw JSON results. Entity construction from the raw dicts +/// is handled by [`DetectNerAction`]. +#[async_trait::async_trait] +pub trait NerBackend: Send + Sync + 'static { + /// Detect entities in text, returning raw dicts. + async fn detect_text( + &self, + text: &str, + config: &NerConfig, + ) -> Result, Error>; + + /// Detect entities in an image, returning raw dicts. + async fn detect_image( + &self, + image_data: &[u8], + mime_type: &str, + config: &NerConfig, + ) -> Result, Error>; +} + /// Typed parameters for [`DetectNerAction`]. #[derive(Debug, Deserialize)] #[serde(rename_all = "camelCase")] @@ -37,28 +74,120 @@ pub struct DetectNerInput { pub image_docs: Vec>, } -/// AI NER detection stub — delegates to an NER model provider at runtime. -pub struct DetectNerAction; +/// AI NER detection — delegates to an [`NerBackend`] at runtime. +pub struct DetectNerAction { + backend: B, + params: DetectNerParams, +} -#[async_trait::async_trait] -impl Action for DetectNerAction { - type Params = DetectNerParams; - type Input = DetectNerInput; - type Output = Vec; +impl DetectNerAction { + /// Create a new action with the given backend and params. + pub fn new(backend: B, params: DetectNerParams) -> Self { + Self { backend, params } + } - fn id(&self) -> &str { - "detect-ner" + /// Build the [`NerConfig`] from action parameters. + fn config(&self) -> NerConfig { + NerConfig { + entity_types: self.params.entity_types.clone(), + confidence_threshold: self.params.confidence_threshold, + } } - async fn connect(_params: Self::Params) -> Result { - Ok(Self) + /// Execute NER detection on text documents and (optionally) image documents. + pub async fn run(&self, input: DetectNerInput) -> Result, Error> { + let config = self.config(); + let mut entities = Vec::new(); + + for doc in &input.text_docs { + let text = doc.handler().lines().join("\n"); + let raw = self.backend.detect_text(&text, &config).await?; + entities.extend(parse_ner_entities(&raw)?); + } + + #[cfg(feature = "image-redaction")] + for doc in &input.image_docs { + let raw = self + .backend + .detect_image(doc.handler().bytes(), "image/png", &config) + .await?; + entities.extend(parse_ner_entities(&raw)?); + } + + Ok(entities) } +} - async fn execute( - &self, - _input: Self::Input, - ) -> Result, Error> { - // Stub: real implementation will call an NER model provider. - Ok(Vec::new()) +/// Parse raw JSON dicts from an NER backend into [`Entity`] values. +/// +/// Expected dict keys: `category`, `entity_type`, `value`, `confidence`, +/// and optionally `start_offset` / `end_offset`. +pub fn parse_ner_entities(raw: &[Value]) -> Result, Error> { + let mut entities = Vec::new(); + + for item in raw { + let obj = item.as_object().ok_or_else(|| { + Error::python("Expected JSON object in NER results".to_string()) + })?; + + let category_str = obj + .get("category") + .and_then(Value::as_str) + .ok_or_else(|| Error::python("Missing 'category'".to_string()))?; + + let category = match category_str { + "pii" => EntityCategory::Pii, + "phi" => EntityCategory::Phi, + "financial" => EntityCategory::Financial, + "credentials" => EntityCategory::Credentials, + other => EntityCategory::Custom(other.to_string()), + }; + + let entity_type = obj + .get("entity_type") + .and_then(Value::as_str) + .ok_or_else(|| Error::python("Missing 'entity_type'".to_string()))?; + + let value = obj + .get("value") + .and_then(Value::as_str) + .ok_or_else(|| Error::python("Missing 'value'".to_string()))?; + + let confidence = obj + .get("confidence") + .and_then(Value::as_f64) + .ok_or_else(|| Error::python("Missing 'confidence'".to_string()))?; + + let start_offset = obj + .get("start_offset") + .and_then(Value::as_u64) + .map(|v| v as usize) + .unwrap_or(0); + + let end_offset = obj + .get("end_offset") + .and_then(Value::as_u64) + .map(|v| v as usize) + .unwrap_or(0); + + let entity = Entity::new( + category, + entity_type, + value, + DetectionMethod::Ner, + confidence, + ) + .with_text_location(TextLocation { + start_offset, + end_offset, + context_start_offset: None, + context_end_offset: None, + element_id: None, + page_number: None, + }); + + entities.push(entity); } + + Ok(entities) } diff --git a/crates/nvisy-pipeline/src/detection/regex.rs b/crates/nvisy-pipeline/src/detection/regex.rs index f7cc05d..48806f3 100644 --- a/crates/nvisy-pipeline/src/detection/regex.rs +++ b/crates/nvisy-pipeline/src/detection/regex.rs @@ -5,7 +5,7 @@ use serde::Deserialize; use nvisy_codec::handler::TxtHandler; use nvisy_codec::document::Document; -use nvisy_ontology::entity::{DetectionMethod, Entity, TextLocation}; +use crate::ontology::entity::{DetectionMethod, Entity, TextLocation}; use nvisy_core::error::Error; use nvisy_pattern::patterns::{self, PatternDefinition}; diff --git a/crates/nvisy-pipeline/src/detection/tabular.rs b/crates/nvisy-pipeline/src/detection/tabular.rs index db5d6f4..7db34e3 100644 --- a/crates/nvisy-pipeline/src/detection/tabular.rs +++ b/crates/nvisy-pipeline/src/detection/tabular.rs @@ -5,8 +5,9 @@ use serde::Deserialize; use nvisy_codec::handler::CsvHandler; use nvisy_codec::document::Document; -use nvisy_ontology::entity::{ - DetectionMethod, Entity, EntityCategory, TabularLocation, +use nvisy_core::entity::EntityCategory; +use crate::ontology::entity::{ + DetectionMethod, Entity, TabularLocation, }; use nvisy_core::error::{Error, ErrorKind}; diff --git a/crates/nvisy-pipeline/src/generation/ocr.rs b/crates/nvisy-pipeline/src/generation/ocr.rs index 0d1e6dd..3fde8fe 100644 --- a/crates/nvisy-pipeline/src/generation/ocr.rs +++ b/crates/nvisy-pipeline/src/generation/ocr.rs @@ -2,13 +2,15 @@ //! from image documents. use serde::Deserialize; +use serde_json::Value; use nvisy_codec::document::Document; -use nvisy_codec::handler::{PngHandler, TxtHandler}; -use nvisy_ontology::entity::Entity; +use nvisy_codec::handler::{PngHandler, TxtHandler, TxtData}; +use nvisy_core::entity::EntityCategory; use nvisy_core::error::Error; +use nvisy_core::math::BoundingBox; -use crate::action::Action; +use crate::ontology::entity::{DetectionMethod, Entity, ImageLocation}; fn default_language() -> String { "eng".into() @@ -22,6 +24,32 @@ fn default_confidence() -> f64 { 0.5 } +/// Configuration passed to an [`OcrBackend`] implementation. +#[derive(Debug, Clone)] +pub struct OcrConfig { + /// Language hint (e.g. `"eng"` for English). + pub language: String, + /// OCR engine to use (`"tesseract"`, `"google-vision"`, `"aws-textract"`). + pub engine: String, + /// Minimum confidence threshold for OCR results. + pub confidence_threshold: f64, +} + +/// Backend trait for OCR providers. +/// +/// Implementations call an external OCR service and return raw JSON +/// results. Entity construction is handled by [`GenerateOcrAction`]. +#[async_trait::async_trait] +pub trait OcrBackend: Send + Sync + 'static { + /// Run OCR on image bytes, returning raw dicts. + async fn detect_ocr( + &self, + image_data: &[u8], + mime_type: &str, + config: &OcrConfig, + ) -> Result, Error>; +} + /// Typed parameters for [`GenerateOcrAction`]. #[derive(Debug, Deserialize)] #[serde(rename_all = "camelCase")] @@ -51,31 +79,99 @@ pub struct GenerateOcrOutput { pub text_docs: Vec>, } -/// OCR generation stub — delegates to an OCR engine provider at runtime. -pub struct GenerateOcrAction; - -#[async_trait::async_trait] -impl Action for GenerateOcrAction { - type Params = GenerateOcrParams; - type Input = GenerateOcrInput; - type Output = GenerateOcrOutput; +/// OCR generation — delegates to an [`OcrBackend`] at runtime. +pub struct GenerateOcrAction { + backend: B, + params: GenerateOcrParams, +} - fn id(&self) -> &str { - "generate-ocr" +impl GenerateOcrAction { + /// Create a new action with the given backend and params. + pub fn new(backend: B, params: GenerateOcrParams) -> Self { + Self { backend, params } } - async fn connect(_params: Self::Params) -> Result { - Ok(Self) + /// Build the [`OcrConfig`] from action parameters. + fn config(&self) -> OcrConfig { + OcrConfig { + language: self.params.language.clone(), + engine: self.params.engine.clone(), + confidence_threshold: self.params.confidence_threshold, + } } - async fn execute( - &self, - _input: Self::Input, - ) -> Result { - // Stub: real implementation will call an OCR engine provider. + /// Execute OCR on image documents. + pub async fn run(&self, input: GenerateOcrInput) -> Result { + let config = self.config(); + let mut all_entities = Vec::new(); + let mut all_ocr_text = Vec::new(); + + for doc in &input.image_docs { + let raw = self + .backend + .detect_ocr(doc.handler().bytes(), "image/png", &config) + .await?; + let entities = parse_ocr_entities(&raw)?; + for entity in &entities { + all_ocr_text.push(entity.value.clone()); + } + all_entities.extend(entities); + } + + let mut text_docs = Vec::new(); + if !all_ocr_text.is_empty() { + let text = all_ocr_text.join("\n"); + let handler = TxtHandler::new(TxtData { + lines: text.lines().map(String::from).collect(), + trailing_newline: text.ends_with('\n'), + }); + text_docs.push(Document::new(handler)); + } + Ok(GenerateOcrOutput { - entities: Vec::new(), - text_docs: Vec::new(), + entities: all_entities, + text_docs, }) } } + +/// Parse raw JSON dicts from an OCR backend into [`Entity`] values. +/// +/// Expected dict keys: `text`, `x`, `y`, `width`, `height`, `confidence`. +pub fn parse_ocr_entities(raw: &[Value]) -> Result, Error> { + let mut entities = Vec::new(); + + for item in raw { + let obj = item.as_object().ok_or_else(|| { + Error::python("Expected JSON object in OCR results".to_string()) + })?; + + let text = obj + .get("text") + .and_then(Value::as_str) + .ok_or_else(|| Error::python("Missing 'text' in OCR result".to_string()))?; + + let x = obj.get("x").and_then(Value::as_f64).unwrap_or(0.0); + let y = obj.get("y").and_then(Value::as_f64).unwrap_or(0.0); + let width = obj.get("width").and_then(Value::as_f64).unwrap_or(0.0); + let height = obj.get("height").and_then(Value::as_f64).unwrap_or(0.0); + let confidence = obj.get("confidence").and_then(Value::as_f64).unwrap_or(0.0); + + let entity = Entity::new( + EntityCategory::Pii, + "ocr_text", + text, + DetectionMethod::Ocr, + confidence, + ) + .with_image_location(ImageLocation { + bounding_box: BoundingBox { x, y, width, height }, + image_id: None, + page_number: None, + }); + + entities.push(entity); + } + + Ok(entities) +} diff --git a/crates/nvisy-pipeline/src/generation/synthetic.rs b/crates/nvisy-pipeline/src/generation/synthetic.rs index d6200b4..78ca0ad 100644 --- a/crates/nvisy-pipeline/src/generation/synthetic.rs +++ b/crates/nvisy-pipeline/src/generation/synthetic.rs @@ -3,8 +3,8 @@ use serde::Deserialize; -use nvisy_ontology::entity::Entity; -use nvisy_ontology::redaction::Redaction; +use crate::ontology::redaction::Redaction; +use crate::ontology::entity::Entity; use nvisy_core::error::Error; use crate::action::Action; diff --git a/crates/nvisy-pipeline/src/generation/transcribe.rs b/crates/nvisy-pipeline/src/generation/transcribe.rs index d705660..52a6228 100644 --- a/crates/nvisy-pipeline/src/generation/transcribe.rs +++ b/crates/nvisy-pipeline/src/generation/transcribe.rs @@ -5,7 +5,7 @@ use serde::Deserialize; use nvisy_codec::document::Document; use nvisy_codec::handler::{WavHandler, TxtHandler}; -use nvisy_ontology::entity::Entity; +use crate::ontology::entity::Entity; use nvisy_core::error::Error; use crate::action::Action; @@ -41,7 +41,7 @@ pub struct GenerateTranscribeInput { /// Typed output for [`GenerateTranscribeAction`]. pub struct GenerateTranscribeOutput { - /// Detected entities with [`AudioLocation`](nvisy_ontology::entity::AudioLocation). + /// Detected entities with [`AudioLocation`](crate::ontology::entity::AudioLocation). pub entities: Vec, /// Transcripts as new text documents. pub text_docs: Vec>, diff --git a/crates/nvisy-pipeline/src/lib.rs b/crates/nvisy-pipeline/src/lib.rs index b5618c3..e46646d 100644 --- a/crates/nvisy-pipeline/src/lib.rs +++ b/crates/nvisy-pipeline/src/lib.rs @@ -15,9 +15,11 @@ pub mod action; pub mod provider; /// Entity detection actions. pub mod detection; -/// Redaction actions (policy evaluation, apply, audit). -pub mod redaction; /// Content generation actions (OCR, transcription, synthetic data). pub mod generation; +/// Domain types: entity, detection, policy, and redaction. +pub mod ontology; +/// Redaction actions (policy evaluation, apply, audit). +pub mod redaction; #[doc(hidden)] pub mod prelude; diff --git a/crates/nvisy-ontology/src/audit/mod.rs b/crates/nvisy-pipeline/src/ontology/audit/mod.rs similarity index 82% rename from crates/nvisy-ontology/src/audit/mod.rs rename to crates/nvisy-pipeline/src/ontology/audit/mod.rs index 0cd4921..880f44c 100644 --- a/crates/nvisy-ontology/src/audit/mod.rs +++ b/crates/nvisy-pipeline/src/ontology/audit/mod.rs @@ -1,12 +1,10 @@ //! Audit trail records for data protection events. //! //! An [`Audit`] entry records an immutable event in the data protection -//! pipeline, carrying structured [`Explanation`] metadata for compliance. +//! pipeline, carrying structured metadata for compliance. -mod explanation; mod retention; -pub use explanation::{Explainable, Explanation}; pub use retention::{RetentionPolicy, RetentionScope}; use jiff::Timestamp; @@ -23,7 +21,7 @@ pub trait Auditable { /// Kind of auditable action recorded in an [`Audit`] entry. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, strum::Display, Serialize, Deserialize)] -#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] +#[derive(schemars::JsonSchema)] #[serde(rename_all = "snake_case")] #[strum(serialize_all = "snake_case")] pub enum AuditAction { @@ -40,7 +38,7 @@ pub enum AuditAction { /// Audit entries are emitted by pipeline actions and form a tamper-evident /// log of all detection, redaction, and policy decisions. #[derive(Debug, Clone, Serialize, Deserialize)] -#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] +#[derive(schemars::JsonSchema)] pub struct Audit { /// Content source identity and lineage. #[serde(flatten)] @@ -48,7 +46,7 @@ pub struct Audit { /// The kind of event this audit entry records. pub action: AuditAction, /// UTC timestamp when the event occurred. - #[cfg_attr(feature = "jsonschema", schemars(with = "String"))] + #[schemars(with = "String")] pub timestamp: Timestamp, /// Identifier of the related entity, if applicable. #[serde(skip_serializing_if = "Option::is_none")] @@ -68,7 +66,4 @@ pub struct Audit { /// Human or service account that triggered the event. #[serde(skip_serializing_if = "Option::is_none")] pub actor: Option, - /// Structured explainability metadata. - #[serde(skip_serializing_if = "Option::is_none")] - pub explanation: Option, } diff --git a/crates/nvisy-ontology/src/audit/retention.rs b/crates/nvisy-pipeline/src/ontology/audit/retention.rs similarity index 91% rename from crates/nvisy-ontology/src/audit/retention.rs rename to crates/nvisy-pipeline/src/ontology/audit/retention.rs index 612b636..808ecd3 100644 --- a/crates/nvisy-ontology/src/audit/retention.rs +++ b/crates/nvisy-pipeline/src/ontology/audit/retention.rs @@ -6,7 +6,7 @@ use serde::{Deserialize, Serialize}; /// What class of data a retention policy applies to. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] -#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] +#[derive(schemars::JsonSchema)] #[serde(rename_all = "snake_case")] pub enum RetentionScope { /// Original ingested content before redaction. @@ -19,7 +19,7 @@ pub enum RetentionScope { /// A retention policy governing how long data is kept. #[derive(Debug, Clone, Serialize, Deserialize)] -#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] +#[derive(schemars::JsonSchema)] pub struct RetentionPolicy { /// What class of data this policy applies to. pub scope: RetentionScope, diff --git a/crates/nvisy-ontology/src/detection/annotation.rs b/crates/nvisy-pipeline/src/ontology/detection/annotation.rs similarity index 78% rename from crates/nvisy-ontology/src/detection/annotation.rs rename to crates/nvisy-pipeline/src/ontology/detection/annotation.rs index ead0bf7..19c0730 100644 --- a/crates/nvisy-ontology/src/detection/annotation.rs +++ b/crates/nvisy-pipeline/src/ontology/detection/annotation.rs @@ -1,20 +1,15 @@ //! Annotation types for pre-identified regions and classification labels. -//! -//! Annotations allow users and upstream systems to mark regions of content -//! before detection runs. They replace the previous `ManualAnnotation` type -//! with a unified model supporting three kinds: inclusions (pre-identified -//! sensitive regions), exclusions (known-safe regions to skip), and -//! classification labels. use serde::{Deserialize, Serialize}; -use crate::entity::{ - AudioLocation, EntityCategory, ImageLocation, TabularLocation, TextLocation, VideoLocation, +use nvisy_core::entity::EntityCategory; + +use crate::ontology::entity::{ + AudioLocation, ImageLocation, TabularLocation, TextLocation, VideoLocation, }; /// The kind of annotation applied to a content region. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, strum::Display, Serialize, Deserialize)] -#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] #[serde(rename_all = "snake_case")] #[strum(serialize_all = "snake_case")] pub enum AnnotationKind { @@ -28,7 +23,6 @@ pub enum AnnotationKind { /// The scope to which an annotation label applies. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, strum::Display, Serialize, Deserialize)] -#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] #[serde(rename_all = "snake_case")] #[strum(serialize_all = "snake_case")] pub enum AnnotationScope { @@ -42,7 +36,6 @@ pub enum AnnotationScope { /// A classification label attached to a document or region. #[derive(Debug, Clone, Serialize, Deserialize)] -#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] pub struct AnnotationLabel { /// Label name (e.g. `"contains-phi"`, `"gdpr-request"`). pub name: String, @@ -55,11 +48,7 @@ pub struct AnnotationLabel { } /// A user-provided or upstream annotation on a content region. -/// -/// Replaces the previous `ManualAnnotation` with a unified type that -/// supports inclusions, exclusions, and classification labels. #[derive(Debug, Clone, Serialize, Deserialize)] -#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] pub struct Annotation { /// What kind of annotation this is. pub kind: AnnotationKind, diff --git a/crates/nvisy-ontology/src/detection/classification.rs b/crates/nvisy-pipeline/src/ontology/detection/classification.rs similarity index 86% rename from crates/nvisy-ontology/src/detection/classification.rs rename to crates/nvisy-pipeline/src/ontology/detection/classification.rs index f107961..f090703 100644 --- a/crates/nvisy-ontology/src/detection/classification.rs +++ b/crates/nvisy-pipeline/src/ontology/detection/classification.rs @@ -6,7 +6,6 @@ use super::Sensitivity; /// Result of sensitivity classification over a set of detected entities. #[derive(Debug, Clone, Serialize, Deserialize)] -#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] pub struct ClassificationResult { /// Sensitivity assessment (level + risk score). pub sensitivity: Sensitivity, diff --git a/crates/nvisy-ontology/src/detection/mod.rs b/crates/nvisy-pipeline/src/ontology/detection/mod.rs similarity index 80% rename from crates/nvisy-ontology/src/detection/mod.rs rename to crates/nvisy-pipeline/src/ontology/detection/mod.rs index 4e31a01..a7ea42e 100644 --- a/crates/nvisy-ontology/src/detection/mod.rs +++ b/crates/nvisy-pipeline/src/ontology/detection/mod.rs @@ -1,8 +1,4 @@ //! Detection result types. -//! -//! A [`DetectionResult`] groups the output of a detection pass as a -//! first-class type, carrying the detected entities alongside pipeline -//! and policy metadata. mod annotation; mod classification; @@ -17,11 +13,10 @@ use uuid::Uuid; use nvisy_core::path::ContentSource; -use crate::entity::Entity; +use crate::ontology::entity::Entity; /// The output of a detection pass over a single content source. #[derive(Debug, Clone, Serialize, Deserialize)] -#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] pub struct DetectionResult { /// Content source identity and lineage. #[serde(flatten)] diff --git a/crates/nvisy-ontology/src/detection/sensitivity.rs b/crates/nvisy-pipeline/src/ontology/detection/sensitivity.rs similarity index 66% rename from crates/nvisy-ontology/src/detection/sensitivity.rs rename to crates/nvisy-pipeline/src/ontology/detection/sensitivity.rs index 55170d5..7691e97 100644 --- a/crates/nvisy-ontology/src/detection/sensitivity.rs +++ b/crates/nvisy-pipeline/src/ontology/detection/sensitivity.rs @@ -3,11 +3,7 @@ use serde::{Deserialize, Serialize}; /// Sensitivity classification assigned to a document or content region. -/// -/// Drives downstream policy: rules can be scoped to specific sensitivity -/// levels via [`RuleCondition`](crate::policy::RuleCondition). #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, strum::Display, Serialize, Deserialize)] -#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] #[serde(rename_all = "snake_case")] #[strum(serialize_all = "snake_case")] pub enum SensitivityLevel { @@ -22,18 +18,11 @@ pub enum SensitivityLevel { } /// Combined sensitivity assessment for a content source. -/// -/// Pairs a discrete [`SensitivityLevel`] with an optional continuous -/// re-identification risk score in `[0.0, 1.0]`. #[derive(Debug, Clone, Copy, Serialize, Deserialize)] -#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] pub struct Sensitivity { /// Discrete sensitivity classification. pub level: SensitivityLevel, /// Re-identification risk score in the range `[0.0, 1.0]`. - /// - /// Estimates the likelihood that a data subject could be re-identified - /// from the entities remaining after redaction. #[serde(skip_serializing_if = "Option::is_none")] pub risk_score: Option, } diff --git a/crates/nvisy-ontology/src/entity/location.rs b/crates/nvisy-pipeline/src/ontology/entity/location.rs similarity index 55% rename from crates/nvisy-ontology/src/entity/location.rs rename to crates/nvisy-pipeline/src/ontology/entity/location.rs index 25ec432..65641ba 100644 --- a/crates/nvisy-ontology/src/entity/location.rs +++ b/crates/nvisy-pipeline/src/ontology/entity/location.rs @@ -1,83 +1,18 @@ -//! Spatial and temporal location types for entity positions. +//! Modality-specific entity location types. use serde::{Deserialize, Serialize}; use uuid::Uuid; -/// A time interval within an audio or video stream. -#[derive(Debug, Clone, Serialize, Deserialize)] -#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] -pub struct TimeSpan { - /// Start time in seconds from the beginning of the stream. - pub start_secs: f64, - /// End time in seconds from the beginning of the stream. - pub end_secs: f64, -} - -/// Axis-aligned bounding box for image-based entity locations. -/// -/// Coordinates are `f64` to support both pixel and normalized (0.0–1.0) -/// values from detection models. Use [`BoundingBoxU32`] (or [`Into`]) -/// when integer pixel coordinates are needed for rendering. -#[derive(Debug, Clone, Serialize, Deserialize)] -#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] -pub struct BoundingBox { - /// Horizontal offset of the top-left corner (pixels or normalized). - pub x: f64, - /// Vertical offset of the top-left corner (pixels or normalized). - pub y: f64, - /// Width of the bounding box. - pub width: f64, - /// Height of the bounding box. - pub height: f64, -} - -/// Integer pixel-coordinate bounding box for rendering operations. -/// -/// Converted from [`BoundingBox`] by rounding each field to the nearest -/// integer. Use this at the rendering boundary where pixel-exact -/// coordinates are required. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] -pub struct BoundingBoxU32 { - /// Horizontal offset of the top-left corner in pixels. - pub x: u32, - /// Vertical offset of the top-left corner in pixels. - pub y: u32, - /// Width in pixels. - pub width: u32, - /// Height in pixels. - pub height: u32, -} - -impl From<&BoundingBox> for BoundingBoxU32 { - fn from(bb: &BoundingBox) -> Self { - Self { - x: bb.x.round() as u32, - y: bb.y.round() as u32, - width: bb.width.round() as u32, - height: bb.height.round() as u32, - } - } -} - -impl From for BoundingBoxU32 { - fn from(bb: BoundingBox) -> Self { - Self::from(&bb) - } -} +use nvisy_core::math::{BoundingBox, TimeSpan}; /// Location of an entity within text content. #[derive(Debug, Clone, Serialize, Deserialize)] -#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] pub struct TextLocation { /// Byte or character offset where the entity starts. pub start_offset: usize, /// Byte or character offset where the entity ends. pub end_offset: usize, /// Start offset of the surrounding context window for redaction. - /// - /// When set, the redaction may expand to cover surrounding text - /// (e.g. +/- N characters around an SSN) to prevent contextual - /// re-identification. #[serde(skip_serializing_if = "Option::is_none")] pub context_start_offset: Option, /// End offset of the surrounding context window for redaction. @@ -93,7 +28,6 @@ pub struct TextLocation { /// Location of an entity within an image. #[derive(Debug, Clone, Serialize, Deserialize)] -#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] pub struct ImageLocation { /// Bounding box of the entity in the image. pub bounding_box: BoundingBox, @@ -107,7 +41,6 @@ pub struct ImageLocation { /// Location of an entity within tabular data. #[derive(Debug, Clone, Serialize, Deserialize)] -#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] pub struct TabularLocation { /// Row index (0-based). pub row_index: usize, @@ -123,7 +56,6 @@ pub struct TabularLocation { /// Location of an entity within an audio stream. #[derive(Debug, Clone, Serialize, Deserialize)] -#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] pub struct AudioLocation { /// Time interval of the entity. pub time_span: TimeSpan, @@ -137,7 +69,6 @@ pub struct AudioLocation { /// Location of an entity within a video stream. #[derive(Debug, Clone, Serialize, Deserialize)] -#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] pub struct VideoLocation { /// Bounding box of the entity in the frame. pub bounding_box: BoundingBox, @@ -153,4 +84,3 @@ pub struct VideoLocation { #[serde(skip_serializing_if = "Option::is_none")] pub speaker_id: Option, } - diff --git a/crates/nvisy-ontology/src/entity/mod.rs b/crates/nvisy-pipeline/src/ontology/entity/mod.rs similarity index 79% rename from crates/nvisy-ontology/src/entity/mod.rs rename to crates/nvisy-pipeline/src/ontology/entity/mod.rs index 8e20a0f..41dd754 100644 --- a/crates/nvisy-ontology/src/entity/mod.rs +++ b/crates/nvisy-pipeline/src/ontology/entity/mod.rs @@ -1,18 +1,14 @@ //! Sensitive-data entity types and detection metadata. //! //! An [`Entity`] represents a single occurrence of sensitive data detected -//! within a document. Entities are produced by detection actions and consumed -//! by redaction and audit stages of the pipeline. +//! within a document. -mod document; mod location; mod model; mod selector; -pub use document::DocumentType; pub use location::{ - AudioLocation, BoundingBox, BoundingBoxU32, ImageLocation, TabularLocation, - TextLocation, TimeSpan, VideoLocation, + AudioLocation, ImageLocation, TabularLocation, TextLocation, VideoLocation, }; pub use model::{ModelInfo, ModelKind}; pub use selector::EntitySelector; @@ -20,34 +16,12 @@ pub use selector::EntitySelector; use serde::{Deserialize, Serialize}; use serde_json::{Map, Value}; +use nvisy_core::entity::EntityCategory; use nvisy_core::path::ContentSource; -/// Category of sensitive data an entity belongs to. -#[derive(Debug, Clone, PartialEq, Eq, Hash, strum::Display, Serialize, Deserialize)] -#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] -#[serde(rename_all = "snake_case")] -#[strum(serialize_all = "snake_case")] -pub enum EntityCategory { - /// Personally Identifiable Information (names, SSNs, addresses, etc.). - Pii, - /// Protected Health Information (HIPAA-regulated data). - Phi, - /// Financial data (credit card numbers, bank accounts, etc.). - Financial, - /// Secrets and credentials (API keys, passwords, tokens). - Credentials, - /// Legal documents and privileged communications. - Legal, - /// Biometric data (fingerprints, iris scans, voiceprints). - Biometric, - /// User-defined or plugin-specific category. - #[strum(to_string = "{0}")] - Custom(String), -} - /// Method used to detect a sensitive entity. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, strum::Display, Serialize, Deserialize)] -#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] +#[derive(schemars::JsonSchema)] #[serde(rename_all = "snake_case")] #[strum(serialize_all = "snake_case")] pub enum DetectionMethod { @@ -76,11 +50,7 @@ pub enum DetectionMethod { } /// A detected sensitive data occurrence within a document. -/// -/// Entities are produced by detection actions (regex, NER, checksum, etc.) -/// and later consumed by redaction and audit actions. #[derive(Debug, Clone, Serialize, Deserialize)] -#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] pub struct Entity { /// Content source identity and lineage. #[serde(flatten)] diff --git a/crates/nvisy-ontology/src/entity/model.rs b/crates/nvisy-pipeline/src/ontology/entity/model.rs similarity index 87% rename from crates/nvisy-ontology/src/entity/model.rs rename to crates/nvisy-pipeline/src/ontology/entity/model.rs index a372704..7506f0f 100644 --- a/crates/nvisy-ontology/src/entity/model.rs +++ b/crates/nvisy-pipeline/src/ontology/entity/model.rs @@ -4,7 +4,7 @@ use serde::{Deserialize, Serialize}; /// Provenance or licensing classification of a detection model. #[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)] -#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] +#[derive(schemars::JsonSchema)] #[serde(rename_all = "snake_case")] pub enum ModelKind { /// Open-source model (e.g. spaCy, Hugging Face community models). @@ -19,7 +19,7 @@ pub enum ModelKind { /// Identity and version of the model used for detection. #[derive(Debug, Clone, Serialize, Deserialize)] -#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] +#[derive(schemars::JsonSchema)] pub struct ModelInfo { /// Model name (e.g. `"spacy-en-core-web-lg"`, `"gpt-4"`). pub name: String, diff --git a/crates/nvisy-ontology/src/entity/selector.rs b/crates/nvisy-pipeline/src/ontology/entity/selector.rs similarity index 89% rename from crates/nvisy-ontology/src/entity/selector.rs rename to crates/nvisy-pipeline/src/ontology/entity/selector.rs index 19bfb51..15c77ac 100644 --- a/crates/nvisy-ontology/src/entity/selector.rs +++ b/crates/nvisy-pipeline/src/ontology/entity/selector.rs @@ -1,11 +1,8 @@ //! Entity selection criteria for policy rules. -//! -//! An [`EntitySelector`] describes which entities a policy rule or redaction -//! applies to, based on category, type, and confidence constraints. use serde::{Deserialize, Serialize}; -use super::EntityCategory; +use nvisy_core::entity::EntityCategory; /// Criteria for selecting which entities a policy rule applies to. /// @@ -13,7 +10,6 @@ use super::EntityCategory; /// matches every category, an empty `entity_types` list matches every type, /// and so on. When multiple fields are set, they are combined with AND logic. #[derive(Debug, Clone, Serialize, Deserialize)] -#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] pub struct EntitySelector { /// Entity categories this selector matches. Empty means all categories. #[serde(default, skip_serializing_if = "Vec::is_empty")] diff --git a/crates/nvisy-pipeline/src/ontology/mod.rs b/crates/nvisy-pipeline/src/ontology/mod.rs new file mode 100644 index 0000000..df77149 --- /dev/null +++ b/crates/nvisy-pipeline/src/ontology/mod.rs @@ -0,0 +1,9 @@ +//! Domain types for pipeline processing. +//! +//! Entity, detection, policy, redaction, and audit types used by pipeline actions. + +pub mod audit; +pub mod detection; +pub mod entity; +pub mod policy; +pub mod redaction; diff --git a/crates/nvisy-ontology/src/policy/evaluation.rs b/crates/nvisy-pipeline/src/ontology/policy/evaluation.rs similarity index 66% rename from crates/nvisy-ontology/src/policy/evaluation.rs rename to crates/nvisy-pipeline/src/ontology/policy/evaluation.rs index d0f540b..bea3093 100644 --- a/crates/nvisy-ontology/src/policy/evaluation.rs +++ b/crates/nvisy-pipeline/src/ontology/policy/evaluation.rs @@ -3,14 +3,10 @@ use serde::{Deserialize, Serialize}; use uuid::Uuid; -use crate::redaction::Redaction; +use crate::ontology::redaction::Redaction; -/// Full outcome of evaluating a [`Policy`](crate::policy::Policy) against a set of entities. -/// -/// Captures every rule kind's effect: redactions to apply, entities pending -/// human review, entities suppressed from output, blocked entities, and alerts. +/// Full outcome of evaluating a policy against a set of entities. #[derive(Debug, Clone, Serialize, Deserialize)] -#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] pub struct PolicyEvaluation { /// Identifier of the policy that was evaluated. pub policy_id: Uuid, diff --git a/crates/nvisy-ontology/src/policy/mod.rs b/crates/nvisy-pipeline/src/ontology/policy/mod.rs similarity index 70% rename from crates/nvisy-ontology/src/policy/mod.rs rename to crates/nvisy-pipeline/src/ontology/policy/mod.rs index 35de77e..f1f30c4 100644 --- a/crates/nvisy-ontology/src/policy/mod.rs +++ b/crates/nvisy-pipeline/src/ontology/policy/mod.rs @@ -1,8 +1,4 @@ //! Redaction policies and rules. -//! -//! A [`Policy`] is a named, versioned set of [`PolicyRule`]s that govern -//! how detected entities are redacted. Policies may be associated with a -//! [`RegulationKind`] and support inheritance via the `extends` field. mod evaluation; mod regulation; @@ -16,21 +12,16 @@ use semver::Version; use serde::{Deserialize, Serialize}; use uuid::Uuid; -use crate::redaction::RedactionSpec; +use crate::ontology::redaction::RedactionSpec; /// A named redaction policy containing an ordered set of rules. -/// -/// Policies are pure configuration — they describe *what* to detect and -/// *how* to handle it, independent of any specific content source. #[derive(Debug, Clone, Serialize, Deserialize)] -#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] pub struct Policy { /// Unique identifier for this policy. pub id: Uuid, /// Human-readable policy name. pub name: String, /// Policy version. - #[cfg_attr(feature = "jsonschema", schemars(with = "String"))] pub version: Version, /// Description of the policy's purpose. #[serde(skip_serializing_if = "Option::is_none")] @@ -51,7 +42,6 @@ pub struct Policy { /// A collection of policies to apply during a pipeline run. #[derive(Debug, Clone, Serialize, Deserialize)] -#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] pub struct Policies { /// The policies to evaluate, in order. pub policies: Vec, diff --git a/crates/nvisy-ontology/src/policy/regulation.rs b/crates/nvisy-pipeline/src/ontology/policy/regulation.rs similarity index 92% rename from crates/nvisy-ontology/src/policy/regulation.rs rename to crates/nvisy-pipeline/src/ontology/policy/regulation.rs index 5cad331..d7d0981 100644 --- a/crates/nvisy-ontology/src/policy/regulation.rs +++ b/crates/nvisy-pipeline/src/ontology/policy/regulation.rs @@ -4,7 +4,6 @@ use serde::{Deserialize, Serialize}; /// A compliance regulation or framework that a policy targets. #[derive(Debug, Clone, PartialEq, Eq, Hash, strum::Display, Serialize, Deserialize)] -#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] #[serde(rename_all = "snake_case")] #[strum(serialize_all = "snake_case")] pub enum RegulationKind { diff --git a/crates/nvisy-ontology/src/policy/rule.rs b/crates/nvisy-pipeline/src/ontology/policy/rule.rs similarity index 75% rename from crates/nvisy-ontology/src/policy/rule.rs rename to crates/nvisy-pipeline/src/ontology/policy/rule.rs index 4c223f7..7b9e78b 100644 --- a/crates/nvisy-ontology/src/policy/rule.rs +++ b/crates/nvisy-pipeline/src/ontology/policy/rule.rs @@ -1,18 +1,16 @@ //! Policy rule types. -//! -//! A [`PolicyRule`] defines when and how a specific redaction is applied, -//! based on entity categories, types, and confidence thresholds. use serde::{Deserialize, Serialize}; use uuid::Uuid; -use crate::detection::SensitivityLevel; -use crate::entity::{DocumentType, EntitySelector}; -use crate::redaction::RedactionSpec; +use nvisy_core::fs::DocumentType; +use crate::ontology::redaction::RedactionSpec; + +use crate::ontology::detection::SensitivityLevel; +use crate::ontology::entity::EntitySelector; /// Conditions that must be met for a [`PolicyRule`] to apply. #[derive(Debug, Clone, Serialize, Deserialize)] -#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] pub struct RuleCondition { /// Document formats this rule applies to. Empty means all formats. #[serde(default, skip_serializing_if = "Vec::is_empty")] @@ -30,7 +28,6 @@ pub struct RuleCondition { /// Classifies what a policy rule does when it matches. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, strum::Display, Serialize, Deserialize)] -#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] #[serde(rename_all = "snake_case")] #[strum(serialize_all = "snake_case")] pub enum RuleKind { @@ -46,13 +43,8 @@ pub enum RuleKind { Suppress, } -/// A single rule within a redaction [`Policy`](super::Policy). -/// -/// Rules specify which entity categories and types they match, the minimum -/// confidence threshold, and the action to take. Rules are evaluated in -/// ascending priority order. +/// A single rule within a redaction policy. #[derive(Debug, Clone, Serialize, Deserialize)] -#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] pub struct PolicyRule { /// Unique identifier for this rule. pub id: Uuid, diff --git a/crates/nvisy-ontology/src/redaction/mod.rs b/crates/nvisy-pipeline/src/ontology/redaction/mod.rs similarity index 53% rename from crates/nvisy-ontology/src/redaction/mod.rs rename to crates/nvisy-pipeline/src/ontology/redaction/mod.rs index 6bc0c42..2abe0e4 100644 --- a/crates/nvisy-ontology/src/redaction/mod.rs +++ b/crates/nvisy-pipeline/src/ontology/redaction/mod.rs @@ -1,67 +1,35 @@ -//! Redaction methods, specifications, outputs, and records. +//! Redaction specifications, records, reviews, and summaries. //! -//! This module contains three layers of redaction types: -//! -//! 1. **Method** ([`RedactionMethod`]) — a plain tag enum naming a redaction -//! strategy. Used as a lightweight identifier (e.g. in logs, serialized -//! references, or when the caller only needs to know *which* algorithm). -//! -//! 2. **Spec** ([`RedactionSpec`]) — a data-carrying enum that describes a -//! redaction request submitted to the engine: which method to apply and -//! the configuration parameters it needs (mask char, blur sigma, key id, -//! etc.). Used on [`PolicyRule`](crate::policy::PolicyRule) and -//! [`Policy`](crate::policy::Policy). -//! -//! 3. **Output** ([`RedactionOutput`]) — a data-carrying enum that records -//! what was actually done and the result data (replacement string, -//! ciphertext, shifted date, etc.). Stored on [`Redaction`]. -//! -//! All three are organized by modality: -//! - Text / tabular: [`TextRedactionMethod`], [`TextRedactionSpec`], [`TextRedactionOutput`] -//! - Image / video: [`ImageRedactionMethod`], [`ImageRedactionSpec`], [`ImageRedactionOutput`] -//! - Audio: [`AudioRedactionMethod`], [`AudioRedactionSpec`], [`AudioRedactionOutput`] +//! - [`RedactionSpec`] — describes *how* to redact (method + config params). +//! - [`Redaction`] — records a redaction decision for a specific entity. +//! - [`ReviewDecision`] / [`ReviewStatus`] — human-in-the-loop review. +//! - [`RedactionSummary`] — per-source redaction counts. +//! - [`Redactable`] trait — types that produce redaction decisions. -mod method; -mod output; mod review; mod spec; mod summary; +mod trait_; -pub use method::{ - AudioRedactionMethod, ImageRedactionMethod, RedactionMethod, TextRedactionMethod, -}; -pub use output::{ - AudioRedactionOutput, ImageRedactionOutput, RedactionOutput, TextRedactionOutput, -}; pub use review::{ReviewDecision, ReviewStatus}; pub use spec::{ AudioRedactionSpec, ImageRedactionSpec, RedactionSpec, TextRedactionSpec, DEFAULT_BLOCK_COLOR, DEFAULT_BLUR_SIGMA, DEFAULT_MASK_CHAR, DEFAULT_PIXELATE_BLOCK_SIZE, }; pub use summary::RedactionSummary; +pub use trait_::Redactable; use serde::{Deserialize, Serialize}; use uuid::Uuid; +use nvisy_codec::render::output::RedactionOutput; use nvisy_core::path::ContentSource; -use crate::entity::Entity; -use crate::policy::Policy; - -/// Types that produce redaction decisions. -pub trait Redactable { - /// The entities detected in this content. - fn entities(&self) -> &[Entity]; - /// The policy governing redaction. - fn policy(&self) -> Option<&Policy>; -} - /// A redaction decision recording how a specific entity was (or will be) redacted. /// -/// Each `Redaction` is linked to exactly one [`Entity`](crate::entity::Entity) -/// via `entity_id`. +/// Each `Redaction` is linked to exactly one entity via `entity_id`. #[derive(Debug, Clone, Serialize, Deserialize)] -#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] +#[derive(schemars::JsonSchema)] pub struct Redaction { /// Content source identity and lineage. #[serde(flatten)] diff --git a/crates/nvisy-ontology/src/redaction/review.rs b/crates/nvisy-pipeline/src/ontology/redaction/review.rs similarity index 83% rename from crates/nvisy-ontology/src/redaction/review.rs rename to crates/nvisy-pipeline/src/ontology/redaction/review.rs index d2e25fd..549f0c3 100644 --- a/crates/nvisy-ontology/src/redaction/review.rs +++ b/crates/nvisy-pipeline/src/ontology/redaction/review.rs @@ -5,7 +5,7 @@ use serde::{Deserialize, Serialize}; /// Status of a human review on a redaction decision. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] -#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] +#[derive(schemars::JsonSchema)] #[serde(rename_all = "snake_case")] pub enum ReviewStatus { /// Awaiting human review. @@ -20,14 +20,14 @@ pub enum ReviewStatus { /// A review decision recorded against a redaction. #[derive(Debug, Clone, Serialize, Deserialize)] -#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] +#[derive(schemars::JsonSchema)] pub struct ReviewDecision { /// Outcome of the review. pub status: ReviewStatus, /// Identifier of the reviewer (human or service account). pub reviewer_id: String, /// When the review decision was made. - #[cfg_attr(feature = "jsonschema", schemars(with = "String"))] + #[schemars(with = "String")] pub timestamp: Timestamp, /// Optional reason for the decision. #[serde(skip_serializing_if = "Option::is_none")] diff --git a/crates/nvisy-ontology/src/redaction/spec.rs b/crates/nvisy-pipeline/src/ontology/redaction/spec.rs similarity index 92% rename from crates/nvisy-ontology/src/redaction/spec.rs rename to crates/nvisy-pipeline/src/ontology/redaction/spec.rs index c87f784..e2f0ed3 100644 --- a/crates/nvisy-ontology/src/redaction/spec.rs +++ b/crates/nvisy-pipeline/src/ontology/redaction/spec.rs @@ -2,19 +2,18 @@ //! //! A [`RedactionSpec`] describes *how* to redact — which method to apply and //! the configuration parameters it needs (mask char, blur sigma, encryption -//! key id, etc.). Used on [`PolicyRule`](crate::policy::PolicyRule) and -//! [`Policy`](crate::policy::Policy). +//! key id, etc.). use derive_more::From; use serde::{Deserialize, Serialize}; -use super::method::{ +use nvisy_core::redaction::{ AudioRedactionMethod, ImageRedactionMethod, RedactionMethod, TextRedactionMethod, }; /// Text redaction specification with method-specific configuration. #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] -#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] +#[derive(schemars::JsonSchema)] #[serde(tag = "method", rename_all = "snake_case")] pub enum TextRedactionSpec { /// Replace characters with a mask character. @@ -82,7 +81,7 @@ fn default_mask_char() -> char { /// Image redaction specification with method-specific configuration. #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] -#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] +#[derive(schemars::JsonSchema)] #[serde(tag = "method", rename_all = "snake_case")] pub enum ImageRedactionSpec { /// Apply a gaussian blur. @@ -119,7 +118,7 @@ fn default_block_size() -> u32 { /// Audio redaction specification. #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] -#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] +#[derive(schemars::JsonSchema)] #[serde(tag = "method", rename_all = "snake_case")] pub enum AudioRedactionSpec { /// Replace with silence. @@ -175,10 +174,8 @@ impl AudioRedactionSpec { /// Unified redaction specification submitted to the engine. /// /// Carries the method to apply and its configuration parameters. -/// Used on [`PolicyRule`](crate::policy::PolicyRule) and -/// [`Policy`](crate::policy::Policy). #[derive(Debug, Clone, PartialEq, From, Serialize, Deserialize)] -#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] +#[derive(schemars::JsonSchema)] #[serde(rename_all = "snake_case")] pub enum RedactionSpec { /// Text/tabular redaction specification. diff --git a/crates/nvisy-ontology/src/redaction/summary.rs b/crates/nvisy-pipeline/src/ontology/redaction/summary.rs similarity index 89% rename from crates/nvisy-ontology/src/redaction/summary.rs rename to crates/nvisy-pipeline/src/ontology/redaction/summary.rs index 5246387..2887221 100644 --- a/crates/nvisy-ontology/src/redaction/summary.rs +++ b/crates/nvisy-pipeline/src/ontology/redaction/summary.rs @@ -6,7 +6,7 @@ use nvisy_core::path::ContentSource; /// Summary of redactions applied to a single content source. #[derive(Debug, Clone, Serialize, Deserialize)] -#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] +#[derive(schemars::JsonSchema)] pub struct RedactionSummary { /// The content source these counts apply to. #[serde(flatten)] diff --git a/crates/nvisy-pipeline/src/ontology/redaction/trait_.rs b/crates/nvisy-pipeline/src/ontology/redaction/trait_.rs new file mode 100644 index 0000000..d635455 --- /dev/null +++ b/crates/nvisy-pipeline/src/ontology/redaction/trait_.rs @@ -0,0 +1,12 @@ +//! Pipeline-level redaction trait. + +use crate::ontology::entity::Entity; +use crate::ontology::policy::Policy; + +/// Types that produce redaction decisions. +pub trait Redactable { + /// The entities detected in this content. + fn entities(&self) -> &[Entity]; + /// The policy governing redaction. + fn policy(&self) -> Option<&Policy>; +} diff --git a/crates/nvisy-pipeline/src/prelude.rs b/crates/nvisy-pipeline/src/prelude.rs index 4387f8c..2c45862 100644 --- a/crates/nvisy-pipeline/src/prelude.rs +++ b/crates/nvisy-pipeline/src/prelude.rs @@ -8,7 +8,7 @@ pub use crate::detection::dictionary::{DetectDictionaryAction, DetectDictionaryP pub use crate::detection::tabular::{DetectTabularAction, DetectTabularParams, ColumnRule}; pub use crate::detection::manual::{DetectManualAction, DetectManualParams}; pub use crate::detection::checksum::{DetectChecksumAction, DetectChecksumParams}; -pub use crate::detection::ner::{DetectNerAction, DetectNerParams, DetectNerInput}; +pub use crate::detection::ner::{DetectNerAction, DetectNerParams, DetectNerInput, NerBackend, NerConfig}; pub use crate::detection::classify::{ClassifyAction, ClassificationResult}; pub use crate::redaction::evaluate_policy::{EvaluatePolicyAction, EvaluatePolicyParams}; pub use crate::redaction::apply::{ @@ -22,6 +22,7 @@ pub use crate::generation::synthetic::{ #[cfg(feature = "image-redaction")] pub use crate::generation::ocr::{ GenerateOcrAction, GenerateOcrParams, GenerateOcrInput, GenerateOcrOutput, + OcrBackend, OcrConfig, }; #[cfg(feature = "audio-redaction")] diff --git a/crates/nvisy-pipeline/src/redaction/apply.rs b/crates/nvisy-pipeline/src/redaction/apply.rs index 5ab9981..c8696ff 100644 --- a/crates/nvisy-pipeline/src/redaction/apply.rs +++ b/crates/nvisy-pipeline/src/redaction/apply.rs @@ -7,8 +7,9 @@ use serde::Deserialize; use nvisy_codec::handler::{TxtHandler, CsvHandler}; use nvisy_codec::document::Document; use nvisy_codec::render::text::{TextRedaction, AsRedactableText, mask_cell}; -use nvisy_ontology::entity::Entity; -use nvisy_ontology::redaction::{Redaction, RedactionOutput}; +use nvisy_codec::render::output::RedactionOutput; +use crate::ontology::redaction::Redaction; +use crate::ontology::entity::Entity; use nvisy_core::error::Error; #[cfg(feature = "image-redaction")] diff --git a/crates/nvisy-pipeline/src/redaction/emit_audit.rs b/crates/nvisy-pipeline/src/redaction/emit_audit.rs index 0c9a7f1..8cd5799 100644 --- a/crates/nvisy-pipeline/src/redaction/emit_audit.rs +++ b/crates/nvisy-pipeline/src/redaction/emit_audit.rs @@ -6,8 +6,8 @@ use uuid::Uuid; use nvisy_core::error::Error; use nvisy_core::path::ContentSource; -use nvisy_ontology::audit::{Audit, AuditAction}; -use nvisy_ontology::redaction::Redaction; +use crate::ontology::audit::{Audit, AuditAction}; +use crate::ontology::redaction::Redaction; use crate::action::Action; @@ -65,7 +65,6 @@ impl Action for EmitAuditAction { source_id: None, run_id: self.params.run_id, actor: self.params.actor.clone(), - explanation: None, }; audits.push(audit); diff --git a/crates/nvisy-pipeline/src/redaction/evaluate_policy.rs b/crates/nvisy-pipeline/src/redaction/evaluate_policy.rs index b7590ec..aea0d21 100644 --- a/crates/nvisy-pipeline/src/redaction/evaluate_policy.rs +++ b/crates/nvisy-pipeline/src/redaction/evaluate_policy.rs @@ -2,12 +2,15 @@ use serde::Deserialize; -use nvisy_ontology::entity::Entity; -use nvisy_ontology::policy::PolicyRule; -use nvisy_ontology::redaction::{ - AudioRedactionOutput, AudioRedactionSpec, ImageRedactionOutput, ImageRedactionSpec, Redaction, - RedactionOutput, RedactionSpec, TextRedactionOutput, TextRedactionSpec, +use nvisy_codec::render::output::{ + AudioRedactionOutput, ImageRedactionOutput, RedactionOutput, TextRedactionOutput, }; +use crate::ontology::redaction::{ + AudioRedactionSpec, ImageRedactionSpec, Redaction, + RedactionSpec, TextRedactionSpec, +}; +use crate::ontology::entity::Entity; +use crate::ontology::policy::PolicyRule; use nvisy_core::error::Error; use crate::action::Action; diff --git a/crates/nvisy-python/Cargo.toml b/crates/nvisy-python/Cargo.toml index de125ea..57e35d2 100644 --- a/crates/nvisy-python/Cargo.toml +++ b/crates/nvisy-python/Cargo.toml @@ -2,7 +2,7 @@ [package] name = "nvisy-python" -description = "PyO3 bridge for AI NER detection via embedded Python" +description = "PyO3 bridge for AI NER/OCR detection via embedded Python" keywords = ["nvisy", "python", "pyo3", "ner"] categories = ["api-bindings"] @@ -23,15 +23,13 @@ rustdoc-args = ["--cfg", "docsrs"] [features] default = ["png"] -# Image-based NER and OCR actions (requires PngHandler) -png = ["nvisy-codec/png"] +# Image-based OCR actions (requires PngHandler via pipeline) +png = ["nvisy-pipeline/image-redaction"] [dependencies] # Internal crates nvisy-core = { workspace = true, features = [] } -nvisy-ontology = { workspace = true, features = [] } nvisy-pipeline = { workspace = true, features = [] } -nvisy-codec = { workspace = true, features = [] } # (De)serialization serde = { workspace = true, features = ["derive"] } @@ -41,11 +39,9 @@ serde_json = { workspace = true, features = [] } tokio = { workspace = true, features = ["sync", "rt"] } async-trait = { workspace = true, features = [] } -# Primitive datatypes -uuid = { workspace = true, features = ["v4"] } - # Python interop -pyo3 = { workspace = true, features = ["auto-initialize", "serde"] } +pyo3 = { workspace = true, features = ["auto-initialize"] } +pythonize = { workspace = true, features = [] } # Error handling thiserror = { workspace = true, features = [] } diff --git a/crates/nvisy-python/src/actions/mod.rs b/crates/nvisy-python/src/actions/mod.rs deleted file mode 100644 index 37d7026..0000000 --- a/crates/nvisy-python/src/actions/mod.rs +++ /dev/null @@ -1,199 +0,0 @@ -//! Pipeline actions that perform AI-powered named-entity recognition and OCR. -//! -//! Three actions are provided: -//! - [`DetectNerAction`] -- runs NER over text documents. -//! - [`DetectNerImageAction`] -- runs NER over images (OCR + entity detection). -//! - [`OcrDetectAction`] -- performs OCR on images to extract text regions. - -/// OCR detection pipeline action. -#[cfg(feature = "png")] -pub mod ocr; - -use serde::Deserialize; - -use nvisy_codec::handler::{TxtHandler, TxtData}; -#[cfg(feature = "png")] -use nvisy_codec::handler::PngHandler; -use nvisy_codec::document::Document; -use nvisy_ontology::entity::Entity; -use nvisy_core::error::Error; -use nvisy_core::io::ContentData; -use nvisy_pipeline::action::Action; -use crate::bridge::PythonBridge; -use crate::ner::{self, NerConfig}; - -/// Typed parameters for NER actions. -#[derive(Debug, Deserialize)] -#[serde(rename_all = "camelCase")] -pub struct DetectNerParams { - /// Entity type labels to detect (e.g., `["PERSON", "SSN"]`). - #[serde(default)] - pub entity_types: Vec, - /// Minimum confidence score to include a detection (0.0 -- 1.0). - #[serde(default = "default_confidence_threshold")] - pub confidence_threshold: f64, - /// Sampling temperature forwarded to the AI model. - #[serde(default)] - pub temperature: f64, - /// API key for the AI provider. - #[serde(default)] - pub api_key: String, - /// Model identifier (e.g., `"gpt-4"`). - #[serde(default = "default_model")] - pub model: String, - /// AI provider name (e.g., `"openai"`). - #[serde(default = "default_provider")] - pub provider: String, -} - -fn default_confidence_threshold() -> f64 { 0.5 } -fn default_model() -> String { "gpt-4".to_string() } -fn default_provider() -> String { "openai".to_string() } - -/// Pipeline action that detects named entities in text documents. -/// -/// Each document's text is sent through the NER model. If no documents are -/// provided, the raw content is interpreted as UTF-8 text. Detected entities -/// are returned directly. -pub struct DetectNerAction { - /// Python bridge used to call the NER model. - pub bridge: PythonBridge, - params: DetectNerParams, -} - -impl DetectNerAction { - /// Replace the default bridge with a pre-configured one. - pub fn with_bridge(mut self, bridge: PythonBridge) -> Self { - self.bridge = bridge; - self - } -} - -#[async_trait::async_trait] -impl Action for DetectNerAction { - type Params = DetectNerParams; - type Input = (ContentData, Vec>); - type Output = Vec; - - fn id(&self) -> &str { "detect-ner" } - - async fn connect(params: Self::Params) -> Result { - Ok(Self { bridge: PythonBridge::default(), params }) - } - - async fn execute( - &self, - input: Self::Input, - ) -> Result { - let (content, documents) = input; - let config = ner_config_from_params(&self.params); - - let docs = if documents.is_empty() { - let text = content.as_str() - .map_err(|e| Error::runtime( - format!("Content is not valid UTF-8: {}", e), - "python/ner", - false, - ))?; - let handler = TxtHandler::new(TxtData { - lines: text.lines().map(String::from).collect(), - trailing_newline: text.ends_with('\n'), - }); - vec![Document::new(handler)] - } else { - documents - }; - - let mut all_entities = Vec::new(); - for doc in &docs { - let text = doc.handler().lines().join("\n"); - let entities = ner::detect_ner(&self.bridge, &text, &config).await?; - all_entities.extend(entities); - } - - Ok(all_entities) - } -} - -/// Pipeline action that detects named entities in images. -/// -/// Each image is processed individually through NER. If no images are -/// provided, the raw content is treated as a single image whose MIME type -/// is inferred from the content metadata. Detected entities are returned -/// directly. -#[cfg(feature = "png")] -pub struct DetectNerImageAction { - /// Python bridge used to call the NER model. - pub bridge: PythonBridge, - params: DetectNerParams, -} - -#[cfg(feature = "png")] -impl DetectNerImageAction { - /// Replace the default bridge with a pre-configured one. - pub fn with_bridge(mut self, bridge: PythonBridge) -> Self { - self.bridge = bridge; - self - } -} - -#[cfg(feature = "png")] -#[async_trait::async_trait] -impl Action for DetectNerImageAction { - type Params = DetectNerParams; - type Input = (ContentData, Vec>); - type Output = Vec; - - fn id(&self) -> &str { "detect-ner-image" } - - async fn connect(params: Self::Params) -> Result { - Ok(Self { bridge: PythonBridge::default(), params }) - } - - async fn execute( - &self, - input: Self::Input, - ) -> Result { - let (content, images) = input; - let config = ner_config_from_params(&self.params); - - let mut all_entities = Vec::new(); - - if images.is_empty() { - let mime_type = content.content_type() - .unwrap_or("application/octet-stream") - .to_string(); - let entities = ner::detect_ner_image( - &self.bridge, - content.as_bytes(), - &mime_type, - &config, - ).await?; - all_entities.extend(entities); - } else { - for doc in &images { - let entities = ner::detect_ner_image( - &self.bridge, - doc.handler().bytes(), - "image/png", - &config, - ).await?; - all_entities.extend(entities); - } - } - - Ok(all_entities) - } -} - -/// Convert [`DetectNerParams`] into the internal [`NerConfig`]. -fn ner_config_from_params(params: &DetectNerParams) -> NerConfig { - NerConfig { - entity_types: params.entity_types.clone(), - confidence_threshold: params.confidence_threshold, - temperature: params.temperature, - api_key: params.api_key.clone(), - model: params.model.clone(), - provider: params.provider.clone(), - } -} diff --git a/crates/nvisy-python/src/actions/ocr.rs b/crates/nvisy-python/src/actions/ocr.rs deleted file mode 100644 index 60f2016..0000000 --- a/crates/nvisy-python/src/actions/ocr.rs +++ /dev/null @@ -1,121 +0,0 @@ -//! OCR detection pipeline action. - -use serde::Deserialize; - -use nvisy_codec::handler::{TxtHandler, TxtData, PngHandler}; -use nvisy_codec::document::Document; -use nvisy_ontology::entity::Entity; -use nvisy_core::error::Error; -use nvisy_core::io::ContentData; -use nvisy_pipeline::action::Action; -use crate::bridge::PythonBridge; -use crate::ocr::{self, OcrConfig}; - -/// Typed parameters for [`OcrDetectAction`]. -#[derive(Debug, Deserialize)] -#[serde(rename_all = "camelCase")] -pub struct OcrDetectParams { - /// Language hint (default `"eng"`). - #[serde(default = "default_language")] - pub language: String, - /// OCR engine to use. - #[serde(default = "default_engine")] - pub engine: String, - /// Minimum confidence threshold. - #[serde(default = "default_confidence")] - pub confidence_threshold: f64, -} - -fn default_language() -> String { - "eng".to_string() -} -fn default_engine() -> String { - "tesseract".to_string() -} -fn default_confidence() -> f64 { - 0.5 -} - -/// Pipeline action that performs OCR on images and produces entities -/// with bounding boxes, plus `Document` artifacts from concatenated -/// OCR text so downstream regex/dictionary/NER can process it. -pub struct OcrDetectAction { - /// Python bridge used to call the OCR backend. - pub bridge: PythonBridge, - params: OcrDetectParams, -} - -impl OcrDetectAction { - /// Replace the default bridge with a pre-configured one. - pub fn with_bridge(mut self, bridge: PythonBridge) -> Self { - self.bridge = bridge; - self - } -} - -#[async_trait::async_trait] -impl Action for OcrDetectAction { - type Params = OcrDetectParams; - type Input = (ContentData, Vec>); - type Output = (Vec, Vec>); - - fn id(&self) -> &str { - "detect-ocr" - } - - async fn connect(params: Self::Params) -> Result { - Ok(Self { bridge: PythonBridge::default(), params }) - } - - async fn execute( - &self, - input: Self::Input, - ) -> Result { - let (content, images) = input; - let config = OcrConfig { - language: self.params.language.clone(), - engine: self.params.engine.clone(), - confidence_threshold: self.params.confidence_threshold, - }; - - let mut all_entities = Vec::new(); - let mut all_ocr_text = Vec::new(); - - if images.is_empty() { - // Treat content as a single image - let mime_type = content - .content_type() - .unwrap_or("application/octet-stream") - .to_string(); - let entities = - ocr::detect_ocr(&self.bridge, content.as_bytes(), &mime_type, &config).await?; - for entity in &entities { - all_ocr_text.push(entity.value.clone()); - } - all_entities.extend(entities); - } else { - for doc in &images { - let entities = - ocr::detect_ocr(&self.bridge, doc.handler().bytes(), "image/png", &config) - .await?; - for entity in &entities { - all_ocr_text.push(entity.value.clone()); - } - all_entities.extend(entities); - } - } - - // Create a Document from concatenated OCR text for downstream processing - let mut documents = Vec::new(); - if !all_ocr_text.is_empty() { - let text = all_ocr_text.join("\n"); - let handler = TxtHandler::new(TxtData { - lines: text.lines().map(String::from).collect(), - trailing_newline: text.ends_with('\n'), - }); - documents.push(Document::new(handler)); - } - - Ok((all_entities, documents)) - } -} diff --git a/crates/nvisy-python/src/lib.rs b/crates/nvisy-python/src/lib.rs index 00f0411..a630b23 100644 --- a/crates/nvisy-python/src/lib.rs +++ b/crates/nvisy-python/src/lib.rs @@ -1,16 +1,15 @@ -//! Python/PyO3 bridge for AI-powered NER detection. +//! Python/PyO3 bridge for AI-powered NER and OCR detection. //! //! This crate embeds a CPython interpreter via PyO3 and delegates named-entity -//! recognition (NER) to a Python module (`nvisy_ai`). It exposes pipeline -//! [`Action`](nvisy_pipeline::action::Action) implementations as well as a -//! [`Provider`](nvisy_pipeline::provider::Provider) for the -//! `"ai"` provider. +//! recognition (NER) and OCR to a Python module (`nvisy_ai`). It implements +//! the [`NerBackend`](nvisy_pipeline::detection::ner::NerBackend) and +//! [`OcrBackend`](nvisy_pipeline::generation::ocr::OcrBackend) traits for +//! [`PythonBridge`](bridge::PythonBridge), returning raw JSON to the pipeline. #![deny(unsafe_code)] #![cfg_attr(docsrs, feature(doc_cfg))] #![doc = include_str!("../README.md")] -pub mod actions; pub mod bridge; pub mod error; pub mod ner; diff --git a/crates/nvisy-python/src/ner/mod.rs b/crates/nvisy-python/src/ner/mod.rs index e16d5fd..ece5989 100644 --- a/crates/nvisy-python/src/ner/mod.rs +++ b/crates/nvisy-python/src/ner/mod.rs @@ -1,39 +1,26 @@ //! Named-entity recognition (NER) detection via a Python AI backend. //! //! Functions in this module acquire the GIL, call into the Python `nvisy_ai` -//! module, and convert the returned list of dicts into [`Entity`] values. +//! module, and return raw JSON values. Entity construction is handled by +//! the pipeline's [`NerBackend`] / [`DetectNerAction`] layer. use pyo3::prelude::*; -use pyo3::types::{PyDict, PyList}; +use pyo3::types::PyDict; +use serde_json::Value; -use nvisy_ontology::entity::{DetectionMethod, Entity, EntityCategory, TextLocation}; use nvisy_core::error::Error; +use nvisy_pipeline::detection::ner::{NerBackend, NerConfig}; use crate::bridge::PythonBridge; use crate::error::from_pyerr; -/// Configuration for NER detection passed to the Python backend. -#[derive(Debug, Clone)] -pub struct NerConfig { - /// Entity type labels to detect (e.g., `["PERSON", "SSN"]`). - pub entity_types: Vec, - /// Minimum confidence score to include a detection (0.0 -- 1.0). - pub confidence_threshold: f64, - /// Sampling temperature forwarded to the AI model. - pub temperature: f64, - /// API key for the AI provider. - pub api_key: String, - /// Model identifier (e.g., `"gpt-4"`). - pub model: String, - /// AI provider name (e.g., `"openai"`). - pub provider: String, -} - -/// Call Python detect_ner function via GIL + spawn_blocking. +/// Call Python `detect_ner()` via GIL + `spawn_blocking`. +/// +/// Returns raw JSON dicts — no domain-type construction. pub async fn detect_ner( bridge: &PythonBridge, text: &str, config: &NerConfig, -) -> Result, Error> { +) -> Result, Error> { let module_name = bridge.module_name().to_string(); let text = text.to_string(); let config = config.clone(); @@ -46,29 +33,29 @@ pub async fn detect_ner( kwargs.set_item("text", &text).map_err(from_pyerr)?; kwargs.set_item("entity_types", &config.entity_types).map_err(from_pyerr)?; kwargs.set_item("confidence_threshold", config.confidence_threshold).map_err(from_pyerr)?; - kwargs.set_item("temperature", config.temperature).map_err(from_pyerr)?; - kwargs.set_item("api_key", &config.api_key).map_err(from_pyerr)?; - kwargs.set_item("model", &config.model).map_err(from_pyerr)?; - kwargs.set_item("provider", &config.provider).map_err(from_pyerr)?; let result = module .call_method("detect_ner", (), Some(&kwargs)) .map_err(from_pyerr)?; - parse_python_entities(py, result) + pythonize::depythonize::>(&result).map_err(|e| { + Error::python(format!("Failed to deserialize NER result: {}", e)) + }) }) }) .await .map_err(|e| Error::python(format!("Task join error: {}", e)))? } -/// Call Python detect_ner_image function via GIL + spawn_blocking. +/// Call Python `detect_ner_image()` via GIL + `spawn_blocking`. +/// +/// Returns raw JSON dicts — no domain-type construction. pub async fn detect_ner_image( bridge: &PythonBridge, image_data: &[u8], mime_type: &str, config: &NerConfig, -) -> Result, Error> { +) -> Result, Error> { let module_name = bridge.module_name().to_string(); let image_data = image_data.to_vec(); let mime_type = mime_type.to_string(); @@ -83,100 +70,39 @@ pub async fn detect_ner_image( kwargs.set_item("mime_type", &mime_type).map_err(from_pyerr)?; kwargs.set_item("entity_types", &config.entity_types).map_err(from_pyerr)?; kwargs.set_item("confidence_threshold", config.confidence_threshold).map_err(from_pyerr)?; - kwargs.set_item("api_key", &config.api_key).map_err(from_pyerr)?; - kwargs.set_item("model", &config.model).map_err(from_pyerr)?; - kwargs.set_item("provider", &config.provider).map_err(from_pyerr)?; let result = module .call_method("detect_ner_image", (), Some(&kwargs)) .map_err(from_pyerr)?; - parse_python_entities(py, result) + pythonize::depythonize::>(&result).map_err(|e| { + Error::python(format!("Failed to deserialize NER image result: {}", e)) + }) }) }) .await .map_err(|e| Error::python(format!("Task join error: {}", e)))? } -/// Parse Python list[dict] response into Vec. -fn parse_python_entities(_py: Python<'_>, result: Bound<'_, PyAny>) -> Result, Error> { - let list: &Bound<'_, PyList> = result.downcast().map_err(|e| { - Error::python(format!("Expected list from Python: {}", e)) - })?; - - let mut entities = Vec::new(); - - for item in list.iter() { - let dict: &Bound<'_, PyDict> = item.downcast().map_err(|e| { - Error::python(format!("Expected dict in list: {}", e)) - })?; - - let category_str: String = dict - .get_item("category") - .map_err(from_pyerr)? - .ok_or_else(|| Error::python("Missing 'category'"))? - .extract() - .map_err(from_pyerr)?; - - let category = match category_str.as_str() { - "pii" => EntityCategory::Pii, - "phi" => EntityCategory::Phi, - "financial" => EntityCategory::Financial, - "credentials" => EntityCategory::Credentials, - other => EntityCategory::Custom(other.to_string()), - }; - - let entity_type: String = dict - .get_item("entity_type") - .map_err(from_pyerr)? - .ok_or_else(|| Error::python("Missing 'entity_type'"))? - .extract() - .map_err(from_pyerr)?; - - let value: String = dict - .get_item("value") - .map_err(from_pyerr)? - .ok_or_else(|| Error::python("Missing 'value'"))? - .extract() - .map_err(from_pyerr)?; - - let confidence: f64 = dict - .get_item("confidence") - .map_err(from_pyerr)? - .ok_or_else(|| Error::python("Missing 'confidence'"))? - .extract() - .map_err(from_pyerr)?; - - let start_offset: usize = dict - .get_item("start_offset") - .map_err(from_pyerr)? - .and_then(|v| v.extract().ok()) - .unwrap_or(0); - - let end_offset: usize = dict - .get_item("end_offset") - .map_err(from_pyerr)? - .and_then(|v| v.extract().ok()) - .unwrap_or(0); - - let entity = Entity::new( - category, - entity_type, - value, - DetectionMethod::Ner, - confidence, - ) - .with_text_location(TextLocation { - start_offset, - end_offset, - context_start_offset: None, - context_end_offset: None, - element_id: None, - page_number: None, - }); - - entities.push(entity); +/// [`NerBackend`] implementation for [`PythonBridge`]. +/// +/// Delegates to the `detect_ner` / `detect_ner_image` functions above. +#[async_trait::async_trait] +impl NerBackend for PythonBridge { + async fn detect_text( + &self, + text: &str, + config: &NerConfig, + ) -> Result, Error> { + detect_ner(self, text, config).await } - Ok(entities) + async fn detect_image( + &self, + image_data: &[u8], + mime_type: &str, + config: &NerConfig, + ) -> Result, Error> { + detect_ner_image(self, image_data, mime_type, config).await + } } diff --git a/crates/nvisy-python/src/ocr/mod.rs b/crates/nvisy-python/src/ocr/mod.rs index e2fd1a8..b793133 100644 --- a/crates/nvisy-python/src/ocr/mod.rs +++ b/crates/nvisy-python/src/ocr/mod.rs @@ -1,40 +1,31 @@ //! OCR text extraction via the Python backend. //! //! Calls `nvisy_ai.detect_ocr()` through the Python bridge to perform -//! optical character recognition on images, returning text regions with -//! bounding boxes. +//! optical character recognition on images, returning raw JSON values. +//! Entity construction is handled by the pipeline's [`OcrBackend`] / +//! [`GenerateOcrAction`] layer. use pyo3::prelude::*; -use pyo3::types::{PyDict, PyList}; +use pyo3::types::PyDict; +use serde_json::Value; -use nvisy_ontology::entity::{ - BoundingBox, DetectionMethod, Entity, EntityCategory, ImageLocation, -}; use nvisy_core::error::Error; use crate::bridge::PythonBridge; use crate::error::from_pyerr; -/// Configuration for OCR detection. -#[derive(Debug, Clone)] -pub struct OcrConfig { - /// Language hint (e.g. `"eng"` for English). - pub language: String, - /// OCR engine to use (`"tesseract"`, `"google-vision"`, `"aws-textract"`). - pub engine: String, - /// Minimum confidence threshold for OCR results. - pub confidence_threshold: f64, -} +#[cfg(feature = "png")] +use nvisy_pipeline::generation::ocr::{OcrBackend, OcrConfig}; /// Call Python `detect_ocr()` via GIL + `spawn_blocking`. /// -/// Returns a list of entities with `DetectionMethod::Ocr`, each carrying -/// a bounding box indicating where the text was found in the image. +/// Returns raw JSON dicts — no domain-type construction. +#[cfg(feature = "png")] pub async fn detect_ocr( bridge: &PythonBridge, image_data: &[u8], mime_type: &str, config: &OcrConfig, -) -> Result, Error> { +) -> Result, Error> { let module_name = bridge.module_name().to_string(); let image_data = image_data.to_vec(); let mime_type = mime_type.to_string(); @@ -55,93 +46,27 @@ pub async fn detect_ocr( .call_method("detect_ocr", (), Some(&kwargs)) .map_err(from_pyerr)?; - parse_ocr_results(result) + pythonize::depythonize::>(&result).map_err(|e| { + Error::python(format!("Failed to deserialize OCR result: {}", e)) + }) }) }) .await .map_err(|e| Error::python(format!("Task join error: {}", e)))? } -/// Parse Python list[dict] OCR response into Vec. +/// [`OcrBackend`] implementation for [`PythonBridge`]. /// -/// Expected Python response format: -/// ```python -/// [ -/// { -/// "text": "John Doe", -/// "x": 100.0, -/// "y": 200.0, -/// "width": 150.0, -/// "height": 30.0, -/// "confidence": 0.95 -/// }, -/// ... -/// ] -/// ``` -fn parse_ocr_results(result: Bound<'_, PyAny>) -> Result, Error> { - let list: &Bound<'_, PyList> = result.downcast().map_err(|e| { - Error::python(format!("Expected list from Python OCR: {}", e)) - })?; - - let mut entities = Vec::new(); - - for item in list.iter() { - let dict: &Bound<'_, PyDict> = item.downcast().map_err(|e| { - Error::python(format!("Expected dict in OCR list: {}", e)) - })?; - - let text: String = dict - .get_item("text") - .map_err(from_pyerr)? - .ok_or_else(|| Error::python("Missing 'text' in OCR result"))? - .extract() - .map_err(from_pyerr)?; - - let x: f64 = dict - .get_item("x") - .map_err(from_pyerr)? - .and_then(|v| v.extract().ok()) - .unwrap_or(0.0); - - let y: f64 = dict - .get_item("y") - .map_err(from_pyerr)? - .and_then(|v| v.extract().ok()) - .unwrap_or(0.0); - - let width: f64 = dict - .get_item("width") - .map_err(from_pyerr)? - .and_then(|v| v.extract().ok()) - .unwrap_or(0.0); - - let height: f64 = dict - .get_item("height") - .map_err(from_pyerr)? - .and_then(|v| v.extract().ok()) - .unwrap_or(0.0); - - let confidence: f64 = dict - .get_item("confidence") - .map_err(from_pyerr)? - .and_then(|v| v.extract().ok()) - .unwrap_or(0.0); - - let entity = Entity::new( - EntityCategory::Pii, - "ocr_text", - &text, - DetectionMethod::Ocr, - confidence, - ) - .with_image_location(ImageLocation { - bounding_box: BoundingBox { x, y, width, height }, - image_id: None, - page_number: None, - }); - - entities.push(entity); +/// Delegates to the `detect_ocr` function above. +#[cfg(feature = "png")] +#[async_trait::async_trait] +impl OcrBackend for PythonBridge { + async fn detect_ocr( + &self, + image_data: &[u8], + mime_type: &str, + config: &OcrConfig, + ) -> Result, Error> { + self::detect_ocr(self, image_data, mime_type, config).await } - - Ok(entities) } diff --git a/crates/nvisy-python/src/prelude.rs b/crates/nvisy-python/src/prelude.rs index b1f49dd..1ed955c 100644 --- a/crates/nvisy-python/src/prelude.rs +++ b/crates/nvisy-python/src/prelude.rs @@ -1,8 +1,3 @@ //! Convenience re-exports. -pub use crate::actions::DetectNerAction; -#[cfg(feature = "png")] -pub use crate::actions::DetectNerImageAction; -#[cfg(feature = "png")] -pub use crate::actions::ocr::OcrDetectAction; pub use crate::bridge::PythonBridge; pub use crate::provider::AiProvider; From 171df51d1909875a4c53dfd0962688e519db18c0 Mon Sep 17 00:00:00 2001 From: Oleh Martsokha Date: Tue, 17 Feb 2026 23:57:47 +0100 Subject: [PATCH 07/11] refactor: span-aware handlers, handler/loader split, remove feature gates and RedactionMethod - Make TextHandler and ImageHandler async traits with default impls using view_spans/edit_spans for span-aware redaction - Split all handlers into _handler/_loader files (image, audio, document, tabular) with Loader trait + Params structs - Replace PngHandler/JpegHandler encode/decode with DynamicImage spans - Delete nvisy-core redaction module (RedactionMethod enums) - Remove png/jpeg/wav/mp3/image feature flags from nvisy-codec; imageproc is now a required dependency - Remove image-redaction/audio-redaction features from nvisy-pipeline - Remove png feature from nvisy-python - Privatize handler/render submodules, re-export via pub use - Delete render/text/replace.rs (absorbed into TextHandler default impl) Co-Authored-By: Claude Opus 4.6 --- crates/nvisy-codec/Cargo.toml | 24 +-- crates/nvisy-codec/src/handler/audio/mod.rs | 15 +- .../handler/audio/{mp3.rs => mp3_handler.rs} | 0 .../src/handler/audio/mp3_loader.rs | 33 ++++ .../handler/audio/{wav.rs => wav_handler.rs} | 0 .../src/handler/audio/wav_loader.rs | 33 ++++ .../document/{docx.rs => docx_handler.rs} | 0 .../src/handler/document/docx_loader.rs | 33 ++++ .../nvisy-codec/src/handler/document/mod.rs | 21 ++- .../document/{pdf.rs => pdf_handler.rs} | 0 .../src/handler/document/pdf_loader.rs | 33 ++++ crates/nvisy-codec/src/handler/image/jpeg.rs | 32 ---- .../src/handler/image/jpeg_handler.rs | 80 +++++++++ .../src/handler/image/jpeg_loader.rs | 40 +++++ crates/nvisy-codec/src/handler/image/mod.rs | 17 +- crates/nvisy-codec/src/handler/image/png.rs | 64 ------- .../src/handler/image/png_handler.rs | 80 +++++++++ .../src/handler/image/png_loader.rs | 40 +++++ crates/nvisy-codec/src/handler/mod.rs | 31 ++-- crates/nvisy-codec/src/handler/tabular/mod.rs | 9 +- .../tabular/{xlsx.rs => xlsx_handler.rs} | 2 +- .../src/handler/tabular/xlsx_loader.rs | 33 ++++ crates/nvisy-codec/src/handler/text/mod.rs | 23 ++- .../src/handler/text/txt_handler.rs | 21 +-- crates/nvisy-codec/src/render/audio/mod.rs | 28 +++ crates/nvisy-codec/src/render/audio/output.rs | 16 ++ crates/nvisy-codec/src/render/image/mod.rs | 92 +++++----- crates/nvisy-codec/src/render/image/output.rs | 18 ++ crates/nvisy-codec/src/render/mod.rs | 16 +- crates/nvisy-codec/src/render/output.rs | 164 +----------------- crates/nvisy-codec/src/render/text/mask.rs | 69 ++++---- crates/nvisy-codec/src/render/text/mod.rs | 157 +++++++++-------- crates/nvisy-codec/src/render/text/output.rs | 66 +++++++ crates/nvisy-codec/src/render/text/replace.rs | 44 ----- crates/nvisy-core/src/lib.rs | 2 - crates/nvisy-core/src/redaction/method.rs | 112 ------------ crates/nvisy-core/src/redaction/mod.rs | 15 -- crates/nvisy-pipeline/Cargo.toml | 11 -- crates/nvisy-pipeline/src/detection/ner.rs | 14 +- crates/nvisy-pipeline/src/generation/mod.rs | 2 - crates/nvisy-pipeline/src/generation/ocr.rs | 3 +- .../src/ontology/redaction/mod.rs | 2 +- .../src/ontology/redaction/spec.rs | 57 ------ crates/nvisy-pipeline/src/prelude.rs | 4 - crates/nvisy-pipeline/src/redaction/apply.rs | 131 +++++++++----- .../src/redaction/evaluate_policy.rs | 2 +- crates/nvisy-python/Cargo.toml | 5 - crates/nvisy-python/src/ocr/mod.rs | 3 - 48 files changed, 889 insertions(+), 808 deletions(-) rename crates/nvisy-codec/src/handler/audio/{mp3.rs => mp3_handler.rs} (100%) create mode 100644 crates/nvisy-codec/src/handler/audio/mp3_loader.rs rename crates/nvisy-codec/src/handler/audio/{wav.rs => wav_handler.rs} (100%) create mode 100644 crates/nvisy-codec/src/handler/audio/wav_loader.rs rename crates/nvisy-codec/src/handler/document/{docx.rs => docx_handler.rs} (100%) create mode 100644 crates/nvisy-codec/src/handler/document/docx_loader.rs rename crates/nvisy-codec/src/handler/document/{pdf.rs => pdf_handler.rs} (100%) create mode 100644 crates/nvisy-codec/src/handler/document/pdf_loader.rs delete mode 100644 crates/nvisy-codec/src/handler/image/jpeg.rs create mode 100644 crates/nvisy-codec/src/handler/image/jpeg_handler.rs create mode 100644 crates/nvisy-codec/src/handler/image/jpeg_loader.rs delete mode 100644 crates/nvisy-codec/src/handler/image/png.rs create mode 100644 crates/nvisy-codec/src/handler/image/png_handler.rs create mode 100644 crates/nvisy-codec/src/handler/image/png_loader.rs rename crates/nvisy-codec/src/handler/tabular/{xlsx.rs => xlsx_handler.rs} (90%) create mode 100644 crates/nvisy-codec/src/handler/tabular/xlsx_loader.rs create mode 100644 crates/nvisy-codec/src/render/audio/mod.rs create mode 100644 crates/nvisy-codec/src/render/audio/output.rs create mode 100644 crates/nvisy-codec/src/render/image/output.rs create mode 100644 crates/nvisy-codec/src/render/text/output.rs delete mode 100644 crates/nvisy-codec/src/render/text/replace.rs delete mode 100644 crates/nvisy-core/src/redaction/method.rs delete mode 100644 crates/nvisy-core/src/redaction/mod.rs diff --git a/crates/nvisy-codec/Cargo.toml b/crates/nvisy-codec/Cargo.toml index aac99f7..c1704f8 100644 --- a/crates/nvisy-codec/Cargo.toml +++ b/crates/nvisy-codec/Cargo.toml @@ -22,27 +22,17 @@ all-features = true rustdoc-args = ["--cfg", "docsrs"] [features] -default = ["pdf", "docx", "html", "xlsx", "image", "wav", "mp3"] +default = ["pdf", "docx", "html", "xlsx"] -# PDF parsing and text extraction via pdf-extract + lopdf; enables png for extracted images -pdf = ["dep:pdf-extract", "dep:lopdf", "png"] -# Microsoft Word (.docx) parsing via zip + quick-xml; enables image formats for extracted images -docx = ["dep:zip", "dep:quick-xml", "jpeg", "png"] +# PDF parsing and text extraction via pdf-extract + lopdf +pdf = ["dep:pdf-extract", "dep:lopdf"] +# Microsoft Word (.docx) parsing via zip + quick-xml +docx = ["dep:zip", "dep:quick-xml"] # HTML parsing and text extraction via scraper html = ["dep:scraper"] # Excel (.xlsx) spreadsheet parsing via calamine xlsx = ["dep:calamine"] -# Convenience alias: all image formats -image = ["jpeg", "png"] -# Individual image format handlers (each pulls in image + imageproc for rendering) -jpeg = ["dep:image", "dep:imageproc"] -png = ["dep:image", "dep:imageproc"] - -# Audio format handlers (no additional dependencies) -wav = [] -mp3 = [] - [dependencies] # Internal crates nvisy-core = { workspace = true, features = [] } @@ -80,8 +70,8 @@ zip = { workspace = true, optional = true, features = [] } quick-xml = { workspace = true, optional = true, features = [] } scraper = { workspace = true, optional = true, features = [] } calamine = { workspace = true, optional = true, features = [] } -image = { workspace = true, optional = true, features = [] } -imageproc = { workspace = true, optional = true, features = [] } +image = { workspace = true, features = [] } +imageproc = { workspace = true, features = [] } [dev-dependencies] tokio = { workspace = true, features = ["macros", "rt"] } diff --git a/crates/nvisy-codec/src/handler/audio/mod.rs b/crates/nvisy-codec/src/handler/audio/mod.rs index 5a1fb22..a3c3b06 100644 --- a/crates/nvisy-codec/src/handler/audio/mod.rs +++ b/crates/nvisy-codec/src/handler/audio/mod.rs @@ -1,6 +1,11 @@ -//! Audio format handlers. +//! Audio format handlers and loaders. -#[cfg(feature = "wav")] -pub mod wav; -#[cfg(feature = "mp3")] -pub mod mp3; +mod wav_handler; +mod wav_loader; +mod mp3_handler; +mod mp3_loader; + +pub use wav_handler::WavHandler; +pub use wav_loader::{WavLoader, WavParams}; +pub use mp3_handler::Mp3Handler; +pub use mp3_loader::{Mp3Loader, Mp3Params}; diff --git a/crates/nvisy-codec/src/handler/audio/mp3.rs b/crates/nvisy-codec/src/handler/audio/mp3_handler.rs similarity index 100% rename from crates/nvisy-codec/src/handler/audio/mp3.rs rename to crates/nvisy-codec/src/handler/audio/mp3_handler.rs diff --git a/crates/nvisy-codec/src/handler/audio/mp3_loader.rs b/crates/nvisy-codec/src/handler/audio/mp3_loader.rs new file mode 100644 index 0000000..51bd24a --- /dev/null +++ b/crates/nvisy-codec/src/handler/audio/mp3_loader.rs @@ -0,0 +1,33 @@ +//! MP3 loader — wraps raw audio bytes into a [`Document`]. + +use nvisy_core::error::Error; +use nvisy_core::io::ContentData; + +use crate::document::Document; +use crate::handler::{Loader, Mp3Handler}; + +/// Parameters for [`Mp3Loader`]. +#[derive(Debug, Default)] +pub struct Mp3Params; + +/// Loader that wraps raw MP3 bytes. +/// +/// Produces a single [`Document`] per input. +#[derive(Debug)] +pub struct Mp3Loader; + +#[async_trait::async_trait] +impl Loader for Mp3Loader { + type Handler = Mp3Handler; + type Params = Mp3Params; + + async fn load( + &self, + content: &ContentData, + _params: &Self::Params, + ) -> Result>, Error> { + let handler = Mp3Handler::new(content.to_bytes()); + let doc = Document::new(handler).with_parent(content); + Ok(vec![doc]) + } +} diff --git a/crates/nvisy-codec/src/handler/audio/wav.rs b/crates/nvisy-codec/src/handler/audio/wav_handler.rs similarity index 100% rename from crates/nvisy-codec/src/handler/audio/wav.rs rename to crates/nvisy-codec/src/handler/audio/wav_handler.rs diff --git a/crates/nvisy-codec/src/handler/audio/wav_loader.rs b/crates/nvisy-codec/src/handler/audio/wav_loader.rs new file mode 100644 index 0000000..efd4c42 --- /dev/null +++ b/crates/nvisy-codec/src/handler/audio/wav_loader.rs @@ -0,0 +1,33 @@ +//! WAV loader — wraps raw audio bytes into a [`Document`]. + +use nvisy_core::error::Error; +use nvisy_core::io::ContentData; + +use crate::document::Document; +use crate::handler::{Loader, WavHandler}; + +/// Parameters for [`WavLoader`]. +#[derive(Debug, Default)] +pub struct WavParams; + +/// Loader that wraps raw WAV bytes. +/// +/// Produces a single [`Document`] per input. +#[derive(Debug)] +pub struct WavLoader; + +#[async_trait::async_trait] +impl Loader for WavLoader { + type Handler = WavHandler; + type Params = WavParams; + + async fn load( + &self, + content: &ContentData, + _params: &Self::Params, + ) -> Result>, Error> { + let handler = WavHandler::new(content.to_bytes()); + let doc = Document::new(handler).with_parent(content); + Ok(vec![doc]) + } +} diff --git a/crates/nvisy-codec/src/handler/document/docx.rs b/crates/nvisy-codec/src/handler/document/docx_handler.rs similarity index 100% rename from crates/nvisy-codec/src/handler/document/docx.rs rename to crates/nvisy-codec/src/handler/document/docx_handler.rs diff --git a/crates/nvisy-codec/src/handler/document/docx_loader.rs b/crates/nvisy-codec/src/handler/document/docx_loader.rs new file mode 100644 index 0000000..6ec27f3 --- /dev/null +++ b/crates/nvisy-codec/src/handler/document/docx_loader.rs @@ -0,0 +1,33 @@ +//! DOCX loader (stub — awaiting real implementation). + +use nvisy_core::error::Error; +use nvisy_core::io::ContentData; + +use crate::document::Document; +use crate::handler::{Loader, DocxHandler}; + +/// Parameters for [`DocxLoader`]. +#[derive(Debug, Default)] +pub struct DocxParams; + +/// Loader that creates a stub DOCX handler. +/// +/// Produces a single [`Document`] per input. +#[derive(Debug)] +pub struct DocxLoader; + +#[async_trait::async_trait] +impl Loader for DocxLoader { + type Handler = DocxHandler; + type Params = DocxParams; + + async fn load( + &self, + content: &ContentData, + _params: &Self::Params, + ) -> Result>, Error> { + let handler = DocxHandler; + let doc = Document::new(handler).with_parent(content); + Ok(vec![doc]) + } +} diff --git a/crates/nvisy-codec/src/handler/document/mod.rs b/crates/nvisy-codec/src/handler/document/mod.rs index 9c62d32..34aba63 100644 --- a/crates/nvisy-codec/src/handler/document/mod.rs +++ b/crates/nvisy-codec/src/handler/document/mod.rs @@ -1,6 +1,21 @@ -//! Rich document format handlers. +//! Rich document format handlers and loaders. #[cfg(feature = "pdf")] -pub mod pdf; +mod pdf_handler; +#[cfg(feature = "pdf")] +mod pdf_loader; + +#[cfg(feature = "docx")] +mod docx_handler; +#[cfg(feature = "docx")] +mod docx_loader; + +#[cfg(feature = "pdf")] +pub use pdf_handler::PdfHandler; +#[cfg(feature = "pdf")] +pub use pdf_loader::{PdfLoader, PdfParams}; + +#[cfg(feature = "docx")] +pub use docx_handler::DocxHandler; #[cfg(feature = "docx")] -pub mod docx; +pub use docx_loader::{DocxLoader, DocxParams}; diff --git a/crates/nvisy-codec/src/handler/document/pdf.rs b/crates/nvisy-codec/src/handler/document/pdf_handler.rs similarity index 100% rename from crates/nvisy-codec/src/handler/document/pdf.rs rename to crates/nvisy-codec/src/handler/document/pdf_handler.rs diff --git a/crates/nvisy-codec/src/handler/document/pdf_loader.rs b/crates/nvisy-codec/src/handler/document/pdf_loader.rs new file mode 100644 index 0000000..0c2baba --- /dev/null +++ b/crates/nvisy-codec/src/handler/document/pdf_loader.rs @@ -0,0 +1,33 @@ +//! PDF loader (stub — awaiting real implementation). + +use nvisy_core::error::Error; +use nvisy_core::io::ContentData; + +use crate::document::Document; +use crate::handler::{Loader, PdfHandler}; + +/// Parameters for [`PdfLoader`]. +#[derive(Debug, Default)] +pub struct PdfParams; + +/// Loader that creates a stub PDF handler. +/// +/// Produces a single [`Document`] per input. +#[derive(Debug)] +pub struct PdfLoader; + +#[async_trait::async_trait] +impl Loader for PdfLoader { + type Handler = PdfHandler; + type Params = PdfParams; + + async fn load( + &self, + content: &ContentData, + _params: &Self::Params, + ) -> Result>, Error> { + let handler = PdfHandler; + let doc = Document::new(handler).with_parent(content); + Ok(vec![doc]) + } +} diff --git a/crates/nvisy-codec/src/handler/image/jpeg.rs b/crates/nvisy-codec/src/handler/image/jpeg.rs deleted file mode 100644 index 6e9e291..0000000 --- a/crates/nvisy-codec/src/handler/image/jpeg.rs +++ /dev/null @@ -1,32 +0,0 @@ -//! JPEG handler (stub — awaiting migration to Loader/Handler pattern). - -use nvisy_core::error::Error; -use nvisy_core::fs::DocumentType; - -use crate::document::edit_stream::SpanEditStream; -use crate::document::view_stream::SpanStream; -use crate::handler::Handler; - -#[derive(Debug)] -pub struct JpegHandler; - -#[async_trait::async_trait] -impl Handler for JpegHandler { - fn document_type(&self) -> DocumentType { - DocumentType::Jpeg - } - - type SpanId = (); - type SpanData = (); - - async fn view_spans(&self) -> SpanStream<'_, (), ()> { - SpanStream::new(futures::stream::empty()) - } - - async fn edit_spans( - &mut self, - _edits: SpanEditStream<'_, (), ()>, - ) -> Result<(), Error> { - Ok(()) - } -} diff --git a/crates/nvisy-codec/src/handler/image/jpeg_handler.rs b/crates/nvisy-codec/src/handler/image/jpeg_handler.rs new file mode 100644 index 0000000..3c279a7 --- /dev/null +++ b/crates/nvisy-codec/src/handler/image/jpeg_handler.rs @@ -0,0 +1,80 @@ +//! JPEG handler — holds a decoded image and provides single-span access +//! via [`Handler`]. +//! +//! # Span model +//! +//! [`Handler::view_spans`] yields exactly one [`Span`] whose data is the +//! current [`DynamicImage`]. [`Handler::edit_spans`] replaces the image +//! in-place. + +use image::DynamicImage; + +use nvisy_core::error::{Error, ErrorKind}; +use nvisy_core::fs::DocumentType; + +use crate::document::edit_stream::SpanEditStream; +use crate::document::view_stream::SpanStream; +use crate::handler::{Handler, Span}; +use crate::render::ImageHandler; + +use futures::StreamExt; + +/// Handler for loaded JPEG content. +/// +/// Stores the decoded [`DynamicImage`] directly. The raw JPEG bytes +/// can be produced on demand via [`JpegHandler::encode_bytes`]. +#[derive(Debug, Clone)] +pub struct JpegHandler { + image: DynamicImage, +} + +impl JpegHandler { + /// Create a handler from an already-decoded image. + pub fn new(image: DynamicImage) -> Self { + Self { image } + } + + /// Reference to the decoded image. + pub fn image(&self) -> &DynamicImage { + &self.image + } + + /// Encode the current image to JPEG bytes. + pub fn encode_bytes(&self) -> Result, Error> { + let mut buf = std::io::Cursor::new(Vec::new()); + self.image + .write_to(&mut buf, image::ImageFormat::Jpeg) + .map_err(|e| Error::new(ErrorKind::Runtime, format!("JPEG encode failed: {e}")))?; + Ok(buf.into_inner()) + } +} + +#[async_trait::async_trait] +impl Handler for JpegHandler { + fn document_type(&self) -> DocumentType { + DocumentType::Jpeg + } + + type SpanId = (); + type SpanData = DynamicImage; + + async fn view_spans(&self) -> SpanStream<'_, (), DynamicImage> { + SpanStream::new(futures::stream::iter(std::iter::once(Span { + id: (), + data: self.image.clone(), + }))) + } + + async fn edit_spans( + &mut self, + edits: SpanEditStream<'_, (), DynamicImage>, + ) -> Result<(), Error> { + let edits: Vec<_> = edits.collect().await; + if let Some(edit) = edits.into_iter().next() { + self.image = edit.data; + } + Ok(()) + } +} + +impl ImageHandler for JpegHandler {} diff --git a/crates/nvisy-codec/src/handler/image/jpeg_loader.rs b/crates/nvisy-codec/src/handler/image/jpeg_loader.rs new file mode 100644 index 0000000..496c1c4 --- /dev/null +++ b/crates/nvisy-codec/src/handler/image/jpeg_loader.rs @@ -0,0 +1,40 @@ +//! JPEG loader — validates and decodes raw JPEG bytes into a +//! [`Document`]. + +use image::DynamicImage; + +use nvisy_core::error::{Error, ErrorKind}; +use nvisy_core::io::ContentData; + +use crate::document::Document; +use crate::handler::{Loader, JpegHandler}; + +/// Parameters for [`JpegLoader`]. +#[derive(Debug, Default)] +pub struct JpegParams; + +/// Loader that validates and decodes JPEG files. +/// +/// Produces a single [`Document`] per input. +#[derive(Debug)] +pub struct JpegLoader; + +#[async_trait::async_trait] +impl Loader for JpegLoader { + type Handler = JpegHandler; + type Params = JpegParams; + + async fn load( + &self, + content: &ContentData, + _params: &Self::Params, + ) -> Result>, Error> { + let raw = content.to_bytes(); + let image: DynamicImage = image::load_from_memory(&raw) + .map_err(|e| Error::new(ErrorKind::Runtime, format!("JPEG decode failed: {e}")))?; + + let handler = JpegHandler::new(image); + let doc = Document::new(handler).with_parent(content); + Ok(vec![doc]) + } +} diff --git a/crates/nvisy-codec/src/handler/image/mod.rs b/crates/nvisy-codec/src/handler/image/mod.rs index e3c4cb0..9460820 100644 --- a/crates/nvisy-codec/src/handler/image/mod.rs +++ b/crates/nvisy-codec/src/handler/image/mod.rs @@ -1,6 +1,13 @@ -//! Image format handlers. +//! Image format handlers and loaders. -#[cfg(feature = "jpeg")] -pub mod jpeg; -#[cfg(feature = "png")] -pub mod png; +mod jpeg_handler; +mod jpeg_loader; + +mod png_handler; +mod png_loader; + +pub use png_handler::PngHandler; +pub use png_loader::{PngLoader, PngParams}; + +pub use jpeg_handler::JpegHandler; +pub use jpeg_loader::{JpegLoader, JpegParams}; diff --git a/crates/nvisy-codec/src/handler/image/png.rs b/crates/nvisy-codec/src/handler/image/png.rs deleted file mode 100644 index bfa9320..0000000 --- a/crates/nvisy-codec/src/handler/image/png.rs +++ /dev/null @@ -1,64 +0,0 @@ -//! PNG handler (stub — awaiting migration to Loader/Handler pattern). - -use bytes::Bytes; -use image::DynamicImage; - -use nvisy_core::error::{Error, ErrorKind}; -use nvisy_core::fs::DocumentType; - -use crate::document::edit_stream::SpanEditStream; -use crate::document::view_stream::SpanStream; -use crate::handler::Handler; -use crate::render::image::AsImage; - -#[derive(Debug, Clone)] -pub struct PngHandler { - pub(crate) bytes: Bytes, -} - -impl PngHandler { - pub fn new(bytes: Bytes) -> Self { - Self { bytes } - } - - pub fn bytes(&self) -> &Bytes { - &self.bytes - } -} - -#[async_trait::async_trait] -impl Handler for PngHandler { - fn document_type(&self) -> DocumentType { - DocumentType::Png - } - - type SpanId = (); - type SpanData = (); - - async fn view_spans(&self) -> SpanStream<'_, (), ()> { - SpanStream::new(futures::stream::empty()) - } - - async fn edit_spans( - &mut self, - _edits: SpanEditStream<'_, (), ()>, - ) -> Result<(), Error> { - Ok(()) - } -} - -impl AsImage for PngHandler { - fn decode(&self) -> Result { - image::load_from_memory(&self.bytes).map_err(|e| { - Error::new(ErrorKind::Runtime, format!("PNG decode failed: {e}")) - }) - } - - fn encode(image: &DynamicImage) -> Result { - let mut buf = std::io::Cursor::new(Vec::new()); - image.write_to(&mut buf, image::ImageFormat::Png).map_err(|e| { - Error::new(ErrorKind::Runtime, format!("PNG encode failed: {e}")) - })?; - Ok(Self::new(Bytes::from(buf.into_inner()))) - } -} diff --git a/crates/nvisy-codec/src/handler/image/png_handler.rs b/crates/nvisy-codec/src/handler/image/png_handler.rs new file mode 100644 index 0000000..898ec6e --- /dev/null +++ b/crates/nvisy-codec/src/handler/image/png_handler.rs @@ -0,0 +1,80 @@ +//! PNG handler — holds a decoded image and provides single-span access +//! via [`Handler`]. +//! +//! # Span model +//! +//! [`Handler::view_spans`] yields exactly one [`Span`] whose data is the +//! current [`DynamicImage`]. [`Handler::edit_spans`] replaces the image +//! in-place. + +use image::DynamicImage; + +use nvisy_core::error::{Error, ErrorKind}; +use nvisy_core::fs::DocumentType; + +use crate::document::edit_stream::SpanEditStream; +use crate::document::view_stream::SpanStream; +use crate::handler::{Handler, Span}; +use crate::render::ImageHandler; + +use futures::StreamExt; + +/// Handler for loaded PNG content. +/// +/// Stores the decoded [`DynamicImage`] directly. The raw PNG bytes +/// can be produced on demand via [`PngHandler::encode_bytes`]. +#[derive(Debug, Clone)] +pub struct PngHandler { + image: DynamicImage, +} + +impl PngHandler { + /// Create a handler from an already-decoded image. + pub fn new(image: DynamicImage) -> Self { + Self { image } + } + + /// Reference to the decoded image. + pub fn image(&self) -> &DynamicImage { + &self.image + } + + /// Encode the current image to PNG bytes. + pub fn encode_bytes(&self) -> Result, Error> { + let mut buf = std::io::Cursor::new(Vec::new()); + self.image + .write_to(&mut buf, image::ImageFormat::Png) + .map_err(|e| Error::new(ErrorKind::Runtime, format!("PNG encode failed: {e}")))?; + Ok(buf.into_inner()) + } +} + +#[async_trait::async_trait] +impl Handler for PngHandler { + fn document_type(&self) -> DocumentType { + DocumentType::Png + } + + type SpanId = (); + type SpanData = DynamicImage; + + async fn view_spans(&self) -> SpanStream<'_, (), DynamicImage> { + SpanStream::new(futures::stream::iter(std::iter::once(Span { + id: (), + data: self.image.clone(), + }))) + } + + async fn edit_spans( + &mut self, + edits: SpanEditStream<'_, (), DynamicImage>, + ) -> Result<(), Error> { + let edits: Vec<_> = edits.collect().await; + if let Some(edit) = edits.into_iter().next() { + self.image = edit.data; + } + Ok(()) + } +} + +impl ImageHandler for PngHandler {} diff --git a/crates/nvisy-codec/src/handler/image/png_loader.rs b/crates/nvisy-codec/src/handler/image/png_loader.rs new file mode 100644 index 0000000..ffb0183 --- /dev/null +++ b/crates/nvisy-codec/src/handler/image/png_loader.rs @@ -0,0 +1,40 @@ +//! PNG loader — validates and decodes raw PNG bytes into a +//! [`Document`]. + +use image::DynamicImage; + +use nvisy_core::error::{Error, ErrorKind}; +use nvisy_core::io::ContentData; + +use crate::document::Document; +use crate::handler::{Loader, PngHandler}; + +/// Parameters for [`PngLoader`]. +#[derive(Debug, Default)] +pub struct PngParams; + +/// Loader that validates and decodes PNG files. +/// +/// Produces a single [`Document`] per input. +#[derive(Debug)] +pub struct PngLoader; + +#[async_trait::async_trait] +impl Loader for PngLoader { + type Handler = PngHandler; + type Params = PngParams; + + async fn load( + &self, + content: &ContentData, + _params: &Self::Params, + ) -> Result>, Error> { + let raw = content.to_bytes(); + let image: DynamicImage = image::load_from_memory(&raw) + .map_err(|e| Error::new(ErrorKind::Runtime, format!("PNG decode failed: {e}")))?; + + let handler = PngHandler::new(image); + let doc = Document::new(handler).with_parent(content); + Ok(vec![doc]) + } +} diff --git a/crates/nvisy-codec/src/handler/mod.rs b/crates/nvisy-codec/src/handler/mod.rs index f2922d9..1656bdb 100644 --- a/crates/nvisy-codec/src/handler/mod.rs +++ b/crates/nvisy-codec/src/handler/mod.rs @@ -18,31 +18,20 @@ use crate::document::Document; pub mod encoding; pub mod span; -pub mod text; -pub mod document; -pub mod image; -pub mod tabular; -pub mod audio; +mod text; +mod document; +mod image; +mod tabular; +mod audio; pub use encoding::TextEncoding; pub use span::{Span, SpanEdit}; -pub use text::txt_handler::{TxtData, TxtHandler, TxtSpan}; -pub use text::txt_loader::{TxtLoader, TxtParams}; -pub use text::csv_handler::{CsvData, CsvHandler, CsvSpan}; -pub use text::csv_loader::{CsvLoader, CsvParams}; -pub use text::json_handler::{ - JsonData, JsonHandler, JsonIndent, JsonPath, -}; -pub use text::json_loader::{JsonParams, JsonLoader}; - -#[cfg(feature = "png")] -pub use image::png::PngHandler; - -#[cfg(feature = "wav")] -pub use audio::wav::WavHandler; -#[cfg(feature = "mp3")] -pub use audio::mp3::Mp3Handler; +pub use text::*; +pub use document::*; +pub use image::*; +pub use tabular::*; +pub use audio::*; /// Trait implemented by all format handlers. /// diff --git a/crates/nvisy-codec/src/handler/tabular/mod.rs b/crates/nvisy-codec/src/handler/tabular/mod.rs index bb7cea7..2e3db39 100644 --- a/crates/nvisy-codec/src/handler/tabular/mod.rs +++ b/crates/nvisy-codec/src/handler/tabular/mod.rs @@ -1,4 +1,11 @@ //! Tabular/spreadsheet format handlers. #[cfg(feature = "xlsx")] -pub mod xlsx; +mod xlsx_handler; +#[cfg(feature = "xlsx")] +mod xlsx_loader; + +#[cfg(feature = "xlsx")] +pub use xlsx_handler::XlsxHandler; +#[cfg(feature = "xlsx")] +pub use xlsx_loader::{XlsxLoader, XlsxParams}; diff --git a/crates/nvisy-codec/src/handler/tabular/xlsx.rs b/crates/nvisy-codec/src/handler/tabular/xlsx_handler.rs similarity index 90% rename from crates/nvisy-codec/src/handler/tabular/xlsx.rs rename to crates/nvisy-codec/src/handler/tabular/xlsx_handler.rs index acf3abd..799ea2c 100644 --- a/crates/nvisy-codec/src/handler/tabular/xlsx.rs +++ b/crates/nvisy-codec/src/handler/tabular/xlsx_handler.rs @@ -1,4 +1,4 @@ -//! XLSX handler (stub — awaiting migration to Loader/Handler pattern). +//! XLSX handler (stub — awaiting full spreadsheet support). use nvisy_core::error::Error; use nvisy_core::fs::DocumentType; diff --git a/crates/nvisy-codec/src/handler/tabular/xlsx_loader.rs b/crates/nvisy-codec/src/handler/tabular/xlsx_loader.rs new file mode 100644 index 0000000..f181587 --- /dev/null +++ b/crates/nvisy-codec/src/handler/tabular/xlsx_loader.rs @@ -0,0 +1,33 @@ +//! XLSX loader (stub — awaiting full spreadsheet support). + +use nvisy_core::error::Error; +use nvisy_core::io::ContentData; + +use crate::document::Document; +use crate::handler::{Loader, XlsxHandler}; + +/// Parameters for [`XlsxLoader`]. +#[derive(Debug, Default)] +pub struct XlsxParams; + +/// Loader that parses XLSX spreadsheets. +/// +/// Produces a single [`Document`] per input. +#[derive(Debug)] +pub struct XlsxLoader; + +#[async_trait::async_trait] +impl Loader for XlsxLoader { + type Handler = XlsxHandler; + type Params = XlsxParams; + + async fn load( + &self, + _content: &ContentData, + _params: &Self::Params, + ) -> Result>, Error> { + let handler = XlsxHandler; + let doc = Document::new(handler).with_parent(_content); + Ok(vec![doc]) + } +} diff --git a/crates/nvisy-codec/src/handler/text/mod.rs b/crates/nvisy-codec/src/handler/text/mod.rs index 22b6542..366aa9e 100644 --- a/crates/nvisy-codec/src/handler/text/mod.rs +++ b/crates/nvisy-codec/src/handler/text/mod.rs @@ -1,10 +1,19 @@ //! Text-based format handlers. -pub mod txt_handler; -pub mod txt_loader; -pub mod csv_handler; -pub mod csv_loader; -pub mod json_handler; -pub mod json_loader; +mod txt_handler; +mod txt_loader; +mod csv_handler; +mod csv_loader; +mod json_handler; +mod json_loader; #[cfg(feature = "html")] -pub mod html; +mod html; + +pub use txt_handler::{TxtData, TxtHandler, TxtSpan}; +pub use txt_loader::{TxtLoader, TxtParams}; +pub use csv_handler::{CsvData, CsvHandler, CsvSpan}; +pub use csv_loader::{CsvLoader, CsvParams}; +pub use json_handler::{JsonData, JsonHandler, JsonIndent, JsonPath}; +pub use json_loader::{JsonLoader, JsonParams}; +#[cfg(feature = "html")] +pub use html::HtmlHandler; diff --git a/crates/nvisy-codec/src/handler/text/txt_handler.rs b/crates/nvisy-codec/src/handler/text/txt_handler.rs index 8213945..abe05ed 100644 --- a/crates/nvisy-codec/src/handler/text/txt_handler.rs +++ b/crates/nvisy-codec/src/handler/text/txt_handler.rs @@ -22,7 +22,7 @@ use nvisy_core::fs::DocumentType; use crate::document::edit_stream::SpanEditStream; use crate::document::view_stream::SpanStream; use crate::handler::{Handler, Span}; -use crate::render::text::AsText; +use crate::render::TextHandler; /// 0-based line index identifying a span within a plain-text document. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] @@ -109,24 +109,7 @@ impl TxtHandler { } } -impl AsText for TxtHandler { - fn content(&self) -> String { - let mut s = self.data.lines.join("\n"); - if self.data.trailing_newline { - s.push('\n'); - } - s - } - - fn replace_content(&self, content: &str) -> Result { - let trailing_newline = content.ends_with('\n'); - let lines: Vec = content.lines().map(String::from).collect(); - Ok(Self::new(TxtData { - lines, - trailing_newline, - })) - } -} +impl TextHandler for TxtHandler {} /// Iterator over lines of a plain-text document. struct TxtSpanIter<'a> { diff --git a/crates/nvisy-codec/src/render/audio/mod.rs b/crates/nvisy-codec/src/render/audio/mod.rs new file mode 100644 index 0000000..783f207 --- /dev/null +++ b/crates/nvisy-codec/src/render/audio/mod.rs @@ -0,0 +1,28 @@ +//! Audio redaction output type and rendering primitives. + +mod output; + +pub use output::AudioRedactionOutput; + +use crate::handler::Handler; +use nvisy_core::error::Error; + +/// A located audio redaction: pairs a time range with an +/// [`AudioRedactionOutput`] that carries the method-specific parameters. +pub struct AudioRedaction { + /// Start of the redacted segment in seconds. + pub start_secs: f64, + /// End of the redacted segment in seconds. + pub end_secs: f64, + /// The redaction output that determines the rendering method. + pub output: AudioRedactionOutput, +} + +/// Trait for handlers that support audio redaction. +/// +/// Extends [`Handler`] with a single [`redact_spans`](Self::redact_spans) +/// method that applies a batch of time-range audio redactions. +pub trait AudioHandler: Handler + Sized { + /// Apply a batch of audio redactions, returning a new handler. + fn redact_spans(&self, redactions: &[AudioRedaction]) -> Result; +} diff --git a/crates/nvisy-codec/src/render/audio/output.rs b/crates/nvisy-codec/src/render/audio/output.rs new file mode 100644 index 0000000..dcf2942 --- /dev/null +++ b/crates/nvisy-codec/src/render/audio/output.rs @@ -0,0 +1,16 @@ +//! Audio redaction output type. + +use serde::{Deserialize, Serialize}; + +/// Audio redaction output — records the method used. +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +#[derive(schemars::JsonSchema)] +#[serde(tag = "method", rename_all = "snake_case")] +pub enum AudioRedactionOutput { + /// Segment replaced with silence. + Silence, + /// Segment removed entirely. + Remove, + /// Segment replaced with synthetic audio. + Synthesize, +} diff --git a/crates/nvisy-codec/src/render/image/mod.rs b/crates/nvisy-codec/src/render/image/mod.rs index ceb8f0c..1ad8855 100644 --- a/crates/nvisy-codec/src/render/image/mod.rs +++ b/crates/nvisy-codec/src/render/image/mod.rs @@ -1,19 +1,8 @@ -//! Image rendering primitives for redaction overlays. -//! -//! Provides gaussian blur, solid-color block overlay, and pixelation -//! functions that operate on [`DynamicImage`] values using bounding-box -//! regions. -//! -//! # Traits -//! -//! [`AsImage`] is the codec extension point: image format handlers -//! implement [`decode`](AsImage::decode) and [`encode`](AsImage::encode) -//! to round-trip through [`DynamicImage`]. -//! -//! [`AsRedactableImage`] adds a [`redact`](AsRedactableImage::redact) -//! convenience method that dispatches [`ImageRedactionOutput`] variants -//! to the appropriate rendering primitive. It is automatically -//! implemented for every type that implements [`AsImage`]. +//! Image redaction output type and rendering primitives. + +mod output; + +pub use output::ImageRedactionOutput; mod blur; mod block; @@ -23,10 +12,13 @@ use blur::apply_gaussian_blur; use block::apply_block_overlay; use pixelate::apply_pixelate; -use ::image::DynamicImage; +use image::DynamicImage; +use futures::StreamExt; + +use crate::document::edit_stream::SpanEditStream; +use crate::handler::{Handler, SpanEdit}; use nvisy_core::error::Error; use nvisy_core::math::{BoundingBox, BoundingBoxU32}; -use crate::render::output::ImageRedactionOutput; /// A located image redaction: pairs a bounding box with an /// [`ImageRedactionOutput`] that carries the method-specific parameters. @@ -37,37 +29,34 @@ pub struct ImageRedaction { pub output: ImageRedactionOutput, } -/// Trait for handlers that wrap a raster image. -/// -/// Handlers implement [`decode`](Self::decode) and [`encode`](Self::encode) -/// to round-trip through [`DynamicImage`]. See [`AsRedactableImage`] for -/// the higher-level redaction API. -pub trait AsImage: Sized { - /// Decode the handler's raw bytes into a [`DynamicImage`]. - fn decode(&self) -> Result; - - /// Encode a [`DynamicImage`] back into a new handler instance. - fn encode(image: &DynamicImage) -> Result; -} - -/// Extension trait that adds [`ImageRedactionOutput`]-driven redaction -/// to any [`AsImage`] implementor. +/// Trait for handlers that support image redaction. /// -/// This trait is automatically implemented for every type that implements -/// [`AsImage`] — handler authors only need to implement [`AsImage`]. -pub trait AsRedactableImage: AsImage { - /// Apply a batch of image redactions, returning a new handler. - /// - /// Each [`ImageRedaction`] identifies a bounding box and an - /// [`ImageRedactionOutput`] that determines the rendering method - /// (blur, block, pixelate). The image is decoded once, all - /// redactions are applied in order, and then re-encoded. - fn redact(&self, redactions: &[ImageRedaction]) -> Result { +/// Extends [`Handler`] with [`redact_spans`](Self::redact_spans) which +/// applies a batch of bounding-box image redactions. The provided +/// default implementation reads the image via [`view_spans`](Handler::view_spans), +/// applies all redactions, and writes back via [`edit_spans`](Handler::edit_spans). +#[async_trait::async_trait] +pub trait ImageHandler: Handler +where + Self::SpanData: Into + From, +{ + /// Apply a batch of image redactions, mutating in place. + async fn redact_spans( + &mut self, + redactions: &[ImageRedaction], + ) -> Result<(), Error> { if redactions.is_empty() { - return Self::encode(&self.decode()?); + return Ok(()); } - let mut img = self.decode()?; + // Get the current image from the single span. + let spans: Vec<_> = self.view_spans().await.collect().await; + let span = match spans.into_iter().next() { + Some(s) => s, + None => return Ok(()), + }; + + let mut img: DynamicImage = span.data.into(); for r in redactions { let region = BoundingBoxU32::from(&r.bounding_box); @@ -88,9 +77,14 @@ pub trait AsRedactableImage: AsImage { } } - Self::encode(&img) + self.edit_spans(SpanEditStream::new(futures::stream::iter( + std::iter::once(SpanEdit { + id: span.id, + data: Self::SpanData::from(img), + }), + ))) + .await?; + + Ok(()) } } - -/// Blanket implementation: every [`AsImage`] type gets [`AsRedactableImage`] for free. -impl AsRedactableImage for T {} diff --git a/crates/nvisy-codec/src/render/image/output.rs b/crates/nvisy-codec/src/render/image/output.rs new file mode 100644 index 0000000..206787a --- /dev/null +++ b/crates/nvisy-codec/src/render/image/output.rs @@ -0,0 +1,18 @@ +//! Image redaction output type. + +use serde::{Deserialize, Serialize}; + +/// Image redaction output — records the method used and its parameters. +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +#[derive(schemars::JsonSchema)] +#[serde(tag = "method", rename_all = "snake_case")] +pub enum ImageRedactionOutput { + /// Gaussian blur applied to the region. + Blur { sigma: f32 }, + /// Opaque block overlay on the region. + Block { color: [u8; 4] }, + /// Pixelation (mosaic) applied to the region. + Pixelate { block_size: u32 }, + /// Region replaced with a synthetic image. + Synthesize, +} diff --git a/crates/nvisy-codec/src/render/mod.rs b/crates/nvisy-codec/src/render/mod.rs index b6c9b1b..22b1093 100644 --- a/crates/nvisy-codec/src/render/mod.rs +++ b/crates/nvisy-codec/src/render/mod.rs @@ -1,11 +1,11 @@ //! Rendering primitives for redaction overlays. -/// Redaction output types recording what was done. -pub mod output; +mod audio; +mod image; +mod output; +mod text; -/// Image rendering: blur and block overlay for bounding-box regions. -#[cfg(any(feature = "png", feature = "jpeg"))] -pub mod image; - -/// Text rendering: byte-offset replacement engine and cell-level masking. -pub mod text; +pub use audio::{AudioHandler, AudioRedaction, AudioRedactionOutput}; +pub use image::{ImageHandler, ImageRedaction, ImageRedactionOutput}; +pub use output::RedactionOutput; +pub use text::{TextHandler, TextRedaction, TextRedactionOutput}; diff --git a/crates/nvisy-codec/src/render/output.rs b/crates/nvisy-codec/src/render/output.rs index 3498a38..3764539 100644 --- a/crates/nvisy-codec/src/render/output.rs +++ b/crates/nvisy-codec/src/render/output.rs @@ -1,145 +1,11 @@ -//! Data-carrying redaction output enums recording what was done. -//! -//! A [`RedactionOutput`] records the method that was applied and its result -//! data (replacement string, ciphertext, blur sigma, etc.). +//! Unified redaction output enum combining all modality-specific outputs. use derive_more::From; use serde::{Deserialize, Serialize}; -use nvisy_core::redaction::{ - AudioRedactionMethod, ImageRedactionMethod, RedactionMethod, TextRedactionMethod, -}; - -/// Text redaction output — records the method used and its replacement data. -#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] -#[derive(schemars::JsonSchema)] -#[serde(tag = "method", rename_all = "snake_case")] -pub enum TextRedactionOutput { - /// Characters replaced with a mask character. - Mask { - replacement: String, - mask_char: char, - }, - /// Substituted with a fixed placeholder string. - Replace { replacement: String }, - /// Replaced with a one-way hash. - Hash { hash_value: String }, - /// Encrypted; recoverable with the referenced key. - Encrypt { ciphertext: String, key_id: String }, - /// Removed entirely from the output. - Remove, - /// Replaced with a synthetically generated value. - Synthesize { replacement: String }, - /// Replaced with a consistent pseudonym. - Pseudonymize { pseudonym: String }, - /// Replaced with a vault-backed reversible token. - Tokenize { - token: String, - vault_id: Option, - }, - /// Aggregated into a range or bucket. - Aggregate { replacement: String }, - /// Generalized to a less precise value. - Generalize { - replacement: String, - level: Option, - }, - /// Date shifted by a consistent offset. - DateShift { - replacement: String, - offset_days: i64, - }, -} - -/// Image redaction output — records the method used and its parameters. -#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] -#[derive(schemars::JsonSchema)] -#[serde(tag = "method", rename_all = "snake_case")] -pub enum ImageRedactionOutput { - /// Gaussian blur applied to the region. - Blur { sigma: f32 }, - /// Opaque block overlay on the region. - Block { color: [u8; 4] }, - /// Pixelation (mosaic) applied to the region. - Pixelate { block_size: u32 }, - /// Region replaced with a synthetic image. - Synthesize, -} - -/// Audio redaction output — records the method used. -#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] -#[derive(schemars::JsonSchema)] -#[serde(tag = "method", rename_all = "snake_case")] -pub enum AudioRedactionOutput { - /// Segment replaced with silence. - Silence, - /// Segment removed entirely. - Remove, - /// Segment replaced with synthetic audio. - Synthesize, -} - -impl TextRedactionOutput { - /// Returns the [`TextRedactionMethod`] tag this output corresponds to. - pub fn method(&self) -> TextRedactionMethod { - match self { - Self::Mask { .. } => TextRedactionMethod::Mask, - Self::Replace { .. } => TextRedactionMethod::Replace, - Self::Hash { .. } => TextRedactionMethod::Hash, - Self::Encrypt { .. } => TextRedactionMethod::Encrypt, - Self::Remove => TextRedactionMethod::Remove, - Self::Synthesize { .. } => TextRedactionMethod::Synthesize, - Self::Pseudonymize { .. } => TextRedactionMethod::Pseudonymize, - Self::Tokenize { .. } => TextRedactionMethod::Tokenize, - Self::Aggregate { .. } => TextRedactionMethod::Aggregate, - Self::Generalize { .. } => TextRedactionMethod::Generalize, - Self::DateShift { .. } => TextRedactionMethod::DateShift, - } - } - - /// Returns the text replacement string, regardless of specific method. - /// - /// Returns `None` for [`Remove`](Self::Remove) — the caller should - /// treat that as an empty string (span deleted). - pub fn replacement_value(&self) -> Option<&str> { - match self { - Self::Mask { replacement, .. } => Some(replacement), - Self::Replace { replacement } => Some(replacement), - Self::Hash { hash_value } => Some(hash_value), - Self::Encrypt { ciphertext, .. } => Some(ciphertext), - Self::Remove => None, - Self::Synthesize { replacement } => Some(replacement), - Self::Pseudonymize { pseudonym } => Some(pseudonym), - Self::Tokenize { token, .. } => Some(token), - Self::Aggregate { replacement } => Some(replacement), - Self::Generalize { replacement, .. } => Some(replacement), - Self::DateShift { replacement, .. } => Some(replacement), - } - } -} - -impl ImageRedactionOutput { - /// Returns the [`ImageRedactionMethod`] tag this output corresponds to. - pub fn method(&self) -> ImageRedactionMethod { - match self { - Self::Blur { .. } => ImageRedactionMethod::Blur, - Self::Block { .. } => ImageRedactionMethod::Block, - Self::Pixelate { .. } => ImageRedactionMethod::Pixelate, - Self::Synthesize => ImageRedactionMethod::Synthesize, - } - } -} - -impl AudioRedactionOutput { - /// Returns the [`AudioRedactionMethod`] tag this output corresponds to. - pub fn method(&self) -> AudioRedactionMethod { - match self { - Self::Silence => AudioRedactionMethod::Silence, - Self::Remove => AudioRedactionMethod::Remove, - Self::Synthesize => AudioRedactionMethod::Synthesize, - } - } -} +use crate::render::audio::AudioRedactionOutput; +use crate::render::image::ImageRedactionOutput; +use crate::render::text::TextRedactionOutput; /// Unified redaction output that wraps modality-specific output variants. /// @@ -156,25 +22,3 @@ pub enum RedactionOutput { /// Audio redaction output. Audio(AudioRedactionOutput), } - -impl RedactionOutput { - /// Returns the text replacement string, regardless of specific method. - /// - /// Used by apply actions that just need to know "what string goes here". - /// Returns `None` for image and audio outputs, or text `Remove`. - pub fn replacement_value(&self) -> Option<&str> { - match self { - Self::Text(t) => t.replacement_value(), - Self::Image(_) | Self::Audio(_) => None, - } - } - - /// Returns the [`RedactionMethod`] tag this output corresponds to. - pub fn method(&self) -> RedactionMethod { - match self { - Self::Text(t) => RedactionMethod::Text(t.method()), - Self::Image(i) => RedactionMethod::Image(i.method()), - Self::Audio(a) => RedactionMethod::Audio(a.method()), - } - } -} diff --git a/crates/nvisy-codec/src/render/text/mask.rs b/crates/nvisy-codec/src/render/text/mask.rs index 1682878..46808c9 100644 --- a/crates/nvisy-codec/src/render/text/mask.rs +++ b/crates/nvisy-codec/src/render/text/mask.rs @@ -1,47 +1,46 @@ //! Cell-level masking and hashing utilities. -//! -//! These functions are used by tabular redaction actions to transform -//! individual cell values according to a [`TextRedactionOutput`] variant. -use crate::render::output::TextRedactionOutput; +use super::TextRedactionOutput; -/// Redact a single cell value according to `output`. -/// -/// Dispatches on the [`TextRedactionOutput`] variant: -/// - **Mask**: preserve the last 4 characters, replacing the rest with the -/// mask character from the output. -/// - **Remove**: return an empty string. -/// - **Hash**: return `[HASH:{hex}]` using [`hash_string`]. -/// - **Other variants**: use the output's replacement value directly. -pub fn mask_cell(cell: &str, output: &TextRedactionOutput) -> String { - match output { - TextRedactionOutput::Mask { mask_char, .. } => { - let char_count = cell.chars().count(); - if char_count > 4 { - let masked: String = cell - .chars() - .take(char_count - 4) - .map(|_| *mask_char) - .collect(); - let tail: String = cell.chars().skip(char_count - 4).collect(); - format!("{masked}{tail}") - } else { - mask_char.to_string().repeat(char_count) +impl TextRedactionOutput { + /// Redact a single cell value according to `self`. + /// + /// Dispatches on the variant: + /// - **Mask**: preserve the last 4 characters, replacing the rest with the + /// mask character from the output. + /// - **Remove**: return an empty string. + /// - **Hash**: return `[HASH:{hex}]` using a deterministic hash. + /// - **Other variants**: use the output's replacement value directly. + pub fn mask_cell(&self, cell: &str) -> String { + match self { + Self::Mask { mask_char, .. } => { + let char_count = cell.chars().count(); + if char_count > 4 { + let masked: String = cell + .chars() + .take(char_count - 4) + .map(|_| *mask_char) + .collect(); + let tail: String = cell.chars().skip(char_count - 4).collect(); + format!("{masked}{tail}") + } else { + mask_char.to_string().repeat(char_count) + } } + Self::Remove => String::new(), + Self::Hash { .. } => { + format!("[HASH:{:x}]", hash_string(cell)) + } + _ => self + .replacement_value() + .unwrap_or_default() + .to_string(), } - TextRedactionOutput::Remove => String::new(), - TextRedactionOutput::Hash { .. } => { - format!("[HASH:{:x}]", hash_string(cell)) - } - _ => output - .replacement_value() - .unwrap_or_default() - .to_string(), } } /// Compute a deterministic 64-bit hash of `s` using [`DefaultHasher`](std::collections::hash_map::DefaultHasher). -pub fn hash_string(s: &str) -> u64 { +fn hash_string(s: &str) -> u64 { use std::hash::{Hash, Hasher}; let mut hasher = std::collections::hash_map::DefaultHasher::new(); s.hash(&mut hasher); diff --git a/crates/nvisy-codec/src/render/text/mod.rs b/crates/nvisy-codec/src/render/text/mod.rs index 17ded0a..c28b276 100644 --- a/crates/nvisy-codec/src/render/text/mod.rs +++ b/crates/nvisy-codec/src/render/text/mod.rs @@ -1,92 +1,113 @@ //! Text rendering and redaction primitives. //! -//! Provides byte-offset replacement, cell-level masking, and the -//! [`AsText`] / [`AsRedactableText`] traits that text-bearing handlers -//! implement to support redaction in a single call. -//! -//! # Traits -//! -//! [`AsText`] is the codec extension point: text format handlers -//! implement [`content`](AsText::content) and -//! [`replace_content`](AsText::replace_content) to read and write their -//! backing text. -//! -//! [`AsRedactableText`] adds a [`redact`](AsRedactableText::redact) -//! convenience method that resolves [`TextRedaction`] items into -//! byte-offset replacements. It is automatically implemented for every -//! type that implements [`AsText`]. +//! Provides the [`TextHandler`] async trait that text-bearing handlers +//! implement to support span-aware redaction. The default implementation +//! groups redactions by [`SpanId`](Handler::SpanId), reads current content +//! via [`Handler::view_spans`], applies intra-span byte-offset replacements +//! right-to-left, and writes the results back via [`Handler::edit_spans`]. mod mask; -mod replace; +mod output; -pub use mask::mask_cell; +pub use output::TextRedactionOutput; -use replace::{apply_replacements, PendingReplacement}; +use std::collections::HashMap; +use std::hash::Hash; +use futures::StreamExt; + +use crate::document::edit_stream::SpanEditStream; +use crate::handler::{Handler, SpanEdit}; use nvisy_core::error::Error; -use crate::render::output::TextRedactionOutput; -/// A located text redaction: pairs a byte range with a -/// [`TextRedactionOutput`] that carries the already-resolved replacement. -pub struct TextRedaction { - /// Byte offset where the redacted span starts in the content. +/// A located text redaction: pairs a span identifier and intra-span byte +/// range with a [`TextRedactionOutput`] that carries the replacement. +pub struct TextRedaction { + /// Which span this redaction targets. + pub span_id: S, + /// Byte offset where the redacted region starts within the span. pub start: usize, - /// Byte offset where the redacted span ends (exclusive) in the content. + /// Byte offset where the redacted region ends (exclusive) within the span. pub end: usize, /// The redaction output that carries the replacement value. pub output: TextRedactionOutput, } -/// Trait for handlers that wrap text content. +/// Trait for handlers that support text redaction. /// -/// Handlers implement [`content`](Self::content) and -/// [`replace_content`](Self::replace_content) to round-trip through -/// plain text. See [`AsRedactableText`] for the higher-level redaction -/// API. -pub trait AsText: Sized { - /// Return the handler's full text content as a single string. - fn content(&self) -> String; - - /// Build a new handler instance with the given text content. - fn replace_content(&self, content: &str) -> Result; -} - -/// Extension trait that adds [`TextRedactionOutput`]-driven redaction -/// to any [`AsText`] implementor. -/// -/// This trait is automatically implemented for every type that implements -/// [`AsText`] — handler authors only need to implement [`AsText`]. -pub trait AsRedactableText: AsText { - /// Apply a batch of text redactions, returning a new handler. +/// Extends [`Handler`] with [`redact_spans`](Self::redact_spans) which +/// applies a batch of span-aware text redactions. The provided default +/// implementation groups redactions by span, reads content via +/// [`view_spans`](Handler::view_spans), applies byte-offset replacements +/// right-to-left per span, and writes back via +/// [`edit_spans`](Handler::edit_spans). +#[async_trait::async_trait] +pub trait TextHandler: Handler +where + Self::SpanId: Eq + Hash, + Self::SpanData: AsRef + From, +{ + /// Apply a batch of text redactions, mutating in place. /// - /// Each [`TextRedaction`] identifies a byte range and a - /// [`TextRedactionOutput`] whose replacement value is written into - /// the content. Replacements are applied right-to-left so that byte - /// offsets remain valid. - fn redact(&self, redactions: &[TextRedaction]) -> Result { + /// Each [`TextRedaction`] identifies a span and an intra-span byte + /// range together with a [`TextRedactionOutput`] whose replacement + /// value is written into the content. Replacements within each span + /// are applied right-to-left so that byte offsets remain valid. + async fn redact_spans( + &mut self, + redactions: &[TextRedaction], + ) -> Result<(), Error> { if redactions.is_empty() { - return self.replace_content(&self.content()); + return Ok(()); + } + + // Group redactions by span id. + let mut by_span: HashMap<&Self::SpanId, Vec<(usize, usize, String)>> = HashMap::new(); + for r in redactions { + let value = r + .output + .replacement_value() + .unwrap_or_default() + .to_string(); + by_span + .entry(&r.span_id) + .or_default() + .push((r.start, r.end, value)); } - let content = self.content(); - let mut pending: Vec = redactions - .iter() - .map(|r| { - let value = r.output.replacement_value() - .unwrap_or_default() - .to_string(); - PendingReplacement { - start: r.start, - end: r.end, - value, + // Read current content for affected spans. + let all_spans: Vec<_> = self.view_spans().await.collect().await; + + let mut edits: Vec> = Vec::new(); + for span in &all_spans { + if let Some(replacements) = by_span.get_mut(&span.id) { + let content = span.data.as_ref(); + + // Sort right-to-left so earlier byte offsets stay valid. + replacements.sort_by(|a, b| b.0.cmp(&a.0)); + + let mut result = content.to_string(); + for (start, end, value) in replacements.iter() { + let s = (*start).min(result.len()); + let e = (*end).min(result.len()); + if s >= e { + continue; + } + result = format!("{}{}{}", &result[..s], value, &result[e..]); } - }) - .collect(); - let result = apply_replacements(&content, &mut pending); - self.replace_content(&result) + edits.push(SpanEdit { + id: span.id.clone(), + data: Self::SpanData::from(result), + }); + } + } + + if !edits.is_empty() { + self.edit_spans(SpanEditStream::new(futures::stream::iter(edits))) + .await?; + } + + Ok(()) } } - -/// Blanket implementation: every [`AsText`] type gets [`AsRedactableText`] for free. -impl AsRedactableText for T {} diff --git a/crates/nvisy-codec/src/render/text/output.rs b/crates/nvisy-codec/src/render/text/output.rs new file mode 100644 index 0000000..88dc413 --- /dev/null +++ b/crates/nvisy-codec/src/render/text/output.rs @@ -0,0 +1,66 @@ +//! Text redaction output type. + +use serde::{Deserialize, Serialize}; + +/// Text redaction output — records the method used and its replacement data. +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +#[derive(schemars::JsonSchema)] +#[serde(tag = "method", rename_all = "snake_case")] +pub enum TextRedactionOutput { + /// Characters replaced with a mask character. + Mask { + replacement: String, + mask_char: char, + }, + /// Substituted with a fixed placeholder string. + Replace { replacement: String }, + /// Replaced with a one-way hash. + Hash { hash_value: String }, + /// Encrypted; recoverable with the referenced key. + Encrypt { ciphertext: String, key_id: String }, + /// Removed entirely from the output. + Remove, + /// Replaced with a synthetically generated value. + Synthesize { replacement: String }, + /// Replaced with a consistent pseudonym. + Pseudonymize { pseudonym: String }, + /// Replaced with a vault-backed reversible token. + Tokenize { + token: String, + vault_id: Option, + }, + /// Aggregated into a range or bucket. + Aggregate { replacement: String }, + /// Generalized to a less precise value. + Generalize { + replacement: String, + level: Option, + }, + /// Date shifted by a consistent offset. + DateShift { + replacement: String, + offset_days: i64, + }, +} + +impl TextRedactionOutput { + /// Returns the text replacement string, regardless of specific method. + /// + /// Returns `None` for [`Remove`](Self::Remove) — the caller should + /// treat that as an empty string (span deleted). + pub fn replacement_value(&self) -> Option<&str> { + match self { + Self::Mask { replacement, .. } => Some(replacement), + Self::Replace { replacement } => Some(replacement), + Self::Hash { hash_value } => Some(hash_value), + Self::Encrypt { ciphertext, .. } => Some(ciphertext), + Self::Remove => None, + Self::Synthesize { replacement } => Some(replacement), + Self::Pseudonymize { pseudonym } => Some(pseudonym), + Self::Tokenize { token, .. } => Some(token), + Self::Aggregate { replacement } => Some(replacement), + Self::Generalize { replacement, .. } => Some(replacement), + Self::DateShift { replacement, .. } => Some(replacement), + } + } +} diff --git a/crates/nvisy-codec/src/render/text/replace.rs b/crates/nvisy-codec/src/render/text/replace.rs deleted file mode 100644 index 733c328..0000000 --- a/crates/nvisy-codec/src/render/text/replace.rs +++ /dev/null @@ -1,44 +0,0 @@ -//! Byte-offset text replacement engine. -//! -//! Provides a simple but correct algorithm for applying multiple -//! non-overlapping replacements to a string by processing them -//! right-to-left (descending start offset). This ensures that each -//! substitution does not invalidate the byte offsets of earlier -//! (leftward) replacements. - -/// A single text replacement that has been resolved but not yet applied. -pub struct PendingReplacement { - /// Byte offset where the replacement starts in the original text. - pub start: usize, - /// Byte offset where the replacement ends (exclusive) in the original text. - pub end: usize, - /// The string that will replace the original span. - pub value: String, -} - -/// Apply a set of pending replacements to `text`, returning the result. -/// -/// Replacements are applied right-to-left (descending start offset) so that -/// earlier byte offsets remain valid after each substitution. Out-of-range -/// offsets are clamped to the text length and empty spans are skipped. -pub fn apply_replacements(text: &str, pending: &mut [PendingReplacement]) -> String { - // Sort by start offset descending (right-to-left) to preserve positions - pending.sort_by(|a, b| b.start.cmp(&a.start)); - - let mut result = text.to_string(); - for replacement in pending.iter() { - let start = replacement.start.min(result.len()); - let end = replacement.end.min(result.len()); - if start >= end { - continue; - } - - result = format!( - "{}{}{}", - &result[..start], - replacement.value, - &result[end..] - ); - } - result -} diff --git a/crates/nvisy-core/src/lib.rs b/crates/nvisy-core/src/lib.rs index 9c381be..adaa7ce 100644 --- a/crates/nvisy-core/src/lib.rs +++ b/crates/nvisy-core/src/lib.rs @@ -8,7 +8,5 @@ pub mod fs; pub mod io; pub mod math; pub mod path; -pub mod redaction; - #[doc(hidden)] pub mod prelude; diff --git a/crates/nvisy-core/src/redaction/method.rs b/crates/nvisy-core/src/redaction/method.rs deleted file mode 100644 index 15cfa7f..0000000 --- a/crates/nvisy-core/src/redaction/method.rs +++ /dev/null @@ -1,112 +0,0 @@ -//! Plain-tag redaction method enums. -//! -//! These are lightweight identifiers that name a redaction algorithm without -//! carrying any configuration data. For a data-carrying request see -//! [`RedactionSpec`](super::RedactionSpec); for a data-carrying result see -//! [`RedactionOutput`](super::RedactionOutput). - -use derive_more::From; -use serde::{Deserialize, Serialize}; - -/// Redaction strategies for text and tabular content. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, strum::Display, Serialize, Deserialize)] -#[derive(schemars::JsonSchema)] -#[serde(rename_all = "snake_case")] -#[strum(serialize_all = "snake_case")] -pub enum TextRedactionMethod { - /// Replace characters with a mask character (e.g. `***-**-1234`). - Mask, - /// Substitute with a fixed placeholder string. - Replace, - /// Replace with a one-way hash of the original value. - Hash, - /// Encrypt the value so it can be recovered later with a key. - Encrypt, - /// Remove the value entirely from the output. - Remove, - /// Replace with a synthetically generated realistic value. - Synthesize, - /// Replace with a consistent pseudonym across the document. - Pseudonymize, - /// Replace with a vault-backed reversible token (e.g. `USER_001`). - Tokenize, - /// Aggregate value into a range or bucket (e.g. age 34 → 30-39). - Aggregate, - /// Generalize to a less precise value (e.g. street → city → country). - Generalize, - /// Shift dates by a random but consistent offset, preserving intervals. - DateShift, -} - -/// Redaction strategies for image and video regions. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, strum::Display, Serialize, Deserialize)] -#[derive(schemars::JsonSchema)] -#[serde(rename_all = "snake_case")] -#[strum(serialize_all = "snake_case")] -pub enum ImageRedactionMethod { - /// Apply a gaussian blur to the region. - Blur, - /// Overlay an opaque block over the region. - Block, - /// Apply pixelation to the region (mosaic effect). - Pixelate, - /// Replace with a synthetically generated region. - Synthesize, -} - -/// Redaction strategies for audio segments. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, strum::Display, Serialize, Deserialize)] -#[derive(schemars::JsonSchema)] -#[serde(rename_all = "snake_case")] -#[strum(serialize_all = "snake_case")] -pub enum AudioRedactionMethod { - /// Replace the audio segment with silence. - Silence, - /// Remove the audio segment entirely. - Remove, - /// Replace with synthetically generated audio. - Synthesize, -} - -/// Unified redaction strategy tag that wraps modality-specific methods. -/// -/// This is a lightweight identifier — it names the algorithm but carries no -/// configuration data. For a data-carrying request use [`RedactionSpec`](super::RedactionSpec); -/// for a data-carrying result use [`RedactionOutput`](super::RedactionOutput). -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, From, Serialize, Deserialize)] -#[derive(schemars::JsonSchema)] -#[serde(rename_all = "snake_case")] -pub enum RedactionMethod { - /// Text/tabular redaction strategy. - Text(TextRedactionMethod), - /// Image/video redaction strategy. - Image(ImageRedactionMethod), - /// Audio redaction strategy. - Audio(AudioRedactionMethod), -} - -impl RedactionMethod { - /// Returns the text redaction method if this is a text variant. - pub fn as_text(&self) -> Option { - match self { - Self::Text(m) => Some(*m), - _ => None, - } - } - - /// Returns the image redaction method if this is an image variant. - pub fn as_image(&self) -> Option { - match self { - Self::Image(m) => Some(*m), - _ => None, - } - } - - /// Returns the audio redaction method if this is an audio variant. - pub fn as_audio(&self) -> Option { - match self { - Self::Audio(m) => Some(*m), - _ => None, - } - } -} diff --git a/crates/nvisy-core/src/redaction/mod.rs b/crates/nvisy-core/src/redaction/mod.rs deleted file mode 100644 index 647b9d6..0000000 --- a/crates/nvisy-core/src/redaction/mod.rs +++ /dev/null @@ -1,15 +0,0 @@ -//! Redaction method tag enums. -//! -//! Lightweight identifiers that name a redaction algorithm without -//! carrying any configuration data. -//! -//! - [`TextRedactionMethod`] — text/tabular strategies (mask, replace, hash, etc.) -//! - [`ImageRedactionMethod`] — image/video strategies (blur, block, pixelate) -//! - [`AudioRedactionMethod`] — audio strategies (silence, remove) -//! - [`RedactionMethod`] — unified wrapper - -mod method; - -pub use method::{ - AudioRedactionMethod, ImageRedactionMethod, RedactionMethod, TextRedactionMethod, -}; diff --git a/crates/nvisy-pipeline/Cargo.toml b/crates/nvisy-pipeline/Cargo.toml index 5c1d1cb..4317448 100644 --- a/crates/nvisy-pipeline/Cargo.toml +++ b/crates/nvisy-pipeline/Cargo.toml @@ -17,17 +17,6 @@ repository = { workspace = true } homepage = { workspace = true } documentation = { workspace = true } -[package.metadata.docs.rs] -all-features = true -rustdoc-args = ["--cfg", "docsrs"] - -[features] -default = ["image-redaction", "audio-redaction"] -# Image redaction (blur, block); enables nvisy-codec image handlers and rendering -image-redaction = ["nvisy-codec/png", "nvisy-codec/jpeg"] -# Audio redaction pass-through; enables nvisy-codec/wav for WavHandler -audio-redaction = ["nvisy-codec/wav"] - [dependencies] # Internal crates nvisy-core = { workspace = true, features = [] } diff --git a/crates/nvisy-pipeline/src/detection/ner.rs b/crates/nvisy-pipeline/src/detection/ner.rs index d74c938..275a994 100644 --- a/crates/nvisy-pipeline/src/detection/ner.rs +++ b/crates/nvisy-pipeline/src/detection/ner.rs @@ -4,15 +4,12 @@ use serde::Deserialize; use serde_json::Value; use nvisy_codec::document::Document; -use nvisy_codec::handler::TxtHandler; +use nvisy_codec::handler::{TxtHandler, PngHandler}; use nvisy_core::entity::EntityCategory; use nvisy_core::error::Error; use crate::ontology::entity::{DetectionMethod, Entity, TextLocation}; -#[cfg(feature = "image-redaction")] -use nvisy_codec::handler::PngHandler; - fn default_confidence() -> f64 { 0.5 } @@ -69,8 +66,7 @@ pub struct DetectNerParams { pub struct DetectNerInput { /// Text documents to scan for named entities. pub text_docs: Vec>, - /// Image documents to scan for named entities (feature-gated). - #[cfg(feature = "image-redaction")] + /// Image documents to scan for named entities. pub image_docs: Vec>, } @@ -94,7 +90,7 @@ impl DetectNerAction { } } - /// Execute NER detection on text documents and (optionally) image documents. + /// Execute NER detection on text documents and image documents. pub async fn run(&self, input: DetectNerInput) -> Result, Error> { let config = self.config(); let mut entities = Vec::new(); @@ -105,11 +101,11 @@ impl DetectNerAction { entities.extend(parse_ner_entities(&raw)?); } - #[cfg(feature = "image-redaction")] for doc in &input.image_docs { + let png_bytes = doc.handler().encode_bytes()?; let raw = self .backend - .detect_image(doc.handler().bytes(), "image/png", &config) + .detect_image(&png_bytes, "image/png", &config) .await?; entities.extend(parse_ner_entities(&raw)?); } diff --git a/crates/nvisy-pipeline/src/generation/mod.rs b/crates/nvisy-pipeline/src/generation/mod.rs index be6c759..ce71906 100644 --- a/crates/nvisy-pipeline/src/generation/mod.rs +++ b/crates/nvisy-pipeline/src/generation/mod.rs @@ -5,10 +5,8 @@ //! from documents. /// OCR text extraction from image documents. -#[cfg(feature = "image-redaction")] pub mod ocr; /// Synthetic replacement value generation for Synthesize redactions. pub mod synthetic; /// Speech-to-text transcription from audio documents. -#[cfg(feature = "audio-redaction")] pub mod transcribe; diff --git a/crates/nvisy-pipeline/src/generation/ocr.rs b/crates/nvisy-pipeline/src/generation/ocr.rs index 3fde8fe..c3c5b4a 100644 --- a/crates/nvisy-pipeline/src/generation/ocr.rs +++ b/crates/nvisy-pipeline/src/generation/ocr.rs @@ -107,9 +107,10 @@ impl GenerateOcrAction { let mut all_ocr_text = Vec::new(); for doc in &input.image_docs { + let png_bytes = doc.handler().encode_bytes()?; let raw = self .backend - .detect_ocr(doc.handler().bytes(), "image/png", &config) + .detect_ocr(&png_bytes, "image/png", &config) .await?; let entities = parse_ocr_entities(&raw)?; for entity in &entities { diff --git a/crates/nvisy-pipeline/src/ontology/redaction/mod.rs b/crates/nvisy-pipeline/src/ontology/redaction/mod.rs index 2abe0e4..22c67dc 100644 --- a/crates/nvisy-pipeline/src/ontology/redaction/mod.rs +++ b/crates/nvisy-pipeline/src/ontology/redaction/mod.rs @@ -22,7 +22,7 @@ pub use trait_::Redactable; use serde::{Deserialize, Serialize}; use uuid::Uuid; -use nvisy_codec::render::output::RedactionOutput; +use nvisy_codec::render::RedactionOutput; use nvisy_core::path::ContentSource; /// A redaction decision recording how a specific entity was (or will be) redacted. diff --git a/crates/nvisy-pipeline/src/ontology/redaction/spec.rs b/crates/nvisy-pipeline/src/ontology/redaction/spec.rs index e2f0ed3..e8c692b 100644 --- a/crates/nvisy-pipeline/src/ontology/redaction/spec.rs +++ b/crates/nvisy-pipeline/src/ontology/redaction/spec.rs @@ -7,10 +7,6 @@ use derive_more::From; use serde::{Deserialize, Serialize}; -use nvisy_core::redaction::{ - AudioRedactionMethod, ImageRedactionMethod, RedactionMethod, TextRedactionMethod, -}; - /// Text redaction specification with method-specific configuration. #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] #[derive(schemars::JsonSchema)] @@ -129,48 +125,6 @@ pub enum AudioRedactionSpec { Synthesize, } -impl TextRedactionSpec { - /// Returns the [`TextRedactionMethod`] tag this spec corresponds to. - pub fn method(&self) -> TextRedactionMethod { - match self { - Self::Mask { .. } => TextRedactionMethod::Mask, - Self::Replace { .. } => TextRedactionMethod::Replace, - Self::Hash => TextRedactionMethod::Hash, - Self::Encrypt { .. } => TextRedactionMethod::Encrypt, - Self::Remove => TextRedactionMethod::Remove, - Self::Synthesize => TextRedactionMethod::Synthesize, - Self::Pseudonymize => TextRedactionMethod::Pseudonymize, - Self::Tokenize { .. } => TextRedactionMethod::Tokenize, - Self::Aggregate => TextRedactionMethod::Aggregate, - Self::Generalize { .. } => TextRedactionMethod::Generalize, - Self::DateShift { .. } => TextRedactionMethod::DateShift, - } - } -} - -impl ImageRedactionSpec { - /// Returns the [`ImageRedactionMethod`] tag this spec corresponds to. - pub fn method(&self) -> ImageRedactionMethod { - match self { - Self::Blur { .. } => ImageRedactionMethod::Blur, - Self::Block { .. } => ImageRedactionMethod::Block, - Self::Pixelate { .. } => ImageRedactionMethod::Pixelate, - Self::Synthesize => ImageRedactionMethod::Synthesize, - } - } -} - -impl AudioRedactionSpec { - /// Returns the [`AudioRedactionMethod`] tag this spec corresponds to. - pub fn method(&self) -> AudioRedactionMethod { - match self { - Self::Silence => AudioRedactionMethod::Silence, - Self::Remove => AudioRedactionMethod::Remove, - Self::Synthesize => AudioRedactionMethod::Synthesize, - } - } -} - /// Unified redaction specification submitted to the engine. /// /// Carries the method to apply and its configuration parameters. @@ -185,14 +139,3 @@ pub enum RedactionSpec { /// Audio redaction specification. Audio(AudioRedactionSpec), } - -impl RedactionSpec { - /// Returns the [`RedactionMethod`] tag this spec corresponds to. - pub fn method(&self) -> RedactionMethod { - match self { - Self::Text(t) => RedactionMethod::Text(t.method()), - Self::Image(i) => RedactionMethod::Image(i.method()), - Self::Audio(a) => RedactionMethod::Audio(a.method()), - } - } -} diff --git a/crates/nvisy-pipeline/src/prelude.rs b/crates/nvisy-pipeline/src/prelude.rs index 2c45862..b2a4c19 100644 --- a/crates/nvisy-pipeline/src/prelude.rs +++ b/crates/nvisy-pipeline/src/prelude.rs @@ -18,14 +18,10 @@ pub use crate::redaction::emit_audit::{EmitAuditAction, EmitAuditParams}; pub use crate::generation::synthetic::{ GenerateSyntheticAction, GenerateSyntheticParams, GenerateSyntheticInput, }; - -#[cfg(feature = "image-redaction")] pub use crate::generation::ocr::{ GenerateOcrAction, GenerateOcrParams, GenerateOcrInput, GenerateOcrOutput, OcrBackend, OcrConfig, }; - -#[cfg(feature = "audio-redaction")] pub use crate::generation::transcribe::{ GenerateTranscribeAction, GenerateTranscribeParams, GenerateTranscribeInput, GenerateTranscribeOutput, diff --git a/crates/nvisy-pipeline/src/redaction/apply.rs b/crates/nvisy-pipeline/src/redaction/apply.rs index c8696ff..657690d 100644 --- a/crates/nvisy-pipeline/src/redaction/apply.rs +++ b/crates/nvisy-pipeline/src/redaction/apply.rs @@ -4,22 +4,15 @@ use std::collections::HashMap; use uuid::Uuid; use serde::Deserialize; -use nvisy_codec::handler::{TxtHandler, CsvHandler}; +use nvisy_codec::handler::{TxtHandler, CsvHandler, PngHandler, WavHandler}; use nvisy_codec::document::Document; -use nvisy_codec::render::text::{TextRedaction, AsRedactableText, mask_cell}; -use nvisy_codec::render::output::RedactionOutput; +use nvisy_codec::handler::TxtSpan; +use nvisy_codec::render::{TextRedaction, TextRedactionOutput, TextHandler, RedactionOutput}; +use nvisy_codec::render::{ImageRedaction, ImageHandler}; use crate::ontology::redaction::Redaction; use crate::ontology::entity::Entity; use nvisy_core::error::Error; -#[cfg(feature = "image-redaction")] -use nvisy_codec::handler::PngHandler; -#[cfg(feature = "image-redaction")] -use nvisy_codec::render::image::{ImageRedaction, AsRedactableImage}; - -#[cfg(feature = "audio-redaction")] -use nvisy_codec::handler::WavHandler; - use crate::action::Action; /// Typed parameters for [`ApplyRedactionAction`]. @@ -27,12 +20,10 @@ use crate::action::Action; #[serde(rename_all = "camelCase")] pub struct ApplyRedactionParams { /// Duration in seconds to crossfade at silence boundaries (audio redaction). - #[cfg(feature = "audio-redaction")] #[serde(default = "default_crossfade_secs")] pub crossfade_secs: f64, } -#[cfg(feature = "audio-redaction")] fn default_crossfade_secs() -> f64 { 0.05 } @@ -41,11 +32,9 @@ fn default_crossfade_secs() -> f64 { pub struct ApplyRedactionInput { /// Text documents to redact. pub text_docs: Vec>, - /// Image documents to redact (feature-gated). - #[cfg(feature = "image-redaction")] + /// Image documents to redact. pub image_docs: Vec>, - /// Audio documents to redact (feature-gated). - #[cfg(feature = "audio-redaction")] + /// Audio documents to redact. pub audio_docs: Vec>, /// Tabular documents to redact. pub tabular_docs: Vec>, @@ -59,11 +48,9 @@ pub struct ApplyRedactionInput { pub struct ApplyRedactionOutput { /// Redacted text documents. pub text_docs: Vec>, - /// Redacted image documents (feature-gated). - #[cfg(feature = "image-redaction")] + /// Redacted image documents. pub image_docs: Vec>, - /// Redacted audio documents (feature-gated). - #[cfg(feature = "audio-redaction")] + /// Redacted audio documents. pub audio_docs: Vec>, /// Redacted tabular documents. pub tabular_docs: Vec>, @@ -73,8 +60,8 @@ pub struct ApplyRedactionOutput { /// /// Dispatches per-document based on content type: /// - **Text documents**: byte-offset replacement -/// - **Image documents**: blur/block overlay (feature-gated) -/// - **Audio documents**: stub pass-through (feature-gated) +/// - **Image documents**: blur/block overlay +/// - **Audio documents**: stub pass-through /// - **Tabular documents**: cell-level redaction pub struct ApplyRedactionAction { #[allow(dead_code)] @@ -110,23 +97,19 @@ impl Action for ApplyRedactionAction { // Text documents let mut result_text = Vec::new(); for doc in &input.text_docs { - let redacted = apply_text_doc(doc, &entity_map, &redaction_map)?; + let redacted = apply_text_doc(doc, &entity_map, &redaction_map).await?; result_text.push(redacted); } // Image documents - #[cfg(feature = "image-redaction")] let mut result_image = Vec::new(); - #[cfg(feature = "image-redaction")] for doc in &input.image_docs { - let redacted = apply_image_doc(doc, &input.entities, &redaction_map)?; + let redacted = apply_image_doc(doc, &input.entities, &redaction_map).await?; result_image.push(redacted); } // Audio documents - #[cfg(feature = "audio-redaction")] let mut result_audio = Vec::new(); - #[cfg(feature = "audio-redaction")] for doc in &input.audio_docs { let redacted = apply_audio_doc(doc); result_audio.push(redacted); @@ -141,9 +124,7 @@ impl Action for ApplyRedactionAction { Ok(ApplyRedactionOutput { text_docs: result_text, - #[cfg(feature = "image-redaction")] image_docs: result_image, - #[cfg(feature = "audio-redaction")] audio_docs: result_audio, tabular_docs: result_tabular, }) @@ -154,12 +135,13 @@ impl Action for ApplyRedactionAction { // Text redaction // --------------------------------------------------------------------------- -fn apply_text_doc( +async fn apply_text_doc( doc: &Document, entity_map: &HashMap, redaction_map: &HashMap, ) -> Result, Error> { - let mut redactions: Vec = Vec::new(); + // Collect global-offset redactions for this document. + let mut global_redactions: Vec<(usize, usize, TextRedactionOutput)> = Vec::new(); for (entity_id, redaction) in redaction_map { let entity = match entity_map.get(entity_id) { @@ -182,25 +164,84 @@ fn apply_text_doc( _ => continue, }; - redactions.push(TextRedaction { start, end, output }); + global_redactions.push((start, end, output)); } - if redactions.is_empty() { + if global_redactions.is_empty() { return Ok(doc.clone()); } - let handler = doc.handler().redact(&redactions)?; - let mut result = Document::new(handler); + // Build cumulative byte-offset map from lines so we can convert + // global offsets to (TxtSpan, intra-line start, intra-line end). + let lines = doc.handler().lines(); + // Each line contributes `line.len()` bytes plus 1 for the '\n' separator. + let mut line_starts: Vec = Vec::with_capacity(lines.len()); + let mut offset = 0usize; + for line in lines { + line_starts.push(offset); + // +1 for the '\n' that separates lines in the flat representation + offset += line.len() + 1; + } + + // Map each global-offset redaction to per-span redactions, splitting + // across line boundaries when necessary. + let mut redactions: Vec> = Vec::new(); + for (g_start, g_end, output) in &global_redactions { + let g_start = *g_start; + let g_end = *g_end; + + for (i, &line_start) in line_starts.iter().enumerate() { + let line_end = line_start + lines[i].len(); // exclusive, before '\n' + + // Skip lines entirely before or after this redaction range. + if g_end <= line_start || g_start >= line_end + 1 { + continue; + } + + let intra_start = if g_start > line_start { + g_start - line_start + } else { + 0 + }; + let intra_end = if g_end < line_end { + g_end - line_start + } else { + lines[i].len() + }; + + if intra_start >= intra_end { + continue; + } + + // Only the first segment of a cross-line redaction carries the + // replacement value; subsequent segments are removals so that + // the original text is deleted without duplicating the replacement. + let seg_output = if line_start <= g_start { + output.clone() + } else { + TextRedactionOutput::Remove + }; + + redactions.push(TextRedaction { + span_id: TxtSpan(i), + start: intra_start, + end: intra_end, + output: seg_output, + }); + } + } + + let mut result = doc.clone(); + result.handler_mut().redact_spans(&redactions).await?; result.source.set_parent_id(Some(doc.source.as_uuid())); Ok(result) } // --------------------------------------------------------------------------- -// Image redaction (feature-gated) +// Image redaction // --------------------------------------------------------------------------- -#[cfg(feature = "image-redaction")] -fn apply_image_doc( +async fn apply_image_doc( doc: &Document, entities: &[Entity], redaction_map: &HashMap, @@ -226,17 +267,16 @@ fn apply_image_doc( return Ok(doc.clone()); } - let handler = doc.handler().redact(&redactions)?; - let mut result = Document::new(handler); + let mut result = doc.clone(); + result.handler_mut().redact_spans(&redactions).await?; result.source.set_parent_id(Some(doc.source.as_uuid())); Ok(result) } // --------------------------------------------------------------------------- -// Audio redaction (feature-gated) +// Audio redaction // --------------------------------------------------------------------------- -#[cfg(feature = "audio-redaction")] fn apply_audio_doc(doc: &Document) -> Document { tracing::warn!("audio redaction not yet implemented"); doc.clone() @@ -263,7 +303,7 @@ fn apply_tabular_doc( }; if let Some(row) = result.handler_mut().rows_mut().get_mut(row_idx) { if let Some(cell) = row.get_mut(col_idx) { - *cell = mask_cell(cell, output); + *cell = output.mask_cell(cell); } } } @@ -272,4 +312,3 @@ fn apply_tabular_doc( result } - diff --git a/crates/nvisy-pipeline/src/redaction/evaluate_policy.rs b/crates/nvisy-pipeline/src/redaction/evaluate_policy.rs index aea0d21..59a1a9c 100644 --- a/crates/nvisy-pipeline/src/redaction/evaluate_policy.rs +++ b/crates/nvisy-pipeline/src/redaction/evaluate_policy.rs @@ -2,7 +2,7 @@ use serde::Deserialize; -use nvisy_codec::render::output::{ +use nvisy_codec::render::{ AudioRedactionOutput, ImageRedactionOutput, RedactionOutput, TextRedactionOutput, }; use crate::ontology::redaction::{ diff --git a/crates/nvisy-python/Cargo.toml b/crates/nvisy-python/Cargo.toml index 57e35d2..f5041f0 100644 --- a/crates/nvisy-python/Cargo.toml +++ b/crates/nvisy-python/Cargo.toml @@ -21,11 +21,6 @@ documentation = { workspace = true } all-features = true rustdoc-args = ["--cfg", "docsrs"] -[features] -default = ["png"] -# Image-based OCR actions (requires PngHandler via pipeline) -png = ["nvisy-pipeline/image-redaction"] - [dependencies] # Internal crates nvisy-core = { workspace = true, features = [] } diff --git a/crates/nvisy-python/src/ocr/mod.rs b/crates/nvisy-python/src/ocr/mod.rs index b793133..c6137fe 100644 --- a/crates/nvisy-python/src/ocr/mod.rs +++ b/crates/nvisy-python/src/ocr/mod.rs @@ -13,13 +13,11 @@ use nvisy_core::error::Error; use crate::bridge::PythonBridge; use crate::error::from_pyerr; -#[cfg(feature = "png")] use nvisy_pipeline::generation::ocr::{OcrBackend, OcrConfig}; /// Call Python `detect_ocr()` via GIL + `spawn_blocking`. /// /// Returns raw JSON dicts — no domain-type construction. -#[cfg(feature = "png")] pub async fn detect_ocr( bridge: &PythonBridge, image_data: &[u8], @@ -58,7 +56,6 @@ pub async fn detect_ocr( /// [`OcrBackend`] implementation for [`PythonBridge`]. /// /// Delegates to the `detect_ocr` function above. -#[cfg(feature = "png")] #[async_trait::async_trait] impl OcrBackend for PythonBridge { async fn detect_ocr( From 1ad1d8fccd07bd8443f09932336f99b68f23b274 Mon Sep 17 00:00:00 2001 From: Oleh Martsokha Date: Tue, 17 Feb 2026 23:58:31 +0100 Subject: [PATCH 08/11] chore: add docs.rs metadata to all crate Cargo.toml files Co-Authored-By: Claude Opus 4.6 --- crates/nvisy-core/Cargo.toml | 4 ++++ crates/nvisy-engine/Cargo.toml | 4 ++++ crates/nvisy-pipeline/Cargo.toml | 4 ++++ 3 files changed, 12 insertions(+) diff --git a/crates/nvisy-core/Cargo.toml b/crates/nvisy-core/Cargo.toml index 9f49039..5400aef 100644 --- a/crates/nvisy-core/Cargo.toml +++ b/crates/nvisy-core/Cargo.toml @@ -17,6 +17,10 @@ repository = { workspace = true } homepage = { workspace = true } documentation = { workspace = true } +[package.metadata.docs.rs] +all-features = true +rustdoc-args = ["--cfg", "docsrs"] + [dependencies] # JSON Schema generation schemars = { workspace = true, features = [] } diff --git a/crates/nvisy-engine/Cargo.toml b/crates/nvisy-engine/Cargo.toml index ef870c0..6bf0ec5 100644 --- a/crates/nvisy-engine/Cargo.toml +++ b/crates/nvisy-engine/Cargo.toml @@ -17,6 +17,10 @@ repository = { workspace = true } homepage = { workspace = true } documentation = { workspace = true } +[package.metadata.docs.rs] +all-features = true +rustdoc-args = ["--cfg", "docsrs"] + [dependencies] # Internal crates nvisy-core = { workspace = true, features = [] } diff --git a/crates/nvisy-pipeline/Cargo.toml b/crates/nvisy-pipeline/Cargo.toml index 4317448..75d4a29 100644 --- a/crates/nvisy-pipeline/Cargo.toml +++ b/crates/nvisy-pipeline/Cargo.toml @@ -17,6 +17,10 @@ repository = { workspace = true } homepage = { workspace = true } documentation = { workspace = true } +[package.metadata.docs.rs] +all-features = true +rustdoc-args = ["--cfg", "docsrs"] + [dependencies] # Internal crates nvisy-core = { workspace = true, features = [] } From 2d8bc027c72fd005f517cd3f9311d15d437cae93 Mon Sep 17 00:00:00 2001 From: Oleh Martsokha Date: Wed, 18 Feb 2026 00:00:01 +0100 Subject: [PATCH 09/11] refactor: move lib.rs doc comments to README.md, use include_str! everywhere Co-Authored-By: Claude Opus 4.6 --- crates/nvisy-engine/src/lib.rs | 6 ------ crates/nvisy-object/src/lib.rs | 5 ----- crates/nvisy-pattern/README.md | 3 +++ crates/nvisy-pattern/src/lib.rs | 7 +------ crates/nvisy-pipeline/README.md | 3 +++ crates/nvisy-pipeline/src/lib.rs | 9 +-------- crates/nvisy-python/src/lib.rs | 8 -------- 7 files changed, 8 insertions(+), 33 deletions(-) create mode 100644 crates/nvisy-pattern/README.md create mode 100644 crates/nvisy-pipeline/README.md diff --git a/crates/nvisy-engine/src/lib.rs b/crates/nvisy-engine/src/lib.rs index 04a86b0..b47ec42 100644 --- a/crates/nvisy-engine/src/lib.rs +++ b/crates/nvisy-engine/src/lib.rs @@ -1,9 +1,3 @@ -//! DAG execution engine for nvisy pipelines. -//! -//! This crate compiles pipeline definitions into directed acyclic graphs (DAGs), -//! plans topologically-ordered execution, and runs nodes concurrently with -//! retry and timeout policies. - #![forbid(unsafe_code)] #![cfg_attr(docsrs, feature(doc_cfg))] #![doc = include_str!("../README.md")] diff --git a/crates/nvisy-object/src/lib.rs b/crates/nvisy-object/src/lib.rs index 806462b..78d073e 100644 --- a/crates/nvisy-object/src/lib.rs +++ b/crates/nvisy-object/src/lib.rs @@ -1,8 +1,3 @@ -//! Object storage providers and streams for the nvisy pipeline. -//! -//! This crate provides an abstraction layer over cloud object stores (currently S3) -//! and exposes streaming read/write interfaces that plug into the nvisy engine. - #![forbid(unsafe_code)] #![cfg_attr(docsrs, feature(doc_cfg))] #![doc = include_str!("../README.md")] diff --git a/crates/nvisy-pattern/README.md b/crates/nvisy-pattern/README.md new file mode 100644 index 0000000..1067fd8 --- /dev/null +++ b/crates/nvisy-pattern/README.md @@ -0,0 +1,3 @@ +# nvisy-pattern + +Built-in regex patterns and dictionaries for PII/PHI detection in the Nvisy runtime. Provides embedded pattern definitions and dictionary data (first names, last names, medical terms) used by the pipeline's detection actions. diff --git a/crates/nvisy-pattern/src/lib.rs b/crates/nvisy-pattern/src/lib.rs index e927e45..8a1ac89 100644 --- a/crates/nvisy-pattern/src/lib.rs +++ b/crates/nvisy-pattern/src/lib.rs @@ -1,11 +1,6 @@ -//! Built-in regex patterns and dictionaries for PII/PHI detection. -//! -//! This crate provides the embedded pattern definitions (compiled from -//! `assets/patterns.json`) and dictionary data (first names, last names, -//! medical terms) used by the nvisy pipeline's detection actions. - #![forbid(unsafe_code)] #![cfg_attr(docsrs, feature(doc_cfg))] +#![doc = include_str!("../README.md")] /// Built-in regex pattern definitions and validation helpers. pub mod patterns; diff --git a/crates/nvisy-pipeline/README.md b/crates/nvisy-pipeline/README.md new file mode 100644 index 0000000..0661b3b --- /dev/null +++ b/crates/nvisy-pipeline/README.md @@ -0,0 +1,3 @@ +# nvisy-pipeline + +Pipeline action and provider traits with detection, redaction, and generation actions for the Nvisy runtime. Provides entity detection (regex, dictionary, checksum, tabular, manual, NER), policy evaluation, content redaction (text/image/tabular/audio), content generation (OCR, transcription, synthetic data), and audit-trail emission. diff --git a/crates/nvisy-pipeline/src/lib.rs b/crates/nvisy-pipeline/src/lib.rs index e46646d..9fb85cb 100644 --- a/crates/nvisy-pipeline/src/lib.rs +++ b/crates/nvisy-pipeline/src/lib.rs @@ -1,13 +1,6 @@ -//! Pipeline action/provider traits with detection, redaction, and generation actions. -//! -//! This crate consolidates the processing pipeline: the [`Action`] and -//! [`Provider`] traits, entity detection (regex, dictionary, checksum, -//! tabular, manual, NER), policy evaluation, content redaction -//! (text/image/tabular/audio), content generation (OCR, transcription, -//! synthetic data), and audit-trail emission. - #![forbid(unsafe_code)] #![cfg_attr(docsrs, feature(doc_cfg))] +#![doc = include_str!("../README.md")] /// The `Action` trait — the fundamental processing unit in a pipeline. pub mod action; diff --git a/crates/nvisy-python/src/lib.rs b/crates/nvisy-python/src/lib.rs index a630b23..679459c 100644 --- a/crates/nvisy-python/src/lib.rs +++ b/crates/nvisy-python/src/lib.rs @@ -1,11 +1,3 @@ -//! Python/PyO3 bridge for AI-powered NER and OCR detection. -//! -//! This crate embeds a CPython interpreter via PyO3 and delegates named-entity -//! recognition (NER) and OCR to a Python module (`nvisy_ai`). It implements -//! the [`NerBackend`](nvisy_pipeline::detection::ner::NerBackend) and -//! [`OcrBackend`](nvisy_pipeline::generation::ocr::OcrBackend) traits for -//! [`PythonBridge`](bridge::PythonBridge), returning raw JSON to the pipeline. - #![deny(unsafe_code)] #![cfg_attr(docsrs, feature(doc_cfg))] #![doc = include_str!("../README.md")] From babc203d484b82271653778738af35d944b5e5ca Mon Sep 17 00:00:00 2001 From: Oleh Martsokha Date: Wed, 18 Feb 2026 00:03:32 +0100 Subject: [PATCH 10/11] fix: resolve clippy int_plus_one and implicit_saturating_sub warnings Co-Authored-By: Claude Opus 4.6 --- crates/nvisy-pipeline/src/redaction/apply.rs | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/crates/nvisy-pipeline/src/redaction/apply.rs b/crates/nvisy-pipeline/src/redaction/apply.rs index 657690d..4e0ec11 100644 --- a/crates/nvisy-pipeline/src/redaction/apply.rs +++ b/crates/nvisy-pipeline/src/redaction/apply.rs @@ -194,15 +194,11 @@ async fn apply_text_doc( let line_end = line_start + lines[i].len(); // exclusive, before '\n' // Skip lines entirely before or after this redaction range. - if g_end <= line_start || g_start >= line_end + 1 { + if g_end <= line_start || g_start > line_end { continue; } - let intra_start = if g_start > line_start { - g_start - line_start - } else { - 0 - }; + let intra_start = g_start.saturating_sub(line_start); let intra_end = if g_end < line_end { g_end - line_start } else { From e38cae5601d0f8ca454a6743702f162eb052cdcc Mon Sep 17 00:00:00 2001 From: Oleh Martsokha Date: Wed, 18 Feb 2026 00:08:32 +0100 Subject: [PATCH 11/11] fix(security): upgrade pyo3 0.23 -> 0.24, pythonize 0.23 -> 0.24 Fixes RUSTSEC-2025-0020: buffer overflow in PyString::from_object. Co-Authored-By: Claude Opus 4.6 --- Cargo.lock | 28 ++++++++++++++-------------- Cargo.toml | 4 ++-- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ce47d04..6534724 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2678,9 +2678,9 @@ dependencies = [ [[package]] name = "pyo3" -version = "0.23.5" +version = "0.24.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7778bffd85cf38175ac1f545509665d0b9b92a198ca7941f131f85f7a4f9a872" +checksum = "e5203598f366b11a02b13aa20cab591229ff0a89fd121a308a5df751d5fc9219" dependencies = [ "cfg-if", "indoc", @@ -2696,9 +2696,9 @@ dependencies = [ [[package]] name = "pyo3-build-config" -version = "0.23.5" +version = "0.24.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94f6cbe86ef3bf18998d9df6e0f3fc1050a8c5efa409bf712e661a4366e010fb" +checksum = "99636d423fa2ca130fa5acde3059308006d46f98caac629418e53f7ebb1e9999" dependencies = [ "once_cell", "target-lexicon", @@ -2706,9 +2706,9 @@ dependencies = [ [[package]] name = "pyo3-ffi" -version = "0.23.5" +version = "0.24.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e9f1b4c431c0bb1c8fb0a338709859eed0d030ff6daa34368d3b152a63dfdd8d" +checksum = "78f9cf92ba9c409279bc3305b5409d90db2d2c22392d443a87df3a1adad59e33" dependencies = [ "libc", "pyo3-build-config", @@ -2716,9 +2716,9 @@ dependencies = [ [[package]] name = "pyo3-macros" -version = "0.23.5" +version = "0.24.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fbc2201328f63c4710f68abdf653c89d8dbc2858b88c5d88b0ff38a75288a9da" +checksum = "0b999cb1a6ce21f9a6b147dcf1be9ffedf02e0043aec74dc390f3007047cecd9" dependencies = [ "proc-macro2", "pyo3-macros-backend", @@ -2728,9 +2728,9 @@ dependencies = [ [[package]] name = "pyo3-macros-backend" -version = "0.23.5" +version = "0.24.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fca6726ad0f3da9c9de093d6f116a93c1a38e417ed73bf138472cf4064f72028" +checksum = "822ece1c7e1012745607d5cf0bcb2874769f0f7cb34c4cde03b9358eb9ef911a" dependencies = [ "heck", "proc-macro2", @@ -2741,9 +2741,9 @@ dependencies = [ [[package]] name = "pythonize" -version = "0.23.0" +version = "0.24.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "91a6ee7a084f913f98d70cdc3ebec07e852b735ae3059a1500db2661265da9ff" +checksum = "d5bcac0d0b71821f0d69e42654f1e15e5c94b85196446c4de9588951a2117e7b" dependencies = [ "pyo3", "serde", @@ -3537,9 +3537,9 @@ dependencies = [ [[package]] name = "target-lexicon" -version = "0.12.16" +version = "0.13.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1" +checksum = "adb6935a6f5c20170eeceb1a3835a49e12e19d792f6dd344ccc76a985ca5a6ca" [[package]] name = "tempfile" diff --git a/Cargo.toml b/Cargo.toml index 706a90d..a7fbe59 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -79,8 +79,8 @@ petgraph = { version = "0.8", features = [] } infer = { version = "0.19", features = [] } # Python interop -pyo3 = { version = "0.23", features = [] } -pythonize = { version = "0.23", features = [] } +pyo3 = { version = "0.24", features = [] } +pythonize = { version = "0.24", features = [] } # S3-compatible object storage minio = { version = "0.3", features = [] }