diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5cad8a8..74d9dae 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -60,11 +60,14 @@ jobs: with: key: msrv-1.88 - - name: cargo build (MSRV) - run: cargo build --workspace --all-targets - - - name: cargo test (MSRV) - run: cargo test --workspace --all-targets + # MSRV is guaranteed only for the lean build. The model2vec `embed` + # feature (on by default) pulls tokenizers/hf-hub, which track a newer + # toolchain; it's exercised by the stable `test` jobs instead. + - name: cargo build (MSRV, lean) + run: cargo build --workspace --all-targets --no-default-features + + - name: cargo test (MSRV, lean) + run: cargo test --workspace --all-targets --no-default-features audit: name: cargo-audit (security advisories) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3929a1b..f41848a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,37 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.15.0] - 2026-06-12 + +### Added +- **Semantic memory — Pillar A** (epic to make the journal a drop-in + replacement for claude-mem/mem0). The journal can now retrieve events by + *meaning*, not just keyword. + - `task-journal ask "" [--k N]` — semantic search over the project's + journal. Embeds the query, embeds any new events on the fly (so the index + self-maintains), and returns the most relevant events by score. + - `task-journal embed [--backfill]` — vectorise new events, or the whole + project history. + - **model2vec backend, on by default.** A pure-Rust static embedding model + (`minishlab/potion-multilingual-128M`, multilingual so RU/EN both work, no + onnxruntime) gives true paraphrase/morphology-robust recall. The model is + downloaded once from HuggingFace and cached. Override the model with + `TJ_EMBED_MODEL`. + - **Always-works fallback.** If the model can't load — offline first run, + download failure, or `TJ_EMBED=hash` — the journal falls back to a + dependency-free lexical embedder, so retrieval never breaks. Build with + `--no-default-features` for the lean, lexical-only configuration. + - Schema **v008**: an additive `embeddings` table (one little-endian f32 BLOB + per event, tagged with model + dim so a model change re-embeds cleanly) and + an `events_index.memory_tier` column for the tiers landing in a later phase. + +### Internal +- `tj-core::embed` — `Embedder` trait, cosine, f32↔BLOB codec, `HashEmbedder` + (lexical default/fallback), `Model2VecEmbedder` (behind the default `embed` + feature), and `db::semantic_search` / `embed_pending`. +- The MSRV (1.88) guarantee now covers the lean build; the `embed` feature + tracks a newer toolchain and is exercised by the stable CI jobs. + ## [0.14.4] - 2026-06-12 ### Fixed diff --git a/Cargo.lock b/Cargo.lock index 08188b1..2801a4d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -15,7 +15,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75" dependencies = [ "cfg-if", + "getrandom 0.3.4", "once_cell", + "serde", "version_check", "zerocopy", ] @@ -158,12 +160,33 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" +[[package]] +name = "base64" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8" + [[package]] name = "base64" version = "0.22.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" +[[package]] +name = "bit-set" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08807e080ed7f9d5433fa9b275196cfc35414f66a0c79d864dc51a0d825231a3" +dependencies = [ + "bit-vec", +] + +[[package]] +name = "bit-vec" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7" + [[package]] name = "bitflags" version = "2.11.1" @@ -196,6 +219,12 @@ version = "3.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5d20789868f4b01b2f2caec9f5c4e0213b41e3e5702a50157d699ae31ced2fcb" +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + [[package]] name = "bytes" version = "1.11.1" @@ -349,6 +378,34 @@ dependencies = [ "static_assertions", ] +[[package]] +name = "compact_str" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9dfdd1c2274d9aa354115b09dc9a901d6c5576818cdf70d14cae2bdb47df00ab" +dependencies = [ + "castaway", + "cfg-if", + "itoa", + "rustversion", + "ryu", + "serde", + "static_assertions", +] + +[[package]] +name = "console" +version = "0.15.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "054ccb5b10f9f2cbf51eb355ca1d05c2d279ce1804688d0db74b4733a5aeafd8" +dependencies = [ + "encode_unicode", + "libc", + "once_cell", + "unicode-width 0.2.0", + "windows-sys 0.59.0", +] + [[package]] name = "core-foundation-sys" version = "0.8.7" @@ -475,6 +532,16 @@ dependencies = [ "typenum", ] +[[package]] +name = "darling" +version = "0.20.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc7f46116c46ff9ab3eb1597a45688b6715c6e628b5c133e288e709a29bcb4ee" +dependencies = [ + "darling_core 0.20.11", + "darling_macro 0.20.11", +] + [[package]] name = "darling" version = "0.21.3" @@ -495,6 +562,20 @@ dependencies = [ "darling_macro 0.23.0", ] +[[package]] +name = "darling_core" +version = "0.20.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0d00b9596d185e565c2207a0b01f8bd1a135483d02d9b7b0a54b11da8d53412e" +dependencies = [ + "fnv", + "ident_case", + "proc-macro2", + "quote", + "strsim", + "syn", +] + [[package]] name = "darling_core" version = "0.21.3" @@ -522,6 +603,17 @@ dependencies = [ "syn", ] +[[package]] +name = "darling_macro" +version = "0.20.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc34b93ccb385b40dc71c6fceac4b2ad23662c7eeb248cf10d529b7e055b6ead" +dependencies = [ + "darling_core 0.20.11", + "quote", + "syn", +] + [[package]] name = "darling_macro" version = "0.21.3" @@ -544,6 +636,46 @@ dependencies = [ "syn", ] +[[package]] +name = "dary_heap" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b1e3a325bc115f096c8b77bbf027a7c2592230e70be2d985be950d3d5e60ebe" +dependencies = [ + "serde", +] + +[[package]] +name = "derive_builder" +version = "0.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "507dfb09ea8b7fa618fcf76e953f4f5e192547945816d5358edffe39f6f94947" +dependencies = [ + "derive_builder_macro", +] + +[[package]] +name = "derive_builder_core" +version = "0.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d5bcf7b024d6835cfb3d473887cd966994907effbe9227e8c8219824d06c4e8" +dependencies = [ + "darling 0.20.11", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "derive_builder_macro" +version = "0.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ab63b0e2bf4d5928aff72e83a7dace85d7bba5fe12dcc3c5a572d78caffd3f3c" +dependencies = [ + "derive_builder_core", + "syn", +] + [[package]] name = "difflib" version = "0.4.0" @@ -566,7 +698,16 @@ version = "5.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9a49173b84e034382284f27f1af4dcbbd231ffa358c0fe316541a7337f376a35" dependencies = [ - "dirs-sys", + "dirs-sys 0.4.1", +] + +[[package]] +name = "dirs" +version = "6.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3e8aa94d75141228480295a7d0e7feb620b1a5ad9f12bc40be62411e38cce4e" +dependencies = [ + "dirs-sys 0.5.0", ] [[package]] @@ -577,10 +718,22 @@ checksum = "520f05a5cbd335fae5a99ff7a6ab8627577660ee5cfd6a94a6a929b52ff0321c" dependencies = [ "libc", "option-ext", - "redox_users", + "redox_users 0.4.6", "windows-sys 0.48.0", ] +[[package]] +name = "dirs-sys" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e01a3366d27ee9890022452ee61b2b63a67e6f13f58900b651ff5665f0bb1fab" +dependencies = [ + "libc", + "option-ext", + "redox_users 0.5.2", + "windows-sys 0.61.2", +] + [[package]] name = "displaydoc" version = "0.2.5" @@ -616,6 +769,12 @@ version = "1.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" +[[package]] +name = "encode_unicode" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0" + [[package]] name = "equivalent" version = "1.0.2" @@ -632,6 +791,15 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "esaxx-rs" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d817e038c30374a4bcb22f94d0a8a0e216958d4c3dcde369b1439fec4bdda6e6" +dependencies = [ + "cc", +] + [[package]] name = "fallible-iterator" version = "0.3.0" @@ -644,6 +812,17 @@ version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a" +[[package]] +name = "fancy-regex" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e24cb5a94bcae1e5408b0effca5cd7172ea3c5755049c5f3af4cd283a165298" +dependencies = [ + "bit-set", + "regex-automata", + "regex-syntax", +] + [[package]] name = "fastrand" version = "2.4.1" @@ -942,6 +1121,25 @@ version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c" +[[package]] +name = "hf-hub" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "629d8f3bbeda9d148036d6b0de0a3ab947abd08ce90626327fc3547a49d59d97" +dependencies = [ + "dirs", + "http", + "indicatif", + "libc", + "log", + "rand", + "serde", + "serde_json", + "thiserror 2.0.18", + "ureq", + "windows-sys 0.60.2", +] + [[package]] name = "http" version = "1.4.0" @@ -1189,6 +1387,19 @@ dependencies = [ "serde_core", ] +[[package]] +name = "indicatif" +version = "0.17.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "183b3088984b400f4cfac3620d5e076c84da5364016b4f49473de574b2586235" +dependencies = [ + "console", + "number_prefix", + "portable-atomic", + "unicode-width 0.2.0", + "web-time", +] + [[package]] name = "indoc" version = "2.0.7" @@ -1246,6 +1457,15 @@ dependencies = [ "either", ] +[[package]] +name = "itertools" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285" +dependencies = [ + "either", +] + [[package]] name = "itoa" version = "1.0.18" @@ -1344,6 +1564,22 @@ dependencies = [ "hashbrown 0.15.5", ] +[[package]] +name = "macro_rules_attribute" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "65049d7923698040cd0b1ddcced9b0eb14dd22c5f86ae59c3740eab64a676520" +dependencies = [ + "macro_rules_attribute-proc_macro", + "paste", +] + +[[package]] +name = "macro_rules_attribute-proc_macro" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "670fdfda89751bc4a84ac13eaa63e205cf0fd22b4c9a5fbfa085b63c1f1d3a30" + [[package]] name = "matchers" version = "0.2.0" @@ -1353,12 +1589,28 @@ dependencies = [ "regex-automata", ] +[[package]] +name = "matrixmultiply" +version = "0.3.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a06de3016e9fae57a36fd14dba131fccf49f74b40b7fbdb472f96e361ec71a08" +dependencies = [ + "autocfg", + "rawpointer", +] + [[package]] name = "memchr" version = "2.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" +[[package]] +name = "minimal-lexical" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" + [[package]] name = "miniz_oxide" version = "0.8.9" @@ -1406,6 +1658,69 @@ dependencies = [ "tokio", ] +[[package]] +name = "model2vec-rs" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3cbb465c6997e85d6bcb0e9fabedb51cc8a0919d2a3de083157abe83dccbde54" +dependencies = [ + "anyhow", + "clap", + "half", + "hf-hub", + "ndarray", + "safetensors", + "serde", + "serde_json", + "tokenizers", + "ureq", +] + +[[package]] +name = "monostate" +version = "0.1.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3341a273f6c9d5bef1908f17b7267bbab0e95c9bf69a0d4dcf8e9e1b2c76ef67" +dependencies = [ + "monostate-impl", + "serde", + "serde_core", +] + +[[package]] +name = "monostate-impl" +version = "0.1.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e4db6d5580af57bf992f59068d4ea26fd518574ff48d7639b255a36f9de6e7e9" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "ndarray" +version = "0.15.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "adb12d4e967ec485a5f71c6311fe28158e9d6f4bc4a447b474184d0f91a8fa32" +dependencies = [ + "matrixmultiply", + "num-complex", + "num-integer", + "num-traits", + "rawpointer", +] + +[[package]] +name = "nom" +version = "7.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" +dependencies = [ + "memchr", + "minimal-lexical", +] + [[package]] name = "normalize-line-endings" version = "0.3.0" @@ -1421,6 +1736,24 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "num-complex" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495" +dependencies = [ + "num-traits", +] + +[[package]] +name = "num-integer" +version = "0.1.46" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f" +dependencies = [ + "num-traits", +] + [[package]] name = "num-traits" version = "0.2.19" @@ -1430,6 +1763,12 @@ dependencies = [ "autocfg", ] +[[package]] +name = "number_prefix" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3" + [[package]] name = "once_cell" version = "1.21.4" @@ -1529,6 +1868,12 @@ dependencies = [ "plotters-backend", ] +[[package]] +name = "portable-atomic" +version = "1.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c33a9471896f1c69cecef8d20cbe2f7accd12527ce60845ff44c153bb2a21b49" + [[package]] name = "potential_utf" version = "0.1.5" @@ -1654,7 +1999,7 @@ checksum = "eabd94c2f37801c20583fc49dd5cd6b0ba68c716787c2dd6ed18571e1e63117b" dependencies = [ "bitflags", "cassowary", - "compact_str", + "compact_str 0.8.1", "crossterm", "indoc", "instability", @@ -1667,6 +2012,12 @@ dependencies = [ "unicode-width 0.2.0", ] +[[package]] +name = "rawpointer" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60a357793950651c4ed0f3f52338f53b2f809f32d83a07f72909fa13e4c6c1e3" + [[package]] name = "rayon" version = "1.12.0" @@ -1677,6 +2028,17 @@ dependencies = [ "rayon-core", ] +[[package]] +name = "rayon-cond" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2964d0cf57a3e7a06e8183d14a8b527195c706b7983549cd5462d5aa3747438f" +dependencies = [ + "either", + "itertools 0.14.0", + "rayon", +] + [[package]] name = "rayon-core" version = "1.13.0" @@ -1707,6 +2069,17 @@ dependencies = [ "thiserror 1.0.69", ] +[[package]] +name = "redox_users" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4e608c6638b9c18977b00b475ac1f28d14e84b27d8d42f70e0bf1e3dec127ac" +dependencies = [ + "getrandom 0.2.17", + "libredox", + "thiserror 2.0.18", +] + [[package]] name = "ref-cast" version = "1.0.25" @@ -1776,7 +2149,7 @@ version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "824daba0a34f8c5c5392295d381e0800f88fd986ba291699f8785f05fa344c1e" dependencies = [ - "base64", + "base64 0.22.1", "chrono", "futures", "paste", @@ -1892,6 +2265,16 @@ version = "1.0.23" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f" +[[package]] +name = "safetensors" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc0cdb7198d738a111f6df8fef42cb175412c311d0c4ac9126ff4e550ad1a0e8" +dependencies = [ + "serde", + "serde_json", +] + [[package]] name = "same-file" version = "1.0.6" @@ -2096,6 +2479,29 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "socks" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0c3dbbd9ae980613c6dd8e28a9407b50509d3803b57624d5dfe8315218cd58b" +dependencies = [ + "byteorder", + "libc", + "winapi", +] + +[[package]] +name = "spm_precompiled" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5851699c4033c63636f7ea4cf7b7c1f1bf06d0cc03cfb42e711de5a5c46cf326" +dependencies = [ + "base64 0.13.1", + "nom", + "serde", + "unicode-segmentation", +] + [[package]] name = "stable_deref_trait" version = "1.2.1" @@ -2166,7 +2572,7 @@ dependencies = [ [[package]] name = "task-journal-cli" -version = "0.14.4" +version = "0.15.0" dependencies = [ "anyhow", "assert_cmd", @@ -2189,7 +2595,7 @@ dependencies = [ [[package]] name = "task-journal-core" -version = "0.14.4" +version = "0.15.0" dependencies = [ "anyhow", "chrono", @@ -2198,6 +2604,7 @@ dependencies = [ "dunce", "fd-lock", "mockito", + "model2vec-rs", "regex", "rusqlite", "schemars", @@ -2213,7 +2620,7 @@ dependencies = [ [[package]] name = "task-journal-mcp" -version = "0.14.4" +version = "0.15.0" dependencies = [ "anyhow", "clap", @@ -2318,6 +2725,40 @@ dependencies = [ "serde_json", ] +[[package]] +name = "tokenizers" +version = "0.21.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a620b996116a59e184c2fa2dfd8251ea34a36d0a514758c6f966386bd2e03476" +dependencies = [ + "ahash", + "aho-corasick", + "compact_str 0.9.1", + "dary_heap", + "derive_builder", + "esaxx-rs", + "fancy-regex", + "getrandom 0.3.4", + "indicatif", + "itertools 0.14.0", + "log", + "macro_rules_attribute", + "monostate", + "paste", + "rand", + "rayon", + "rayon-cond", + "regex", + "regex-syntax", + "serde", + "serde_json", + "spm_precompiled", + "thiserror 2.0.18", + "unicode-normalization-alignments", + "unicode-segmentation", + "unicode_categories", +] + [[package]] name = "tokio" version = "1.52.1" @@ -2454,6 +2895,15 @@ version = "1.0.24" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" +[[package]] +name = "unicode-normalization-alignments" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43f613e4fa046e69818dd287fdc4bc78175ff20331479dab6e1b0f98d57062de" +dependencies = [ + "smallvec", +] + [[package]] name = "unicode-segmentation" version = "1.13.2" @@ -2489,6 +2939,12 @@ version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853" +[[package]] +name = "unicode_categories" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39ec24b3121d976906ece63c9daad25b85969647682eee313cb5779fdd69e14e" + [[package]] name = "untrusted" version = "0.9.0" @@ -2501,7 +2957,7 @@ version = "2.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "02d1a66277ed75f640d608235660df48c8e3c19f3b4edb6a263315626cc3c01d" dependencies = [ - "base64", + "base64 0.22.1", "flate2", "log", "once_cell", @@ -2509,6 +2965,7 @@ dependencies = [ "rustls-pki-types", "serde", "serde_json", + "socks", "url", "webpki-roots 0.26.11", ] @@ -2823,6 +3280,24 @@ dependencies = [ "windows-targets 0.52.6", ] +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +dependencies = [ + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-sys" +version = "0.60.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2f500e4d28234f72040990ec9d39e3a6b950f9f22d3dba18416c35882612bcb" +dependencies = [ + "windows-targets 0.53.5", +] + [[package]] name = "windows-sys" version = "0.61.2" @@ -2856,13 +3331,30 @@ dependencies = [ "windows_aarch64_gnullvm 0.52.6", "windows_aarch64_msvc 0.52.6", "windows_i686_gnu 0.52.6", - "windows_i686_gnullvm", + "windows_i686_gnullvm 0.52.6", "windows_i686_msvc 0.52.6", "windows_x86_64_gnu 0.52.6", "windows_x86_64_gnullvm 0.52.6", "windows_x86_64_msvc 0.52.6", ] +[[package]] +name = "windows-targets" +version = "0.53.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4945f9f551b88e0d65f3db0bc25c33b8acea4d9e41163edf90dcd0b19f9069f3" +dependencies = [ + "windows-link", + "windows_aarch64_gnullvm 0.53.1", + "windows_aarch64_msvc 0.53.1", + "windows_i686_gnu 0.53.1", + "windows_i686_gnullvm 0.53.1", + "windows_i686_msvc 0.53.1", + "windows_x86_64_gnu 0.53.1", + "windows_x86_64_gnullvm 0.53.1", + "windows_x86_64_msvc 0.53.1", +] + [[package]] name = "windows_aarch64_gnullvm" version = "0.48.5" @@ -2875,6 +3367,12 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a9d8416fa8b42f5c947f8482c43e7d89e73a173cead56d044f6a56104a6d1b53" + [[package]] name = "windows_aarch64_msvc" version = "0.48.5" @@ -2887,6 +3385,12 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" +[[package]] +name = "windows_aarch64_msvc" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9d782e804c2f632e395708e99a94275910eb9100b2114651e04744e9b125006" + [[package]] name = "windows_i686_gnu" version = "0.48.5" @@ -2899,12 +3403,24 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" +[[package]] +name = "windows_i686_gnu" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "960e6da069d81e09becb0ca57a65220ddff016ff2d6af6a223cf372a506593a3" + [[package]] name = "windows_i686_gnullvm" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" +[[package]] +name = "windows_i686_gnullvm" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa7359d10048f68ab8b09fa71c3daccfb0e9b559aed648a8f95469c27057180c" + [[package]] name = "windows_i686_msvc" version = "0.48.5" @@ -2917,6 +3433,12 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" +[[package]] +name = "windows_i686_msvc" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e7ac75179f18232fe9c285163565a57ef8d3c89254a30685b57d83a38d326c2" + [[package]] name = "windows_x86_64_gnu" version = "0.48.5" @@ -2929,6 +3451,12 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" +[[package]] +name = "windows_x86_64_gnu" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c3842cdd74a865a8066ab39c8a7a473c0778a3f29370b5fd6b4b9aa7df4a499" + [[package]] name = "windows_x86_64_gnullvm" version = "0.48.5" @@ -2941,6 +3469,12 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ffa179e2d07eee8ad8f57493436566c7cc30ac536a3379fdf008f47f6bb7ae1" + [[package]] name = "windows_x86_64_msvc" version = "0.48.5" @@ -2953,6 +3487,12 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" +[[package]] +name = "windows_x86_64_msvc" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650" + [[package]] name = "wit-bindgen" version = "0.51.0" diff --git a/Cargo.toml b/Cargo.toml index e9bf9d5..aab1968 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,7 +7,7 @@ members = [ ] [workspace.package] -version = "0.14.4" +version = "0.15.0" edition = "2021" rust-version = "1.88" license = "MIT" diff --git a/crates/tj-cli/Cargo.toml b/crates/tj-cli/Cargo.toml index a5cab55..36db2a8 100644 --- a/crates/tj-cli/Cargo.toml +++ b/crates/tj-cli/Cargo.toml @@ -15,8 +15,15 @@ readme = "../../README.md" name = "task-journal" path = "src/main.rs" +[features] +# `embed` (on by default) turns on tj-core's model2vec semantic backend. The +# lean build (`--no-default-features`) keeps the dependency-free lexical +# fallback and a lower MSRV — the CI msrv job builds that configuration. +default = ["embed"] +embed = ["tj-core/embed"] + [dependencies] -tj-core = { package = "task-journal-core", version = "0.14.2", path = "../tj-core" } +tj-core = { package = "task-journal-core", version = "0.15.0", path = "../tj-core", default-features = false } anyhow = { workspace = true } clap = { workspace = true } tracing = { workspace = true } diff --git a/crates/tj-cli/src/main.rs b/crates/tj-cli/src/main.rs index 596e2df..6fa1e93 100644 --- a/crates/tj-cli/src/main.rs +++ b/crates/tj-cli/src/main.rs @@ -613,6 +613,25 @@ enum Commands { }, /// Rebuild SQLite state from the JSONL log. RebuildState, + /// Embed events for semantic search (Pillar A). Computes a vector per event + /// and stores it in the v008 `embeddings` table. `--backfill` drains the + /// whole project; without it, only newly-unembedded events are processed. + /// Uses the dependency-free hash embedder by default — fully offline. + Embed { + /// Vectorise the entire project history, not just new events. + #[arg(long)] + backfill: bool, + }, + /// Semantic search over this project's journal (Pillar A). Embeds the query + /// and returns the most relevant events by meaning, not keyword. New events + /// are embedded on the fly, so the index stays current with zero setup. + Ask { + /// The question or topic to search for. + query: String, + /// Maximum number of results. + #[arg(long, default_value_t = 5)] + k: usize, + }, /// Render and print the resume pack for a task. Pack { /// Task id (e.g. tj-7f3a). @@ -1113,6 +1132,74 @@ fn main() -> Result<()> { let n = tj_core::db::rebuild_state(&conn, &events_path, &project_hash)?; println!("rebuilt {n} events into {state_path:?}"); } + Commands::Embed { backfill } => { + let cwd = std::env::current_dir()?; + let project_hash = tj_core::project_hash::from_path(&cwd)?; + let events_path = tj_core::paths::events_dir()?.join(format!("{project_hash}.jsonl")); + let state_path = tj_core::paths::state_dir()?.join(format!("{project_hash}.sqlite")); + if !events_path.exists() { + anyhow::bail!("no events file at {events_path:?}"); + } + let conn = tj_core::db::open(&state_path)?; + // search_fts must be current before we embed from it. + tj_core::db::ingest_new_events(&conn, &events_path, &project_hash)?; + + let embedder = tj_core::embed::default_embedder(); + let now = chrono::Utc::now().to_rfc3339(); + let batch = if backfill { 256 } else { 64 }; + let mut total = 0usize; + loop { + let n = tj_core::db::embed_pending( + &conn, + &project_hash, + embedder.as_ref(), + &now, + batch, + )?; + total += n; + // Without --backfill, one batch of newly-unembedded events is enough. + if n == 0 || !backfill { + break; + } + } + println!( + "embedded {total} event(s) with model {} ({} dim)", + embedder.model_id(), + embedder.dim() + ); + } + Commands::Ask { query, k } => { + let cwd = std::env::current_dir()?; + let project_hash = tj_core::project_hash::from_path(&cwd)?; + let events_path = tj_core::paths::events_dir()?.join(format!("{project_hash}.jsonl")); + let state_path = tj_core::paths::state_dir()?.join(format!("{project_hash}.sqlite")); + if !events_path.exists() { + anyhow::bail!("no events file at {events_path:?}"); + } + let conn = tj_core::db::open(&state_path)?; + tj_core::db::ingest_new_events(&conn, &events_path, &project_hash)?; + + let embedder = tj_core::embed::default_embedder(); + // Embed-on-ask: vectorise anything new so the answer reflects the + // latest events without the user running `embed` first. + let now = chrono::Utc::now().to_rfc3339(); + tj_core::db::embed_pending(&conn, &project_hash, embedder.as_ref(), &now, 512)?; + + let qv = embedder.embed_one(&query)?; + let hits = + tj_core::db::semantic_search(&conn, &project_hash, &qv, embedder.model_id(), k)?; + if hits.is_empty() { + println!("no matches"); + } else { + for h in hits { + let snippet: String = h.text.chars().take(100).collect(); + println!( + "{:.3} [{}] {} ({})", + h.score, h.event_type, snippet, h.task_id + ); + } + } + } Commands::Event { task_id, r#type, diff --git a/crates/tj-cli/tests/cli.rs b/crates/tj-cli/tests/cli.rs index 406684c..38a8d49 100644 --- a/crates/tj-cli/tests/cli.rs +++ b/crates/tj-cli/tests/cli.rs @@ -4743,3 +4743,191 @@ fn export_memory_missing_task_exits_1() { .code(1) .stderr(contains("task not found")); } + +#[test] +fn embed_backfill_vectorises_events_then_idempotent() { + // Pillar A / Phase 0: `embed --backfill` computes a vector per event using + // the dependency-free hash embedder and stores it; a second run finds + // nothing new. Fully offline — no model, no network. + let dir = assert_fs::TempDir::new().unwrap(); + + let task_id = String::from_utf8( + Command::cargo_bin("task-journal") + .unwrap() + .env("XDG_DATA_HOME", dir.path()) + .args(["create", "Implement semantic memory retrieval"]) + .assert() + .success() + .get_output() + .stdout + .clone(), + ) + .unwrap() + .trim() + .to_string(); + + Command::cargo_bin("task-journal") + .unwrap() + .env("XDG_DATA_HOME", dir.path()) + .args([ + "event", + &task_id, + "--type", + "decision", + "--text", + "Use model2vec static embeddings for offline semantic recall.", + ]) + .assert() + .success(); + + // First backfill: embeds the open + decision events. TJ_EMBED=hash forces + // the deterministic lexical embedder so the assertion is model-independent. + Command::cargo_bin("task-journal") + .unwrap() + .env("XDG_DATA_HOME", dir.path()) + .env("TJ_EMBED", "hash") + .args(["embed", "--backfill"]) + .assert() + .success() + .stdout(contains("hash-v1")) + .stdout(contains("embedded 2")); + + // Second run without --backfill: nothing new to embed. + Command::cargo_bin("task-journal") + .unwrap() + .env("XDG_DATA_HOME", dir.path()) + .env("TJ_EMBED", "hash") + .args(["embed"]) + .assert() + .success() + .stdout(contains("embedded 0")); +} + +#[test] +fn ask_ranks_semantically_relevant_event_first() { + // Pillar A / Phase 1: `ask` embeds the query and returns events by meaning. + // The query's terms overlap one event strongly and the others not at all, + // so vector ranking must surface it first. (The hash embedder is lexical; + // true paraphrase/morphology robustness is the model2vec backend's job.) + let dir = assert_fs::TempDir::new().unwrap(); + let task_id = String::from_utf8( + Command::cargo_bin("task-journal") + .unwrap() + .env("XDG_DATA_HOME", dir.path()) + .args(["create", "Payments hardening"]) + .assert() + .success() + .get_output() + .stdout + .clone(), + ) + .unwrap() + .trim() + .to_string(); + + for (ty, text) in [ + ( + "decision", + "Route refunds through the idempotent payment ledger to stop double writes.", + ), + ( + "finding", + "The frontend button hover color is wrong in dark mode.", + ), + ( + "finding", + "Added a composite index on users email and tenant for lookup speed.", + ), + ] { + Command::cargo_bin("task-journal") + .unwrap() + .env("XDG_DATA_HOME", dir.path()) + .args(["event", &task_id, "--type", ty, "--text", text]) + .assert() + .success(); + } + + let out = String::from_utf8( + Command::cargo_bin("task-journal") + .unwrap() + .env("XDG_DATA_HOME", dir.path()) + .env("TJ_EMBED", "hash") + .args(["ask", "idempotent refunds ledger double writes", "--k", "3"]) + .assert() + .success() + .get_output() + .stdout + .clone(), + ) + .unwrap(); + + let first = out.lines().next().unwrap_or(""); + assert!( + first.contains("refund") || first.contains("ledger"), + "top hit must be the refund decision; got first line: {first:?}\nfull:\n{out}" + ); +} + +#[test] +#[ignore = "downloads the model2vec model from HuggingFace; run manually with --ignored"] +fn ask_with_model2vec_handles_paraphrase() { + // True semantic recall: a paraphrase that shares NO exact term with the + // target event must still rank it first. The lexical hash embedder fails + // this; the model2vec backend (default) passes. Needs network on first run. + let dir = assert_fs::TempDir::new().unwrap(); + let task_id = String::from_utf8( + Command::cargo_bin("task-journal") + .unwrap() + .env("XDG_DATA_HOME", dir.path()) + .args(["create", "Payments hardening"]) + .assert() + .success() + .get_output() + .stdout + .clone(), + ) + .unwrap() + .trim() + .to_string(); + + for (ty, text) in [ + ( + "decision", + "Route refunds through the idempotent payment ledger to stop double writes.", + ), + ( + "finding", + "The frontend button hover color is wrong in dark mode.", + ), + ( + "finding", + "Added a composite index on users email and tenant for lookup speed.", + ), + ] { + Command::cargo_bin("task-journal") + .unwrap() + .env("XDG_DATA_HOME", dir.path()) + .args(["event", &task_id, "--type", ty, "--text", text]) + .assert() + .success(); + } + + // "duplicate refund payments" shares no exact token with the ledger event. + let out = String::from_utf8( + Command::cargo_bin("task-journal") + .unwrap() + .env("XDG_DATA_HOME", dir.path()) + .args(["ask", "duplicate refund payments", "--k", "3"]) + .assert() + .success() + .get_output() + .stdout + .clone(), + ) + .unwrap(); + let first = out.lines().next().unwrap_or(""); + assert!( + first.contains("refund") || first.contains("ledger"), + "model2vec must rank the refund decision first for a paraphrase; got: {first:?}" + ); +} diff --git a/crates/tj-core/Cargo.toml b/crates/tj-core/Cargo.toml index ed1df7f..0f11a8c 100644 --- a/crates/tj-core/Cargo.toml +++ b/crates/tj-core/Cargo.toml @@ -15,6 +15,15 @@ readme = "../../README.md" name = "tj_core" path = "src/lib.rs" +[features] +# `embed` pulls the real semantic backend (model2vec, pure-Rust). It is ON by +# default so out-of-the-box `ask` is genuinely semantic; build with +# `--no-default-features` for the lean, dependency-free lexical fallback. We +# pin model2vec-rs to fancy-regex (pure Rust) instead of its default `onig` +# (native oniguruma C lib) so the build stays C-toolchain-free. +default = ["embed"] +embed = ["dep:model2vec-rs"] + [dependencies] anyhow = { workspace = true } thiserror = { workspace = true } @@ -31,6 +40,10 @@ ureq = { workspace = true } tracing = { workspace = true } fd-lock = { workspace = true } regex = { workspace = true } +model2vec-rs = { version = "0.2", optional = true, default-features = false, features = [ + "hf-hub", + "fancy-regex", +] } [dev-dependencies] tempfile = { workspace = true } diff --git a/crates/tj-core/src/db.rs b/crates/tj-core/src/db.rs index b03ab7f..7748b8e 100644 --- a/crates/tj-core/src/db.rs +++ b/crates/tj-core/src/db.rs @@ -128,6 +128,28 @@ ALTER TABLE decisions ADD COLUMN alternatives TEXT; DELETE FROM task_pack_cache; "#; +/// v0.15.0 semantic-memory substrate (Pillar A). `embeddings` stores one +/// vector per event as a little-endian f32 BLOB, tagged with the model id + +/// dim so we never compare across models and can re-embed on a model change. +/// `memory_tier` is denormalised onto `events_index` for cheap tier filtering +/// (episodic by default; semantic/procedural/preference added in Phase 3). +/// Purely additive — existing rows default to `episodic`, the append-only log +/// is untouched, and an absent embedder simply leaves `embeddings` empty. +const MIGRATION_008: &str = r#" +CREATE TABLE IF NOT EXISTS embeddings ( + event_id TEXT PRIMARY KEY, + task_id TEXT NOT NULL, + project_hash TEXT NOT NULL, + tier TEXT NOT NULL DEFAULT 'episodic', + model TEXT NOT NULL, + dim INTEGER NOT NULL, + vec BLOB NOT NULL, + created_at TEXT NOT NULL +); +CREATE INDEX IF NOT EXISTS idx_emb_project_tier ON embeddings(project_hash, tier); +ALTER TABLE events_index ADD COLUMN memory_tier TEXT NOT NULL DEFAULT 'episodic'; +"#; + /// All schema migrations in version order. Append new entries here; never /// edit a published migration's `sql` — write a new one instead. const MIGRATIONS: &[Migration] = &[ @@ -159,6 +181,10 @@ const MIGRATIONS: &[Migration] = &[ version: 7, sql: MIGRATION_007, }, + Migration { + version: 8, + sql: MIGRATION_008, + }, ]; fn apply_migrations(conn: &Connection) -> anyhow::Result<()> { @@ -1038,9 +1064,196 @@ pub fn invalidate_pack_cascade(conn: &Connection, task_id: &str) -> anyhow::Resu Ok(()) } +// --------------------------------------------------------------------------- +// Semantic-memory substrate (Pillar A / schema v008). +// --------------------------------------------------------------------------- + +/// One event awaiting an embedding: its id, task, and the text to embed. +pub struct PendingEmbed { + pub event_id: String, + pub task_id: String, + pub text: String, +} + +/// Events that have no up-to-date embedding for `model` — either never embedded +/// or embedded by a different model. Pulls the text straight from `search_fts`. +/// `limit` bounds the batch; pass a large value to drain. +pub fn events_needing_embedding( + conn: &Connection, + model: &str, + limit: usize, +) -> anyhow::Result> { + let mut stmt = conn.prepare( + "SELECT f.event_id, f.task_id, f.text + FROM search_fts f + LEFT JOIN embeddings e ON e.event_id = f.event_id AND e.model = ?1 + WHERE e.event_id IS NULL + LIMIT ?2", + )?; + let rows = stmt.query_map(rusqlite::params![model, limit as i64], |r| { + Ok(PendingEmbed { + event_id: r.get(0)?, + task_id: r.get(1)?, + text: r.get(2)?, + }) + })?; + let mut out = Vec::new(); + for r in rows { + out.push(r?); + } + Ok(out) +} + +/// Upsert one vector. Keyed on `event_id`, so re-embedding (e.g. after a model +/// change) replaces the prior row idempotently across `rebuild_state` replays. +#[allow(clippy::too_many_arguments)] +pub fn upsert_embedding( + conn: &Connection, + event_id: &str, + task_id: &str, + project_hash: &str, + tier: &str, + model: &str, + dim: usize, + vec: &[f32], + created_at: &str, +) -> anyhow::Result<()> { + conn.execute( + "INSERT OR REPLACE INTO embeddings(event_id, task_id, project_hash, tier, model, dim, vec, created_at) + VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8)", + rusqlite::params![ + event_id, + task_id, + project_hash, + tier, + model, + dim as i64, + crate::embed::to_blob(vec), + created_at + ], + )?; + Ok(()) +} + +/// Number of stored embeddings for a project (test/stats helper). +pub fn count_embeddings(conn: &Connection, project_hash: &str) -> anyhow::Result { + let n: i64 = conn.query_row( + "SELECT COUNT(*) FROM embeddings WHERE project_hash = ?1", + rusqlite::params![project_hash], + |r| r.get(0), + )?; + Ok(n as usize) +} + +/// Embed up to `limit` events that still need a vector for the embedder's model, +/// and store them. Returns how many were embedded this call. Shared by +/// embed-on-ingest (small batch after `ingest_new_events`) and +/// `embed --backfill` (looped until it returns 0). Every pending text gets a +/// vector — including short boilerplate — so nothing is re-scanned next pass; +/// retrieval-side filtering ([`crate::embed::is_embeddable`]) decides what's +/// worth surfacing. +pub fn embed_pending( + conn: &Connection, + project_hash: &str, + embedder: &dyn crate::embed::Embedder, + created_at: &str, + limit: usize, +) -> anyhow::Result { + let pending = events_needing_embedding(conn, embedder.model_id(), limit)?; + if pending.is_empty() { + return Ok(0); + } + let texts: Vec<&str> = pending.iter().map(|p| p.text.as_str()).collect(); + let vecs = embedder.embed(&texts)?; + let mut done = 0usize; + for (p, v) in pending.iter().zip(vecs.iter()) { + upsert_embedding( + conn, + &p.event_id, + &p.task_id, + project_hash, + "episodic", + embedder.model_id(), + embedder.dim(), + v, + created_at, + )?; + done += 1; + } + Ok(done) +} + +/// A retrieval hit: the event, its task, and the relevance score. +pub struct ScoredHit { + pub event_id: String, + pub task_id: String, + pub task_title: String, + pub event_type: String, + pub tier: String, + pub text: String, + pub score: f32, +} + +/// Semantic search over a project's embeddings. Scores every stored vector for +/// `model` against `query_vec` by cosine, returns the top `k` by score. The +/// caller embeds the query with the same embedder so the model ids match. +/// Pure vector ranking for now; recency / tier / contradiction weighting layer +/// on top in later phases. +pub fn semantic_search( + conn: &Connection, + project_hash: &str, + query_vec: &[f32], + model: &str, + k: usize, +) -> anyhow::Result> { + let mut stmt = conn.prepare( + "SELECT e.event_id, e.task_id, e.tier, e.vec, f.text, f.type, + COALESCE(t.title, '') + FROM embeddings e + JOIN search_fts f ON f.event_id = e.event_id + LEFT JOIN tasks t ON t.task_id = e.task_id + WHERE e.project_hash = ?1 AND e.model = ?2", + )?; + let rows = stmt.query_map(rusqlite::params![project_hash, model], |r| { + let blob: Vec = r.get(3)?; + Ok(( + r.get::<_, String>(0)?, // event_id + r.get::<_, String>(1)?, // task_id + r.get::<_, String>(2)?, // tier + blob, + r.get::<_, String>(4)?, // text + r.get::<_, String>(5)?, // type + r.get::<_, String>(6)?, // title + )) + })?; + + let mut hits: Vec = Vec::new(); + for row in rows { + let (event_id, task_id, tier, blob, text, event_type, task_title) = row?; + let score = crate::embed::cosine(query_vec, &crate::embed::from_blob(&blob)); + hits.push(ScoredHit { + event_id, + task_id, + task_title, + event_type, + tier, + text, + score, + }); + } + hits.sort_by(|a, b| { + b.score + .partial_cmp(&a.score) + .unwrap_or(std::cmp::Ordering::Equal) + }); + hits.truncate(k); + Ok(hits) +} + #[cfg(test)] mod tests { use super::*; + use crate::embed::Embedder; use tempfile::TempDir; #[test] @@ -1096,6 +1309,80 @@ mod tests { ); } + fn make_text_event(text: &str) -> crate::event::Event { + crate::event::Event::new( + "tj-x", + crate::event::EventType::Finding, + crate::event::Author::User, + crate::event::Source::Cli, + text.into(), + ) + } + + #[test] + fn embed_pending_embeds_all_then_is_idempotent() { + let d = TempDir::new().unwrap(); + let conn = open(d.path().join("s.sqlite")).unwrap(); + let ph = "feedfacefeedface"; + + for text in [ + "implement payment refund deduplication", + "add validation for negative order amounts", + ] { + index_event(&conn, &make_text_event(text)).unwrap(); + } + + let emb = crate::embed::HashEmbedder::new(64); + let at = "2026-06-12T00:00:00Z"; + + let n = embed_pending(&conn, ph, &emb, at, 100).unwrap(); + assert_eq!(n, 2, "both events embedded on first pass"); + assert_eq!(count_embeddings(&conn, ph).unwrap(), 2); + + // Idempotent: nothing left for this model on a second pass. + assert_eq!(embed_pending(&conn, ph, &emb, at, 100).unwrap(), 0); + + // Model-scoped: a different model id sees them as un-embedded + // (so a model change triggers a re-embed). + assert_eq!( + events_needing_embedding(&conn, "other-model", 100) + .unwrap() + .len(), + 2 + ); + } + + #[test] + fn semantic_search_ranks_relevant_event_first() { + let d = TempDir::new().unwrap(); + let conn = open(d.path().join("s.sqlite")).unwrap(); + let ph = "feedfacefeedface"; + + for text in [ + "fix duplicate payment refund write on partial refund", + "update the frontend button hover color", + "add a database index for faster user lookup", + ] { + index_event(&conn, &make_text_event(text)).unwrap(); + } + let emb = crate::embed::HashEmbedder::new(256); + embed_pending(&conn, ph, &emb, "t", 100).unwrap(); + + let q = emb.embed_one("payment refund duplicated").unwrap(); + let hits = semantic_search(&conn, ph, &q, emb.model_id(), 3).unwrap(); + + assert_eq!(hits.len(), 3); + assert!( + hits[0].text.contains("refund"), + "the refund event must rank first, got: {}", + hits[0].text + ); + assert!( + hits[0].score >= hits[1].score, + "hits must be sorted by score desc" + ); + } + #[test] fn open_creates_all_tables() { let d = TempDir::new().unwrap(); diff --git a/crates/tj-core/src/embed.rs b/crates/tj-core/src/embed.rs new file mode 100644 index 0000000..9a0c2a7 --- /dev/null +++ b/crates/tj-core/src/embed.rs @@ -0,0 +1,282 @@ +//! Embedding substrate for semantic memory (Pillar A). +//! +//! An [`Embedder`] turns text into a fixed-dimension vector so events can be +//! retrieved by *meaning*, not just keyword (FTS5). The real semantic backend is +//! a pure-Rust static model (model2vec, behind the `embed` feature); when it is +//! absent every caller falls back to FTS5, so the journal's zero-cost, +//! offline-by-default behaviour is preserved. +//! +//! This module is dependency-free on purpose: the trait, the cosine/recency +//! math, the SQLite blob codec, and a deterministic [`HashEmbedder`] all build +//! and test without pulling a model. The model2vec backend is added as an +//! isolated, feature-gated step on top. + +/// A text embedder. Implementations return exactly one vector per input, all of +/// the same [`dim`](Embedder::dim), produced by the model named by +/// [`model_id`](Embedder::model_id). +pub trait Embedder: Send + Sync { + /// Embed a batch of texts. `out[i]` corresponds to `texts[i]`. + fn embed(&self, texts: &[&str]) -> anyhow::Result>>; + /// Stable identifier of the model (stored per vector so a model change can + /// trigger a re-embed and we never compare vectors across models). + fn model_id(&self) -> &str; + /// Output dimensionality. + fn dim(&self) -> usize; + + /// Convenience: embed a single text. + fn embed_one(&self, text: &str) -> anyhow::Result> { + let mut v = self.embed(&[text])?; + Ok(v.pop().unwrap_or_default()) + } +} + +/// Cosine similarity of two vectors. Returns `0.0` on a length mismatch or a +/// zero-norm input — callers *rank* with this, they don't assert on it, so it +/// must never panic. +pub fn cosine(a: &[f32], b: &[f32]) -> f32 { + if a.len() != b.len() || a.is_empty() { + return 0.0; + } + let mut dot = 0.0f32; + let mut na = 0.0f32; + let mut nb = 0.0f32; + for i in 0..a.len() { + dot += a[i] * b[i]; + na += a[i] * a[i]; + nb += b[i] * b[i]; + } + if na == 0.0 || nb == 0.0 { + return 0.0; + } + dot / (na.sqrt() * nb.sqrt()) +} + +/// Encode an `f32` vector as a little-endian byte blob for SQLite `BLOB` +/// storage. Round-trips with [`from_blob`]. +pub fn to_blob(v: &[f32]) -> Vec { + let mut out = Vec::with_capacity(v.len() * 4); + for f in v { + out.extend_from_slice(&f.to_le_bytes()); + } + out +} + +/// Decode a little-endian byte blob back into an `f32` vector. Trailing bytes +/// that don't form a full `f32` are ignored (defensive; should never happen for +/// blobs produced by [`to_blob`]). +pub fn from_blob(b: &[u8]) -> Vec { + b.chunks_exact(4) + .map(|c| f32::from_le_bytes([c[0], c[1], c[2], c[3]])) + .collect() +} + +/// Whether an event's text is worth embedding. Skips empties and very short +/// boilerplate (e.g. the `[open]` marker) that carry no retrievable meaning. +pub fn is_embeddable(text: &str) -> bool { + text.trim().chars().count() >= 12 +} + +/// A deterministic, dependency-free embedder using the feature-hashing trick: +/// each token is hashed into one of `dim` buckets and the resulting bag-of-words +/// vector is L2-normalised. It is **lexical**, not semantic — its job is to make +/// the trait, storage, ingest and ranking code testable without a model, and to +/// serve as a crude offline fallback. The real semantic quality comes from the +/// model2vec backend. +pub struct HashEmbedder { + dim: usize, +} + +impl HashEmbedder { + pub fn new(dim: usize) -> Self { + Self { dim: dim.max(1) } + } + + fn hash_token(tok: &str) -> u64 { + // FNV-1a — small, deterministic, no deps. + let mut h: u64 = 0xcbf29ce484222325; + for b in tok.bytes() { + h ^= b as u64; + h = h.wrapping_mul(0x100000001b3); + } + h + } +} + +impl Default for HashEmbedder { + fn default() -> Self { + Self::new(64) + } +} + +impl Embedder for HashEmbedder { + fn embed(&self, texts: &[&str]) -> anyhow::Result>> { + let mut out = Vec::with_capacity(texts.len()); + for t in texts { + let mut v = vec![0.0f32; self.dim]; + for tok in t + .split(|c: char| !c.is_alphanumeric()) + .filter(|s| !s.is_empty()) + { + let lower = tok.to_lowercase(); + let bucket = (Self::hash_token(&lower) as usize) % self.dim; + v[bucket] += 1.0; + } + // L2-normalise so cosine == dot product and lengths don't bias. + let norm: f32 = v.iter().map(|x| x * x).sum::().sqrt(); + if norm > 0.0 { + for x in &mut v { + *x /= norm; + } + } + out.push(v); + } + Ok(out) + } + + fn model_id(&self) -> &str { + "hash-v1" + } + + fn dim(&self) -> usize { + self.dim + } +} + +/// Default model2vec repo — multilingual so RU/EN prose both embed well. +/// Overridable via `TJ_EMBED_MODEL`. +#[cfg(feature = "embed")] +pub const DEFAULT_EMBED_MODEL: &str = "minishlab/potion-multilingual-128M"; + +/// The embedder the journal uses unless overridden. With the `embed` feature +/// (on by default) it loads the model2vec static model for true semantic +/// recall; if that can't load — offline first run, download failure, or +/// `TJ_EMBED=hash` — it falls back to the dependency-free lexical +/// [`HashEmbedder`] so the journal never breaks. +pub fn default_embedder() -> Box { + // Test/escape hatch: force the deterministic lexical embedder. + if std::env::var("TJ_EMBED").as_deref() == Ok("hash") { + return Box::new(HashEmbedder::default()); + } + #[cfg(feature = "embed")] + { + let repo = + std::env::var("TJ_EMBED_MODEL").unwrap_or_else(|_| DEFAULT_EMBED_MODEL.to_string()); + match Model2VecEmbedder::load(&repo) { + Ok(m) => return Box::new(m), + Err(e) => { + tracing::warn!("model2vec load failed ({e:#}); using hash embedder fallback"); + } + } + } + Box::new(HashEmbedder::default()) +} + +/// True semantic embedder backed by a model2vec static model (pure-Rust, no +/// onnxruntime). The model is downloaded once via the HuggingFace hub and +/// cached locally; later loads read the cache. Behind the `embed` feature. +#[cfg(feature = "embed")] +pub struct Model2VecEmbedder { + model: model2vec_rs::model::StaticModel, + model_id: String, + dim: usize, +} + +#[cfg(feature = "embed")] +impl Model2VecEmbedder { + /// Load `repo` (a HuggingFace model id or a local directory). Probes the + /// model once to discover its output dimension. + pub fn load(repo: &str) -> anyhow::Result { + let model = model2vec_rs::model::StaticModel::from_pretrained( + repo, + None, // no auth token + Some(true), // L2-normalise outputs + None, // no subfolder + )?; + let dim = model.encode_single("probe").len(); + anyhow::ensure!( + dim > 0, + "model2vec model {repo} produced a zero-dim embedding" + ); + Ok(Self { + model, + model_id: format!("model2vec:{repo}"), + dim, + }) + } +} + +#[cfg(feature = "embed")] +impl Embedder for Model2VecEmbedder { + fn embed(&self, texts: &[&str]) -> anyhow::Result>> { + let owned: Vec = texts.iter().map(|s| s.to_string()).collect(); + Ok(self.model.encode(&owned)) + } + + fn model_id(&self) -> &str { + &self.model_id + } + + fn dim(&self) -> usize { + self.dim + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn cosine_identical_is_one() { + let v = vec![1.0, 2.0, 3.0]; + assert!((cosine(&v, &v) - 1.0).abs() < 1e-6); + } + + #[test] + fn cosine_orthogonal_is_zero() { + assert_eq!(cosine(&[1.0, 0.0], &[0.0, 1.0]), 0.0); + } + + #[test] + fn cosine_mismatch_or_zero_norm_is_zero() { + assert_eq!(cosine(&[1.0, 2.0], &[1.0]), 0.0); + assert_eq!(cosine(&[0.0, 0.0], &[1.0, 1.0]), 0.0); + } + + #[test] + fn blob_round_trips() { + let v = vec![0.5, -1.25, 3.0, 0.0]; + assert_eq!(from_blob(&to_blob(&v)), v); + } + + #[test] + fn is_embeddable_skips_short_boilerplate() { + assert!(!is_embeddable("")); + assert!(!is_embeddable("[open]")); + assert!(is_embeddable("Fix the auth bug in middleware")); + } + + #[test] + fn hash_embedder_is_deterministic_and_normalised() { + let e = HashEmbedder::new(32); + let a = e.embed_one("payment gateway dedup").unwrap(); + let b = e.embed_one("payment gateway dedup").unwrap(); + assert_eq!(a, b); + assert_eq!(a.len(), 32); + let norm: f32 = a.iter().map(|x| x * x).sum::().sqrt(); + assert!((norm - 1.0).abs() < 1e-5); + } + + #[test] + fn hash_embedder_overlap_ranks_above_disjoint() { + let e = HashEmbedder::new(256); + let q = e.embed_one("payment refund duplicate write").unwrap(); + let near = e.embed_one("duplicate refund write on payment").unwrap(); + let far = e.embed_one("frontend button color tweak").unwrap(); + assert!( + cosine(&q, &near) > cosine(&q, &far), + "lexical overlap must score higher: near={} far={}", + cosine(&q, &near), + cosine(&q, &far) + ); + } +} diff --git a/crates/tj-core/src/lib.rs b/crates/tj-core/src/lib.rs index 2ec5d50..27ace2e 100644 --- a/crates/tj-core/src/lib.rs +++ b/crates/tj-core/src/lib.rs @@ -52,6 +52,7 @@ pub mod classifier; pub mod completeness; pub mod db; pub mod dream; +pub mod embed; pub mod event; pub mod frontmatter; pub mod fts; diff --git a/crates/tj-mcp/Cargo.toml b/crates/tj-mcp/Cargo.toml index 6edce88..d67cbab 100644 --- a/crates/tj-mcp/Cargo.toml +++ b/crates/tj-mcp/Cargo.toml @@ -16,7 +16,8 @@ name = "task-journal-mcp" path = "src/main.rs" [dependencies] -tj-core = { package = "task-journal-core", version = "0.14.2", path = "../tj-core" } +# Lean: the MCP server doesn't embed yet, so it skips the model2vec backend. +tj-core = { package = "task-journal-core", version = "0.15.0", path = "../tj-core", default-features = false } anyhow = { workspace = true } tokio = { workspace = true } tracing = { workspace = true } diff --git a/plugin/.claude-plugin/plugin.json b/plugin/.claude-plugin/plugin.json index 0572dc8..8577ef5 100644 --- a/plugin/.claude-plugin/plugin.json +++ b/plugin/.claude-plugin/plugin.json @@ -1,6 +1,6 @@ { "name": "task-journal", - "version": "0.14.4", + "version": "0.15.0", "description": "Append-only journal of AI-coding task reasoning chains: hypotheses, decisions, rejections, evidence. Renders compact resume packs so an agent can pick up a 2-week-old task with full context.", "author": { "name": "Mher Shahinyan"