diff --git a/internal/vector/hybrid/engine.go b/internal/vector/hybrid/engine.go index 02b67e3d..18313bf7 100644 --- a/internal/vector/hybrid/engine.go +++ b/internal/vector/hybrid/engine.go @@ -6,7 +6,10 @@ import ( "errors" "fmt" "math" + "strings" + "unicode" + "go.kenn.io/msgvault/internal/query" "go.kenn.io/msgvault/internal/search" "go.kenn.io/msgvault/internal/vector" ) @@ -145,8 +148,21 @@ func (e *Engine) Search(ctx context.Context, req SearchRequest) ([]vector.FusedH if !ok { return nil, ResultMeta{}, errors.New("hybrid mode requires a FusingBackend; non-fusing fallback not wired in MVP") } + // FTSQuery is contractually a pre-tokenized FTS5 MATCH expression + // (see vector.FusedRequest). An explicit override passes through + // verbatim; an empty one falls back to the raw FreeText, which must + // be tokenized and quote-escaped before it reaches the fused CTE's + // `messages_fts MATCH` — otherwise FTS5 metacharacters in a natural- + // language query (",", "?", ...) reach the parser raw and raise a + // syntax error (issue #366). Only the hybrid path was affected: + // --mode fts sanitizes via Store.SearchMessages, and --mode vector + // has no BM25 branch at all. + ftsQuery := req.FTSQuery + if ftsQuery == "" { + ftsQuery = buildFTSMatch(req.FreeText) + } fReq := vector.FusedRequest{ - FTSQuery: firstNonEmpty(req.FTSQuery, req.FreeText), + FTSQuery: ftsQuery, QueryVec: queryVec, Generation: active.ID, KPerSignal: e.cfg.KPerSignal, @@ -186,9 +202,41 @@ func vectorHitsToFused(hits []vector.Hit) []vector.FusedHit { return out } -func firstNonEmpty(a, b string) string { - if a != "" { - return a +// buildFTSMatch turns a raw free-text query into an FTS5 MATCH +// expression for the hybrid BM25 branch, mirroring the --mode fts path +// (Store.SearchMessages → strings.Fields → dialect.BuildFTSArg): each +// whitespace-separated term is quote-escaped and prefix-matched, and +// terms the FTS5 tokenizer would drop entirely (punctuation-only) are +// skipped. Returns "" when nothing usable remains, which the fused +// query treats as "skip BM25" (vector-only) rather than dispatching a +// malformed MATCH. Quoting each term is what neutralizes embedded +// metacharacters like "," and "?" that otherwise crash FTS5 (#366). +func buildFTSMatch(freeText string) string { + terms := strings.Fields(freeText) + kept := terms[:0] + for _, t := range terms { + if hasFTSToken(t) { + kept = append(kept, t) + } + } + if len(kept) == 0 { + return "" + } + _, arg := query.SQLiteQueryDialect{}.BuildFTSTerm(kept) + return arg +} + +// hasFTSToken reports whether s contains a rune the default FTS5 +// tokenizer (unicode61) emits as part of a token — a Unicode letter or +// digit. Punctuation-only terms tokenize to nothing, so a MATCH built +// from them is a syntax error; callers drop them. Mirrors the helper of +// the same name in internal/store (the two packages keep parallel +// minimal abstractions rather than share a dependency). +func hasFTSToken(s string) bool { + for _, r := range s { + if unicode.IsLetter(r) || unicode.IsDigit(r) { + return true + } } - return b + return false } diff --git a/internal/vector/hybrid/engine_test.go b/internal/vector/hybrid/engine_test.go index 381f8278..b1128e19 100644 --- a/internal/vector/hybrid/engine_test.go +++ b/internal/vector/hybrid/engine_test.go @@ -163,6 +163,73 @@ func TestEngine_Hybrid_HappyPath(t *testing.T) { assert.Equal(len(results), meta.ReturnedCount) } +// TestBuildFTSMatch covers the FreeText → FTS5 MATCH sanitization +// directly (no DB needed). Every term is quote-wrapped with a "*" +// prefix, embedded double-quotes are doubled, stray "*" is stripped, +// and punctuation-only terms (which the FTS5 tokenizer would drop) are +// removed — yielding "" when nothing usable remains so the caller skips +// the BM25 branch instead of dispatching a malformed MATCH. +func TestBuildFTSMatch(t *testing.T) { + cases := []struct { + name string + in string + want string + }{ + {"plain terms", "budget review", `"budget"* "review"*`}, + {"trailing question mark", "budget?", `"budget?"*`}, + {"comma and question (issue #366)", "what's the budget, roughly?", `"what's"* "the"* "budget,"* "roughly?"*`}, + {"embedded double quote doubled", `say "hi"`, `"say"* """hi"""*`}, + {"stray star stripped", "bud*get", `"budget"*`}, + {"punctuation-only dropped", "??? ,,, !!!", ""}, + {"empty", "", ""}, + {"mixed tokenless dropped", "--- budget ???", `"budget"*`}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + assertpkg.Equal(t, tc.want, buildFTSMatch(tc.in)) + }) + } +} + +// TestEngine_Hybrid_PunctuationQuery is the regression test for #366: a +// --mode hybrid query containing FTS5 metacharacters (",", "?", "(", +// ")", ":") used to reach `messages_fts MATCH` unescaped and crash the +// fused query with "fts5: syntax error near ...". The engine now +// tokenizes and quote-escapes FreeText into a valid MATCH expression, +// so these queries succeed — and a metacharacter-laden query still +// matches on its real terms via the BM25 branch. +func TestEngine_Hybrid_PunctuationQuery(t *testing.T) { + require := requirepkg.New(t) + assert := assertpkg.New(t) + ctx := context.Background() + f := newEngineFixture(t) + + for _, q := range []string{ + "what's the budget, roughly?", + "meeting, tomorrow?", + "review (Q3): costs, etc.", + } { + _, _, err := f.Engine.Search(ctx, SearchRequest{ + Mode: ModeHybrid, + FreeText: q, + Limit: 5, + }) + require.NoErrorf(err, "hybrid search must not raise an FTS5 syntax error for %q", q) + } + + // Punctuation is neutralized, not dropped: "meeting, tomorrow?" + // must still surface the "meeting tomorrow" message (id 1) through + // the BM25 branch. + results, _, err := f.Engine.Search(ctx, SearchRequest{ + Mode: ModeHybrid, + FreeText: "meeting, tomorrow?", + Limit: 5, + }) + require.NoError(err, "Search") + require.NotEmpty(results, "expected hits for 'meeting, tomorrow?'") + assert.Equal(int64(1), results[0].MessageID, "top hit") +} + func TestEngine_Vector_HappyPath(t *testing.T) { require := requirepkg.New(t) assert := assertpkg.New(t)