Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 53 additions & 5 deletions internal/vector/hybrid/engine.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,10 @@ import (
"errors"
"fmt"
"math"
"strings"
"unicode"

"go.kenn.io/msgvault/internal/query"
"go.kenn.io/msgvault/internal/search"
"go.kenn.io/msgvault/internal/vector"
)
Expand Down Expand Up @@ -145,8 +148,21 @@ func (e *Engine) Search(ctx context.Context, req SearchRequest) ([]vector.FusedH
if !ok {
return nil, ResultMeta{}, errors.New("hybrid mode requires a FusingBackend; non-fusing fallback not wired in MVP")
}
// FTSQuery is contractually a pre-tokenized FTS5 MATCH expression
// (see vector.FusedRequest). An explicit override passes through
// verbatim; an empty one falls back to the raw FreeText, which must
// be tokenized and quote-escaped before it reaches the fused CTE's
// `messages_fts MATCH` — otherwise FTS5 metacharacters in a natural-
// language query (",", "?", ...) reach the parser raw and raise a
// syntax error (issue #366). Only the hybrid path was affected:
// --mode fts sanitizes via Store.SearchMessages, and --mode vector
// has no BM25 branch at all.
ftsQuery := req.FTSQuery
if ftsQuery == "" {
ftsQuery = buildFTSMatch(req.FreeText)
}
fReq := vector.FusedRequest{
FTSQuery: firstNonEmpty(req.FTSQuery, req.FreeText),
FTSQuery: ftsQuery,
QueryVec: queryVec,
Generation: active.ID,
KPerSignal: e.cfg.KPerSignal,
Expand Down Expand Up @@ -186,9 +202,41 @@ func vectorHitsToFused(hits []vector.Hit) []vector.FusedHit {
return out
}

func firstNonEmpty(a, b string) string {
if a != "" {
return a
// buildFTSMatch turns a raw free-text query into an FTS5 MATCH
// expression for the hybrid BM25 branch, mirroring the --mode fts path
// (Store.SearchMessages → strings.Fields → dialect.BuildFTSArg): each
// whitespace-separated term is quote-escaped and prefix-matched, and
// terms the FTS5 tokenizer would drop entirely (punctuation-only) are
// skipped. Returns "" when nothing usable remains, which the fused
// query treats as "skip BM25" (vector-only) rather than dispatching a
// malformed MATCH. Quoting each term is what neutralizes embedded
// metacharacters like "," and "?" that otherwise crash FTS5 (#366).
func buildFTSMatch(freeText string) string {
terms := strings.Fields(freeText)
kept := terms[:0]
for _, t := range terms {
if hasFTSToken(t) {
kept = append(kept, t)
}
}
if len(kept) == 0 {
return ""
}
_, arg := query.SQLiteQueryDialect{}.BuildFTSTerm(kept)
return arg
}

// hasFTSToken reports whether s contains a rune the default FTS5
// tokenizer (unicode61) emits as part of a token — a Unicode letter or
// digit. Punctuation-only terms tokenize to nothing, so a MATCH built
// from them is a syntax error; callers drop them. Mirrors the helper of
// the same name in internal/store (the two packages keep parallel
// minimal abstractions rather than share a dependency).
func hasFTSToken(s string) bool {
for _, r := range s {
if unicode.IsLetter(r) || unicode.IsDigit(r) {
return true
}
}
return b
return false
}
67 changes: 67 additions & 0 deletions internal/vector/hybrid/engine_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,73 @@ func TestEngine_Hybrid_HappyPath(t *testing.T) {
assert.Equal(len(results), meta.ReturnedCount)
}

// TestBuildFTSMatch covers the FreeText → FTS5 MATCH sanitization
// directly (no DB needed). Every term is quote-wrapped with a "*"
// prefix, embedded double-quotes are doubled, stray "*" is stripped,
// and punctuation-only terms (which the FTS5 tokenizer would drop) are
// removed — yielding "" when nothing usable remains so the caller skips
// the BM25 branch instead of dispatching a malformed MATCH.
func TestBuildFTSMatch(t *testing.T) {
cases := []struct {
name string
in string
want string
}{
{"plain terms", "budget review", `"budget"* "review"*`},
{"trailing question mark", "budget?", `"budget?"*`},
{"comma and question (issue #366)", "what's the budget, roughly?", `"what's"* "the"* "budget,"* "roughly?"*`},
{"embedded double quote doubled", `say "hi"`, `"say"* """hi"""*`},
{"stray star stripped", "bud*get", `"budget"*`},
{"punctuation-only dropped", "??? ,,, !!!", ""},
{"empty", "", ""},
{"mixed tokenless dropped", "--- budget ???", `"budget"*`},
}
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
assertpkg.Equal(t, tc.want, buildFTSMatch(tc.in))
})
}
}

// TestEngine_Hybrid_PunctuationQuery is the regression test for #366: a
// --mode hybrid query containing FTS5 metacharacters (",", "?", "(",
// ")", ":") used to reach `messages_fts MATCH` unescaped and crash the
// fused query with "fts5: syntax error near ...". The engine now
// tokenizes and quote-escapes FreeText into a valid MATCH expression,
// so these queries succeed — and a metacharacter-laden query still
// matches on its real terms via the BM25 branch.
func TestEngine_Hybrid_PunctuationQuery(t *testing.T) {
require := requirepkg.New(t)
assert := assertpkg.New(t)
ctx := context.Background()
f := newEngineFixture(t)

for _, q := range []string{
"what's the budget, roughly?",
"meeting, tomorrow?",
"review (Q3): costs, etc.",
} {
_, _, err := f.Engine.Search(ctx, SearchRequest{
Mode: ModeHybrid,
FreeText: q,
Limit: 5,
})
require.NoErrorf(err, "hybrid search must not raise an FTS5 syntax error for %q", q)
}

// Punctuation is neutralized, not dropped: "meeting, tomorrow?"
// must still surface the "meeting tomorrow" message (id 1) through
// the BM25 branch.
results, _, err := f.Engine.Search(ctx, SearchRequest{
Mode: ModeHybrid,
FreeText: "meeting, tomorrow?",
Limit: 5,
})
require.NoError(err, "Search")
require.NotEmpty(results, "expected hits for 'meeting, tomorrow?'")
assert.Equal(int64(1), results[0].MessageID, "top hit")
}

func TestEngine_Vector_HappyPath(t *testing.T) {
require := requirepkg.New(t)
assert := assertpkg.New(t)
Expand Down
Loading