From 3bfd84976a38e95f5638c2581983b040c6765483 Mon Sep 17 00:00:00 2001 From: ponbac Date: Thu, 5 Feb 2026 22:19:12 +0100 Subject: [PATCH] feat(search): implement super search with hybrid BM25+vector search --- ...99b2b2e778224716fe7b7fee470d2ff80fcf.json} | 10 +- ...3e638ffd210367b0b90dae1c6a2ad18b5bbab.json | 20 + ...471903db0038c24868853b508b1028522ef9a.json | 14 + ...723fcb0511b89b85e9d1f39f01a9c15120aee.json | 129 ++++ ...e7c48d4044c2929203700e9bb172ba694757.json} | 10 +- ...b55732bfba4a8ebc7265bfbd2f26abf17cad9.json | 52 ++ ...1257c8c069f9be7d4634db3b87011c85493a2.json | 135 ++++ ...634e3cd48eaa3243ffcb1c67278d821365f49.json | 25 + ...668f0de12583faac70c14028b8fb34fb6a6a7.json | 52 ++ ...8ca99faf3d626ff3e0eb1cf836f66e16a385.json} | 10 +- ...971907e894858a3f4e2b33b3ee66099bdcdc6.json | 175 ++++++ ...9c17fc948460a49060203e63836940ae97829.json | 32 + Cargo.lock | 595 +++++++++++++++++- app/src/components/cmd-k.tsx | 123 +++- app/src/hooks/useDebounce.ts | 17 + app/src/lib/api/mutations/differs.ts | 20 +- app/src/lib/api/mutations/repositories.ts | 8 +- app/src/lib/api/queries/queries.ts | 12 +- app/src/lib/api/queries/search.ts | 39 ++ app/src/routes/_layout/prs/$prId/route.tsx | 4 +- app/src/routes/_layout/prs/route.tsx | 8 +- app/src/routes/_layout/repositories/route.tsx | 8 +- az-devops/Cargo.toml | 2 +- docs/implementation-plan.md | 228 +++++++ docs/super-search-design.md | 468 ++++++++++++++ milltime/Cargo.toml | 2 +- ...3e638ffd210367b0b90dae1c6a2ad18b5bbab.json | 20 + ...66c3cc3fec0a12d9a1aa9530a320e1cc9899f.json | 164 +++++ ...1002071aedcf109f58e62082784071c368368.json | 135 ++++ ...634e3cd48eaa3243ffcb1c67278d821365f49.json | 25 + ...668f0de12583faac70c14028b8fb34fb6a6a7.json | 52 ++ ...eb4a157cae880eb8a56c3bfb83623f1391e46.json | 175 ++++++ ...9c17fc948460a49060203e63836940ae97829.json | 32 + ...7f62625ac39629a29430dff51c1501f3e7eb2.json | 52 ++ toki-api/Cargo.toml | 5 +- ...20260205220000_create_search_documents.sql | 82 +++ toki-api/scripts/init_db.sh | 2 +- toki-api/src/app_state.rs | 56 ++ toki-api/src/domain/mod.rs | 1 + toki-api/src/domain/search/embedder/gemini.rs | 146 +++++ toki-api/src/domain/search/embedder/mock.rs | 151 +++++ toki-api/src/domain/search/embedder/mod.rs | 9 + toki-api/src/domain/search/index_worker.rs | 99 +++ toki-api/src/domain/search/indexer.rs | 470 ++++++++++++++ toki-api/src/domain/search/mod.rs | 66 ++ toki-api/src/domain/search/parser.rs | 416 ++++++++++++ toki-api/src/domain/search/repository/mock.rs | 345 ++++++++++ toki-api/src/domain/search/repository/mod.rs | 9 + .../src/domain/search/repository/postgres.rs | 515 +++++++++++++++ toki-api/src/domain/search/service.rs | 331 ++++++++++ toki-api/src/domain/search/source/ado.rs | 211 +++++++ toki-api/src/domain/search/source/mod.rs | 5 + toki-api/src/domain/search/traits.rs | 162 +++++ toki-api/src/domain/search/types.rs | 225 +++++++ toki-api/src/router.rs | 7 +- toki-api/src/routes/mod.rs | 1 + toki-api/src/routes/search.rs | 41 ++ 57 files changed, 6134 insertions(+), 74 deletions(-) rename .sqlx/{query-ed0cf3603af8311bafbccf7828c74062321d73b4d100014b4173b7778cda7f20.json => query-04f8354faff05b84b0ed028d139b99b2b2e778224716fe7b7fee470d2ff80fcf.json} (78%) create mode 100644 .sqlx/query-05dc742cf5195c7d8452fc64af73e638ffd210367b0b90dae1c6a2ad18b5bbab.json create mode 100644 .sqlx/query-194acd0f52892c25a2d4c3e4405471903db0038c24868853b508b1028522ef9a.json create mode 100644 .sqlx/query-36b083362a76c963febf90ab0b2723fcb0511b89b85e9d1f39f01a9c15120aee.json rename .sqlx/{query-6247813c0b5c62947b577514ca1569b32ede7c85b1f9a2f301c2582d58068d6b.json => query-38a9e8c389152859cb16f1bc4088e7c48d4044c2929203700e9bb172ba694757.json} (74%) create mode 100644 .sqlx/query-4a7de0d7a42ce7b4c995a92928bb55732bfba4a8ebc7265bfbd2f26abf17cad9.json create mode 100644 .sqlx/query-70978b14ccbabaa4c78c3bc1fd11257c8c069f9be7d4634db3b87011c85493a2.json create mode 100644 .sqlx/query-8557872e0bb4ec41241c5027536634e3cd48eaa3243ffcb1c67278d821365f49.json create mode 100644 .sqlx/query-91b62aaf67f424e6cc413380dd7668f0de12583faac70c14028b8fb34fb6a6a7.json rename .sqlx/{query-1dbbf9fd1e30024c4a0d0cfc7767b3650725f936f62e32f07ea53271f1fbd2fa.json => query-a17d79ee056495d4ddc1e5b03ad68ca99faf3d626ff3e0eb1cf836f66e16a385.json} (83%) create mode 100644 .sqlx/query-a72a103d262ec8aa1335754cb3c971907e894858a3f4e2b33b3ee66099bdcdc6.json create mode 100644 .sqlx/query-c3e92392cdeee40bef2116569c39c17fc948460a49060203e63836940ae97829.json create mode 100644 app/src/hooks/useDebounce.ts create mode 100644 app/src/lib/api/queries/search.ts create mode 100644 docs/implementation-plan.md create mode 100644 docs/super-search-design.md create mode 100644 toki-api/.sqlx/query-05dc742cf5195c7d8452fc64af73e638ffd210367b0b90dae1c6a2ad18b5bbab.json create mode 100644 toki-api/.sqlx/query-0d2de90b227d0282c0e040cfaa266c3cc3fec0a12d9a1aa9530a320e1cc9899f.json create mode 100644 toki-api/.sqlx/query-66404001aaa432bb201713792ad1002071aedcf109f58e62082784071c368368.json create mode 100644 toki-api/.sqlx/query-8557872e0bb4ec41241c5027536634e3cd48eaa3243ffcb1c67278d821365f49.json create mode 100644 toki-api/.sqlx/query-91b62aaf67f424e6cc413380dd7668f0de12583faac70c14028b8fb34fb6a6a7.json create mode 100644 toki-api/.sqlx/query-c2ab51c6047a03b187539718cf8eb4a157cae880eb8a56c3bfb83623f1391e46.json create mode 100644 toki-api/.sqlx/query-c3e92392cdeee40bef2116569c39c17fc948460a49060203e63836940ae97829.json create mode 100644 toki-api/.sqlx/query-f8b004aabe43de431ecf2fa90417f62625ac39629a29430dff51c1501f3e7eb2.json create mode 100644 toki-api/migrations/20260205220000_create_search_documents.sql create mode 100644 toki-api/src/domain/search/embedder/gemini.rs create mode 100644 toki-api/src/domain/search/embedder/mock.rs create mode 100644 toki-api/src/domain/search/embedder/mod.rs create mode 100644 toki-api/src/domain/search/index_worker.rs create mode 100644 toki-api/src/domain/search/indexer.rs create mode 100644 toki-api/src/domain/search/mod.rs create mode 100644 toki-api/src/domain/search/parser.rs create mode 100644 toki-api/src/domain/search/repository/mock.rs create mode 100644 toki-api/src/domain/search/repository/mod.rs create mode 100644 toki-api/src/domain/search/repository/postgres.rs create mode 100644 toki-api/src/domain/search/service.rs create mode 100644 toki-api/src/domain/search/source/ado.rs create mode 100644 toki-api/src/domain/search/source/mod.rs create mode 100644 toki-api/src/domain/search/traits.rs create mode 100644 toki-api/src/domain/search/types.rs create mode 100644 toki-api/src/routes/search.rs diff --git a/.sqlx/query-ed0cf3603af8311bafbccf7828c74062321d73b4d100014b4173b7778cda7f20.json b/.sqlx/query-04f8354faff05b84b0ed028d139b99b2b2e778224716fe7b7fee470d2ff80fcf.json similarity index 78% rename from .sqlx/query-ed0cf3603af8311bafbccf7828c74062321d73b4d100014b4173b7778cda7f20.json rename to .sqlx/query-04f8354faff05b84b0ed028d139b99b2b2e778224716fe7b7fee470d2ff80fcf.json index a72ba43b..611d0173 100644 --- a/.sqlx/query-ed0cf3603af8311bafbccf7828c74062321d73b4d100014b4173b7778cda7f20.json +++ b/.sqlx/query-04f8354faff05b84b0ed028d139b99b2b2e778224716fe7b7fee470d2ff80fcf.json @@ -1,6 +1,6 @@ { "db_name": "PostgreSQL", - "query": "SELECT id, email, full_name, picture, access_token, roles FROM users", + "query": "SELECT id, email, full_name, picture, access_token, roles, session_auth_hash FROM users", "describe": { "columns": [ { @@ -32,6 +32,11 @@ "ordinal": 5, "name": "roles", "type_info": "TextArray" + }, + { + "ordinal": 6, + "name": "session_auth_hash", + "type_info": "Text" } ], "parameters": { @@ -43,8 +48,9 @@ false, false, false, + false, false ] }, - "hash": "ed0cf3603af8311bafbccf7828c74062321d73b4d100014b4173b7778cda7f20" + "hash": "04f8354faff05b84b0ed028d139b99b2b2e778224716fe7b7fee470d2ff80fcf" } diff --git a/.sqlx/query-05dc742cf5195c7d8452fc64af73e638ffd210367b0b90dae1c6a2ad18b5bbab.json b/.sqlx/query-05dc742cf5195c7d8452fc64af73e638ffd210367b0b90dae1c6a2ad18b5bbab.json new file mode 100644 index 00000000..1ad60466 --- /dev/null +++ b/.sqlx/query-05dc742cf5195c7d8452fc64af73e638ffd210367b0b90dae1c6a2ad18b5bbab.json @@ -0,0 +1,20 @@ +{ + "db_name": "PostgreSQL", + "query": "SELECT COUNT(*) as \"count!\" FROM search_documents", + "describe": { + "columns": [ + { + "ordinal": 0, + "name": "count!", + "type_info": "Int8" + } + ], + "parameters": { + "Left": [] + }, + "nullable": [ + null + ] + }, + "hash": "05dc742cf5195c7d8452fc64af73e638ffd210367b0b90dae1c6a2ad18b5bbab" +} diff --git a/.sqlx/query-194acd0f52892c25a2d4c3e4405471903db0038c24868853b508b1028522ef9a.json b/.sqlx/query-194acd0f52892c25a2d4c3e4405471903db0038c24868853b508b1028522ef9a.json new file mode 100644 index 00000000..13e87bb3 --- /dev/null +++ b/.sqlx/query-194acd0f52892c25a2d4c3e4405471903db0038c24868853b508b1028522ef9a.json @@ -0,0 +1,14 @@ +{ + "db_name": "PostgreSQL", + "query": "\n DELETE FROM search_documents\n WHERE indexed_at < $1\n ", + "describe": { + "columns": [], + "parameters": { + "Left": [ + "Timestamptz" + ] + }, + "nullable": [] + }, + "hash": "194acd0f52892c25a2d4c3e4405471903db0038c24868853b508b1028522ef9a" +} diff --git a/.sqlx/query-36b083362a76c963febf90ab0b2723fcb0511b89b85e9d1f39f01a9c15120aee.json b/.sqlx/query-36b083362a76c963febf90ab0b2723fcb0511b89b85e9d1f39f01a9c15120aee.json new file mode 100644 index 00000000..031bbc38 --- /dev/null +++ b/.sqlx/query-36b083362a76c963febf90ab0b2723fcb0511b89b85e9d1f39f01a9c15120aee.json @@ -0,0 +1,129 @@ +{ + "db_name": "PostgreSQL", + "query": "\n SELECT\n id,\n source_type as \"source_type: SearchSource\",\n source_id,\n external_id,\n title,\n description,\n status,\n priority,\n item_type,\n author_name,\n url,\n created_at,\n updated_at,\n ts_rank_cd(search_vector, websearch_to_tsquery('english', $1))::float8 as score\n FROM search_documents\n WHERE ($1 = '' OR search_vector @@ websearch_to_tsquery('english', $1))\n AND ($2::search_source IS NULL OR source_type = $2)\n AND ($3::text IS NULL OR organization = $3)\n AND ($4::text IS NULL OR project = $4)\n AND ($5::text[] IS NULL OR status = ANY($5))\n AND ($6::int[] IS NULL OR priority = ANY($6))\n AND ($7::text[] IS NULL OR item_type = ANY($7))\n AND ($8::bool IS NULL OR is_draft = $8)\n AND ($9::timestamptz IS NULL OR updated_at >= $9)\n ORDER BY score DESC\n LIMIT $10\n ", + "describe": { + "columns": [ + { + "ordinal": 0, + "name": "id", + "type_info": "Int4" + }, + { + "ordinal": 1, + "name": "source_type: SearchSource", + "type_info": { + "Custom": { + "name": "search_source", + "kind": { + "Enum": [ + "pr", + "work_item" + ] + } + } + } + }, + { + "ordinal": 2, + "name": "source_id", + "type_info": "Text" + }, + { + "ordinal": 3, + "name": "external_id", + "type_info": "Int4" + }, + { + "ordinal": 4, + "name": "title", + "type_info": "Text" + }, + { + "ordinal": 5, + "name": "description", + "type_info": "Text" + }, + { + "ordinal": 6, + "name": "status", + "type_info": "Text" + }, + { + "ordinal": 7, + "name": "priority", + "type_info": "Int4" + }, + { + "ordinal": 8, + "name": "item_type", + "type_info": "Text" + }, + { + "ordinal": 9, + "name": "author_name", + "type_info": "Text" + }, + { + "ordinal": 10, + "name": "url", + "type_info": "Text" + }, + { + "ordinal": 11, + "name": "created_at", + "type_info": "Timestamptz" + }, + { + "ordinal": 12, + "name": "updated_at", + "type_info": "Timestamptz" + }, + { + "ordinal": 13, + "name": "score", + "type_info": "Float8" + } + ], + "parameters": { + "Left": [ + "Text", + { + "Custom": { + "name": "search_source", + "kind": { + "Enum": [ + "pr", + "work_item" + ] + } + } + }, + "Text", + "Text", + "TextArray", + "Int4Array", + "TextArray", + "Bool", + "Timestamptz", + "Int8" + ] + }, + "nullable": [ + false, + false, + false, + false, + false, + true, + false, + true, + true, + true, + false, + false, + false, + null + ] + }, + "hash": "36b083362a76c963febf90ab0b2723fcb0511b89b85e9d1f39f01a9c15120aee" +} diff --git a/.sqlx/query-6247813c0b5c62947b577514ca1569b32ede7c85b1f9a2f301c2582d58068d6b.json b/.sqlx/query-38a9e8c389152859cb16f1bc4088e7c48d4044c2929203700e9bb172ba694757.json similarity index 74% rename from .sqlx/query-6247813c0b5c62947b577514ca1569b32ede7c85b1f9a2f301c2582d58068d6b.json rename to .sqlx/query-38a9e8c389152859cb16f1bc4088e7c48d4044c2929203700e9bb172ba694757.json index f2346178..e099f9f2 100644 --- a/.sqlx/query-6247813c0b5c62947b577514ca1569b32ede7c85b1f9a2f301c2582d58068d6b.json +++ b/.sqlx/query-38a9e8c389152859cb16f1bc4088e7c48d4044c2929203700e9bb172ba694757.json @@ -1,6 +1,6 @@ { "db_name": "PostgreSQL", - "query": "\n SELECT id, email, full_name, picture, access_token, roles\n FROM users\n WHERE id = $1\n ", + "query": "\n SELECT id, email, full_name, picture, access_token, roles, session_auth_hash\n FROM users\n WHERE id = $1\n ", "describe": { "columns": [ { @@ -32,6 +32,11 @@ "ordinal": 5, "name": "roles", "type_info": "TextArray" + }, + { + "ordinal": 6, + "name": "session_auth_hash", + "type_info": "Text" } ], "parameters": { @@ -45,8 +50,9 @@ false, false, false, + false, false ] }, - "hash": "6247813c0b5c62947b577514ca1569b32ede7c85b1f9a2f301c2582d58068d6b" + "hash": "38a9e8c389152859cb16f1bc4088e7c48d4044c2929203700e9bb172ba694757" } diff --git a/.sqlx/query-4a7de0d7a42ce7b4c995a92928bb55732bfba4a8ebc7265bfbd2f26abf17cad9.json b/.sqlx/query-4a7de0d7a42ce7b4c995a92928bb55732bfba4a8ebc7265bfbd2f26abf17cad9.json new file mode 100644 index 00000000..0d7524b8 --- /dev/null +++ b/.sqlx/query-4a7de0d7a42ce7b4c995a92928bb55732bfba4a8ebc7265bfbd2f26abf17cad9.json @@ -0,0 +1,52 @@ +{ + "db_name": "PostgreSQL", + "query": "\n INSERT INTO search_documents (\n source_type, source_id, external_id, title, description, content,\n organization, project, repo_name, status,\n author_id, author_name, assigned_to_id, assigned_to_name,\n priority, item_type, is_draft,\n created_at, updated_at, closed_at,\n url, parent_id, linked_work_items, embedding\n ) VALUES (\n $1, $2, $3, $4, $5, $6, $7, $8, $9, $10,\n $11, $12, $13, $14, $15, $16, $17, $18, $19, $20,\n $21, $22, $23, $24\n )\n ON CONFLICT (source_type, source_id) DO UPDATE SET\n title = EXCLUDED.title,\n description = EXCLUDED.description,\n content = EXCLUDED.content,\n status = EXCLUDED.status,\n author_id = EXCLUDED.author_id,\n author_name = EXCLUDED.author_name,\n assigned_to_id = EXCLUDED.assigned_to_id,\n assigned_to_name = EXCLUDED.assigned_to_name,\n priority = EXCLUDED.priority,\n item_type = EXCLUDED.item_type,\n is_draft = EXCLUDED.is_draft,\n updated_at = EXCLUDED.updated_at,\n closed_at = EXCLUDED.closed_at,\n linked_work_items = EXCLUDED.linked_work_items,\n embedding = EXCLUDED.embedding,\n indexed_at = NOW()\n ", + "describe": { + "columns": [], + "parameters": { + "Left": [ + { + "Custom": { + "name": "search_source", + "kind": { + "Enum": [ + "pr", + "work_item" + ] + } + } + }, + "Text", + "Int4", + "Text", + "Text", + "Text", + "Text", + "Text", + "Text", + "Text", + "Text", + "Text", + "Text", + "Text", + "Int4", + "Text", + "Bool", + "Timestamptz", + "Timestamptz", + "Timestamptz", + "Text", + "Int4", + "Int4Array", + { + "Custom": { + "name": "vector", + "kind": "Simple" + } + } + ] + }, + "nullable": [] + }, + "hash": "4a7de0d7a42ce7b4c995a92928bb55732bfba4a8ebc7265bfbd2f26abf17cad9" +} diff --git a/.sqlx/query-70978b14ccbabaa4c78c3bc1fd11257c8c069f9be7d4634db3b87011c85493a2.json b/.sqlx/query-70978b14ccbabaa4c78c3bc1fd11257c8c069f9be7d4634db3b87011c85493a2.json new file mode 100644 index 00000000..d8325c47 --- /dev/null +++ b/.sqlx/query-70978b14ccbabaa4c78c3bc1fd11257c8c069f9be7d4634db3b87011c85493a2.json @@ -0,0 +1,135 @@ +{ + "db_name": "PostgreSQL", + "query": "\n WITH bm25_results AS (\n SELECT\n id,\n ts_rank_cd(search_vector, websearch_to_tsquery('english', $1)) as score,\n ROW_NUMBER() OVER (\n ORDER BY ts_rank_cd(search_vector, websearch_to_tsquery('english', $1)) DESC\n ) as rank\n FROM search_documents\n WHERE ($1 = '' OR search_vector @@ websearch_to_tsquery('english', $1))\n AND ($2::search_source IS NULL OR source_type = $2)\n AND ($3::text IS NULL OR organization = $3)\n AND ($4::text IS NULL OR project = $4)\n AND ($5::text[] IS NULL OR status = ANY($5))\n AND ($6::int[] IS NULL OR priority = ANY($6))\n AND ($7::text[] IS NULL OR item_type = ANY($7))\n AND ($8::bool IS NULL OR is_draft = $8)\n AND ($9::timestamptz IS NULL OR updated_at >= $9)\n LIMIT 100\n ),\n vector_results AS (\n SELECT\n id,\n 1 - (embedding <=> $10) as score,\n ROW_NUMBER() OVER (\n ORDER BY embedding <=> $10\n ) as rank\n FROM search_documents\n WHERE embedding IS NOT NULL\n AND ($2::search_source IS NULL OR source_type = $2)\n AND ($3::text IS NULL OR organization = $3)\n AND ($4::text IS NULL OR project = $4)\n AND ($5::text[] IS NULL OR status = ANY($5))\n AND ($6::int[] IS NULL OR priority = ANY($6))\n AND ($7::text[] IS NULL OR item_type = ANY($7))\n AND ($8::bool IS NULL OR is_draft = $8)\n AND ($9::timestamptz IS NULL OR updated_at >= $9)\n LIMIT 100\n ),\n rrf_combined AS (\n SELECT\n COALESCE(b.id, v.id) as id,\n (COALESCE(1.0 / (60 + b.rank), 0) + COALESCE(1.0 / (60 + v.rank), 0))::float8 as rrf_score\n FROM bm25_results b\n FULL OUTER JOIN vector_results v ON b.id = v.id\n )\n SELECT\n d.id,\n d.source_type as \"source_type: SearchSource\",\n d.source_id,\n d.external_id,\n d.title,\n d.description,\n d.status,\n d.priority,\n d.item_type,\n d.author_name,\n d.url,\n d.created_at,\n d.updated_at,\n r.rrf_score as score\n FROM rrf_combined r\n JOIN search_documents d ON d.id = r.id\n ORDER BY r.rrf_score DESC\n LIMIT $11\n ", + "describe": { + "columns": [ + { + "ordinal": 0, + "name": "id", + "type_info": "Int4" + }, + { + "ordinal": 1, + "name": "source_type: SearchSource", + "type_info": { + "Custom": { + "name": "search_source", + "kind": { + "Enum": [ + "pr", + "work_item" + ] + } + } + } + }, + { + "ordinal": 2, + "name": "source_id", + "type_info": "Text" + }, + { + "ordinal": 3, + "name": "external_id", + "type_info": "Int4" + }, + { + "ordinal": 4, + "name": "title", + "type_info": "Text" + }, + { + "ordinal": 5, + "name": "description", + "type_info": "Text" + }, + { + "ordinal": 6, + "name": "status", + "type_info": "Text" + }, + { + "ordinal": 7, + "name": "priority", + "type_info": "Int4" + }, + { + "ordinal": 8, + "name": "item_type", + "type_info": "Text" + }, + { + "ordinal": 9, + "name": "author_name", + "type_info": "Text" + }, + { + "ordinal": 10, + "name": "url", + "type_info": "Text" + }, + { + "ordinal": 11, + "name": "created_at", + "type_info": "Timestamptz" + }, + { + "ordinal": 12, + "name": "updated_at", + "type_info": "Timestamptz" + }, + { + "ordinal": 13, + "name": "score", + "type_info": "Float8" + } + ], + "parameters": { + "Left": [ + "Text", + { + "Custom": { + "name": "search_source", + "kind": { + "Enum": [ + "pr", + "work_item" + ] + } + } + }, + "Text", + "Text", + "TextArray", + "Int4Array", + "TextArray", + "Bool", + "Timestamptz", + { + "Custom": { + "name": "vector", + "kind": "Simple" + } + }, + "Int8" + ] + }, + "nullable": [ + false, + false, + false, + false, + false, + true, + false, + true, + true, + true, + false, + false, + false, + null + ] + }, + "hash": "70978b14ccbabaa4c78c3bc1fd11257c8c069f9be7d4634db3b87011c85493a2" +} diff --git a/.sqlx/query-8557872e0bb4ec41241c5027536634e3cd48eaa3243ffcb1c67278d821365f49.json b/.sqlx/query-8557872e0bb4ec41241c5027536634e3cd48eaa3243ffcb1c67278d821365f49.json new file mode 100644 index 00000000..7d112371 --- /dev/null +++ b/.sqlx/query-8557872e0bb4ec41241c5027536634e3cd48eaa3243ffcb1c67278d821365f49.json @@ -0,0 +1,25 @@ +{ + "db_name": "PostgreSQL", + "query": "\n DELETE FROM search_documents\n WHERE source_type = $1 AND source_id = $2\n ", + "describe": { + "columns": [], + "parameters": { + "Left": [ + { + "Custom": { + "name": "search_source", + "kind": { + "Enum": [ + "pr", + "work_item" + ] + } + } + }, + "Text" + ] + }, + "nullable": [] + }, + "hash": "8557872e0bb4ec41241c5027536634e3cd48eaa3243ffcb1c67278d821365f49" +} diff --git a/.sqlx/query-91b62aaf67f424e6cc413380dd7668f0de12583faac70c14028b8fb34fb6a6a7.json b/.sqlx/query-91b62aaf67f424e6cc413380dd7668f0de12583faac70c14028b8fb34fb6a6a7.json new file mode 100644 index 00000000..40106741 --- /dev/null +++ b/.sqlx/query-91b62aaf67f424e6cc413380dd7668f0de12583faac70c14028b8fb34fb6a6a7.json @@ -0,0 +1,52 @@ +{ + "db_name": "PostgreSQL", + "query": "\n INSERT INTO search_documents (\n source_type, source_id, external_id, title, description, content,\n organization, project, repo_name, status,\n author_id, author_name, assigned_to_id, assigned_to_name,\n priority, item_type, is_draft,\n created_at, updated_at, closed_at,\n url, parent_id, linked_work_items, embedding\n ) VALUES (\n $1, $2, $3, $4, $5, $6, $7, $8, $9, $10,\n $11, $12, $13, $14, $15, $16, $17, $18, $19, $20,\n $21, $22, $23, $24\n )\n ON CONFLICT (source_type, source_id) DO UPDATE SET\n title = EXCLUDED.title,\n description = EXCLUDED.description,\n content = EXCLUDED.content,\n status = EXCLUDED.status,\n author_id = EXCLUDED.author_id,\n author_name = EXCLUDED.author_name,\n assigned_to_id = EXCLUDED.assigned_to_id,\n assigned_to_name = EXCLUDED.assigned_to_name,\n priority = EXCLUDED.priority,\n item_type = EXCLUDED.item_type,\n is_draft = EXCLUDED.is_draft,\n updated_at = EXCLUDED.updated_at,\n closed_at = EXCLUDED.closed_at,\n linked_work_items = EXCLUDED.linked_work_items,\n embedding = EXCLUDED.embedding,\n indexed_at = NOW()\n ", + "describe": { + "columns": [], + "parameters": { + "Left": [ + { + "Custom": { + "name": "search_source", + "kind": { + "Enum": [ + "pr", + "work_item" + ] + } + } + }, + "Text", + "Int4", + "Text", + "Text", + "Text", + "Text", + "Text", + "Text", + "Text", + "Text", + "Text", + "Text", + "Text", + "Int4", + "Text", + "Bool", + "Timestamptz", + "Timestamptz", + "Timestamptz", + "Text", + "Int4", + "Int4Array", + { + "Custom": { + "name": "vector", + "kind": "Simple" + } + } + ] + }, + "nullable": [] + }, + "hash": "91b62aaf67f424e6cc413380dd7668f0de12583faac70c14028b8fb34fb6a6a7" +} diff --git a/.sqlx/query-1dbbf9fd1e30024c4a0d0cfc7767b3650725f936f62e32f07ea53271f1fbd2fa.json b/.sqlx/query-a17d79ee056495d4ddc1e5b03ad68ca99faf3d626ff3e0eb1cf836f66e16a385.json similarity index 83% rename from .sqlx/query-1dbbf9fd1e30024c4a0d0cfc7767b3650725f936f62e32f07ea53271f1fbd2fa.json rename to .sqlx/query-a17d79ee056495d4ddc1e5b03ad68ca99faf3d626ff3e0eb1cf836f66e16a385.json index cccfce61..f380d116 100644 --- a/.sqlx/query-1dbbf9fd1e30024c4a0d0cfc7767b3650725f936f62e32f07ea53271f1fbd2fa.json +++ b/.sqlx/query-a17d79ee056495d4ddc1e5b03ad68ca99faf3d626ff3e0eb1cf836f66e16a385.json @@ -1,6 +1,6 @@ { "db_name": "PostgreSQL", - "query": "\n INSERT INTO users (email, full_name, picture, access_token, roles)\n VALUES ($1, $2, $3, $4, $5)\n ON CONFLICT(email) DO UPDATE\n SET full_name = EXCLUDED.full_name,\n picture = EXCLUDED.picture,\n access_token = EXCLUDED.access_token\n RETURNING id, email, full_name, picture, access_token, roles\n ", + "query": "\n INSERT INTO users (email, full_name, picture, access_token, roles)\n VALUES ($1, $2, $3, $4, $5)\n ON CONFLICT(email) DO UPDATE\n SET full_name = EXCLUDED.full_name,\n picture = EXCLUDED.picture,\n access_token = EXCLUDED.access_token\n RETURNING id, email, full_name, picture, access_token, roles, session_auth_hash\n ", "describe": { "columns": [ { @@ -32,6 +32,11 @@ "ordinal": 5, "name": "roles", "type_info": "TextArray" + }, + { + "ordinal": 6, + "name": "session_auth_hash", + "type_info": "Text" } ], "parameters": { @@ -49,8 +54,9 @@ false, false, false, + false, false ] }, - "hash": "1dbbf9fd1e30024c4a0d0cfc7767b3650725f936f62e32f07ea53271f1fbd2fa" + "hash": "a17d79ee056495d4ddc1e5b03ad68ca99faf3d626ff3e0eb1cf836f66e16a385" } diff --git a/.sqlx/query-a72a103d262ec8aa1335754cb3c971907e894858a3f4e2b33b3ee66099bdcdc6.json b/.sqlx/query-a72a103d262ec8aa1335754cb3c971907e894858a3f4e2b33b3ee66099bdcdc6.json new file mode 100644 index 00000000..5f36bd1c --- /dev/null +++ b/.sqlx/query-a72a103d262ec8aa1335754cb3c971907e894858a3f4e2b33b3ee66099bdcdc6.json @@ -0,0 +1,175 @@ +{ + "db_name": "PostgreSQL", + "query": "\n SELECT\n source_type as \"source_type: SearchSource\",\n source_id,\n external_id,\n title,\n description,\n content,\n organization,\n project,\n repo_name,\n status,\n author_id,\n author_name,\n assigned_to_id,\n assigned_to_name,\n priority,\n item_type,\n is_draft,\n created_at,\n updated_at,\n closed_at,\n url,\n parent_id,\n linked_work_items\n FROM search_documents\n WHERE source_type = $1 AND source_id = $2\n ", + "describe": { + "columns": [ + { + "ordinal": 0, + "name": "source_type: SearchSource", + "type_info": { + "Custom": { + "name": "search_source", + "kind": { + "Enum": [ + "pr", + "work_item" + ] + } + } + } + }, + { + "ordinal": 1, + "name": "source_id", + "type_info": "Text" + }, + { + "ordinal": 2, + "name": "external_id", + "type_info": "Int4" + }, + { + "ordinal": 3, + "name": "title", + "type_info": "Text" + }, + { + "ordinal": 4, + "name": "description", + "type_info": "Text" + }, + { + "ordinal": 5, + "name": "content", + "type_info": "Text" + }, + { + "ordinal": 6, + "name": "organization", + "type_info": "Text" + }, + { + "ordinal": 7, + "name": "project", + "type_info": "Text" + }, + { + "ordinal": 8, + "name": "repo_name", + "type_info": "Text" + }, + { + "ordinal": 9, + "name": "status", + "type_info": "Text" + }, + { + "ordinal": 10, + "name": "author_id", + "type_info": "Text" + }, + { + "ordinal": 11, + "name": "author_name", + "type_info": "Text" + }, + { + "ordinal": 12, + "name": "assigned_to_id", + "type_info": "Text" + }, + { + "ordinal": 13, + "name": "assigned_to_name", + "type_info": "Text" + }, + { + "ordinal": 14, + "name": "priority", + "type_info": "Int4" + }, + { + "ordinal": 15, + "name": "item_type", + "type_info": "Text" + }, + { + "ordinal": 16, + "name": "is_draft", + "type_info": "Bool" + }, + { + "ordinal": 17, + "name": "created_at", + "type_info": "Timestamptz" + }, + { + "ordinal": 18, + "name": "updated_at", + "type_info": "Timestamptz" + }, + { + "ordinal": 19, + "name": "closed_at", + "type_info": "Timestamptz" + }, + { + "ordinal": 20, + "name": "url", + "type_info": "Text" + }, + { + "ordinal": 21, + "name": "parent_id", + "type_info": "Int4" + }, + { + "ordinal": 22, + "name": "linked_work_items", + "type_info": "Int4Array" + } + ], + "parameters": { + "Left": [ + { + "Custom": { + "name": "search_source", + "kind": { + "Enum": [ + "pr", + "work_item" + ] + } + } + }, + "Text" + ] + }, + "nullable": [ + false, + false, + false, + false, + true, + true, + false, + false, + true, + false, + true, + true, + true, + true, + true, + true, + true, + false, + false, + true, + false, + true, + true + ] + }, + "hash": "a72a103d262ec8aa1335754cb3c971907e894858a3f4e2b33b3ee66099bdcdc6" +} diff --git a/.sqlx/query-c3e92392cdeee40bef2116569c39c17fc948460a49060203e63836940ae97829.json b/.sqlx/query-c3e92392cdeee40bef2116569c39c17fc948460a49060203e63836940ae97829.json new file mode 100644 index 00000000..bf33f4bd --- /dev/null +++ b/.sqlx/query-c3e92392cdeee40bef2116569c39c17fc948460a49060203e63836940ae97829.json @@ -0,0 +1,32 @@ +{ + "db_name": "PostgreSQL", + "query": "SELECT COUNT(*) as \"count!\" FROM search_documents WHERE source_type = $1", + "describe": { + "columns": [ + { + "ordinal": 0, + "name": "count!", + "type_info": "Int8" + } + ], + "parameters": { + "Left": [ + { + "Custom": { + "name": "search_source", + "kind": { + "Enum": [ + "pr", + "work_item" + ] + } + } + } + ] + }, + "nullable": [ + null + ] + }, + "hash": "c3e92392cdeee40bef2116569c39c17fc948460a49060203e63836940ae97829" +} diff --git a/Cargo.lock b/Cargo.lock index 748096dc..7f374391 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -157,6 +157,28 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" +[[package]] +name = "aws-lc-rs" +version = "1.15.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b7b6141e96a8c160799cc2d5adecd5cbbe5054cb8c7c4af53da0f83bb7ad256" +dependencies = [ + "aws-lc-sys", + "zeroize", +] + +[[package]] +name = "aws-lc-sys" +version = "0.37.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c34dda4df7017c8db52132f0f8a2e0f8161649d15723ed63fc00c82d0f2081a" +dependencies = [ + "cc", + "cmake", + "dunce", + "fs_extra", +] + [[package]] name = "axum" version = "0.7.9" @@ -436,15 +458,29 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6354c81bbfd62d9cfa9cb3c773c2b7b2a3a482d569de977fd0e961f6e7c00583" dependencies = [ "find-msvc-tools", + "jobserver", + "libc", "shlex", ] +[[package]] +name = "cesu8" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d43a04d8753f35258c91f8ec639f792891f748a1edbd759cf1dcea3382ad83c" + [[package]] name = "cfg-if" version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" +[[package]] +name = "cfg_aliases" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" + [[package]] name = "chrono" version = "0.4.43" @@ -469,6 +505,15 @@ dependencies = [ "inout", ] +[[package]] +name = "cmake" +version = "0.1.57" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75443c44cd6b379beb8c5b45d85d0773baf31cce901fe7bb252f4eff3008ef7d" +dependencies = [ + "cc", +] + [[package]] name = "coarsetime" version = "0.1.37" @@ -480,6 +525,16 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "combine" +version = "4.6.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba5a308b75df32fe02788e748662718f03fde005016435c444eea572398219fd" +dependencies = [ + "bytes", + "memchr", +] + [[package]] name = "compression-codecs" version = "0.4.36" @@ -513,7 +568,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "68578f196d2a33ff61b27fae256c3164f65e36382648e30666dde05b8cc9dfdf" dependencies = [ "async-trait", - "convert_case", + "convert_case 0.6.0", "json5", "nom", "pathdiff", @@ -566,6 +621,15 @@ dependencies = [ "unicode-segmentation", ] +[[package]] +name = "convert_case" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "633458d4ef8c78b72454de2d54fd6ab2e60f9e02be22f3c6104cdc8a4e0fceb9" +dependencies = [ + "unicode-segmentation", +] + [[package]] name = "cookie" version = "0.17.0" @@ -615,6 +679,16 @@ dependencies = [ "libc", ] +[[package]] +name = "core-foundation" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2a6cd9ae233e7f62ba4e9353e81a88df7fc8a5987b8d445b4d90c879bd156f6" +dependencies = [ + "core-foundation-sys", + "libc", +] + [[package]] name = "core-foundation-sys" version = "0.8.7" @@ -723,7 +797,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0dc92fb57ca44df6db8059111ab3af99a63d5d0f8375d9972e319a379c6bab76" dependencies = [ "generic-array", - "rand_core", + "rand_core 0.6.4", "subtle", "zeroize", ] @@ -735,7 +809,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "78c8292055d1c1df0cce5d180393dc8cce0abec0a7102adb6c7b1eef6016d60a" dependencies = [ "generic-array", - "rand_core", + "rand_core 0.6.4", "typenum", ] @@ -762,7 +836,7 @@ checksum = "79fc3b6dd0b87ba36e565715bf9a2ced221311db47bd18011676f24a6066edbc" dependencies = [ "curl-sys", "libc", - "openssl-probe", + "openssl-probe 0.1.6", "openssl-sys", "schannel", "socket2 0.6.2", @@ -880,6 +954,49 @@ dependencies = [ "serde_core", ] +[[package]] +name = "derive_more" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4a9b99b9cbbe49445b21764dc0625032a89b145a2642e67603e1c936f5458d05" +dependencies = [ + "derive_more-impl 1.0.0", +] + +[[package]] +name = "derive_more" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d751e9e49156b02b44f9c1815bcb94b984cdcc4396ecc32521c739452808b134" +dependencies = [ + "derive_more-impl 2.1.1", +] + +[[package]] +name = "derive_more-impl" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb7330aeadfbe296029522e6c40f315320aba36fc43a5b3632f3795348f3bd22" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.114", +] + +[[package]] +name = "derive_more-impl" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "799a97264921d8623a957f6c3b9011f3b5492f557bbb7a5a19b7fa6d06ba8dcb" +dependencies = [ + "convert_case 0.10.0", + "proc-macro2", + "quote", + "rustc_version", + "syn 2.0.114", + "unicode-xid", +] + [[package]] name = "digest" version = "0.10.7" @@ -918,6 +1035,12 @@ version = "0.15.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1aaf95b3e5c8f23aa320147307562d361db0ae0d51242340f558153b4eb2439b" +[[package]] +name = "dunce" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92773504d58c093f6de2459af4af33faa518c13451eb8f2b5698ed3d36e7c813" + [[package]] name = "dyn-clone" version = "1.0.20" @@ -990,7 +1113,7 @@ dependencies = [ "hkdf", "pem-rfc7468 0.7.0", "pkcs8 0.10.2", - "rand_core", + "rand_core 0.6.4", "sec1", "subtle", "zeroize", @@ -1059,6 +1182,17 @@ dependencies = [ "pin-project-lite", ] +[[package]] +name = "eventsource-stream" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "74fef4569247a5f429d9156b9d0a2599914385dd189c539334c625d8099d90ab" +dependencies = [ + "futures-core", + "nom", + "pin-project-lite", +] + [[package]] name = "fastrand" version = "1.9.0" @@ -1080,7 +1214,7 @@ version = "0.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c0b50bfb653653f9ca9095b427bed08ab8d75a137839d9ad64eb11810d5b6393" dependencies = [ - "rand_core", + "rand_core 0.6.4", "subtle", ] @@ -1147,6 +1281,12 @@ dependencies = [ "percent-encoding", ] +[[package]] +name = "fs_extra" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" + [[package]] name = "futures" version = "0.3.31" @@ -1275,6 +1415,28 @@ dependencies = [ "slab", ] +[[package]] +name = "genai" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "98221799e645192a607502de51a6423ebb6f36729938a9351f1ca488a1a62d7e" +dependencies = [ + "base64 0.22.1", + "bytes", + "derive_more 2.1.1", + "eventsource-stream", + "futures", + "mime_guess", + "reqwest 0.13.1", + "serde", + "serde_json", + "serde_with", + "tokio", + "tokio-stream", + "tracing", + "value-ext", +] + [[package]] name = "generic-array" version = "0.14.7" @@ -1330,7 +1492,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f0f9ef7462f7c099f518d754361858f86d8a07af53ba9af0fe635bbccb151a63" dependencies = [ "ff", - "rand_core", + "rand_core 0.6.4", "subtle", ] @@ -1950,6 +2112,38 @@ version = "1.0.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "92ecc6618181def0457392ccd0ee51198e065e016d1d527a7ac1b6dc7c1f09d2" +[[package]] +name = "jni" +version = "0.21.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a87aa2bb7d2af34197c04845522473242e1aa17c12f4935d5856491a7fb8c97" +dependencies = [ + "cesu8", + "cfg-if", + "combine", + "jni-sys", + "log", + "thiserror 1.0.69", + "walkdir", + "windows-sys 0.45.0", +] + +[[package]] +name = "jni-sys" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8eaf4bc02d17cbdd7ff4c7438cafcdf7fb9a4613313ad11b4f8fefe7d3fa0130" + +[[package]] +name = "jobserver" +version = "0.1.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33" +dependencies = [ + "getrandom 0.3.4", + "libc", +] + [[package]] name = "js-sys" version = "0.3.85" @@ -1988,7 +2182,7 @@ dependencies = [ "k256", "p256", "p384", - "rand", + "rand 0.8.5", "rsa 0.7.2", "serde", "serde_json", @@ -2103,6 +2297,12 @@ version = "0.4.29" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" +[[package]] +name = "lru-slab" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154" + [[package]] name = "matchers" version = "0.2.0" @@ -2240,10 +2440,10 @@ dependencies = [ "libc", "log", "openssl", - "openssl-probe", + "openssl-probe 0.1.6", "openssl-sys", "schannel", - "security-framework", + "security-framework 2.11.1", "security-framework-sys", "tempfile", ] @@ -2278,7 +2478,7 @@ dependencies = [ "num-integer", "num-iter", "num-traits", - "rand", + "rand 0.8.5", "smallvec", "zeroize", ] @@ -2338,7 +2538,7 @@ dependencies = [ "chrono", "getrandom 0.2.17", "http 0.2.12", - "rand", + "rand 0.8.5", "reqwest 0.11.27", "serde", "serde_json", @@ -2392,6 +2592,12 @@ version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d05e27ee213611ffe7d6348b942e8f942b37114c00cc03cec254295a4a17852e" +[[package]] +name = "openssl-probe" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c87def4c32ab89d880effc9e097653c8da5d6ef28e6b539d313baaacfbafcbe" + [[package]] name = "openssl-sys" version = "0.9.111" @@ -2561,6 +2767,15 @@ dependencies = [ "sha2", ] +[[package]] +name = "pgvector" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc58e2d255979a31caa7cabfa7aac654af0354220719ab7a68520ae7a91e8c0b" +dependencies = [ + "sqlx", +] + [[package]] name = "pin-project" version = "1.1.10" @@ -2734,6 +2949,62 @@ dependencies = [ "psl-types", ] +[[package]] +name = "quinn" +version = "0.11.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9e20a958963c291dc322d98411f541009df2ced7b5a4f2bd52337638cfccf20" +dependencies = [ + "bytes", + "cfg_aliases", + "pin-project-lite", + "quinn-proto", + "quinn-udp", + "rustc-hash", + "rustls 0.23.36", + "socket2 0.6.2", + "thiserror 2.0.18", + "tokio", + "tracing", + "web-time", +] + +[[package]] +name = "quinn-proto" +version = "0.11.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1906b49b0c3bc04b5fe5d86a77925ae6524a19b816ae38ce1e426255f1d8a31" +dependencies = [ + "aws-lc-rs", + "bytes", + "getrandom 0.3.4", + "lru-slab", + "rand 0.9.2", + "ring", + "rustc-hash", + "rustls 0.23.36", + "rustls-pki-types", + "slab", + "thiserror 2.0.18", + "tinyvec", + "tracing", + "web-time", +] + +[[package]] +name = "quinn-udp" +version = "0.5.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "addec6a0dcad8a8d96a771f815f0eaf55f9d1805756410b39f5fa81332574cbd" +dependencies = [ + "cfg_aliases", + "libc", + "once_cell", + "socket2 0.6.2", + "tracing", + "windows-sys 0.60.2", +] + [[package]] name = "quote" version = "1.0.44" @@ -2756,8 +3027,18 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" dependencies = [ "libc", - "rand_chacha", - "rand_core", + "rand_chacha 0.3.1", + "rand_core 0.6.4", +] + +[[package]] +name = "rand" +version = "0.9.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1" +dependencies = [ + "rand_chacha 0.9.0", + "rand_core 0.9.5", ] [[package]] @@ -2767,7 +3048,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" dependencies = [ "ppv-lite86", - "rand_core", + "rand_core 0.6.4", +] + +[[package]] +name = "rand_chacha" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" +dependencies = [ + "ppv-lite86", + "rand_core 0.9.5", ] [[package]] @@ -2779,6 +3070,15 @@ dependencies = [ "getrandom 0.2.17", ] +[[package]] +name = "rand_core" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76afc826de14238e6e8c374ddcc1fa19e374fd8dd986b0d2af0d02377261d83c" +dependencies = [ + "getrandom 0.3.4", +] + [[package]] name = "redox_syscall" version = "0.5.18" @@ -2936,6 +3236,49 @@ dependencies = [ "web-sys", ] +[[package]] +name = "reqwest" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04e9018c9d814e5f30cc16a0f03271aeab3571e609612d9fe78c1aa8d11c2f62" +dependencies = [ + "base64 0.22.1", + "bytes", + "encoding_rs", + "futures-core", + "futures-util", + "h2 0.4.13", + "http 1.4.0", + "http-body 1.0.1", + "http-body-util", + "hyper 1.8.1", + "hyper-rustls 0.27.7", + "hyper-util", + "js-sys", + "log", + "mime", + "percent-encoding", + "pin-project-lite", + "quinn", + "rustls 0.23.36", + "rustls-pki-types", + "rustls-platform-verifier", + "serde", + "serde_json", + "sync_wrapper 1.0.2", + "tokio", + "tokio-rustls 0.26.4", + "tokio-util", + "tower", + "tower-http 0.6.8", + "tower-service", + "url", + "wasm-bindgen", + "wasm-bindgen-futures", + "wasm-streams", + "web-sys", +] + [[package]] name = "rfc6979" version = "0.4.0" @@ -3005,7 +3348,7 @@ dependencies = [ "num-traits", "pkcs1 0.4.1", "pkcs8 0.9.0", - "rand_core", + "rand_core 0.6.4", "signature 1.6.4", "smallvec", "subtle", @@ -3025,7 +3368,7 @@ dependencies = [ "num-traits", "pkcs1 0.7.5", "pkcs8 0.10.2", - "rand_core", + "rand_core 0.6.4", "signature 2.2.0", "spki 0.7.3", "subtle", @@ -3042,6 +3385,12 @@ dependencies = [ "ordered-multimap", ] +[[package]] +name = "rustc-hash" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d" + [[package]] name = "rustc_version" version = "0.4.1" @@ -3082,6 +3431,7 @@ version = "0.23.36" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c665f33d38cea657d9614f766881e4d510e0eda4239891eea56b4cadcf01801b" dependencies = [ + "aws-lc-rs", "once_cell", "ring", "rustls-pki-types", @@ -3090,6 +3440,18 @@ dependencies = [ "zeroize", ] +[[package]] +name = "rustls-native-certs" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "612460d5f7bea540c490b2b6395d8e34a953e52b491accd6c86c8164c5932a63" +dependencies = [ + "openssl-probe 0.2.1", + "rustls-pki-types", + "schannel", + "security-framework 3.5.1", +] + [[package]] name = "rustls-pemfile" version = "1.0.4" @@ -3105,9 +3467,37 @@ version = "1.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "be040f8b0a225e40375822a563fa9524378b9d63112f53e19ffff34df5d33fdd" dependencies = [ + "web-time", "zeroize", ] +[[package]] +name = "rustls-platform-verifier" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d99feebc72bae7ab76ba994bb5e121b8d83d910ca40b36e0921f53becc41784" +dependencies = [ + "core-foundation 0.10.1", + "core-foundation-sys", + "jni", + "log", + "once_cell", + "rustls 0.23.36", + "rustls-native-certs", + "rustls-platform-verifier-android", + "rustls-webpki 0.103.9", + "security-framework 3.5.1", + "security-framework-sys", + "webpki-root-certs", + "windows-sys 0.61.2", +] + +[[package]] +name = "rustls-platform-verifier-android" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f87165f0995f63a9fbeea62b64d10b4d9d8e78ec6d7d51fb2125fda7bb36788f" + [[package]] name = "rustls-webpki" version = "0.101.7" @@ -3124,6 +3514,7 @@ version = "0.103.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d7df23109aa6c1567d1c575b9952556388da57401e4ace1d15f79eedad0d8f53" dependencies = [ + "aws-lc-rs", "ring", "rustls-pki-types", "untrusted", @@ -3141,6 +3532,15 @@ version = "1.0.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a50f4cf475b65d88e057964e0e9bb1f0aa9bbb2036dc65c64596b42932536984" +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + [[package]] name = "schannel" version = "0.1.28" @@ -3222,7 +3622,20 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "897b2245f0b511c87893af39b033e5ca9cce68824c4d7e7630b5a1d339658d02" dependencies = [ "bitflags 2.10.0", - "core-foundation", + "core-foundation 0.9.4", + "core-foundation-sys", + "libc", + "security-framework-sys", +] + +[[package]] +name = "security-framework" +version = "3.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b3297343eaf830f66ede390ea39da1d462b6b0c1b000f420d0a83f898bbbe6ef" +dependencies = [ + "bitflags 2.10.0", + "core-foundation 0.10.1", "core-foundation-sys", "libc", "security-framework-sys", @@ -3404,7 +3817,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "74233d3b3b2f6d4b006dc19dee745e73e2a6bfb6f93607cd3b02bd5b00797d7c" dependencies = [ "digest", - "rand_core", + "rand_core 0.6.4", ] [[package]] @@ -3414,7 +3827,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "77549399552de45a898a580c1b41d445bf730df867cc44e6c0233bbc4b8329de" dependencies = [ "digest", - "rand_core", + "rand_core 0.6.4", ] [[package]] @@ -3615,7 +4028,7 @@ dependencies = [ "memchr", "once_cell", "percent-encoding", - "rand", + "rand 0.8.5", "rsa 0.9.10", "serde", "sha1", @@ -3654,7 +4067,7 @@ dependencies = [ "md-5", "memchr", "once_cell", - "rand", + "rand 0.8.5", "serde", "serde_json", "sha2", @@ -3810,7 +4223,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ba3a3adc5c275d719af8cb4272ea1c4a6d668a777f37e115f6d11ddbc1c8e0e7" dependencies = [ "bitflags 1.3.2", - "core-foundation", + "core-foundation 0.9.4", "system-configuration-sys 0.5.0", ] @@ -3821,7 +4234,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3c879d448e9d986b661742763247d3693ed13609438cf3d006f51f5368a5ba6b" dependencies = [ "bitflags 2.10.0", - "core-foundation", + "core-foundation 0.9.4", "system-configuration-sys 0.6.0", ] @@ -3915,9 +4328,9 @@ dependencies = [ [[package]] name = "time" -version = "0.3.46" +version = "0.3.47" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9da98b7d9b7dad93488a84b8248efc35352b0b2657397d4167e7ad67e5d535e5" +checksum = "743bd48c283afc0388f9b8827b976905fb217ad9e647fae3a379a9283c4def2c" dependencies = [ "deranged", "itoa", @@ -3939,9 +4352,9 @@ checksum = "7694e1cfe791f8d31026952abf09c69ca6f6fa4e1a1229e18988f06a04a12dca" [[package]] name = "time-macros" -version = "0.2.26" +version = "0.2.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "78cc610bac2dcee56805c99642447d4c5dbde4d01f752ffea0199aee1f601dc4" +checksum = "2e70e4c5a0e0a8a4823ad65dfe1a6930e4f4d756dcd9dd7939022b5e8c501215" dependencies = [ "num-conv", "time-core", @@ -3998,9 +4411,12 @@ dependencies = [ "dotenvy", "futures", "futures-util", + "genai", "itertools", "milltime", "oauth2", + "pgvector", + "regex", "reqwest 0.12.28", "serde", "serde_json", @@ -4271,7 +4687,7 @@ dependencies = [ "futures", "http 1.4.0", "parking_lot", - "rand", + "rand 0.8.5", "serde", "serde_json", "thiserror 1.0.69", @@ -4409,7 +4825,7 @@ dependencies = [ "http 1.4.0", "httparse", "log", - "rand", + "rand 0.8.5", "sha1", "thiserror 1.0.69", "utf-8", @@ -4458,7 +4874,7 @@ dependencies = [ "futures", "getrandom 0.2.17", "pin-project", - "rand", + "rand 0.8.5", "reqwest 0.12.28", "serde", "serde_json", @@ -4483,7 +4899,7 @@ dependencies = [ "futures", "getrandom 0.2.17", "pin-project", - "rand", + "rand 0.8.5", "reqwest 0.12.28", "serde", "serde_json", @@ -4624,6 +5040,17 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65" +[[package]] +name = "value-ext" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6f2d566183ea18900e7ad5b91ec41c661db4e4140d56ee5405df0cafbefab72" +dependencies = [ + "derive_more 1.0.0", + "serde", + "serde_json", +] + [[package]] name = "vcpkg" version = "0.2.15" @@ -4642,6 +5069,16 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "317211a0dc0ceedd78fb2ca9a44aed3d7b9b26f81870d485c07122b4350673b7" +[[package]] +name = "walkdir" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" +dependencies = [ + "same-file", + "winapi-util", +] + [[package]] name = "want" version = "0.3.1" @@ -4785,6 +5222,25 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "web-time" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "webpki-root-certs" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "804f18a4ac2676ffb4e8b5b5fa9ae38af06df08162314f96a68d2a363e21a8ca" +dependencies = [ + "rustls-pki-types", +] + [[package]] name = "webpki-roots" version = "0.25.4" @@ -4819,6 +5275,15 @@ dependencies = [ "wasite", ] +[[package]] +name = "winapi-util" +version = "0.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" +dependencies = [ + "windows-sys 0.61.2", +] + [[package]] name = "windows-core" version = "0.62.2" @@ -4889,6 +5354,15 @@ dependencies = [ "windows-link", ] +[[package]] +name = "windows-sys" +version = "0.45.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0" +dependencies = [ + "windows-targets 0.42.2", +] + [[package]] name = "windows-sys" version = "0.48.0" @@ -4934,6 +5408,21 @@ dependencies = [ "windows-link", ] +[[package]] +name = "windows-targets" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e5180c00cd44c9b1c88adb3693291f1cd93605ded80c250a75d472756b4d071" +dependencies = [ + "windows_aarch64_gnullvm 0.42.2", + "windows_aarch64_msvc 0.42.2", + "windows_i686_gnu 0.42.2", + "windows_i686_msvc 0.42.2", + "windows_x86_64_gnu 0.42.2", + "windows_x86_64_gnullvm 0.42.2", + "windows_x86_64_msvc 0.42.2", +] + [[package]] name = "windows-targets" version = "0.48.5" @@ -4982,6 +5471,12 @@ dependencies = [ "windows_x86_64_msvc 0.53.1", ] +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "597a5118570b68bc08d8d59125332c54f1ba9d9adeedeef5b99b02ba2b0698f8" + [[package]] name = "windows_aarch64_gnullvm" version = "0.48.5" @@ -5000,6 +5495,12 @@ version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a9d8416fa8b42f5c947f8482c43e7d89e73a173cead56d044f6a56104a6d1b53" +[[package]] +name = "windows_aarch64_msvc" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e08e8864a60f06ef0d0ff4ba04124db8b0fb3be5776a5cd47641e942e58c4d43" + [[package]] name = "windows_aarch64_msvc" version = "0.48.5" @@ -5018,6 +5519,12 @@ version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b9d782e804c2f632e395708e99a94275910eb9100b2114651e04744e9b125006" +[[package]] +name = "windows_i686_gnu" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c61d927d8da41da96a81f029489353e68739737d3beca43145c8afec9a31a84f" + [[package]] name = "windows_i686_gnu" version = "0.48.5" @@ -5048,6 +5555,12 @@ version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fa7359d10048f68ab8b09fa71c3daccfb0e9b559aed648a8f95469c27057180c" +[[package]] +name = "windows_i686_msvc" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44d840b6ec649f480a41c8d80f9c65108b92d89345dd94027bfe06ac444d1060" + [[package]] name = "windows_i686_msvc" version = "0.48.5" @@ -5066,6 +5579,12 @@ version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e7ac75179f18232fe9c285163565a57ef8d3c89254a30685b57d83a38d326c2" +[[package]] +name = "windows_x86_64_gnu" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8de912b8b8feb55c064867cf047dda097f92d51efad5b491dfb98f6bbb70cb36" + [[package]] name = "windows_x86_64_gnu" version = "0.48.5" @@ -5084,6 +5603,12 @@ version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c3842cdd74a865a8066ab39c8a7a473c0778a3f29370b5fd6b4b9aa7df4a499" +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26d41b46a36d453748aedef1486d5c7a85db22e56aff34643984ea85514e94a3" + [[package]] name = "windows_x86_64_gnullvm" version = "0.48.5" @@ -5102,6 +5627,12 @@ version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ffa179e2d07eee8ad8f57493436566c7cc30ac536a3379fdf008f47f6bb7ae1" +[[package]] +name = "windows_x86_64_msvc" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9aec5da331524158c6d1a4ac0ab1541149c0b9505fde06423b02f5ef0106b9f0" + [[package]] name = "windows_x86_64_msvc" version = "0.48.5" diff --git a/app/src/components/cmd-k.tsx b/app/src/components/cmd-k.tsx index 147e562a..59a7c261 100644 --- a/app/src/components/cmd-k.tsx +++ b/app/src/components/cmd-k.tsx @@ -15,7 +15,9 @@ import { DrumIcon, FolderGit2, GitPullRequestIcon, + Loader2, TimerIcon, + BriefcaseIcon, } from "lucide-react"; import { ListPullRequest } from "@/lib/api/queries/pullRequests"; import { Tooltip, TooltipContent, TooltipTrigger } from "./ui/tooltip"; @@ -35,9 +37,13 @@ import { rememberLastProjectAtom, buildRememberedTimerParams, } from "@/lib/milltime-preferences"; +import { SearchResult } from "@/lib/api/queries/search"; +import { useDebounce } from "@/hooks/useDebounce"; export function CmdK() { const [open, setOpen] = React.useState(false); + const [searchInput, setSearchInput] = React.useState(""); + const debouncedSearchInput = useDebounce(searchInput, 300); const close = () => setOpen(false); @@ -53,13 +59,28 @@ export function CmdK() { return () => document.removeEventListener("keydown", down); }, []); + React.useEffect(() => { + if (!open) { + setSearchInput(""); + } + }, [open]); + return ( - + No results found. + @@ -218,7 +239,7 @@ function ActionsCommandGroup(props: { close: () => void }) { function PRCommandGroup(props: { close: () => void }) { const navigate = useNavigate(); - const { data: pullRequests } = useQuery(queries.listPullRequests()); + const { data: pullRequests } = useQuery(queries.pullRequests.listPullRequests()); return ( @@ -262,3 +283,101 @@ function PRCommandGroup(props: { close: () => void }) { function pullRequestValue(pr: ListPullRequest) { return `!${pr.id} ${pr.title} ${pr.repoName} ${pr.createdBy.displayName} ${pr.workItems.map((wi) => `#${wi.id}`).join(" ")}`; } + +function SearchCommandGroup(props: { + close: () => void; + searchQuery: string; + isDebouncing: boolean; +}) { + const navigate = useNavigate(); + + const { data: searchResults, isLoading, isFetching, isError } = useQuery( + queries.search.search(props.searchQuery, 10) + ); + + if (!props.searchQuery && !props.isDebouncing) { + return null; + } + + if (isLoading || props.isDebouncing) { + return ( + + +
+ + Searching... +
+
+
+ ); + } + + if (isError) { + return ( + + + Search failed. Try again. + + + ); + } + + if (!searchResults || searchResults.length === 0) { + return null; + } + + return ( + + Search results + {isFetching && ( + + )} + + } + > + {searchResults.map((result) => ( + { + if (result.sourceType === "Pr") { + navigate({ + to: "/prs/$prId", + params: { prId: result.externalId.toString() }, + }); + } else { + window.open(result.url, "_blank", "noopener,noreferrer"); + } + props.close(); + }} + > +
+
+ {result.sourceType === "Pr" ? ( + + ) : ( + + )} + + {result.sourceType === "Pr" ? "!" : "#"} + {result.externalId} + + {result.title} +
+ {result.authorName && ( + + {result.authorName} + + )} +
+
+ ))} +
+ ); +} + +function searchResultValue(result: SearchResult) { + return `${result.sourceType === "Pr" ? "!" : "#"}${result.externalId} ${result.title} ${result.authorName ?? ""}`; +} diff --git a/app/src/hooks/useDebounce.ts b/app/src/hooks/useDebounce.ts new file mode 100644 index 00000000..22e8fea7 --- /dev/null +++ b/app/src/hooks/useDebounce.ts @@ -0,0 +1,17 @@ +import { useEffect, useState } from "react"; + +export function useDebounce(value: T, delay: number): T { + const [debouncedValue, setDebouncedValue] = useState(value); + + useEffect(() => { + const timer = setTimeout(() => { + setDebouncedValue(value); + }, delay); + + return () => { + clearTimeout(timer); + }; + }, [value, delay]); + + return debouncedValue; +} diff --git a/app/src/lib/api/mutations/differs.ts b/app/src/lib/api/mutations/differs.ts index ae3c71dd..5e4ac522 100644 --- a/app/src/lib/api/mutations/differs.ts +++ b/app/src/lib/api/mutations/differs.ts @@ -18,10 +18,10 @@ function useStartDiffers(options?: DefaultMutationOptions) { }), ...options, onMutate: (repoKey) => { - queryClient.cancelQueries(queries.differs()); - const previous = queryClient.getQueryData(queries.differs().queryKey); + queryClient.cancelQueries(queries.differs.differs()); + const previous = queryClient.getQueryData(queries.differs.differs().queryKey); - queryClient.setQueryData(queries.differs().queryKey, (old) => { + queryClient.setQueryData(queries.differs.differs().queryKey, (old) => { if (!old) return undefined; const differ = old.find( @@ -45,11 +45,11 @@ function useStartDiffers(options?: DefaultMutationOptions) { return { previous }; }, onError: (err, vars, ctx) => { - queryClient.setQueryData(queries.differs().queryKey, ctx?.previous); + queryClient.setQueryData(queries.differs.differs().queryKey, ctx?.previous); options?.onError?.(err, vars, ctx); }, onSuccess: (data, vars, ctx) => { - queryClient.invalidateQueries(queries.differs()); + queryClient.invalidateQueries(queries.differs.differs()); queryClient.invalidateQueries({ queryKey: pullRequestsQueries.baseKey, }); @@ -69,10 +69,10 @@ function useStopDiffers(options?: DefaultMutationOptions) { }), ...options, onMutate: (repoKey) => { - queryClient.cancelQueries(queries.differs()); - const previous = queryClient.getQueryData(queries.differs().queryKey); + queryClient.cancelQueries(queries.differs.differs()); + const previous = queryClient.getQueryData(queries.differs.differs().queryKey); - queryClient.setQueryData(queries.differs().queryKey, (old) => { + queryClient.setQueryData(queries.differs.differs().queryKey, (old) => { if (!old) return undefined; const differ = old.find( @@ -96,11 +96,11 @@ function useStopDiffers(options?: DefaultMutationOptions) { return { previous }; }, onError: (err, vars, ctx) => { - queryClient.setQueryData(queries.differs().queryKey, ctx?.previous); + queryClient.setQueryData(queries.differs.differs().queryKey, ctx?.previous); options?.onError?.(err, vars, ctx); }, onSuccess: (data, vars, ctx) => { - queryClient.invalidateQueries(queries.differs()); + queryClient.invalidateQueries(queries.differs.differs()); options?.onSuccess?.(data, vars, ctx); }, }); diff --git a/app/src/lib/api/mutations/repositories.ts b/app/src/lib/api/mutations/repositories.ts index 89740eba..aa89eb90 100644 --- a/app/src/lib/api/mutations/repositories.ts +++ b/app/src/lib/api/mutations/repositories.ts @@ -22,7 +22,7 @@ function useAddRepository(options?: DefaultMutationOptions) { }), ...options, onSuccess: (data, vars, ctx) => { - queryClient.invalidateQueries(queries.differs()); + queryClient.invalidateQueries(queries.differs.differs()); options?.onSuccess?.(data, vars, ctx); }, }); @@ -43,7 +43,7 @@ function useFollowRepository( }), ...options, onMutate: (vars) => { - queryClient.setQueryData(queries.differs().queryKey, (old) => { + queryClient.setQueryData(queries.differs.differs().queryKey, (old) => { if (!old) return old; const idx = old.findIndex( @@ -62,7 +62,7 @@ function useFollowRepository( options?.onMutate?.(vars); }, onSettled: (data, err, vars, ctx) => { - queryClient.invalidateQueries(queries.differs()); + queryClient.invalidateQueries(queries.differs.differs()); queryClient.invalidateQueries({ queryKey: pullRequestsQueries.baseKey, }); @@ -84,7 +84,7 @@ function useDeleteRepository( }), ...options, onSuccess: (data, vars, ctx) => { - queryClient.invalidateQueries(queries.differs()); + queryClient.invalidateQueries(queries.differs.differs()); queryClient.invalidateQueries({ queryKey: pullRequestsQueries.baseKey, }); diff --git a/app/src/lib/api/queries/queries.ts b/app/src/lib/api/queries/queries.ts index 515971e9..dc3a815d 100644 --- a/app/src/lib/api/queries/queries.ts +++ b/app/src/lib/api/queries/queries.ts @@ -3,13 +3,15 @@ import { pullRequestsQueries } from "./pullRequests"; import { commitsQueries } from "./commits"; import { milltimeQueries } from "./milltime"; import { userQueries } from "./user"; +import { searchQueries } from "./search"; export const queries = { - ...userQueries, - ...differsQueries, - ...pullRequestsQueries, - ...commitsQueries, - ...milltimeQueries, + user: userQueries, + differs: differsQueries, + pullRequests: pullRequestsQueries, + commits: commitsQueries, + milltime: milltimeQueries, + search: searchQueries, }; export type RepoKey = T & { diff --git a/app/src/lib/api/queries/search.ts b/app/src/lib/api/queries/search.ts new file mode 100644 index 00000000..5912c357 --- /dev/null +++ b/app/src/lib/api/queries/search.ts @@ -0,0 +1,39 @@ +import { queryOptions } from "@tanstack/react-query"; +import { api } from "../api"; + +export const searchQueries = { + baseKey: ["search"], + search: (query: string, limit?: number) => { + const trimmedQuery = query.trim(); + return queryOptions({ + queryKey: [...searchQueries.baseKey, trimmedQuery, limit], + queryFn: async () => { + const params = new URLSearchParams({ q: trimmedQuery }); + if (limit) { + params.set("limit", limit.toString()); + } + return api.get(`search?${params}`).json>(); + }, + enabled: trimmedQuery.length >= 2, + }); + }, +}; + +export type SearchSource = "Pr" | "WorkItem"; + +export type SearchResult = { + id: number; + sourceType: SearchSource; + sourceId: string; + externalId: number; + title: string; + description: string | null; + status: string; + priority: number | null; + itemType: string | null; + authorName: string | null; + url: string; + createdAt: string; + updatedAt: string; + score: number; +}; diff --git a/app/src/routes/_layout/prs/$prId/route.tsx b/app/src/routes/_layout/prs/$prId/route.tsx index 93dea1cf..7c6f0ed7 100644 --- a/app/src/routes/_layout/prs/$prId/route.tsx +++ b/app/src/routes/_layout/prs/$prId/route.tsx @@ -41,7 +41,7 @@ import { milltimeMutations } from "@/lib/api/mutations/milltime"; export const Route = createFileRoute("/_layout/prs/$prId")({ loader: ({ context }) => - context.queryClient.ensureQueryData(queries.listPullRequests()), + context.queryClient.ensureQueryData(queries.pullRequests.listPullRequests()), component: PRDetailsDialog, }); @@ -51,7 +51,7 @@ function PRDetailsDialog() { const navigate = useNavigate({ from: Route.fullPath }); const { data: pr } = useSuspenseQuery({ - ...queries.listPullRequests(), + ...queries.pullRequests.listPullRequests(), select: (data) => data.find((pr) => pr.id === +prId), }); diff --git a/app/src/routes/_layout/prs/route.tsx b/app/src/routes/_layout/prs/route.tsx index 955b3646..d9598968 100644 --- a/app/src/routes/_layout/prs/route.tsx +++ b/app/src/routes/_layout/prs/route.tsx @@ -28,8 +28,8 @@ export const Route = createFileRoute("/_layout/prs")({ shouldReload: false, loader: async ({ context }) => { await Promise.all([ - context.queryClient.ensureQueryData(queries.me()), - context.queryClient.ensureQueryData(queries.listPullRequests()), + context.queryClient.ensureQueryData(queries.user.me()), + context.queryClient.ensureQueryData(queries.pullRequests.listPullRequests()), ]); }, component: PrsComponent, @@ -40,9 +40,9 @@ function PrsComponent() { const { searchString, filterAuthor, filterReviewer, filterBlocking } = Route.useSearch(); - const { data: user } = useSuspenseQuery(queries.me()); + const { data: user } = useSuspenseQuery(queries.user.me()); const { data: pullRequests } = useSuspenseQuery({ - ...queries.listPullRequests(), + ...queries.pullRequests.listPullRequests(), refetchInterval: 60 * 1000, }); diff --git a/app/src/routes/_layout/repositories/route.tsx b/app/src/routes/_layout/repositories/route.tsx index 646cbdb5..bae0f497 100644 --- a/app/src/routes/_layout/repositories/route.tsx +++ b/app/src/routes/_layout/repositories/route.tsx @@ -22,8 +22,8 @@ const repositoriesSearchSchema = z.object({ export const Route = createFileRoute("/_layout/repositories")({ validateSearch: repositoriesSearchSchema, loader: ({ context }) => { - context.queryClient.ensureQueryData(queries.me()); - context.queryClient.ensureQueryData(queries.differs()); + context.queryClient.ensureQueryData(queries.user.me()); + context.queryClient.ensureQueryData(queries.differs.differs()); }, component: RepositoriesComponent, }); @@ -32,11 +32,11 @@ function RepositoriesComponent() { const { searchString } = Route.useSearch(); const { data: isAdmin } = useSuspenseQuery({ - ...queries.me(), + ...queries.user.me(), select: (data) => data.roles.includes("Admin"), }); const { data, dataUpdatedAt } = useSuspenseQuery({ - ...queries.differs(), + ...queries.differs.differs(), refetchInterval: 15 * 1000, }); diff --git a/az-devops/Cargo.toml b/az-devops/Cargo.toml index ca9b5441..18978103 100644 --- a/az-devops/Cargo.toml +++ b/az-devops/Cargo.toml @@ -16,7 +16,7 @@ azure_devops_rust_api = { version = "0.28.0", features = [ serde = { version = "1.0.193", features = ["derive"] } serde_json = "1.0.108" thiserror = "2.0.12" -time = { version = "0.3.31", features = ["serde", "parsing"] } +time = { version = "0.3.47", features = ["serde", "parsing"] } tokio = { version = "1.35.1", features = ["full"] } tracing = { version = "0.1.40", features = ["attributes"] } typespec = "0.4.0" diff --git a/docs/implementation-plan.md b/docs/implementation-plan.md new file mode 100644 index 00000000..2598dbfe --- /dev/null +++ b/docs/implementation-plan.md @@ -0,0 +1,228 @@ +# Super Search Implementation Plan + +## Architecture Principles + +1. **Trait-based abstractions** for all external dependencies +2. **Dependency injection** via generics with trait bounds +3. **Pure business logic** separated from I/O +4. **Easy mocking** for unit tests + +## Trait Definitions + +### `Embedder` - Text embedding abstraction + +```rust +#[async_trait] +pub trait Embedder: Send + Sync { + async fn embed(&self, text: &str) -> Result>; + async fn embed_batch(&self, texts: &[&str]) -> Result>>; +} +``` + +Implementations: +- `GeminiEmbedder` - Production (calls Gemini API) +- `MockEmbedder` - Tests (returns fixed vectors) + +### `SearchRepository` - Database abstraction + +```rust +#[async_trait] +pub trait SearchRepository: Send + Sync { + async fn search(&self, query: &ParsedQuery, embedding: &[f32], limit: i32) -> Result>; + async fn upsert_document(&self, doc: &SearchDocument) -> Result<()>; + async fn upsert_documents(&self, docs: &[SearchDocument]) -> Result; + async fn delete_document(&self, source_type: SearchSource, source_id: &str) -> Result; + async fn get_document(&self, source_type: SearchSource, source_id: &str) -> Result>; + async fn get_stale_documents(&self, older_than: OffsetDateTime) -> Result>; +} +``` + +Implementations: +- `PgSearchRepository` - Production (SQLx + pgvector) +- `MockSearchRepository` - Tests (in-memory HashMap) + +### `DocumentSource` - ADO data fetching abstraction + +```rust +#[async_trait] +pub trait DocumentSource: Send + Sync { + async fn fetch_pull_requests(&self, org: &str, project: &str) -> Result>; + async fn fetch_work_items(&self, org: &str, project: &str, since: Option) -> Result>; +} +``` + +Implementations: +- `AdoDocumentSource` - Production (uses existing ADO client) +- `MockDocumentSource` - Tests (returns test fixtures) + +## File Structure + +``` +toki-api/src/domain/search/ +├── mod.rs # Module exports +├── types.rs # SearchSource, SearchDocument, SearchResult, ParsedQuery, SearchFilters +├── traits.rs # Embedder, SearchRepository, DocumentSource traits +├── parser.rs # parse_query() function +├── embedder/ +│ ├── mod.rs +│ ├── gemini.rs # GeminiEmbedder +│ └── mock.rs # MockEmbedder (cfg test) +├── repository/ +│ ├── mod.rs +│ ├── postgres.rs # PgSearchRepository +│ └── mock.rs # MockSearchRepository (cfg test) +├── source/ +│ ├── mod.rs +│ ├── ado.rs # AdoDocumentSource +│ └── mock.rs # MockDocumentSource (cfg test) +├── service.rs # SearchService +└── indexer.rs # SearchIndexer +``` + +## Services with Generics + +### SearchService + +```rust +pub struct SearchService +where + E: Embedder, + R: SearchRepository, +{ + embedder: E, + repository: R, +} + +impl SearchService +where + E: Embedder, + R: SearchRepository, +{ + pub fn new(embedder: E, repository: R) -> Self { + Self { embedder, repository } + } + + pub async fn search(&self, query: &str, limit: i32) -> Result> { + let parsed = parse_query(query); + let embedding = self.embedder.embed(&parsed.search_text).await?; + self.repository.search(&parsed, &embedding, limit).await + } +} +``` + +### SearchIndexer + +```rust +pub struct SearchIndexer +where + E: Embedder, + R: SearchRepository, + S: DocumentSource, +{ + embedder: E, + repository: R, + source: S, +} + +impl SearchIndexer +where + E: Embedder, + R: SearchRepository, + S: DocumentSource, +{ + pub async fn sync_project(&self, org: &str, project: &str) -> Result { + // Fetch from source, embed, upsert to repository + } +} +``` + +## Implementation Order + +1. **Migration** - `migrations/YYYYMMDDHHMMSS_create_search_documents.sql` +2. **Types** - `types.rs` (all data structures) +3. **Traits** - `traits.rs` (all trait definitions) +4. **Parser** - `parser.rs` (query parsing, pure function, easy to test) +5. **Mock implementations** - For testing the service layer +6. **SearchService** - `service.rs` (with unit tests using mocks) +7. **Embedder** - `embedder/gemini.rs` (Gemini API client) +8. **Repository** - `repository/postgres.rs` (SQLx implementation) +9. **DocumentSource** - `source/ado.rs` (ADO client wrapper) +10. **Indexer** - `indexer.rs` (with unit tests using mocks) +11. **API Route** - `routes/search.rs` +12. **Integration tests** - Full stack with test database + +## Testing Strategy + +### Unit Tests (fast, no I/O) + +```rust +#[cfg(test)] +mod tests { + use super::*; + use crate::domain::search::embedder::MockEmbedder; + use crate::domain::search::repository::MockSearchRepository; + + #[tokio::test] + async fn search_returns_results_sorted_by_score() { + let embedder = MockEmbedder::returning(vec![0.1; 3072]); + let repo = MockSearchRepository::new() + .with_document(SearchDocument { title: "Auth PR".into(), .. }) + .with_document(SearchDocument { title: "Other PR".into(), .. }); + + let service = SearchService::new(embedder, repo); + let results = service.search("authentication", 10).await.unwrap(); + + assert_eq!(results.len(), 2); + assert!(results[0].score >= results[1].score); + } + + #[tokio::test] + async fn search_applies_filters() { + // Test that status/priority/project filters work + } + + #[tokio::test] + async fn parser_extracts_priority_filter() { + let parsed = parse_query("priority 1 bugs"); + assert_eq!(parsed.filters.priority, Some(vec![1])); + assert_eq!(parsed.filters.item_type, Some(vec!["Bug".to_string()])); + assert_eq!(parsed.search_text, ""); + } +} +``` + +### Integration Tests (with test DB) + +```rust +#[sqlx::test] +async fn search_integration(pool: PgPool) { + let embedder = GeminiEmbedder::new(test_api_key()); + let repo = PgSearchRepository::new(pool); + let service = SearchService::new(embedder, repo); + + // Insert test documents, run search, verify results +} +``` + +## Dependencies + +```toml +[dependencies] +pgvector = "0.4" +async-trait = "0.1" + +[dev-dependencies] +mockall = "0.13" # Optional: for derive-based mocking +``` + +## Config + +```yaml +search: + enabled: true + gemini_api_key: ${GEMINI_API_KEY} + embedding_model: "gemini-embedding-001" + embedding_dimensions: 3072 + sync_interval_minutes: 15 + batch_size: 10 +``` diff --git a/docs/super-search-design.md b/docs/super-search-design.md new file mode 100644 index 00000000..f351b891 --- /dev/null +++ b/docs/super-search-design.md @@ -0,0 +1,468 @@ +# Super Search Design + +Hybrid search over PRs and Work Items using Gemini embeddings + Postgres full-text search. + +## Goals + +- **Semantic search**: "PRs about authentication" finds PRs even if they don't contain the word +- **Keyword search**: Exact matches, IDs, names +- **Metadata filtering**: Status, priority, author, date ranges, repo +- **Unified results**: PRs and work items in one search + +## Architecture + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ Search Flow │ +└─────────────────────────────────────────────────────────────────┘ + + User Query: "authentication issues in Lerum, priority 1" + │ + ▼ + ┌────────────────────────┐ + │ Query Parser │ + │ - Extract filters │ + │ - Extract search text │ + └───────────┬────────────┘ + │ + ┌─────────────────┼─────────────────┐ + ▼ ▼ ▼ + ┌──────────┐ ┌──────────────┐ ┌──────────┐ + │ BM25 │ │ Vector │ │ Metadata │ + │(tsvector)│ │ Search │ │ Filter │ + └────┬─────┘ └──────┬───────┘ └────┬─────┘ + │ │ │ + └─────────────────┬┴────────────────┘ + ▼ + ┌────────────────────────┐ + │ Reciprocal Rank │ + │ Fusion (RRF) │ + └───────────┬────────────┘ + ▼ + ┌────────────────────────┐ + │ Results + Snippets │ + └────────────────────────┘ +``` + +## Data Model + +### `search_documents` Table + +```sql +CREATE EXTENSION IF NOT EXISTS vector; + +CREATE TYPE search_source AS ENUM ('pr', 'work_item'); + +CREATE TABLE search_documents ( + id SERIAL PRIMARY KEY, + + -- Source identification + source_type search_source NOT NULL, + source_id TEXT NOT NULL, -- "org/project/repo/123" for PR, "org/project/123" for WI + external_id INT NOT NULL, -- PR number or work item ID + + -- Searchable content + title TEXT NOT NULL, + description TEXT, + content TEXT, -- Combined: description + comments + commit messages + + -- Metadata (filterable) + organization TEXT NOT NULL, + project TEXT NOT NULL, + repo_name TEXT, -- NULL for work items + status TEXT NOT NULL, -- 'active', 'completed', 'abandoned' / 'New', 'Active', 'Closed' + author_id TEXT, + author_name TEXT, + assigned_to_id TEXT, + assigned_to_name TEXT, + priority INT, -- 1-4 for work items, NULL for PRs + item_type TEXT, -- 'Bug', 'Task', 'User Story' for WI; 'pr' for PRs + is_draft BOOLEAN DEFAULT FALSE, + + -- Timestamps + created_at TIMESTAMPTZ NOT NULL, + updated_at TIMESTAMPTZ NOT NULL, + closed_at TIMESTAMPTZ, + indexed_at TIMESTAMPTZ DEFAULT NOW(), + + -- Search vectors + search_vector TSVECTOR GENERATED ALWAYS AS ( + setweight(to_tsvector('english', coalesce(title, '')), 'A') || + setweight(to_tsvector('english', coalesce(description, '')), 'B') || + setweight(to_tsvector('english', coalesce(content, '')), 'C') + ) STORED, + embedding vector(1536), -- Gemini gemini-embedding-001 outputs 1536 dims + + -- Links + url TEXT NOT NULL, + parent_id INT, -- For work items with parent + linked_work_items INT[], -- Work items linked to a PR + + UNIQUE(source_type, source_id) +); + +-- Indexes +CREATE INDEX idx_search_docs_search_vector ON search_documents USING GIN(search_vector); +CREATE INDEX idx_search_docs_embedding ON search_documents USING hnsw(embedding vector_cosine_ops); +CREATE INDEX idx_search_docs_org_project ON search_documents(organization, project); +CREATE INDEX idx_search_docs_status ON search_documents(status); +CREATE INDEX idx_search_docs_priority ON search_documents(priority) WHERE priority IS NOT NULL; +CREATE INDEX idx_search_docs_created ON search_documents(created_at DESC); +CREATE INDEX idx_search_docs_updated ON search_documents(updated_at DESC); +``` + +## Gemini Embedding Integration + +Using the `genai` Rust crate for clean provider abstraction: + +```rust +// toki-api/src/domain/search/embedder/gemini.rs +use genai::embed::EmbedOptions; + +pub const GEMINI_MODEL: &str = "gemini-embedding-001"; +pub const GEMINI_DIMENSIONS: usize = 1536; + +pub struct GeminiEmbedder { + client: genai::Client, + model: String, + options: EmbedOptions, +} + +impl GeminiEmbedder { + pub fn new() -> Result { + let client = genai::Client::default(); + let options = EmbedOptions::new().with_embedding_type("RETRIEVAL_QUERY"); + Ok(Self { client, model: GEMINI_MODEL.into(), options }) + } +} + +#[async_trait] +impl Embedder for GeminiEmbedder { + async fn embed(&self, text: &str) -> Result> { + let response = self.client.embed(&self.model, text, Some(&self.options)).await?; + Ok(response.first_embedding().unwrap().vector().iter().map(|&v| v as f32).collect()) + } + + async fn embed_batch(&self, texts: &[&str]) -> Result>> { + let response = self.client.embed_batch(&self.model, texts, Some(&self.options)).await?; + // ... map embeddings back to results + } +} +``` + +Benefits of `genai` crate: +- Unified interface across providers (easy to swap to OpenAI, Cohere, etc.) +- Built-in batching with `embed_batch` +- Reads `GEMINI_API_KEY` from environment automatically +- Handles retries and rate limiting + +## Search Query Parser + +Parse natural language into filters + search text: + +```rust +// "priority 1 bugs in Lerum closed last week" +// → filters: {priority: 1, item_type: "Bug", project: "Lerums Djursjukhus", status: "Closed", date_range: last_week} +// → search_text: "" + +// "authentication PRs" +// → filters: {source_type: "pr"} +// → search_text: "authentication" + +pub struct ParsedQuery { + pub search_text: String, + pub filters: SearchFilters, +} + +#[derive(Default)] +pub struct SearchFilters { + pub source_type: Option, + pub organization: Option, + pub project: Option, + pub repo_name: Option, + pub status: Option>, + pub priority: Option>, + pub item_type: Option>, + pub author: Option, + pub assigned_to: Option, + pub is_draft: Option, + pub created_after: Option, + pub created_before: Option, + pub updated_after: Option, +} +``` + +## Hybrid Search Implementation + +```rust +pub struct SearchService { + pool: PgPool, + embedder: GeminiEmbedder, +} + +impl SearchService { + pub async fn search(&self, query: &str, limit: i32) -> anyhow::Result> { + let parsed = parse_query(query); + + // Generate embedding for semantic search + let query_embedding = self.embedder.embed_query(&parsed.search_text).await?; + + // Hybrid search with RRF (Reciprocal Rank Fusion) + let results = sqlx::query_as!( + SearchResult, + r#" + WITH bm25_results AS ( + SELECT id, + ts_rank_cd(search_vector, websearch_to_tsquery('english', $1)) as score, + ROW_NUMBER() OVER (ORDER BY ts_rank_cd(search_vector, websearch_to_tsquery('english', $1)) DESC) as rank + FROM search_documents + WHERE ($1 = '' OR search_vector @@ websearch_to_tsquery('english', $1)) + AND ($2::text IS NULL OR organization = $2) + AND ($3::text IS NULL OR project = $3) + AND ($4::text[] IS NULL OR status = ANY($4)) + AND ($5::int[] IS NULL OR priority = ANY($5)) + AND ($6::search_source IS NULL OR source_type = $6) + LIMIT 100 + ), + vector_results AS ( + SELECT id, + 1 - (embedding <=> $7::vector) as score, + ROW_NUMBER() OVER (ORDER BY embedding <=> $7::vector) as rank + FROM search_documents + WHERE embedding IS NOT NULL + AND ($2::text IS NULL OR organization = $2) + AND ($3::text IS NULL OR project = $3) + AND ($4::text[] IS NULL OR status = ANY($4)) + AND ($5::int[] IS NULL OR priority = ANY($5)) + AND ($6::search_source IS NULL OR source_type = $6) + LIMIT 100 + ), + rrf_combined AS ( + SELECT + COALESCE(b.id, v.id) as id, + COALESCE(1.0 / (60 + b.rank), 0) + COALESCE(1.0 / (60 + v.rank), 0) as rrf_score + FROM bm25_results b + FULL OUTER JOIN vector_results v ON b.id = v.id + ) + SELECT + d.id, + d.source_type as "source_type: SearchSource", + d.source_id, + d.external_id, + d.title, + d.description, + d.status, + d.priority, + d.item_type, + d.author_name, + d.url, + d.created_at, + d.updated_at, + r.rrf_score as score + FROM rrf_combined r + JOIN search_documents d ON d.id = r.id + ORDER BY r.rrf_score DESC + LIMIT $8 + "#, + parsed.search_text, + parsed.filters.organization, + parsed.filters.project, + parsed.filters.status.as_deref(), + parsed.filters.priority.as_deref(), + parsed.filters.source_type as Option, + &query_embedding as &[f32], + limit + ) + .fetch_all(&self.pool) + .await?; + + Ok(results) + } +} +``` + +## Sync Service + +Background job to sync from Azure DevOps to search index: + +```rust +pub struct SearchIndexer { + pool: PgPool, + embedder: GeminiEmbedder, + ado_client: AzureDevOpsClient, +} + +impl SearchIndexer { + /// Full sync - run periodically or on demand + pub async fn sync_all(&self, org: &str, project: &str) -> anyhow::Result { + let mut stats = SyncStats::default(); + + // Sync PRs (active + recently closed) + let prs = self.ado_client.list_pull_requests(org, project, PrStatus::All).await?; + for pr in prs { + self.index_pull_request(&pr).await?; + stats.prs_indexed += 1; + } + + // Sync work items (query for recent changes) + let work_items = self.ado_client.query_work_items(org, project, + "SELECT [System.Id] FROM WorkItems WHERE [System.ChangedDate] >= @Today - 30" + ).await?; + for wi in work_items { + self.index_work_item(org, project, &wi).await?; + stats.work_items_indexed += 1; + } + + Ok(stats) + } + + async fn index_pull_request(&self, pr: &PullRequest) -> anyhow::Result<()> { + // Combine content for embedding + let content = format!( + "{}\n\n{}\n\nCommits:\n{}\n\nComments:\n{}", + pr.pull_request_base.title, + pr.pull_request_base.description.as_deref().unwrap_or(""), + pr.commits.iter().map(|c| &c.comment).collect::>().join("\n"), + pr.threads.iter() + .flat_map(|t| t.comments.iter()) + .filter(|c| !c.is_system_comment()) + .map(|c| c.content.as_deref().unwrap_or("")) + .collect::>() + .join("\n") + ); + + let embedding = self.embedder.embed(&content).await?; + + sqlx::query!( + r#" + INSERT INTO search_documents ( + source_type, source_id, external_id, title, description, content, + organization, project, repo_name, status, author_id, author_name, + is_draft, created_at, updated_at, url, linked_work_items, embedding + ) VALUES ( + 'pr', $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17 + ) + ON CONFLICT (source_type, source_id) DO UPDATE SET + title = EXCLUDED.title, + description = EXCLUDED.description, + content = EXCLUDED.content, + status = EXCLUDED.status, + is_draft = EXCLUDED.is_draft, + updated_at = EXCLUDED.updated_at, + linked_work_items = EXCLUDED.linked_work_items, + embedding = EXCLUDED.embedding, + indexed_at = NOW() + "#, + // ... parameters + ).execute(&self.pool).await?; + + Ok(()) + } +} +``` + +## API Endpoints + +```rust +// GET /api/search?q=authentication+PRs&limit=20 +// GET /api/search?q=priority+1+bugs&project=Lerums+Djursjukhus + +#[derive(Deserialize)] +pub struct SearchQuery { + q: String, + #[serde(default = "default_limit")] + limit: i32, + // Optional explicit filters (override parsed) + project: Option, + status: Option, + priority: Option, + source_type: Option, +} + +pub async fn search( + State(state): State, + Query(params): Query, +) -> Result, AppError> { + let results = state.search_service.search(¶ms.q, params.limit).await?; + Ok(Json(SearchResponse { results })) +} +``` + +## Frontend Component + +```tsx +// app/src/components/SuperSearch.tsx +function SuperSearch() { + const [query, setQuery] = useState('') + const { data, isLoading } = useQuery({ + queryKey: ['search', query], + queryFn: () => api.search(query), + enabled: query.length > 2, + }) + + return ( + + + + {data?.results.map((result) => ( + + + + ))} + + + ) +} +``` + +## Implementation Plan + +1. **Database setup** (1h) + - Add pgvector extension + - Create migration for `search_documents` table + +2. **Gemini embeddings** (2h) + - Add `GEMINI_API_KEY` to config + - Implement embedder client + +3. **Indexer service** (4h) + - PR indexing with full content + - Work item indexing + - Background sync job + +4. **Search service** (3h) + - Query parser + - Hybrid search with RRF + - Result formatting + +5. **API endpoint** (1h) + - Search route + - Response types + +6. **Frontend** (3h) + - Command palette component + - Result cards with metadata + - Filters UI + +## Config + +```yaml +# config/base.yaml +search: + enabled: true + gemini_api_key: ${GEMINI_API_KEY} + sync_interval_minutes: 15 + embedding_batch_size: 10 +``` + +## Future Enhancements + +- **Query suggestions** - Autocomplete based on recent searches +- **Saved searches** - Pin common queries +- **Search within PR** - Drill down into specific PR content +- **AI summaries** - Generate summary of search results using Gemini +- **Related items** - "Similar PRs" based on embedding distance diff --git a/milltime/Cargo.toml b/milltime/Cargo.toml index 1561bfa1..c740889f 100644 --- a/milltime/Cargo.toml +++ b/milltime/Cargo.toml @@ -10,7 +10,7 @@ serde = { version = "1.0.193", features = ["derive"] } serde_json = "1.0.108" tokio = { version = "1.35.1", features = ["full"] } dotenvy = "0.15.7" -time = { version = "0.3.31", features = ["serde", "parsing"] } +time = { version = "0.3.47", features = ["serde", "parsing"] } tracing = { version = "0.1.40", features = ["attributes"] } thiserror = "1.0.56" reqwest = { version = "0.11.23", features = ["json", "cookies", "multipart"] } diff --git a/toki-api/.sqlx/query-05dc742cf5195c7d8452fc64af73e638ffd210367b0b90dae1c6a2ad18b5bbab.json b/toki-api/.sqlx/query-05dc742cf5195c7d8452fc64af73e638ffd210367b0b90dae1c6a2ad18b5bbab.json new file mode 100644 index 00000000..1ad60466 --- /dev/null +++ b/toki-api/.sqlx/query-05dc742cf5195c7d8452fc64af73e638ffd210367b0b90dae1c6a2ad18b5bbab.json @@ -0,0 +1,20 @@ +{ + "db_name": "PostgreSQL", + "query": "SELECT COUNT(*) as \"count!\" FROM search_documents", + "describe": { + "columns": [ + { + "ordinal": 0, + "name": "count!", + "type_info": "Int8" + } + ], + "parameters": { + "Left": [] + }, + "nullable": [ + null + ] + }, + "hash": "05dc742cf5195c7d8452fc64af73e638ffd210367b0b90dae1c6a2ad18b5bbab" +} diff --git a/toki-api/.sqlx/query-0d2de90b227d0282c0e040cfaa266c3cc3fec0a12d9a1aa9530a320e1cc9899f.json b/toki-api/.sqlx/query-0d2de90b227d0282c0e040cfaa266c3cc3fec0a12d9a1aa9530a320e1cc9899f.json new file mode 100644 index 00000000..7b4a19d9 --- /dev/null +++ b/toki-api/.sqlx/query-0d2de90b227d0282c0e040cfaa266c3cc3fec0a12d9a1aa9530a320e1cc9899f.json @@ -0,0 +1,164 @@ +{ + "db_name": "PostgreSQL", + "query": "\n SELECT \n source_type as \"source_type: SearchSource\",\n source_id,\n external_id,\n title,\n description,\n content,\n organization,\n project,\n repo_name,\n status,\n author_id,\n author_name,\n assigned_to_id,\n assigned_to_name,\n priority,\n item_type,\n is_draft,\n created_at,\n updated_at,\n closed_at,\n url,\n parent_id,\n linked_work_items\n FROM search_documents\n WHERE indexed_at < $1\n ", + "describe": { + "columns": [ + { + "ordinal": 0, + "name": "source_type: SearchSource", + "type_info": { + "Custom": { + "name": "search_source", + "kind": { + "Enum": [ + "pr", + "work_item" + ] + } + } + } + }, + { + "ordinal": 1, + "name": "source_id", + "type_info": "Text" + }, + { + "ordinal": 2, + "name": "external_id", + "type_info": "Int4" + }, + { + "ordinal": 3, + "name": "title", + "type_info": "Text" + }, + { + "ordinal": 4, + "name": "description", + "type_info": "Text" + }, + { + "ordinal": 5, + "name": "content", + "type_info": "Text" + }, + { + "ordinal": 6, + "name": "organization", + "type_info": "Text" + }, + { + "ordinal": 7, + "name": "project", + "type_info": "Text" + }, + { + "ordinal": 8, + "name": "repo_name", + "type_info": "Text" + }, + { + "ordinal": 9, + "name": "status", + "type_info": "Text" + }, + { + "ordinal": 10, + "name": "author_id", + "type_info": "Text" + }, + { + "ordinal": 11, + "name": "author_name", + "type_info": "Text" + }, + { + "ordinal": 12, + "name": "assigned_to_id", + "type_info": "Text" + }, + { + "ordinal": 13, + "name": "assigned_to_name", + "type_info": "Text" + }, + { + "ordinal": 14, + "name": "priority", + "type_info": "Int4" + }, + { + "ordinal": 15, + "name": "item_type", + "type_info": "Text" + }, + { + "ordinal": 16, + "name": "is_draft", + "type_info": "Bool" + }, + { + "ordinal": 17, + "name": "created_at", + "type_info": "Timestamptz" + }, + { + "ordinal": 18, + "name": "updated_at", + "type_info": "Timestamptz" + }, + { + "ordinal": 19, + "name": "closed_at", + "type_info": "Timestamptz" + }, + { + "ordinal": 20, + "name": "url", + "type_info": "Text" + }, + { + "ordinal": 21, + "name": "parent_id", + "type_info": "Int4" + }, + { + "ordinal": 22, + "name": "linked_work_items", + "type_info": "Int4Array" + } + ], + "parameters": { + "Left": [ + "Timestamptz" + ] + }, + "nullable": [ + false, + false, + false, + false, + true, + true, + false, + false, + true, + false, + true, + true, + true, + true, + true, + true, + true, + false, + false, + true, + false, + true, + true + ] + }, + "hash": "0d2de90b227d0282c0e040cfaa266c3cc3fec0a12d9a1aa9530a320e1cc9899f" +} diff --git a/toki-api/.sqlx/query-66404001aaa432bb201713792ad1002071aedcf109f58e62082784071c368368.json b/toki-api/.sqlx/query-66404001aaa432bb201713792ad1002071aedcf109f58e62082784071c368368.json new file mode 100644 index 00000000..be2d65d3 --- /dev/null +++ b/toki-api/.sqlx/query-66404001aaa432bb201713792ad1002071aedcf109f58e62082784071c368368.json @@ -0,0 +1,135 @@ +{ + "db_name": "PostgreSQL", + "query": "\n WITH bm25_results AS (\n SELECT \n id,\n ts_rank_cd(search_vector, websearch_to_tsquery('english', $1)) as score,\n ROW_NUMBER() OVER (\n ORDER BY ts_rank_cd(search_vector, websearch_to_tsquery('english', $1)) DESC\n ) as rank\n FROM search_documents\n WHERE ($1 = '' OR search_vector @@ websearch_to_tsquery('english', $1))\n AND ($2::search_source IS NULL OR source_type = $2)\n AND ($3::text IS NULL OR organization = $3)\n AND ($4::text IS NULL OR project = $4)\n AND ($5::text[] IS NULL OR status = ANY($5))\n AND ($6::int[] IS NULL OR priority = ANY($6))\n AND ($7::text[] IS NULL OR item_type = ANY($7))\n AND ($8::bool IS NULL OR is_draft = $8)\n AND ($9::timestamptz IS NULL OR updated_at >= $9)\n LIMIT 100\n ),\n vector_results AS (\n SELECT \n id,\n 1 - (embedding <=> $10) as score,\n ROW_NUMBER() OVER (\n ORDER BY embedding <=> $10\n ) as rank\n FROM search_documents\n WHERE embedding IS NOT NULL\n AND ($2::search_source IS NULL OR source_type = $2)\n AND ($3::text IS NULL OR organization = $3)\n AND ($4::text IS NULL OR project = $4)\n AND ($5::text[] IS NULL OR status = ANY($5))\n AND ($6::int[] IS NULL OR priority = ANY($6))\n AND ($7::text[] IS NULL OR item_type = ANY($7))\n AND ($8::bool IS NULL OR is_draft = $8)\n AND ($9::timestamptz IS NULL OR updated_at >= $9)\n LIMIT 100\n ),\n rrf_combined AS (\n SELECT\n COALESCE(b.id, v.id) as id,\n (COALESCE(1.0 / (60 + b.rank), 0) + COALESCE(1.0 / (60 + v.rank), 0))::float8 as rrf_score\n FROM bm25_results b\n FULL OUTER JOIN vector_results v ON b.id = v.id\n )\n SELECT \n d.id,\n d.source_type as \"source_type: SearchSource\",\n d.source_id,\n d.external_id,\n d.title,\n d.description,\n d.status,\n d.priority,\n d.item_type,\n d.author_name,\n d.url,\n d.created_at,\n d.updated_at,\n r.rrf_score as score\n FROM rrf_combined r\n JOIN search_documents d ON d.id = r.id\n ORDER BY r.rrf_score DESC\n LIMIT $11\n ", + "describe": { + "columns": [ + { + "ordinal": 0, + "name": "id", + "type_info": "Int4" + }, + { + "ordinal": 1, + "name": "source_type: SearchSource", + "type_info": { + "Custom": { + "name": "search_source", + "kind": { + "Enum": [ + "pr", + "work_item" + ] + } + } + } + }, + { + "ordinal": 2, + "name": "source_id", + "type_info": "Text" + }, + { + "ordinal": 3, + "name": "external_id", + "type_info": "Int4" + }, + { + "ordinal": 4, + "name": "title", + "type_info": "Text" + }, + { + "ordinal": 5, + "name": "description", + "type_info": "Text" + }, + { + "ordinal": 6, + "name": "status", + "type_info": "Text" + }, + { + "ordinal": 7, + "name": "priority", + "type_info": "Int4" + }, + { + "ordinal": 8, + "name": "item_type", + "type_info": "Text" + }, + { + "ordinal": 9, + "name": "author_name", + "type_info": "Text" + }, + { + "ordinal": 10, + "name": "url", + "type_info": "Text" + }, + { + "ordinal": 11, + "name": "created_at", + "type_info": "Timestamptz" + }, + { + "ordinal": 12, + "name": "updated_at", + "type_info": "Timestamptz" + }, + { + "ordinal": 13, + "name": "score", + "type_info": "Float8" + } + ], + "parameters": { + "Left": [ + "Text", + { + "Custom": { + "name": "search_source", + "kind": { + "Enum": [ + "pr", + "work_item" + ] + } + } + }, + "Text", + "Text", + "TextArray", + "Int4Array", + "TextArray", + "Bool", + "Timestamptz", + { + "Custom": { + "name": "vector", + "kind": "Simple" + } + }, + "Int8" + ] + }, + "nullable": [ + false, + false, + false, + false, + false, + true, + false, + true, + true, + true, + false, + false, + false, + null + ] + }, + "hash": "66404001aaa432bb201713792ad1002071aedcf109f58e62082784071c368368" +} diff --git a/toki-api/.sqlx/query-8557872e0bb4ec41241c5027536634e3cd48eaa3243ffcb1c67278d821365f49.json b/toki-api/.sqlx/query-8557872e0bb4ec41241c5027536634e3cd48eaa3243ffcb1c67278d821365f49.json new file mode 100644 index 00000000..7d112371 --- /dev/null +++ b/toki-api/.sqlx/query-8557872e0bb4ec41241c5027536634e3cd48eaa3243ffcb1c67278d821365f49.json @@ -0,0 +1,25 @@ +{ + "db_name": "PostgreSQL", + "query": "\n DELETE FROM search_documents\n WHERE source_type = $1 AND source_id = $2\n ", + "describe": { + "columns": [], + "parameters": { + "Left": [ + { + "Custom": { + "name": "search_source", + "kind": { + "Enum": [ + "pr", + "work_item" + ] + } + } + }, + "Text" + ] + }, + "nullable": [] + }, + "hash": "8557872e0bb4ec41241c5027536634e3cd48eaa3243ffcb1c67278d821365f49" +} diff --git a/toki-api/.sqlx/query-91b62aaf67f424e6cc413380dd7668f0de12583faac70c14028b8fb34fb6a6a7.json b/toki-api/.sqlx/query-91b62aaf67f424e6cc413380dd7668f0de12583faac70c14028b8fb34fb6a6a7.json new file mode 100644 index 00000000..40106741 --- /dev/null +++ b/toki-api/.sqlx/query-91b62aaf67f424e6cc413380dd7668f0de12583faac70c14028b8fb34fb6a6a7.json @@ -0,0 +1,52 @@ +{ + "db_name": "PostgreSQL", + "query": "\n INSERT INTO search_documents (\n source_type, source_id, external_id, title, description, content,\n organization, project, repo_name, status,\n author_id, author_name, assigned_to_id, assigned_to_name,\n priority, item_type, is_draft,\n created_at, updated_at, closed_at,\n url, parent_id, linked_work_items, embedding\n ) VALUES (\n $1, $2, $3, $4, $5, $6, $7, $8, $9, $10,\n $11, $12, $13, $14, $15, $16, $17, $18, $19, $20,\n $21, $22, $23, $24\n )\n ON CONFLICT (source_type, source_id) DO UPDATE SET\n title = EXCLUDED.title,\n description = EXCLUDED.description,\n content = EXCLUDED.content,\n status = EXCLUDED.status,\n author_id = EXCLUDED.author_id,\n author_name = EXCLUDED.author_name,\n assigned_to_id = EXCLUDED.assigned_to_id,\n assigned_to_name = EXCLUDED.assigned_to_name,\n priority = EXCLUDED.priority,\n item_type = EXCLUDED.item_type,\n is_draft = EXCLUDED.is_draft,\n updated_at = EXCLUDED.updated_at,\n closed_at = EXCLUDED.closed_at,\n linked_work_items = EXCLUDED.linked_work_items,\n embedding = EXCLUDED.embedding,\n indexed_at = NOW()\n ", + "describe": { + "columns": [], + "parameters": { + "Left": [ + { + "Custom": { + "name": "search_source", + "kind": { + "Enum": [ + "pr", + "work_item" + ] + } + } + }, + "Text", + "Int4", + "Text", + "Text", + "Text", + "Text", + "Text", + "Text", + "Text", + "Text", + "Text", + "Text", + "Text", + "Int4", + "Text", + "Bool", + "Timestamptz", + "Timestamptz", + "Timestamptz", + "Text", + "Int4", + "Int4Array", + { + "Custom": { + "name": "vector", + "kind": "Simple" + } + } + ] + }, + "nullable": [] + }, + "hash": "91b62aaf67f424e6cc413380dd7668f0de12583faac70c14028b8fb34fb6a6a7" +} diff --git a/toki-api/.sqlx/query-c2ab51c6047a03b187539718cf8eb4a157cae880eb8a56c3bfb83623f1391e46.json b/toki-api/.sqlx/query-c2ab51c6047a03b187539718cf8eb4a157cae880eb8a56c3bfb83623f1391e46.json new file mode 100644 index 00000000..7da0c2ed --- /dev/null +++ b/toki-api/.sqlx/query-c2ab51c6047a03b187539718cf8eb4a157cae880eb8a56c3bfb83623f1391e46.json @@ -0,0 +1,175 @@ +{ + "db_name": "PostgreSQL", + "query": "\n SELECT \n source_type as \"source_type: SearchSource\",\n source_id,\n external_id,\n title,\n description,\n content,\n organization,\n project,\n repo_name,\n status,\n author_id,\n author_name,\n assigned_to_id,\n assigned_to_name,\n priority,\n item_type,\n is_draft,\n created_at,\n updated_at,\n closed_at,\n url,\n parent_id,\n linked_work_items\n FROM search_documents\n WHERE source_type = $1 AND source_id = $2\n ", + "describe": { + "columns": [ + { + "ordinal": 0, + "name": "source_type: SearchSource", + "type_info": { + "Custom": { + "name": "search_source", + "kind": { + "Enum": [ + "pr", + "work_item" + ] + } + } + } + }, + { + "ordinal": 1, + "name": "source_id", + "type_info": "Text" + }, + { + "ordinal": 2, + "name": "external_id", + "type_info": "Int4" + }, + { + "ordinal": 3, + "name": "title", + "type_info": "Text" + }, + { + "ordinal": 4, + "name": "description", + "type_info": "Text" + }, + { + "ordinal": 5, + "name": "content", + "type_info": "Text" + }, + { + "ordinal": 6, + "name": "organization", + "type_info": "Text" + }, + { + "ordinal": 7, + "name": "project", + "type_info": "Text" + }, + { + "ordinal": 8, + "name": "repo_name", + "type_info": "Text" + }, + { + "ordinal": 9, + "name": "status", + "type_info": "Text" + }, + { + "ordinal": 10, + "name": "author_id", + "type_info": "Text" + }, + { + "ordinal": 11, + "name": "author_name", + "type_info": "Text" + }, + { + "ordinal": 12, + "name": "assigned_to_id", + "type_info": "Text" + }, + { + "ordinal": 13, + "name": "assigned_to_name", + "type_info": "Text" + }, + { + "ordinal": 14, + "name": "priority", + "type_info": "Int4" + }, + { + "ordinal": 15, + "name": "item_type", + "type_info": "Text" + }, + { + "ordinal": 16, + "name": "is_draft", + "type_info": "Bool" + }, + { + "ordinal": 17, + "name": "created_at", + "type_info": "Timestamptz" + }, + { + "ordinal": 18, + "name": "updated_at", + "type_info": "Timestamptz" + }, + { + "ordinal": 19, + "name": "closed_at", + "type_info": "Timestamptz" + }, + { + "ordinal": 20, + "name": "url", + "type_info": "Text" + }, + { + "ordinal": 21, + "name": "parent_id", + "type_info": "Int4" + }, + { + "ordinal": 22, + "name": "linked_work_items", + "type_info": "Int4Array" + } + ], + "parameters": { + "Left": [ + { + "Custom": { + "name": "search_source", + "kind": { + "Enum": [ + "pr", + "work_item" + ] + } + } + }, + "Text" + ] + }, + "nullable": [ + false, + false, + false, + false, + true, + true, + false, + false, + true, + false, + true, + true, + true, + true, + true, + true, + true, + false, + false, + true, + false, + true, + true + ] + }, + "hash": "c2ab51c6047a03b187539718cf8eb4a157cae880eb8a56c3bfb83623f1391e46" +} diff --git a/toki-api/.sqlx/query-c3e92392cdeee40bef2116569c39c17fc948460a49060203e63836940ae97829.json b/toki-api/.sqlx/query-c3e92392cdeee40bef2116569c39c17fc948460a49060203e63836940ae97829.json new file mode 100644 index 00000000..bf33f4bd --- /dev/null +++ b/toki-api/.sqlx/query-c3e92392cdeee40bef2116569c39c17fc948460a49060203e63836940ae97829.json @@ -0,0 +1,32 @@ +{ + "db_name": "PostgreSQL", + "query": "SELECT COUNT(*) as \"count!\" FROM search_documents WHERE source_type = $1", + "describe": { + "columns": [ + { + "ordinal": 0, + "name": "count!", + "type_info": "Int8" + } + ], + "parameters": { + "Left": [ + { + "Custom": { + "name": "search_source", + "kind": { + "Enum": [ + "pr", + "work_item" + ] + } + } + } + ] + }, + "nullable": [ + null + ] + }, + "hash": "c3e92392cdeee40bef2116569c39c17fc948460a49060203e63836940ae97829" +} diff --git a/toki-api/.sqlx/query-f8b004aabe43de431ecf2fa90417f62625ac39629a29430dff51c1501f3e7eb2.json b/toki-api/.sqlx/query-f8b004aabe43de431ecf2fa90417f62625ac39629a29430dff51c1501f3e7eb2.json new file mode 100644 index 00000000..3fbac9f9 --- /dev/null +++ b/toki-api/.sqlx/query-f8b004aabe43de431ecf2fa90417f62625ac39629a29430dff51c1501f3e7eb2.json @@ -0,0 +1,52 @@ +{ + "db_name": "PostgreSQL", + "query": "\n INSERT INTO search_documents (\n source_type, source_id, external_id, title, description, content,\n organization, project, repo_name, status, \n author_id, author_name, assigned_to_id, assigned_to_name,\n priority, item_type, is_draft,\n created_at, updated_at, closed_at,\n url, parent_id, linked_work_items, embedding\n ) VALUES (\n $1, $2, $3, $4, $5, $6, $7, $8, $9, $10,\n $11, $12, $13, $14, $15, $16, $17, $18, $19, $20,\n $21, $22, $23, $24\n )\n ON CONFLICT (source_type, source_id) DO UPDATE SET\n title = EXCLUDED.title,\n description = EXCLUDED.description,\n content = EXCLUDED.content,\n status = EXCLUDED.status,\n author_id = EXCLUDED.author_id,\n author_name = EXCLUDED.author_name,\n assigned_to_id = EXCLUDED.assigned_to_id,\n assigned_to_name = EXCLUDED.assigned_to_name,\n priority = EXCLUDED.priority,\n item_type = EXCLUDED.item_type,\n is_draft = EXCLUDED.is_draft,\n updated_at = EXCLUDED.updated_at,\n closed_at = EXCLUDED.closed_at,\n linked_work_items = EXCLUDED.linked_work_items,\n embedding = EXCLUDED.embedding,\n indexed_at = NOW()\n ", + "describe": { + "columns": [], + "parameters": { + "Left": [ + { + "Custom": { + "name": "search_source", + "kind": { + "Enum": [ + "pr", + "work_item" + ] + } + } + }, + "Text", + "Int4", + "Text", + "Text", + "Text", + "Text", + "Text", + "Text", + "Text", + "Text", + "Text", + "Text", + "Text", + "Int4", + "Text", + "Bool", + "Timestamptz", + "Timestamptz", + "Timestamptz", + "Text", + "Int4", + "Int4Array", + { + "Custom": { + "name": "vector", + "kind": "Simple" + } + } + ] + }, + "nullable": [] + }, + "hash": "f8b004aabe43de431ecf2fa90417f62625ac39629a29430dff51c1501f3e7eb2" +} diff --git a/toki-api/Cargo.toml b/toki-api/Cargo.toml index fc410e3b..e25a0c3e 100644 --- a/toki-api/Cargo.toml +++ b/toki-api/Cargo.toml @@ -14,7 +14,7 @@ serde = { version = "1.0.193", features = ["derive"] } serde_json = "1.0.108" serde_with = "3.4.0" tokio = { version = "1.35.1", features = ["full"] } -time = { version = "0.3.31", features = ["serde", "parsing"] } +time = { version = "0.3.47", features = ["serde", "parsing"] } chrono = { version = "0.4.38", features = ["serde"] } axum = { version = "0.7.3", features = ["ws", "macros"] } axum-extra = { version = "0.9.1", features = ["typed-header", "cookie"] } @@ -42,9 +42,11 @@ sqlx = { version = "0.8.0", features = [ ] } axum-login = "0.16.0" thiserror = "1.0" +regex = "1.10" oauth2 = "4.4.2" reqwest = { version = "0.12.5", features = ["json"] } async-trait = "0.1.77" +pgvector = { version = "0.4", features = ["sqlx"] } crossbeam = { version = "0.8.4", features = ["crossbeam-channel"] } web-push = "0.10.1" url = "2.5.0" @@ -53,3 +55,4 @@ base64 = "0.22.1" strum_macros = "0.26.4" tower-sessions-moka-store = "0.14" tower-sessions-sqlx-store = { version = "0.14.2", features = ["postgres"] } +genai = "0.5.3" diff --git a/toki-api/migrations/20260205220000_create_search_documents.sql b/toki-api/migrations/20260205220000_create_search_documents.sql new file mode 100644 index 00000000..c29cdcf1 --- /dev/null +++ b/toki-api/migrations/20260205220000_create_search_documents.sql @@ -0,0 +1,82 @@ +-- Super Search: Hybrid semantic + full-text search for PRs and Work Items +-- Requires pgvector extension for vector similarity search + +CREATE EXTENSION IF NOT EXISTS vector; + +-- Source type enum (idempotent: Postgres lacks CREATE TYPE IF NOT EXISTS for enums) +DO $$ BEGIN + CREATE TYPE search_source AS ENUM ('pr', 'work_item'); +EXCEPTION + WHEN duplicate_object THEN NULL; +END $$; + +-- Main search documents table +CREATE TABLE search_documents ( + id SERIAL PRIMARY KEY, + + -- Source identification + source_type search_source NOT NULL, + source_id TEXT NOT NULL, -- "org/project/repo/123" for PR, "org/project/123" for WI + external_id INT NOT NULL, -- PR number or work item ID + + -- Searchable content + title TEXT NOT NULL, + description TEXT, + content TEXT, -- Combined: description + comments + commit messages + + -- Metadata (filterable) + organization TEXT NOT NULL, + project TEXT NOT NULL, + repo_name TEXT, -- NULL for work items + status TEXT NOT NULL, -- 'active', 'completed', 'abandoned' / 'New', 'Active', 'Closed' + author_id TEXT, + author_name TEXT, + assigned_to_id TEXT, + assigned_to_name TEXT, + priority INT, -- 1-4 for work items, NULL for PRs + item_type TEXT, -- 'Bug', 'Task', 'User Story' for WI; NULL for PRs + is_draft BOOLEAN DEFAULT FALSE, + + -- Timestamps + created_at TIMESTAMPTZ NOT NULL, + updated_at TIMESTAMPTZ NOT NULL, + closed_at TIMESTAMPTZ, + indexed_at TIMESTAMPTZ DEFAULT NOW(), + + -- Full-text search vector (weighted: A=title, B=description, C=content) + search_vector TSVECTOR GENERATED ALWAYS AS ( + setweight(to_tsvector('english', coalesce(title, '')), 'A') || + setweight(to_tsvector('english', coalesce(description, '')), 'B') || + setweight(to_tsvector('english', coalesce(content, '')), 'C') + ) STORED, + + -- Embedding vector for semantic search (Gemini: 1536 dimensions, reduced via outputDimensionality) + embedding vector(1536), + + -- Links + url TEXT NOT NULL, + parent_id INT, -- For work items with parent + linked_work_items INT[], -- Work items linked to a PR + + -- Unique constraint on source + UNIQUE(source_type, source_id) +); + +-- Indexes for search performance + +-- Full-text search (GIN for tsvector) +CREATE INDEX idx_search_docs_search_vector ON search_documents USING GIN(search_vector); + +-- Vector similarity search (HNSW for approximate nearest neighbor) +CREATE INDEX idx_search_docs_embedding ON search_documents USING hnsw(embedding vector_cosine_ops); + +-- Metadata filtering (B-tree indexes) +CREATE INDEX idx_search_docs_org_project ON search_documents(organization, project); +CREATE INDEX idx_search_docs_source_type ON search_documents(source_type); +CREATE INDEX idx_search_docs_status ON search_documents(status); +CREATE INDEX idx_search_docs_priority ON search_documents(priority) WHERE priority IS NOT NULL; +CREATE INDEX idx_search_docs_item_type ON search_documents(item_type) WHERE item_type IS NOT NULL; +CREATE INDEX idx_search_docs_author ON search_documents(author_id) WHERE author_id IS NOT NULL; +CREATE INDEX idx_search_docs_created ON search_documents(created_at DESC); +CREATE INDEX idx_search_docs_updated ON search_documents(updated_at DESC); +CREATE INDEX idx_search_docs_indexed ON search_documents(indexed_at DESC); diff --git a/toki-api/scripts/init_db.sh b/toki-api/scripts/init_db.sh index 8cdbd65a..c6b8da6e 100755 --- a/toki-api/scripts/init_db.sh +++ b/toki-api/scripts/init_db.sh @@ -39,7 +39,7 @@ then -e POSTGRES_PASSWORD=${DB_PASSWORD} \ -e POSTGRES_DB=${DB_NAME} \ -p "${DB_PORT}":5432 \ - -d postgres \ + -d pgvector/pgvector:pg17 \ postgres -N 1000 # ^ Increased maximum number of connections for testing purposes fi diff --git a/toki-api/src/app_state.rs b/toki-api/src/app_state.rs index 5dd37711..67d9af97 100644 --- a/toki-api/src/app_state.rs +++ b/toki-api/src/app_state.rs @@ -16,6 +16,10 @@ use web_push::{IsahcWebPushClient, WebPushClient, WebPushMessage}; use crate::{ domain::{ + search::{ + embedder::GeminiEmbedder, repository::PgSearchRepository, + run_search_index_worker, IndexerConfig, SearchService, + }, CachedIdentities, NotificationHandler, PullRequest, RepoConfig, RepoDiffer, RepoDifferMessage, RepoKey, }, @@ -61,6 +65,8 @@ pub struct AppState { differ_txs: Arc>>>, web_push_client: IsahcWebPushClient, notification_handler: Arc, + search_service: Option>>, + embedder: Option, } impl AppState { @@ -101,6 +107,26 @@ impl AppState { web_push_client.clone(), )); + // Initialize embedder and search service if GEMINI_API_KEY is set + let embedder = match GeminiEmbedder::try_from_env() { + Some(Ok(embedder)) => { + tracing::info!("GEMINI_API_KEY found, initializing search service"); + Some(embedder) + } + Some(Err(e)) => { + tracing::error!("Failed to create Gemini embedder: {}", e); + None + } + None => { + tracing::warn!("GEMINI_API_KEY not set, search service disabled"); + None + } + }; + let search_service = embedder.as_ref().map(|embedder| { + let repository = PgSearchRepository::new(db_pool.clone()); + Arc::new(SearchService::with_defaults(embedder.clone(), repository)) + }); + let mut differs = HashMap::new(); let differ_txs = clients .iter() @@ -137,6 +163,8 @@ impl AppState { differs: Arc::new(RwLock::new(differs)), web_push_client, notification_handler, + search_service, + embedder, } } @@ -264,4 +292,32 @@ impl AppState { pub fn host_domain(&self) -> String { self.api_url.host_str().unwrap_or("localhost").to_string() } + + #[allow(dead_code)] + pub fn start_search_indexer(&self) { + let embedder = match &self.embedder { + Some(e) => e.clone(), + None => { + tracing::info!("Embedder not configured, skipping search indexer"); + return; + } + }; + + let db_pool = self.db_pool.clone(); + let repo_clients = self.repo_clients.clone(); + let interval = Duration::from_secs(3600); // 1 hour + let config = IndexerConfig::default(); + + tokio::spawn(run_search_index_worker( + db_pool, + repo_clients, + embedder, + interval, + config, + )); + } + + pub fn search_service(&self) -> Option<&SearchService> { + self.search_service.as_ref().map(|s| s.as_ref()) + } } diff --git a/toki-api/src/domain/mod.rs b/toki-api/src/domain/mod.rs index d838a48e..d8351ce8 100644 --- a/toki-api/src/domain/mod.rs +++ b/toki-api/src/domain/mod.rs @@ -10,6 +10,7 @@ mod repo_config; mod repo_differ; mod repo_key; mod repository; +pub mod search; mod user; pub use email::*; diff --git a/toki-api/src/domain/search/embedder/gemini.rs b/toki-api/src/domain/search/embedder/gemini.rs new file mode 100644 index 00000000..1bad954a --- /dev/null +++ b/toki-api/src/domain/search/embedder/gemini.rs @@ -0,0 +1,146 @@ +//! Gemini embedder implementation using the genai crate. + +use async_trait::async_trait; +use genai::embed::EmbedOptions; + +use crate::domain::search::traits::{Embedder, Result, SearchError}; + +/// Gemini embedding model configuration. +pub const GEMINI_MODEL: &str = "gemini-embedding-001"; +pub const GEMINI_DIMENSIONS: usize = 1536; + +/// Embedder implementation using Google's Gemini API via the `genai` crate. +/// +/// The genai client automatically reads `GEMINI_API_KEY` from the environment. +/// +/// # Example +/// +/// ```ignore +/// let embedder = GeminiEmbedder::new()?; +/// let embedding = embedder.embed("authentication system").await?; +/// assert_eq!(embedding.len(), 1536); +/// ``` +#[derive(Clone)] +pub struct GeminiEmbedder { + client: genai::Client, + model: String, + options: EmbedOptions, +} + +impl GeminiEmbedder { + /// Create a new Gemini embedder with the default model. + /// + /// Returns an error if the genai client cannot be created. + pub fn new() -> Result { + Self::with_model(GEMINI_MODEL) + } + + /// Create a new Gemini embedder with a specific model. + pub fn with_model(model: impl Into) -> Result { + let client = genai::Client::default(); + let options = EmbedOptions::new().with_embedding_type("RETRIEVAL_QUERY"); + + Ok(Self { + client, + model: model.into(), + options, + }) + } + + /// Try to create from environment variable. + /// + /// Returns `None` if `GEMINI_API_KEY` is not set, or `Some(Err)` if + /// the client can't be created for another reason. + pub fn try_from_env() -> Option> { + if std::env::var("GEMINI_API_KEY").is_err() { + return None; + } + Some(Self::new()) + } +} + +#[async_trait] +impl Embedder for GeminiEmbedder { + async fn embed(&self, text: &str) -> Result> { + if text.is_empty() { + return Ok(vec![0.0; GEMINI_DIMENSIONS]); + } + + let response = self + .client + .embed(&self.model, text, Some(&self.options)) + .await + .map_err(|e| SearchError::EmbeddingError(e.to_string()))?; + + let embedding = response + .first_embedding() + .ok_or_else(|| SearchError::EmbeddingError("No embedding in response".into()))?; + + Ok(embedding.vector().to_vec()) + } + + async fn embed_batch(&self, texts: &[&str]) -> Result>> { + if texts.is_empty() { + return Ok(vec![]); + } + + // Filter empty strings and track their indices + let mut results = vec![vec![0.0f32; GEMINI_DIMENSIONS]; texts.len()]; + let non_empty: Vec<(usize, String)> = texts + .iter() + .enumerate() + .filter(|(_, t)| !t.is_empty()) + .map(|(i, t)| (i, t.to_string())) + .collect(); + + if non_empty.is_empty() { + return Ok(results); + } + + let batch_texts: Vec = non_empty.iter().map(|(_, t)| t.clone()).collect(); + + let response = self + .client + .embed_batch(&self.model, batch_texts, Some(&self.options)) + .await + .map_err(|e| SearchError::EmbeddingError(e.to_string()))?; + + for (batch_idx, (original_idx, _)) in non_empty.iter().enumerate() { + if let Some(embedding) = response.embeddings.get(batch_idx) { + results[*original_idx] = embedding.vector().to_vec(); + } + } + + Ok(results) + } + + fn dimensions(&self) -> usize { + GEMINI_DIMENSIONS + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn embedder_dimensions() { + if std::env::var("GEMINI_API_KEY").is_err() { + // Can't test without API key + return; + } + let embedder = GeminiEmbedder::new().unwrap(); + assert_eq!(embedder.dimensions(), 1536); + } + + #[tokio::test] + async fn embed_empty_returns_zeros() { + if std::env::var("GEMINI_API_KEY").is_err() { + return; + } + let embedder = GeminiEmbedder::new().unwrap(); + let result = embedder.embed("").await.unwrap(); + assert_eq!(result.len(), GEMINI_DIMENSIONS); + assert!(result.iter().all(|&x| x == 0.0)); + } +} diff --git a/toki-api/src/domain/search/embedder/mock.rs b/toki-api/src/domain/search/embedder/mock.rs new file mode 100644 index 00000000..d1f14a68 --- /dev/null +++ b/toki-api/src/domain/search/embedder/mock.rs @@ -0,0 +1,151 @@ +//! Mock embedder implementation for testing. + +use async_trait::async_trait; +use std::sync::atomic::{AtomicUsize, Ordering}; +use std::sync::Arc; + +use crate::domain::search::traits::{Embedder, Result}; + +/// Mock embedder that returns configurable vectors. +/// +/// # Examples +/// +/// ``` +/// use toki_api::domain::search::embedder::MockEmbedder; +/// +/// // Return a fixed vector +/// let embedder = MockEmbedder::returning(vec![0.1; 1536]); +/// +/// // Return different vectors for each call +/// let embedder = MockEmbedder::with_sequence(vec![ +/// vec![0.1; 1536], +/// vec![0.2; 1536], +/// ]); +/// ``` +#[derive(Clone)] +pub struct MockEmbedder { + responses: Arc>>, + call_count: Arc, + dimensions: usize, +} + +impl MockEmbedder { + /// Create a mock that always returns the same vector. + pub fn returning(vector: Vec) -> Self { + let dims = vector.len(); + Self { + responses: Arc::new(vec![vector]), + call_count: Arc::new(AtomicUsize::new(0)), + dimensions: dims, + } + } + + /// Create a mock that returns vectors in sequence. + /// + /// Wraps around if more calls are made than vectors provided. + pub fn with_sequence(vectors: Vec>) -> Self { + let dims = vectors.first().map(|v| v.len()).unwrap_or(1536); + Self { + responses: Arc::new(vectors), + call_count: Arc::new(AtomicUsize::new(0)), + dimensions: dims, + } + } + + /// Create a mock with default 1536-dimensional zero vectors. + pub fn default_dims() -> Self { + Self::returning(vec![0.0; 1536]) + } + + /// Get the number of times `embed` or `embed_batch` was called. + pub fn call_count(&self) -> usize { + self.call_count.load(Ordering::SeqCst) + } + + /// Reset the call counter. + pub fn reset(&self) { + self.call_count.store(0, Ordering::SeqCst); + } +} + +impl Default for MockEmbedder { + fn default() -> Self { + Self::default_dims() + } +} + +#[async_trait] +impl Embedder for MockEmbedder { + async fn embed(&self, _text: &str) -> Result> { + let idx = self.call_count.fetch_add(1, Ordering::SeqCst); + let response_idx = idx % self.responses.len(); + Ok(self.responses[response_idx].clone()) + } + + async fn embed_batch(&self, texts: &[&str]) -> Result>> { + let mut results = Vec::with_capacity(texts.len()); + for _ in texts { + let idx = self.call_count.fetch_add(1, Ordering::SeqCst); + let response_idx = idx % self.responses.len(); + results.push(self.responses[response_idx].clone()); + } + Ok(results) + } + + fn dimensions(&self) -> usize { + self.dimensions + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[tokio::test] + async fn mock_returns_fixed_vector() { + let embedder = MockEmbedder::returning(vec![1.0, 2.0, 3.0]); + + let result = embedder.embed("test").await.unwrap(); + assert_eq!(result, vec![1.0, 2.0, 3.0]); + + let result = embedder.embed("another").await.unwrap(); + assert_eq!(result, vec![1.0, 2.0, 3.0]); + } + + #[tokio::test] + async fn mock_returns_sequence() { + let embedder = MockEmbedder::with_sequence(vec![ + vec![1.0], + vec![2.0], + vec![3.0], + ]); + + assert_eq!(embedder.embed("a").await.unwrap(), vec![1.0]); + assert_eq!(embedder.embed("b").await.unwrap(), vec![2.0]); + assert_eq!(embedder.embed("c").await.unwrap(), vec![3.0]); + // Wraps around + assert_eq!(embedder.embed("d").await.unwrap(), vec![1.0]); + } + + #[tokio::test] + async fn mock_tracks_call_count() { + let embedder = MockEmbedder::default(); + + assert_eq!(embedder.call_count(), 0); + embedder.embed("a").await.unwrap(); + assert_eq!(embedder.call_count(), 1); + embedder.embed("b").await.unwrap(); + assert_eq!(embedder.call_count(), 2); + + embedder.reset(); + assert_eq!(embedder.call_count(), 0); + } + + #[tokio::test] + async fn mock_batch_increments_count_per_item() { + let embedder = MockEmbedder::default(); + + embedder.embed_batch(&["a", "b", "c"]).await.unwrap(); + assert_eq!(embedder.call_count(), 3); + } +} diff --git a/toki-api/src/domain/search/embedder/mod.rs b/toki-api/src/domain/search/embedder/mod.rs new file mode 100644 index 00000000..ae065b03 --- /dev/null +++ b/toki-api/src/domain/search/embedder/mod.rs @@ -0,0 +1,9 @@ +//! Embedding generation implementations. + +mod gemini; +#[cfg(test)] +mod mock; + +pub use gemini::GeminiEmbedder; +#[cfg(test)] +pub use mock::MockEmbedder; diff --git a/toki-api/src/domain/search/index_worker.rs b/toki-api/src/domain/search/index_worker.rs new file mode 100644 index 00000000..ee8d88a5 --- /dev/null +++ b/toki-api/src/domain/search/index_worker.rs @@ -0,0 +1,99 @@ +//! Background worker for periodic search index syncing. + +use std::{collections::HashMap, sync::Arc, time::Duration}; + +use az_devops::RepoClient; +use sqlx::PgPool; +use tokio::sync::RwLock; +use tracing::{error, info}; + +use super::{ + embedder::GeminiEmbedder, + repository::PgSearchRepository, + source::AdoDocumentSource, + IndexerConfig, SearchIndexer, +}; +use crate::domain::RepoKey; + +/// Runs a periodic search index sync across all configured repositories. +/// +/// Each cycle iterates repos sequentially to avoid Gemini API rate limits. +/// Errors for individual repos are logged and skipped (non-fatal). +pub async fn run_search_index_worker( + db_pool: Arc, + repo_clients: Arc>>, + embedder: GeminiEmbedder, + interval: Duration, + config: IndexerConfig, +) { + info!( + interval_secs = interval.as_secs(), + "Search indexer background task started" + ); + + let mut ticker = tokio::time::interval(interval); + + // Skip the first immediate tick to let the app fully start + ticker.tick().await; + + loop { + ticker.tick().await; + + info!("Starting search index sync cycle"); + + // Snapshot repo_clients under a brief read lock + let clients: Vec<(RepoKey, RepoClient)> = { + let guard = repo_clients.read().await; + guard + .iter() + .map(|(k, v)| (k.clone(), v.clone())) + .collect() + }; + + if clients.is_empty() { + info!("No repositories configured, skipping sync cycle"); + continue; + } + + let mut total_prs = 0usize; + let mut total_work_items = 0usize; + let mut total_errors = 0usize; + + for (key, client) in &clients { + let source = AdoDocumentSource::new(client.clone(), key.repo_name.clone()); + let repository = PgSearchRepository::new((*db_pool).clone()); + let indexer = + SearchIndexer::new(embedder.clone(), repository, source, config.clone()); + + match indexer + .sync_project(&key.organization, &key.project) + .await + { + Ok(stats) => { + info!( + repo = %key, + prs = stats.prs_indexed, + work_items = stats.work_items_indexed, + deleted = stats.documents_deleted, + "Repo sync completed" + ); + total_prs += stats.prs_indexed; + total_work_items += stats.work_items_indexed; + total_errors += stats.errors; + } + Err(e) => { + error!(repo = %key, error = %e, "Repo sync failed"); + total_errors += 1; + } + } + } + + info!( + repos = clients.len(), + prs = total_prs, + work_items = total_work_items, + errors = total_errors, + "Search index sync cycle completed" + ); + } +} diff --git a/toki-api/src/domain/search/indexer.rs b/toki-api/src/domain/search/indexer.rs new file mode 100644 index 00000000..aa6b599f --- /dev/null +++ b/toki-api/src/domain/search/indexer.rs @@ -0,0 +1,470 @@ +//! Search indexer for syncing data from Azure DevOps to the search index. + +use time::{Duration, OffsetDateTime}; +use tracing::{info, warn}; + +use super::traits::{DocumentSource, Embedder, Result, SearchRepository}; +use super::types::{ + PullRequestDocument, SearchDocument, SearchSource, SyncStats, WorkItemDocument, +}; + +/// Configuration for the search indexer. +#[derive(Debug, Clone)] +pub struct IndexerConfig { + /// Batch size for embedding generation + pub embedding_batch_size: usize, + /// How old documents can be before being considered stale (hours) + pub stale_threshold_hours: i64, + /// Whether to delete stale documents during sync + pub cleanup_stale: bool, +} + +impl Default for IndexerConfig { + fn default() -> Self { + Self { + embedding_batch_size: 10, + stale_threshold_hours: 48, + cleanup_stale: true, + } + } +} + +/// Indexer for syncing Azure DevOps data to the search index. +/// +/// # Type Parameters +/// +/// * `E` - Embedder implementation for generating document embeddings +/// * `R` - SearchRepository implementation for database operations +/// * `S` - DocumentSource implementation for fetching from ADO +/// +/// # Example +/// +/// ```ignore +/// let indexer = SearchIndexer::new(embedder, repository, source, IndexerConfig::default()); +/// let stats = indexer.sync_project("org", "project").await?; +/// println!("Indexed {} PRs and {} work items", stats.prs_indexed, stats.work_items_indexed); +/// ``` +pub struct SearchIndexer +where + E: Embedder, + R: SearchRepository, + S: DocumentSource, +{ + embedder: E, + repository: R, + source: S, + config: IndexerConfig, +} + +impl SearchIndexer +where + E: Embedder, + R: SearchRepository, + S: DocumentSource, +{ + /// Create a new search indexer. + pub fn new(embedder: E, repository: R, source: S, config: IndexerConfig) -> Self { + Self { + embedder, + repository, + source, + config, + } + } + + /// Create an indexer with default configuration. + #[allow(dead_code)] + pub fn with_defaults(embedder: E, repository: R, source: S) -> Self { + Self::new(embedder, repository, source, IndexerConfig::default()) + } + + /// Sync all data from a project to the search index. + pub async fn sync_project(&self, org: &str, project: &str) -> Result { + let mut stats = SyncStats::default(); + let sync_start = OffsetDateTime::now_utc(); + + info!(org, project, "Starting search index sync"); + + // Sync PRs + match self.sync_pull_requests(org, project).await { + Ok(count) => { + stats.prs_indexed = count; + info!(org, project, count, "Synced pull requests"); + } + Err(e) => { + warn!(org, project, error = %e, "Failed to sync pull requests"); + stats.errors += 1; + } + } + + // Sync work items + match self.sync_work_items(org, project, None).await { + Ok(count) => { + stats.work_items_indexed = count; + info!(org, project, count, "Synced work items"); + } + Err(e) => { + warn!(org, project, error = %e, "Failed to sync work items"); + stats.errors += 1; + } + } + + // Cleanup stale documents + if self.config.cleanup_stale { + let stale_threshold = + sync_start - Duration::hours(self.config.stale_threshold_hours); + match self.cleanup_stale_documents(stale_threshold).await { + Ok(count) => { + stats.documents_deleted = count; + if count > 0 { + info!(count, "Cleaned up stale documents"); + } + } + Err(e) => { + warn!(error = %e, "Failed to cleanup stale documents"); + stats.errors += 1; + } + } + } + + info!( + org, + project, + prs = stats.prs_indexed, + work_items = stats.work_items_indexed, + deleted = stats.documents_deleted, + errors = stats.errors, + "Sync completed" + ); + + Ok(stats) + } + + /// Sync pull requests from ADO to the search index. + async fn sync_pull_requests(&self, org: &str, project: &str) -> Result { + let prs = self.source.fetch_pull_requests(org, project).await?; + let mut indexed = 0; + + // Process in batches for embedding + for batch in prs.chunks(self.config.embedding_batch_size) { + let docs = self.prepare_pr_documents(org, batch).await?; + indexed += self.repository.upsert_documents(&docs).await?; + } + + Ok(indexed) + } + + /// Sync work items from ADO to the search index. + async fn sync_work_items( + &self, + org: &str, + project: &str, + since: Option, + ) -> Result { + let work_items = self.source.fetch_work_items(org, project, since).await?; + let mut indexed = 0; + + // Process in batches for embedding + for batch in work_items.chunks(self.config.embedding_batch_size) { + let docs = self.prepare_work_item_documents(org, batch).await?; + indexed += self.repository.upsert_documents(&docs).await?; + } + + Ok(indexed) + } + + /// Prepare PR documents with embeddings. + async fn prepare_pr_documents( + &self, + org: &str, + prs: &[PullRequestDocument], + ) -> Result> { + let mut documents = Vec::with_capacity(prs.len()); + + // Prepare content for batch embedding + let contents: Vec = prs + .iter() + .map(|pr| self.prepare_pr_content(pr)) + .collect(); + + let content_refs: Vec<&str> = contents.iter().map(|s| s.as_str()).collect(); + + // Generate embeddings in batch + let embeddings = self.embedder.embed_batch(&content_refs).await?; + + // Build documents + for (pr, embedding) in prs.iter().zip(embeddings) { + let source_id = format!("{}/{}/{}/{}", org, pr.project, pr.repo_name, pr.id); + + documents.push(SearchDocument { + source_type: SearchSource::Pr, + source_id, + external_id: pr.id, + title: pr.title.clone(), + description: pr.description.clone(), + content: Some(pr.additional_content.clone()), + organization: org.to_string(), + project: pr.project.clone(), + repo_name: Some(pr.repo_name.clone()), + status: pr.status.clone(), + author_id: pr.author_id.clone(), + author_name: pr.author_name.clone(), + assigned_to_id: None, + assigned_to_name: None, + priority: None, + item_type: None, + is_draft: pr.is_draft, + created_at: pr.created_at, + updated_at: pr.updated_at, + closed_at: pr.closed_at, + url: pr.url.clone(), + parent_id: None, + linked_work_items: pr.linked_work_items.clone(), + embedding: Some(embedding), + }); + } + + Ok(documents) + } + + /// Prepare work item documents with embeddings. + async fn prepare_work_item_documents( + &self, + org: &str, + work_items: &[WorkItemDocument], + ) -> Result> { + let mut documents = Vec::with_capacity(work_items.len()); + + // Prepare content for batch embedding + let contents: Vec = work_items + .iter() + .map(|wi| self.prepare_work_item_content(wi)) + .collect(); + + let content_refs: Vec<&str> = contents.iter().map(|s| s.as_str()).collect(); + + // Generate embeddings in batch + let embeddings = self.embedder.embed_batch(&content_refs).await?; + + // Build documents + for (wi, embedding) in work_items.iter().zip(embeddings) { + let source_id = format!("{}/{}/{}", org, wi.project, wi.id); + + documents.push(SearchDocument { + source_type: SearchSource::WorkItem, + source_id, + external_id: wi.id, + title: wi.title.clone(), + description: wi.description.clone(), + content: Some(wi.additional_content.clone()), + organization: org.to_string(), + project: wi.project.clone(), + repo_name: None, + status: wi.status.clone(), + author_id: wi.author_id.clone(), + author_name: wi.author_name.clone(), + assigned_to_id: wi.assigned_to_id.clone(), + assigned_to_name: wi.assigned_to_name.clone(), + priority: wi.priority, + item_type: Some(wi.item_type.clone()), + is_draft: false, + created_at: wi.created_at, + updated_at: wi.updated_at, + closed_at: wi.closed_at, + url: wi.url.clone(), + parent_id: wi.parent_id, + linked_work_items: vec![], + embedding: Some(embedding), + }); + } + + Ok(documents) + } + + /// Prepare content for embedding from a PR. + fn prepare_pr_content(&self, pr: &PullRequestDocument) -> String { + let mut parts = vec![pr.title.clone()]; + + if let Some(ref desc) = pr.description { + parts.push(desc.clone()); + } + + if !pr.additional_content.is_empty() { + parts.push(pr.additional_content.clone()); + } + + parts.join("\n\n") + } + + /// Prepare content for embedding from a work item. + fn prepare_work_item_content(&self, wi: &WorkItemDocument) -> String { + let mut parts = vec![wi.title.clone()]; + + if let Some(ref desc) = wi.description { + parts.push(desc.clone()); + } + + if !wi.additional_content.is_empty() { + parts.push(wi.additional_content.clone()); + } + + parts.join("\n\n") + } + + /// Remove documents that haven't been updated since the threshold. + async fn cleanup_stale_documents(&self, older_than: OffsetDateTime) -> Result { + self.repository.delete_stale_documents(older_than).await + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::domain::search::embedder::MockEmbedder; + use crate::domain::search::repository::MockSearchRepository; + use async_trait::async_trait; + use std::sync::{Arc, RwLock}; + + // Mock document source for testing + struct MockDocumentSource { + prs: Arc>>, + work_items: Arc>>, + } + + impl MockDocumentSource { + fn new() -> Self { + Self { + prs: Arc::new(RwLock::new(vec![])), + work_items: Arc::new(RwLock::new(vec![])), + } + } + + fn with_prs(self, prs: Vec) -> Self { + *self.prs.write().unwrap() = prs; + self + } + + fn with_work_items(self, work_items: Vec) -> Self { + *self.work_items.write().unwrap() = work_items; + self + } + } + + #[async_trait] + impl DocumentSource for MockDocumentSource { + async fn fetch_pull_requests( + &self, + _org: &str, + _project: &str, + ) -> Result> { + Ok(self.prs.read().unwrap().clone()) + } + + async fn fetch_work_items( + &self, + _org: &str, + _project: &str, + _since: Option, + ) -> Result> { + Ok(self.work_items.read().unwrap().clone()) + } + } + + fn make_pr(id: i32, title: &str) -> PullRequestDocument { + PullRequestDocument { + id, + title: title.to_string(), + description: Some("Description".to_string()), + organization: "org".to_string(), + project: "project".to_string(), + repo_name: "repo".to_string(), + status: "active".to_string(), + author_id: Some("author".to_string()), + author_name: Some("Author Name".to_string()), + is_draft: false, + created_at: OffsetDateTime::now_utc(), + updated_at: OffsetDateTime::now_utc(), + closed_at: None, + url: format!("https://dev.azure.com/org/project/_git/repo/pullrequest/{}", id), + additional_content: "Commit messages".to_string(), + linked_work_items: vec![], + } + } + + fn make_work_item(id: i32, title: &str, item_type: &str) -> WorkItemDocument { + WorkItemDocument { + id, + title: title.to_string(), + description: Some("Description".to_string()), + organization: "org".to_string(), + project: "project".to_string(), + status: "Active".to_string(), + author_id: Some("author".to_string()), + author_name: Some("Author Name".to_string()), + assigned_to_id: Some("assignee".to_string()), + assigned_to_name: Some("Assignee Name".to_string()), + priority: Some(2), + item_type: item_type.to_string(), + created_at: OffsetDateTime::now_utc(), + updated_at: OffsetDateTime::now_utc(), + closed_at: None, + url: format!("https://dev.azure.com/org/project/_workitems/edit/{}", id), + parent_id: None, + additional_content: "Comments".to_string(), + } + } + + #[tokio::test] + async fn sync_indexes_prs_and_work_items() { + let embedder = MockEmbedder::default(); + let repository = MockSearchRepository::new(); + let source = MockDocumentSource::new() + .with_prs(vec![make_pr(1, "Auth PR"), make_pr(2, "DB PR")]) + .with_work_items(vec![ + make_work_item(100, "Auth Bug", "Bug"), + make_work_item(101, "DB Task", "Task"), + ]); + + let config = IndexerConfig { + cleanup_stale: false, + ..Default::default() + }; + + let indexer = SearchIndexer::new(embedder, repository.clone(), source, config); + let stats = indexer.sync_project("org", "project").await.unwrap(); + + assert_eq!(stats.prs_indexed, 2); + assert_eq!(stats.work_items_indexed, 2); + assert_eq!(stats.errors, 0); + assert_eq!(repository.len(), 4); + } + + #[tokio::test] + async fn sync_generates_embeddings() { + let embedder = MockEmbedder::default(); + let repository = MockSearchRepository::new(); + let source = MockDocumentSource::new() + .with_prs(vec![make_pr(1, "Test PR")]); + + let indexer = SearchIndexer::with_defaults(embedder.clone(), repository, source); + indexer.sync_project("org", "project").await.unwrap(); + + // Should have called embed_batch once with 1 text + assert_eq!(embedder.call_count(), 1); + } + + #[tokio::test] + async fn sync_handles_empty_source() { + let embedder = MockEmbedder::default(); + let repository = MockSearchRepository::new(); + let source = MockDocumentSource::new(); + + let indexer = SearchIndexer::with_defaults(embedder, repository.clone(), source); + let stats = indexer.sync_project("org", "project").await.unwrap(); + + assert_eq!(stats.prs_indexed, 0); + assert_eq!(stats.work_items_indexed, 0); + assert_eq!(stats.total_indexed(), 0); + assert!(repository.is_empty()); + } +} diff --git a/toki-api/src/domain/search/mod.rs b/toki-api/src/domain/search/mod.rs new file mode 100644 index 00000000..0b409f96 --- /dev/null +++ b/toki-api/src/domain/search/mod.rs @@ -0,0 +1,66 @@ +//! Super Search - Hybrid semantic + full-text search over PRs and Work Items. +//! +//! This module provides a search system that combines: +//! - **BM25 full-text search** via PostgreSQL tsvector +//! - **Semantic vector search** via pgvector + Gemini embeddings +//! - **Reciprocal Rank Fusion (RRF)** for combining results +//! +//! # Architecture +//! +//! The search system is built around trait abstractions for testability: +//! +//! - [`Embedder`] - Text embedding generation (Gemini, mocks) +//! - [`SearchRepository`] - Database operations (PostgreSQL, mocks) +//! - [`DocumentSource`] - Data fetching from Azure DevOps +//! +//! # Example +//! +//! ```ignore +//! use toki_api::domain::search::{SearchService, SearchConfig}; +//! use toki_api::domain::search::embedder::GeminiEmbedder; +//! use toki_api::domain::search::repository::PgSearchRepository; +//! +//! let embedder = GeminiEmbedder::new(api_key); +//! let repository = PgSearchRepository::new(pool); +//! let service = SearchService::new(embedder, repository, SearchConfig::default()); +//! +//! let results = service.search("authentication PRs", Some(20)).await?; +//! ``` +//! +//! # Indexing +//! +//! Use [`SearchIndexer`] to sync data from Azure DevOps: +//! +//! ```ignore +//! use toki_api::domain::search::{SearchIndexer, IndexerConfig}; +//! +//! let indexer = SearchIndexer::new(embedder, repository, ado_source, IndexerConfig::default()); +//! let stats = indexer.sync_project("org", "project").await?; +//! ``` +//! +//! # Query Syntax +//! +//! The search parser supports natural language queries with filter extraction: +//! +//! - `"authentication PRs"` → source_type: PR, search: "authentication" +//! - `"priority 1 bugs"` → priority: [1], item_type: ["Bug"] +//! - `"bugs in Lerum last week"` → project: "Lerums Djursjukhus", date filter +//! +//! See [`parse_query`] for full filter support. + +mod index_worker; +mod indexer; +mod parser; +mod service; +mod traits; +mod types; + +pub mod embedder; +pub mod repository; +pub mod source; + +// Re-export main types +pub use index_worker::run_search_index_worker; +pub use indexer::{IndexerConfig, SearchIndexer}; +pub use service::SearchService; +pub use types::SearchResult; diff --git a/toki-api/src/domain/search/parser.rs b/toki-api/src/domain/search/parser.rs new file mode 100644 index 00000000..8cbc7be9 --- /dev/null +++ b/toki-api/src/domain/search/parser.rs @@ -0,0 +1,416 @@ +//! Query parser for extracting filters from natural language search queries. +//! +//! Transforms queries like "priority 1 bugs in Lerum" into structured filters. + +use regex::Regex; +use std::sync::LazyLock; +use time::{Duration, OffsetDateTime}; + +use super::types::{ParsedQuery, SearchFilters, SearchSource}; + +/// Parse a natural language search query into structured filters and search text. +/// +/// # Examples +/// +/// ``` +/// use toki_api::domain::search::{parse_query, SearchSource}; +/// +/// let parsed = parse_query("priority 1 bugs"); +/// assert_eq!(parsed.filters.priority, Some(vec![1])); +/// assert_eq!(parsed.filters.item_type, Some(vec!["Bug".to_string()])); +/// +/// let parsed = parse_query("authentication PRs"); +/// assert_eq!(parsed.filters.source_type, Some(SearchSource::Pr)); +/// assert_eq!(parsed.search_text, "authentication"); +/// ``` +pub fn parse_query(query: &str) -> ParsedQuery { + let mut filters = SearchFilters::default(); + let mut remaining = query.to_string(); + + // Extract source type + remaining = extract_source_type(&remaining, &mut filters); + + // Extract priority + remaining = extract_priority(&remaining, &mut filters); + + // Extract item type + remaining = extract_item_type(&remaining, &mut filters); + + // Extract status + remaining = extract_status(&remaining, &mut filters); + + // Extract date ranges + remaining = extract_date_range(&remaining, &mut filters); + + // Extract draft filter + remaining = extract_draft(&remaining, &mut filters); + + // Extract project (known projects) + remaining = extract_project(&remaining, &mut filters); + + // Clean up remaining text + let search_text = cleanup_search_text(&remaining); + + ParsedQuery { + search_text, + filters, + } +} + +// Regex patterns compiled once +static PRIORITY_PATTERN: LazyLock = + LazyLock::new(|| Regex::new(r"(?i)\bpriority\s*(\d+)\b").unwrap()); +static PRIORITY_SHORT_PATTERN: LazyLock = + LazyLock::new(|| Regex::new(r"(?i)\bp([1-4])\b").unwrap()); +static PR_PATTERN: LazyLock = + LazyLock::new(|| Regex::new(r"(?i)\b(PRs?|pull\s*requests?)\b").unwrap()); +static WORK_ITEM_PATTERN: LazyLock = + LazyLock::new(|| Regex::new(r"(?i)\b(work\s*items?|WIs?)\b").unwrap()); +static BUG_PATTERN: LazyLock = + LazyLock::new(|| Regex::new(r"(?i)\b(bugs?)\b").unwrap()); +static TASK_PATTERN: LazyLock = + LazyLock::new(|| Regex::new(r"(?i)\b(tasks?)\b").unwrap()); +static STORY_PATTERN: LazyLock = + LazyLock::new(|| Regex::new(r"(?i)\b(user\s*stor(?:y|ies)|stor(?:y|ies))\b").unwrap()); +static STATUS_PATTERN: LazyLock = LazyLock::new(|| { + Regex::new(r"(?i)\b(active|completed|closed|resolved|abandoned|new|open)\b").unwrap() +}); +static DATE_RANGE_PATTERN: LazyLock = LazyLock::new(|| { + Regex::new(r"(?i)\b(last|past)\s+(week|month|year|(\d+)\s*(days?|weeks?|months?))\b").unwrap() +}); +static DRAFT_PATTERN: LazyLock = + LazyLock::new(|| Regex::new(r"(?i)\b(drafts?|draft\s+PRs?)\b").unwrap()); + +fn extract_source_type(query: &str, filters: &mut SearchFilters) -> String { + let mut result = query.to_string(); + + if PR_PATTERN.is_match(query) { + filters.source_type = Some(SearchSource::Pr); + result = PR_PATTERN.replace_all(&result, "").to_string(); + } else if WORK_ITEM_PATTERN.is_match(query) { + filters.source_type = Some(SearchSource::WorkItem); + result = WORK_ITEM_PATTERN.replace_all(&result, "").to_string(); + } + + result +} + +fn extract_priority(query: &str, filters: &mut SearchFilters) -> String { + let mut result = query.to_string(); + let mut priorities = Vec::new(); + + // Match "priority 1" or "priority 2" + for cap in PRIORITY_PATTERN.captures_iter(query) { + if let Ok(p) = cap[1].parse::() { + if (1..=4).contains(&p) { + priorities.push(p); + } + } + } + result = PRIORITY_PATTERN.replace_all(&result, "").to_string(); + + // Match "p1" or "p2" + for cap in PRIORITY_SHORT_PATTERN.captures_iter(query) { + if let Ok(p) = cap[1].parse::() { + priorities.push(p); + } + } + result = PRIORITY_SHORT_PATTERN.replace_all(&result, "").to_string(); + + if !priorities.is_empty() { + priorities.sort(); + priorities.dedup(); + filters.priority = Some(priorities); + } + + result +} + +fn extract_item_type(query: &str, filters: &mut SearchFilters) -> String { + let mut result = query.to_string(); + let mut types = Vec::new(); + + if BUG_PATTERN.is_match(query) { + types.push("Bug".to_string()); + result = BUG_PATTERN.replace_all(&result, "").to_string(); + } + + if TASK_PATTERN.is_match(query) { + types.push("Task".to_string()); + result = TASK_PATTERN.replace_all(&result, "").to_string(); + } + + if STORY_PATTERN.is_match(query) { + types.push("User Story".to_string()); + result = STORY_PATTERN.replace_all(&result, "").to_string(); + } + + if !types.is_empty() { + filters.item_type = Some(types); + // If we found work item types, assume work items unless explicitly PRs + if filters.source_type.is_none() { + filters.source_type = Some(SearchSource::WorkItem); + } + } + + result +} + +fn extract_status(query: &str, filters: &mut SearchFilters) -> String { + let mut result = query.to_string(); + let mut statuses = Vec::new(); + + for cap in STATUS_PATTERN.captures_iter(query) { + let status = normalize_status(&cap[1]); + if !statuses.contains(&status) { + statuses.push(status); + } + } + result = STATUS_PATTERN.replace_all(&result, "").to_string(); + + if !statuses.is_empty() { + filters.status = Some(statuses); + } + + result +} + +fn normalize_status(status: &str) -> String { + match status.to_lowercase().as_str() { + "active" | "open" => "active".to_string(), + "completed" | "closed" | "resolved" => "completed".to_string(), + "abandoned" => "abandoned".to_string(), + "new" => "new".to_string(), + other => other.to_string(), + } +} + +fn extract_date_range(query: &str, filters: &mut SearchFilters) -> String { + let mut result = query.to_string(); + + if let Some(cap) = DATE_RANGE_PATTERN.captures(query) { + let now = OffsetDateTime::now_utc(); + let duration = match cap.get(2).map(|m| m.as_str().to_lowercase()).as_deref() { + Some("week") => Some(Duration::weeks(1)), + Some("month") => Some(Duration::days(30)), + Some("year") => Some(Duration::days(365)), + _ => { + // Parse "N days/weeks/months" + if let (Some(num), Some(unit)) = (cap.get(3), cap.get(4)) { + if let Ok(n) = num.as_str().parse::() { + let unit_str = unit.as_str().to_lowercase(); + if unit_str.starts_with("day") { + Some(Duration::days(n)) + } else if unit_str.starts_with("week") { + Some(Duration::weeks(n)) + } else if unit_str.starts_with("month") { + Some(Duration::days(n * 30)) + } else { + None + } + } else { + None + } + } else { + None + } + } + }; + + if let Some(d) = duration { + filters.updated_after = Some(now - d); + result = DATE_RANGE_PATTERN.replace(&result, "").to_string(); + } + } + + result +} + +fn extract_draft(query: &str, filters: &mut SearchFilters) -> String { + let mut result = query.to_string(); + + if DRAFT_PATTERN.is_match(query) { + filters.is_draft = Some(true); + filters.source_type = Some(SearchSource::Pr); + result = DRAFT_PATTERN.replace_all(&result, "").to_string(); + } + + result +} + +static PROJECT_PATTERNS: LazyLock> = LazyLock::new(|| { + vec![ + (Regex::new(r"(?i)\b(in\s+)?lerum\b").unwrap(), "Lerums Djursjukhus"), + (Regex::new(r"(?i)\b(in\s+)?evidensia\b").unwrap(), "Evidensia"), + ] +}); + +fn extract_project(query: &str, filters: &mut SearchFilters) -> String { + let mut result = query.to_string(); + + for (re, full_name) in PROJECT_PATTERNS.iter() { + if re.is_match(&result) { + filters.project = Some(full_name.to_string()); + result = re.replace_all(&result, "").to_string(); + break; + } + } + + result +} + +fn cleanup_search_text(text: &str) -> String { + // Remove extra whitespace and common noise words + let noise_words = ["in", "the", "for", "with", "from", "about"]; + + let words: Vec<&str> = text + .split_whitespace() + .filter(|w| !noise_words.contains(&w.to_lowercase().as_str())) + .collect(); + + words.join(" ") +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn parse_empty_query() { + let parsed = parse_query(""); + assert_eq!(parsed.search_text, ""); + assert!(parsed.filters.source_type.is_none()); + } + + #[test] + fn parse_simple_text() { + let parsed = parse_query("authentication"); + assert_eq!(parsed.search_text, "authentication"); + assert!(parsed.filters.source_type.is_none()); + } + + #[test] + fn parse_pr_filter() { + let parsed = parse_query("authentication PRs"); + assert_eq!(parsed.search_text, "authentication"); + assert_eq!(parsed.filters.source_type, Some(SearchSource::Pr)); + + let parsed = parse_query("pull requests about auth"); + assert_eq!(parsed.filters.source_type, Some(SearchSource::Pr)); + } + + #[test] + fn parse_work_item_filter() { + let parsed = parse_query("authentication work items"); + assert_eq!(parsed.filters.source_type, Some(SearchSource::WorkItem)); + } + + #[test] + fn parse_priority_filter() { + let parsed = parse_query("priority 1 bugs"); + assert_eq!(parsed.filters.priority, Some(vec![1])); + + let parsed = parse_query("p2 issues"); + assert_eq!(parsed.filters.priority, Some(vec![2])); + + let parsed = parse_query("priority 1 and priority 2"); + assert_eq!(parsed.filters.priority, Some(vec![1, 2])); + } + + #[test] + fn parse_item_type_filter() { + let parsed = parse_query("priority 1 bugs"); + assert_eq!( + parsed.filters.item_type, + Some(vec!["Bug".to_string()]) + ); + assert_eq!(parsed.filters.source_type, Some(SearchSource::WorkItem)); + + let parsed = parse_query("tasks in Lerum"); + assert_eq!( + parsed.filters.item_type, + Some(vec!["Task".to_string()]) + ); + + let parsed = parse_query("user stories"); + assert_eq!( + parsed.filters.item_type, + Some(vec!["User Story".to_string()]) + ); + } + + #[test] + fn parse_status_filter() { + let parsed = parse_query("active PRs"); + assert_eq!(parsed.filters.status, Some(vec!["active".to_string()])); + + let parsed = parse_query("closed bugs"); + assert_eq!(parsed.filters.status, Some(vec!["completed".to_string()])); + + // "resolved" normalizes to "completed" + let parsed = parse_query("resolved work items"); + assert_eq!(parsed.filters.status, Some(vec!["completed".to_string()])); + + // "open" normalizes to "active" + let parsed = parse_query("open PRs"); + assert_eq!(parsed.filters.status, Some(vec!["active".to_string()])); + } + + #[test] + fn parse_date_range_filter() { + let parsed = parse_query("last week"); + assert!(parsed.filters.updated_after.is_some()); + let since = parsed.filters.updated_after.unwrap(); + let now = OffsetDateTime::now_utc(); + let diff = now - since; + assert!(diff.whole_days() >= 6 && diff.whole_days() <= 8); + + let parsed = parse_query("past 30 days"); + assert!(parsed.filters.updated_after.is_some()); + } + + #[test] + fn parse_draft_filter() { + let parsed = parse_query("draft PRs"); + assert_eq!(parsed.filters.is_draft, Some(true)); + assert_eq!(parsed.filters.source_type, Some(SearchSource::Pr)); + } + + #[test] + fn parse_project_filter() { + let parsed = parse_query("bugs in Lerum"); + assert_eq!( + parsed.filters.project, + Some("Lerums Djursjukhus".to_string()) + ); + } + + #[test] + fn parse_complex_query() { + let parsed = parse_query("priority 1 bugs in Lerum closed last week"); + assert_eq!(parsed.filters.priority, Some(vec![1])); + assert_eq!( + parsed.filters.item_type, + Some(vec!["Bug".to_string()]) + ); + assert_eq!( + parsed.filters.project, + Some("Lerums Djursjukhus".to_string()) + ); + assert_eq!(parsed.filters.status, Some(vec!["completed".to_string()])); + assert!(parsed.filters.updated_after.is_some()); + assert_eq!(parsed.search_text, ""); // All tokens were filters + } + + #[test] + fn parse_preserves_search_text() { + let parsed = parse_query("authentication PRs"); + assert_eq!(parsed.search_text, "authentication"); + + let parsed = parse_query("fix login bug in authentication service"); + assert!(parsed.search_text.contains("fix")); + assert!(parsed.search_text.contains("login")); + assert!(parsed.search_text.contains("authentication")); + assert!(parsed.search_text.contains("service")); + } +} diff --git a/toki-api/src/domain/search/repository/mock.rs b/toki-api/src/domain/search/repository/mock.rs new file mode 100644 index 00000000..dccc610e --- /dev/null +++ b/toki-api/src/domain/search/repository/mock.rs @@ -0,0 +1,345 @@ +//! Mock repository implementation for testing. + +use async_trait::async_trait; +use std::collections::HashMap; +use std::sync::{Arc, RwLock}; +use time::OffsetDateTime; + +use crate::domain::search::traits::{Result, SearchRepository}; +use crate::domain::search::types::{ParsedQuery, SearchDocument, SearchResult, SearchSource}; + +/// Mock search repository backed by an in-memory HashMap. +/// +/// # Examples +/// +/// ``` +/// use toki_api::domain::search::repository::MockSearchRepository; +/// +/// let repo = MockSearchRepository::new(); +/// // or with initial documents: +/// let repo = MockSearchRepository::new().with_documents(vec![doc1, doc2]); +/// ``` +#[derive(Clone, Default)] +pub struct MockSearchRepository { + documents: Arc>>, + /// Custom search results to return (overrides default behavior) + custom_results: Arc>>>, +} + +#[allow(dead_code)] +impl MockSearchRepository { + pub fn new() -> Self { + Self::default() + } + + /// Add initial documents to the repository. + pub fn with_documents(self, docs: Vec) -> Self { + { + let mut documents = self.documents.write().unwrap(); + for doc in docs { + let key = (doc.source_type, doc.source_id.clone()); + documents.insert(key, doc); + } + } + self + } + + /// Configure custom search results to return. + pub fn with_search_results(self, results: Vec) -> Self { + { + let mut custom = self.custom_results.write().unwrap(); + *custom = Some(results); + } + self + } + + /// Get the current number of documents. + pub fn len(&self) -> usize { + self.documents.read().unwrap().len() + } + + /// Check if empty. + pub fn is_empty(&self) -> bool { + self.documents.read().unwrap().is_empty() + } + + /// Get all documents (for test assertions). + pub fn all_documents(&self) -> Vec { + self.documents.read().unwrap().values().cloned().collect() + } +} + +#[async_trait] +impl SearchRepository for MockSearchRepository { + async fn search( + &self, + query: &ParsedQuery, + _embedding: Option<&[f32]>, + limit: i32, + ) -> Result> { + // Return custom results if configured + if let Some(results) = self.custom_results.read().unwrap().as_ref() { + return Ok(results.clone().into_iter().take(limit as usize).collect()); + } + + // Simple mock: filter documents and convert to results + let documents = self.documents.read().unwrap(); + let mut results: Vec = documents + .values() + .filter(|doc| { + // Apply basic filters + if let Some(ref source_type) = query.filters.source_type { + if doc.source_type != *source_type { + return false; + } + } + if let Some(ref project) = query.filters.project { + if &doc.project != project { + return false; + } + } + if let Some(ref priorities) = query.filters.priority { + match doc.priority { + Some(p) if priorities.contains(&p) => {} + _ => return false, + } + } + if let Some(ref statuses) = query.filters.status { + if !statuses.iter().any(|s| s.eq_ignore_ascii_case(&doc.status)) { + return false; + } + } + if let Some(ref item_types) = query.filters.item_type { + match &doc.item_type { + Some(t) if item_types.iter().any(|it| it.eq_ignore_ascii_case(t)) => {} + _ => return false, + } + } + if let Some(ref org) = query.filters.organization { + if &doc.organization != org { + return false; + } + } + if let Some(is_draft) = query.filters.is_draft { + if doc.is_draft != is_draft { + return false; + } + } + if let Some(updated_after) = query.filters.updated_after { + if doc.updated_at < updated_after { + return false; + } + } + + // Simple text matching + if !query.search_text.is_empty() { + let search_lower = query.search_text.to_lowercase(); + let title_match = doc.title.to_lowercase().contains(&search_lower); + let desc_match = doc + .description + .as_ref() + .map(|d| d.to_lowercase().contains(&search_lower)) + .unwrap_or(false); + if !title_match && !desc_match { + return false; + } + } + + true + }) + .map(|doc| SearchResult { + id: 0, // Mock doesn't have real IDs + source_type: doc.source_type, + source_id: doc.source_id.clone(), + external_id: doc.external_id, + title: doc.title.clone(), + description: doc.description.clone(), + status: doc.status.clone(), + priority: doc.priority, + item_type: doc.item_type.clone(), + author_name: doc.author_name.clone(), + url: doc.url.clone(), + created_at: doc.created_at, + updated_at: doc.updated_at, + score: 1.0, // Mock score + }) + .collect(); + + // Sort by updated_at descending as a simple ranking + results.sort_by(|a, b| b.updated_at.cmp(&a.updated_at)); + + Ok(results.into_iter().take(limit as usize).collect()) + } + + async fn upsert_document(&self, doc: &SearchDocument) -> Result<()> { + let key = (doc.source_type, doc.source_id.clone()); + self.documents.write().unwrap().insert(key, doc.clone()); + Ok(()) + } + + async fn upsert_documents(&self, docs: &[SearchDocument]) -> Result { + let mut documents = self.documents.write().unwrap(); + let mut count = 0; + for doc in docs { + let key = (doc.source_type, doc.source_id.clone()); + documents.insert(key, doc.clone()); + count += 1; + } + Ok(count) + } + + async fn delete_document(&self, source_type: SearchSource, source_id: &str) -> Result { + let key = (source_type, source_id.to_string()); + let removed = self.documents.write().unwrap().remove(&key); + Ok(removed.is_some()) + } + + async fn get_document( + &self, + source_type: SearchSource, + source_id: &str, + ) -> Result> { + let key = (source_type, source_id.to_string()); + let doc = self.documents.read().unwrap().get(&key).cloned(); + Ok(doc) + } + + async fn delete_stale_documents(&self, older_than: OffsetDateTime) -> Result { + let mut documents = self.documents.write().unwrap(); + let stale_keys: Vec<_> = documents + .iter() + .filter(|(_, doc)| doc.updated_at < older_than) + .map(|(k, _)| k.clone()) + .collect(); + let count = stale_keys.len(); + for key in stale_keys { + documents.remove(&key); + } + Ok(count) + } + + async fn count(&self, source_type: Option) -> Result { + let documents = self.documents.read().unwrap(); + let count = match source_type { + Some(st) => documents.values().filter(|d| d.source_type == st).count(), + None => documents.len(), + }; + Ok(count as i64) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::domain::search::types::SearchFilters; + + fn make_document(source_type: SearchSource, id: &str, title: &str) -> SearchDocument { + SearchDocument { + source_type, + source_id: id.to_string(), + external_id: 1, + title: title.to_string(), + description: None, + content: None, + organization: "org".to_string(), + project: "project".to_string(), + repo_name: None, + status: "active".to_string(), + author_id: None, + author_name: None, + assigned_to_id: None, + assigned_to_name: None, + priority: None, + item_type: None, + is_draft: false, + created_at: OffsetDateTime::now_utc(), + updated_at: OffsetDateTime::now_utc(), + closed_at: None, + url: "https://example.com".to_string(), + parent_id: None, + linked_work_items: vec![], + embedding: None, + } + } + + #[tokio::test] + async fn upsert_and_get() { + let repo = MockSearchRepository::new(); + let doc = make_document(SearchSource::Pr, "org/proj/repo/1", "Test PR"); + + repo.upsert_document(&doc).await.unwrap(); + + let retrieved = repo + .get_document(SearchSource::Pr, "org/proj/repo/1") + .await + .unwrap(); + assert!(retrieved.is_some()); + assert_eq!(retrieved.unwrap().title, "Test PR"); + } + + #[tokio::test] + async fn delete_document() { + let repo = MockSearchRepository::new(); + let doc = make_document(SearchSource::Pr, "org/proj/repo/1", "Test PR"); + + repo.upsert_document(&doc).await.unwrap(); + assert_eq!(repo.len(), 1); + + let deleted = repo + .delete_document(SearchSource::Pr, "org/proj/repo/1") + .await + .unwrap(); + assert!(deleted); + assert!(repo.is_empty()); + } + + #[tokio::test] + async fn search_filters_by_source_type() { + let pr = make_document(SearchSource::Pr, "pr/1", "PR Title"); + let wi = make_document(SearchSource::WorkItem, "wi/1", "Work Item"); + + let repo = MockSearchRepository::new().with_documents(vec![pr, wi]); + + let query = ParsedQuery { + search_text: String::new(), + filters: SearchFilters { + source_type: Some(SearchSource::Pr), + ..Default::default() + }, + }; + + let results = repo.search(&query, None, 10).await.unwrap(); + assert_eq!(results.len(), 1); + assert_eq!(results[0].source_type, SearchSource::Pr); + } + + #[tokio::test] + async fn search_filters_by_text() { + let doc1 = make_document(SearchSource::Pr, "pr/1", "Authentication fix"); + let doc2 = make_document(SearchSource::Pr, "pr/2", "Database migration"); + + let repo = MockSearchRepository::new().with_documents(vec![doc1, doc2]); + + let query = ParsedQuery { + search_text: "auth".to_string(), + filters: SearchFilters::default(), + }; + + let results = repo.search(&query, None, 10).await.unwrap(); + assert_eq!(results.len(), 1); + assert!(results[0].title.contains("Authentication")); + } + + #[tokio::test] + async fn count_by_source_type() { + let pr1 = make_document(SearchSource::Pr, "pr/1", "PR 1"); + let pr2 = make_document(SearchSource::Pr, "pr/2", "PR 2"); + let wi = make_document(SearchSource::WorkItem, "wi/1", "Work Item"); + + let repo = MockSearchRepository::new().with_documents(vec![pr1, pr2, wi]); + + assert_eq!(repo.count(None).await.unwrap(), 3); + assert_eq!(repo.count(Some(SearchSource::Pr)).await.unwrap(), 2); + assert_eq!(repo.count(Some(SearchSource::WorkItem)).await.unwrap(), 1); + } +} diff --git a/toki-api/src/domain/search/repository/mod.rs b/toki-api/src/domain/search/repository/mod.rs new file mode 100644 index 00000000..a63fe58f --- /dev/null +++ b/toki-api/src/domain/search/repository/mod.rs @@ -0,0 +1,9 @@ +//! Search repository implementations. + +#[cfg(test)] +mod mock; +mod postgres; + +#[cfg(test)] +pub use mock::MockSearchRepository; +pub use postgres::PgSearchRepository; diff --git a/toki-api/src/domain/search/repository/postgres.rs b/toki-api/src/domain/search/repository/postgres.rs new file mode 100644 index 00000000..416109e0 --- /dev/null +++ b/toki-api/src/domain/search/repository/postgres.rs @@ -0,0 +1,515 @@ +//! PostgreSQL repository implementation with pgvector support. + +use async_trait::async_trait; +use pgvector::Vector; +use sqlx::PgPool; +use time::OffsetDateTime; + +use crate::domain::search::traits::{Result, SearchRepository}; +use crate::domain::search::types::{ParsedQuery, SearchDocument, SearchResult, SearchSource}; + +/// PostgreSQL-backed search repository using pgvector for similarity search. +/// +/// Implements hybrid search combining: +/// - BM25 full-text search via PostgreSQL tsvector +/// - Vector similarity search via pgvector HNSW index +/// - Reciprocal Rank Fusion (RRF) for result combination +#[derive(Clone)] +pub struct PgSearchRepository { + pool: PgPool, +} + +impl PgSearchRepository { + pub fn new(pool: PgPool) -> Self { + Self { pool } + } + + /// Execute hybrid BM25 + vector search with RRF fusion. + async fn search_hybrid( + &self, + query: &ParsedQuery, + embedding: &[f32], + limit: i32, + ) -> Result> { + let results = sqlx::query_as!( + SearchResultRow, + r#" + WITH bm25_results AS ( + SELECT + id, + ts_rank_cd(search_vector, websearch_to_tsquery('english', $1)) as score, + ROW_NUMBER() OVER ( + ORDER BY ts_rank_cd(search_vector, websearch_to_tsquery('english', $1)) DESC + ) as rank + FROM search_documents + WHERE ($1 = '' OR search_vector @@ websearch_to_tsquery('english', $1)) + AND ($2::search_source IS NULL OR source_type = $2) + AND ($3::text IS NULL OR organization = $3) + AND ($4::text IS NULL OR project = $4) + AND ($5::text[] IS NULL OR status = ANY($5)) + AND ($6::int[] IS NULL OR priority = ANY($6)) + AND ($7::text[] IS NULL OR item_type = ANY($7)) + AND ($8::bool IS NULL OR is_draft = $8) + AND ($9::timestamptz IS NULL OR updated_at >= $9) + LIMIT 100 + ), + vector_results AS ( + SELECT + id, + 1 - (embedding <=> $10) as score, + ROW_NUMBER() OVER ( + ORDER BY embedding <=> $10 + ) as rank + FROM search_documents + WHERE embedding IS NOT NULL + AND ($2::search_source IS NULL OR source_type = $2) + AND ($3::text IS NULL OR organization = $3) + AND ($4::text IS NULL OR project = $4) + AND ($5::text[] IS NULL OR status = ANY($5)) + AND ($6::int[] IS NULL OR priority = ANY($6)) + AND ($7::text[] IS NULL OR item_type = ANY($7)) + AND ($8::bool IS NULL OR is_draft = $8) + AND ($9::timestamptz IS NULL OR updated_at >= $9) + LIMIT 100 + ), + rrf_combined AS ( + SELECT + COALESCE(b.id, v.id) as id, + (COALESCE(1.0 / (60 + b.rank), 0) + COALESCE(1.0 / (60 + v.rank), 0))::float8 as rrf_score + FROM bm25_results b + FULL OUTER JOIN vector_results v ON b.id = v.id + ) + SELECT + d.id, + d.source_type as "source_type: SearchSource", + d.source_id, + d.external_id, + d.title, + d.description, + d.status, + d.priority, + d.item_type, + d.author_name, + d.url, + d.created_at, + d.updated_at, + r.rrf_score as score + FROM rrf_combined r + JOIN search_documents d ON d.id = r.id + ORDER BY r.rrf_score DESC + LIMIT $11 + "#, + query.search_text, + query.filters.source_type as Option, + query.filters.organization.as_deref(), + query.filters.project.as_deref(), + query.filters.status.as_deref(), + query.filters.priority.as_deref(), + query.filters.item_type.as_deref(), + query.filters.is_draft, + query.filters.updated_after, + Vector::from(embedding.to_vec()) as Vector, + limit as i64 + ) + .fetch_all(&self.pool) + .await?; + + Ok(map_result_rows(results)) + } + + /// Execute BM25-only full-text search (no vector component). + async fn search_bm25_only( + &self, + query: &ParsedQuery, + limit: i32, + ) -> Result> { + let results = sqlx::query_as!( + SearchResultRow, + r#" + SELECT + id, + source_type as "source_type: SearchSource", + source_id, + external_id, + title, + description, + status, + priority, + item_type, + author_name, + url, + created_at, + updated_at, + ts_rank_cd(search_vector, websearch_to_tsquery('english', $1))::float8 as score + FROM search_documents + WHERE ($1 = '' OR search_vector @@ websearch_to_tsquery('english', $1)) + AND ($2::search_source IS NULL OR source_type = $2) + AND ($3::text IS NULL OR organization = $3) + AND ($4::text IS NULL OR project = $4) + AND ($5::text[] IS NULL OR status = ANY($5)) + AND ($6::int[] IS NULL OR priority = ANY($6)) + AND ($7::text[] IS NULL OR item_type = ANY($7)) + AND ($8::bool IS NULL OR is_draft = $8) + AND ($9::timestamptz IS NULL OR updated_at >= $9) + ORDER BY score DESC + LIMIT $10 + "#, + query.search_text, + query.filters.source_type as Option, + query.filters.organization.as_deref(), + query.filters.project.as_deref(), + query.filters.status.as_deref(), + query.filters.priority.as_deref(), + query.filters.item_type.as_deref(), + query.filters.is_draft, + query.filters.updated_after, + limit as i64 + ) + .fetch_all(&self.pool) + .await?; + + Ok(map_result_rows(results)) + } +} + +fn map_result_rows(rows: Vec) -> Vec { + rows.into_iter() + .map(|row| SearchResult { + id: row.id, + source_type: row.source_type, + source_id: row.source_id, + external_id: row.external_id, + title: row.title, + description: row.description, + status: row.status, + priority: row.priority, + item_type: row.item_type, + author_name: row.author_name, + url: row.url, + created_at: row.created_at, + updated_at: row.updated_at, + score: row.score.unwrap_or(0.0), + }) + .collect() +} + +/// Convert an embedding Option> to Option without unnecessary cloning. +fn to_pg_vector(embedding: &Option>) -> Option { + embedding.as_deref().map(|e| Vector::from(e.to_vec())) +} + +#[async_trait] +impl SearchRepository for PgSearchRepository { + async fn search( + &self, + query: &ParsedQuery, + embedding: Option<&[f32]>, + limit: i32, + ) -> Result> { + match embedding { + Some(emb) => self.search_hybrid(query, emb, limit).await, + None => self.search_bm25_only(query, limit).await, + } + } + + async fn upsert_document(&self, doc: &SearchDocument) -> Result<()> { + sqlx::query!( + r#" + INSERT INTO search_documents ( + source_type, source_id, external_id, title, description, content, + organization, project, repo_name, status, + author_id, author_name, assigned_to_id, assigned_to_name, + priority, item_type, is_draft, + created_at, updated_at, closed_at, + url, parent_id, linked_work_items, embedding + ) VALUES ( + $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, + $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, + $21, $22, $23, $24 + ) + ON CONFLICT (source_type, source_id) DO UPDATE SET + title = EXCLUDED.title, + description = EXCLUDED.description, + content = EXCLUDED.content, + status = EXCLUDED.status, + author_id = EXCLUDED.author_id, + author_name = EXCLUDED.author_name, + assigned_to_id = EXCLUDED.assigned_to_id, + assigned_to_name = EXCLUDED.assigned_to_name, + priority = EXCLUDED.priority, + item_type = EXCLUDED.item_type, + is_draft = EXCLUDED.is_draft, + updated_at = EXCLUDED.updated_at, + closed_at = EXCLUDED.closed_at, + linked_work_items = EXCLUDED.linked_work_items, + embedding = EXCLUDED.embedding, + indexed_at = NOW() + "#, + doc.source_type as SearchSource, + doc.source_id, + doc.external_id, + doc.title, + doc.description.as_deref(), + doc.content.as_deref(), + doc.organization, + doc.project, + doc.repo_name.as_deref(), + doc.status, + doc.author_id.as_deref(), + doc.author_name.as_deref(), + doc.assigned_to_id.as_deref(), + doc.assigned_to_name.as_deref(), + doc.priority, + doc.item_type.as_deref(), + doc.is_draft, + doc.created_at, + doc.updated_at, + doc.closed_at, + doc.url, + doc.parent_id, + &doc.linked_work_items, + to_pg_vector(&doc.embedding) as Option, + ) + .execute(&self.pool) + .await?; + + Ok(()) + } + + async fn upsert_documents(&self, docs: &[SearchDocument]) -> Result { + let mut tx = self.pool.begin().await?; + let mut count = 0; + + for doc in docs { + sqlx::query!( + r#" + INSERT INTO search_documents ( + source_type, source_id, external_id, title, description, content, + organization, project, repo_name, status, + author_id, author_name, assigned_to_id, assigned_to_name, + priority, item_type, is_draft, + created_at, updated_at, closed_at, + url, parent_id, linked_work_items, embedding + ) VALUES ( + $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, + $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, + $21, $22, $23, $24 + ) + ON CONFLICT (source_type, source_id) DO UPDATE SET + title = EXCLUDED.title, + description = EXCLUDED.description, + content = EXCLUDED.content, + status = EXCLUDED.status, + author_id = EXCLUDED.author_id, + author_name = EXCLUDED.author_name, + assigned_to_id = EXCLUDED.assigned_to_id, + assigned_to_name = EXCLUDED.assigned_to_name, + priority = EXCLUDED.priority, + item_type = EXCLUDED.item_type, + is_draft = EXCLUDED.is_draft, + updated_at = EXCLUDED.updated_at, + closed_at = EXCLUDED.closed_at, + linked_work_items = EXCLUDED.linked_work_items, + embedding = EXCLUDED.embedding, + indexed_at = NOW() + "#, + doc.source_type as SearchSource, + doc.source_id, + doc.external_id, + doc.title, + doc.description.as_deref(), + doc.content.as_deref(), + doc.organization, + doc.project, + doc.repo_name.as_deref(), + doc.status, + doc.author_id.as_deref(), + doc.author_name.as_deref(), + doc.assigned_to_id.as_deref(), + doc.assigned_to_name.as_deref(), + doc.priority, + doc.item_type.as_deref(), + doc.is_draft, + doc.created_at, + doc.updated_at, + doc.closed_at, + doc.url, + doc.parent_id, + &doc.linked_work_items, + to_pg_vector(&doc.embedding) as Option, + ) + .execute(&mut *tx) + .await?; + count += 1; + } + + tx.commit().await?; + Ok(count) + } + + async fn delete_document(&self, source_type: SearchSource, source_id: &str) -> Result { + let result = sqlx::query!( + r#" + DELETE FROM search_documents + WHERE source_type = $1 AND source_id = $2 + "#, + source_type as SearchSource, + source_id + ) + .execute(&self.pool) + .await?; + + Ok(result.rows_affected() > 0) + } + + async fn delete_stale_documents(&self, older_than: OffsetDateTime) -> Result { + let rows_affected = sqlx::query!( + r#" + DELETE FROM search_documents + WHERE indexed_at < $1 + "#, + older_than + ) + .execute(&self.pool) + .await? + .rows_affected(); + + Ok(rows_affected as usize) + } + + async fn get_document( + &self, + source_type: SearchSource, + source_id: &str, + ) -> Result> { + let row = sqlx::query_as!( + SearchDocumentRow, + r#" + SELECT + source_type as "source_type: SearchSource", + source_id, + external_id, + title, + description, + content, + organization, + project, + repo_name, + status, + author_id, + author_name, + assigned_to_id, + assigned_to_name, + priority, + item_type, + is_draft, + created_at, + updated_at, + closed_at, + url, + parent_id, + linked_work_items + FROM search_documents + WHERE source_type = $1 AND source_id = $2 + "#, + source_type as SearchSource, + source_id + ) + .fetch_optional(&self.pool) + .await?; + + Ok(row.map(|r| SearchDocument { + source_type: r.source_type, + source_id: r.source_id, + external_id: r.external_id, + title: r.title, + description: r.description, + content: r.content, + organization: r.organization, + project: r.project, + repo_name: r.repo_name, + status: r.status, + author_id: r.author_id, + author_name: r.author_name, + assigned_to_id: r.assigned_to_id, + assigned_to_name: r.assigned_to_name, + priority: r.priority, + item_type: r.item_type, + is_draft: r.is_draft.unwrap_or(false), + created_at: r.created_at, + updated_at: r.updated_at, + closed_at: r.closed_at, + url: r.url, + parent_id: r.parent_id, + linked_work_items: r.linked_work_items.unwrap_or_default(), + embedding: None, // Don't fetch embedding by default (large) + })) + } + + async fn count(&self, source_type: Option) -> Result { + let count = match source_type { + Some(st) => { + sqlx::query_scalar!( + r#"SELECT COUNT(*) as "count!" FROM search_documents WHERE source_type = $1"#, + st as SearchSource + ) + .fetch_one(&self.pool) + .await? + } + None => { + sqlx::query_scalar!(r#"SELECT COUNT(*) as "count!" FROM search_documents"#) + .fetch_one(&self.pool) + .await? + } + }; + + Ok(count) + } +} + +// Row types for sqlx queries + +#[allow(dead_code)] +struct SearchResultRow { + id: i32, + source_type: SearchSource, + source_id: String, + external_id: i32, + title: String, + description: Option, + status: String, + priority: Option, + item_type: Option, + author_name: Option, + url: String, + created_at: OffsetDateTime, + updated_at: OffsetDateTime, + score: Option, +} + +#[allow(dead_code)] +struct SearchDocumentRow { + source_type: SearchSource, + source_id: String, + external_id: i32, + title: String, + description: Option, + content: Option, + organization: String, + project: String, + repo_name: Option, + status: String, + author_id: Option, + author_name: Option, + assigned_to_id: Option, + assigned_to_name: Option, + priority: Option, + item_type: Option, + is_draft: Option, + created_at: OffsetDateTime, + updated_at: OffsetDateTime, + closed_at: Option, + url: String, + parent_id: Option, + linked_work_items: Option>, +} diff --git a/toki-api/src/domain/search/service.rs b/toki-api/src/domain/search/service.rs new file mode 100644 index 00000000..1f6448e5 --- /dev/null +++ b/toki-api/src/domain/search/service.rs @@ -0,0 +1,331 @@ +//! Search service combining embedding generation and hybrid search. + +use super::parser::parse_query; +use super::traits::{Embedder, Result, SearchRepository}; +use super::types::SearchResult; + +/// Configuration for the search service. +#[derive(Debug, Clone)] +pub struct SearchConfig { + /// Default number of results to return + pub default_limit: i32, + /// Maximum number of results allowed + pub max_limit: i32, + /// Minimum query length for semantic search + pub min_query_length: usize, +} + +impl Default for SearchConfig { + fn default() -> Self { + Self { + default_limit: 20, + max_limit: 100, + min_query_length: 2, + } + } +} + +/// Search service that combines embedding generation with hybrid search. +/// +/// # Type Parameters +/// +/// * `E` - Embedder implementation for generating query embeddings +/// * `R` - SearchRepository implementation for database operations +/// +/// # Examples +/// +/// ```ignore +/// let service = SearchService::new(embedder, repository, SearchConfig::default()); +/// let results = service.search("authentication PRs", 10).await?; +/// ``` +pub struct SearchService +where + E: Embedder, + R: SearchRepository, +{ + embedder: E, + repository: R, + config: SearchConfig, +} + +impl SearchService +where + E: Embedder, + R: SearchRepository, +{ + /// Create a new search service. + pub fn new(embedder: E, repository: R, config: SearchConfig) -> Self { + Self { + embedder, + repository, + config, + } + } + + /// Create a search service with default configuration. + pub fn with_defaults(embedder: E, repository: R) -> Self { + Self::new(embedder, repository, SearchConfig::default()) + } + + /// Execute a search query. + /// + /// Parses the query to extract filters, generates an embedding for semantic search, + /// and performs hybrid BM25 + vector search with RRF fusion. + /// + /// # Arguments + /// + /// * `query` - Natural language search query + /// * `limit` - Maximum number of results (None uses default, capped at max_limit) + /// + /// # Returns + /// + /// Results sorted by combined relevance score (higher is better). + pub async fn search(&self, query: &str, limit: Option) -> Result> { + let query = query.trim(); + if query.is_empty() { + return Ok(vec![]); + } + + // Parse query into filters and search text + let parsed = parse_query(query); + + // Determine effective limit + let limit = limit + .unwrap_or(self.config.default_limit) + .min(self.config.max_limit) + .max(1); + + // Generate embedding for semantic search (skip for filter-only queries) + let embedding = if parsed.search_text.len() >= self.config.min_query_length { + Some(self.embedder.embed(&parsed.search_text).await?) + } else { + None + }; + + // Execute hybrid search (or BM25-only when no embedding) + self.repository + .search(&parsed, embedding.as_deref(), limit) + .await + } + + /// Get document counts by source type. + #[allow(dead_code)] + pub async fn stats(&self) -> Result { + let total = self.repository.count(None).await?; + let prs = self + .repository + .count(Some(super::types::SearchSource::Pr)) + .await?; + let work_items = self + .repository + .count(Some(super::types::SearchSource::WorkItem)) + .await?; + + Ok(SearchStats { + total, + prs, + work_items, + }) + } +} + +/// Statistics about the search index. +#[allow(dead_code)] +#[derive(Debug, Clone)] +pub struct SearchStats { + pub total: i64, + pub prs: i64, + pub work_items: i64, +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::domain::search::embedder::MockEmbedder; + use crate::domain::search::repository::MockSearchRepository; + use crate::domain::search::types::{SearchDocument, SearchSource}; + use time::OffsetDateTime; + + fn make_pr(id: &str, title: &str) -> SearchDocument { + SearchDocument { + source_type: SearchSource::Pr, + source_id: id.to_string(), + external_id: 1, + title: title.to_string(), + description: Some("Description".to_string()), + content: None, + organization: "org".to_string(), + project: "project".to_string(), + repo_name: Some("repo".to_string()), + status: "active".to_string(), + author_id: None, + author_name: Some("Author".to_string()), + assigned_to_id: None, + assigned_to_name: None, + priority: None, + item_type: None, + is_draft: false, + created_at: OffsetDateTime::now_utc(), + updated_at: OffsetDateTime::now_utc(), + closed_at: None, + url: "https://dev.azure.com/org/project/_git/repo/pullrequest/1".to_string(), + parent_id: None, + linked_work_items: vec![], + embedding: None, + } + } + + fn make_bug(id: &str, title: &str, priority: i32) -> SearchDocument { + SearchDocument { + source_type: SearchSource::WorkItem, + source_id: id.to_string(), + external_id: 1, + title: title.to_string(), + description: Some("Bug description".to_string()), + content: None, + organization: "org".to_string(), + project: "Lerums Djursjukhus".to_string(), + repo_name: None, + status: "active".to_string(), + author_id: None, + author_name: Some("Reporter".to_string()), + assigned_to_id: None, + assigned_to_name: None, + priority: Some(priority), + item_type: Some("Bug".to_string()), + is_draft: false, + created_at: OffsetDateTime::now_utc(), + updated_at: OffsetDateTime::now_utc(), + closed_at: None, + url: "https://dev.azure.com/org/project/_workitems/edit/1".to_string(), + parent_id: None, + linked_work_items: vec![], + embedding: None, + } + } + + #[tokio::test] + async fn search_empty_query_returns_empty() { + let embedder = MockEmbedder::default(); + let repo = MockSearchRepository::new(); + let service = SearchService::with_defaults(embedder, repo); + + let results = service.search("", None).await.unwrap(); + assert!(results.is_empty()); + + let results = service.search(" ", None).await.unwrap(); + assert!(results.is_empty()); + } + + #[tokio::test] + async fn search_returns_matching_documents() { + let embedder = MockEmbedder::default(); + let repo = MockSearchRepository::new().with_documents(vec![ + make_pr("pr/1", "Authentication fix"), + make_pr("pr/2", "Database migration"), + ]); + let service = SearchService::with_defaults(embedder, repo); + + let results = service.search("authentication", None).await.unwrap(); + assert_eq!(results.len(), 1); + assert!(results[0].title.contains("Authentication")); + } + + #[tokio::test] + async fn search_applies_source_type_filter() { + let embedder = MockEmbedder::default(); + let repo = MockSearchRepository::new().with_documents(vec![ + make_pr("pr/1", "Auth PR"), + make_bug("wi/1", "Auth Bug", 1), + ]); + let service = SearchService::with_defaults(embedder, repo); + + let results = service.search("auth PRs", None).await.unwrap(); + assert_eq!(results.len(), 1); + assert_eq!(results[0].source_type, SearchSource::Pr); + } + + #[tokio::test] + async fn search_applies_priority_filter() { + let embedder = MockEmbedder::default(); + let repo = MockSearchRepository::new().with_documents(vec![ + make_bug("wi/1", "Critical bug", 1), + make_bug("wi/2", "Minor bug", 3), + ]); + let service = SearchService::with_defaults(embedder, repo); + + let results = service.search("priority 1 bugs", None).await.unwrap(); + assert_eq!(results.len(), 1); + assert_eq!(results[0].priority, Some(1)); + } + + #[tokio::test] + async fn search_applies_project_filter() { + let embedder = MockEmbedder::default(); + let lerum_bug = make_bug("wi/1", "Lerum bug", 1); + let mut other_bug = make_bug("wi/2", "Other bug", 1); + other_bug.project = "Other Project".to_string(); + + let repo = MockSearchRepository::new().with_documents(vec![lerum_bug, other_bug]); + let service = SearchService::with_defaults(embedder, repo); + + let results = service.search("bugs in Lerum", None).await.unwrap(); + assert_eq!(results.len(), 1); + assert!(results[0].source_id.contains("wi/1")); + } + + #[tokio::test] + async fn search_respects_limit() { + let embedder = MockEmbedder::default(); + let repo = MockSearchRepository::new().with_documents(vec![ + make_pr("pr/1", "PR 1"), + make_pr("pr/2", "PR 2"), + make_pr("pr/3", "PR 3"), + ]); + let service = SearchService::with_defaults(embedder, repo); + + let results = service.search("PR", Some(2)).await.unwrap(); + assert_eq!(results.len(), 2); + } + + #[tokio::test] + async fn search_generates_embedding_for_semantic_search() { + let embedder = MockEmbedder::default(); + let repo = MockSearchRepository::new(); + let service = SearchService::with_defaults(embedder.clone(), repo); + + service.search("authentication", None).await.unwrap(); + + // Embedder should have been called once + assert_eq!(embedder.call_count(), 1); + } + + #[tokio::test] + async fn search_skips_embedding_for_short_queries() { + let embedder = MockEmbedder::default(); + let repo = MockSearchRepository::new(); + let service = SearchService::with_defaults(embedder.clone(), repo); + + // Single character query + service.search("a", None).await.unwrap(); + + // Embedder should NOT have been called + assert_eq!(embedder.call_count(), 0); + } + + #[tokio::test] + async fn stats_returns_counts() { + let embedder = MockEmbedder::default(); + let repo = MockSearchRepository::new().with_documents(vec![ + make_pr("pr/1", "PR 1"), + make_pr("pr/2", "PR 2"), + make_bug("wi/1", "Bug 1", 1), + ]); + let service = SearchService::with_defaults(embedder, repo); + + let stats = service.stats().await.unwrap(); + assert_eq!(stats.total, 3); + assert_eq!(stats.prs, 2); + assert_eq!(stats.work_items, 1); + } +} diff --git a/toki-api/src/domain/search/source/ado.rs b/toki-api/src/domain/search/source/ado.rs new file mode 100644 index 00000000..e10b459f --- /dev/null +++ b/toki-api/src/domain/search/source/ado.rs @@ -0,0 +1,211 @@ +//! Azure DevOps document source implementation. +//! +//! Wraps the az-devops crate to fetch PRs and work items for search indexing. + +use async_trait::async_trait; +use futures::{stream, StreamExt, TryStreamExt}; +use time::OffsetDateTime; + +use az_devops::RepoClient; + +use crate::domain::search::traits::{DocumentSource, Result, SearchError}; +use crate::domain::search::types::{PullRequestDocument, WorkItemDocument}; + +/// Document source that fetches from Azure DevOps. +/// +/// # Example +/// +/// ```ignore +/// let client = RepoClient::new("repo", "org", "project", "pat").await?; +/// let source = AdoDocumentSource::new(client, "repo".to_string()); +/// let prs = source.fetch_pull_requests("org", "project").await?; +/// ``` +pub struct AdoDocumentSource { + client: RepoClient, + repo_name: String, +} + +impl AdoDocumentSource { + /// Create a new ADO document source. + pub fn new(client: RepoClient, repo_name: String) -> Self { + Self { client, repo_name } + } +} + +#[async_trait] +impl DocumentSource for AdoDocumentSource { + async fn fetch_pull_requests( + &self, + org: &str, + project: &str, + ) -> Result> { + // Fetch all PRs (open and closed) + let prs = self + .client + .get_all_pull_requests(None) + .await + .map_err(|e| SearchError::SourceError(e.to_string()))?; + + let org = org.to_string(); + let project = project.to_string(); + let repo_name = self.repo_name.clone(); + + // Process PRs in parallel, fetching threads, commits, and work items concurrently + let documents: Vec = stream::iter(prs) + .map(|pr| { + let client = &self.client; + let org = org.clone(); + let project = project.clone(); + let repo_name = repo_name.clone(); + + async move { + // Fetch threads, commits, and work items in parallel for this PR + let (threads, commits, work_item_ids) = tokio::try_join!( + client.get_threads_in_pull_request(pr.id), + client.get_commits_in_pull_request(pr.id), + client.get_work_item_ids_in_pull_request(pr.id), + ) + .map_err(|e| SearchError::SourceError(e.to_string()))?; + + // Combine comments from all threads + let mut comment_texts = Vec::new(); + for thread in threads { + for comment in thread.comments { + if let Some(content) = comment.content { + comment_texts.push(content); + } + } + } + + // Combine commit messages + let mut commit_messages = Vec::new(); + for commit in commits { + if let Some(comment) = commit.comment { + commit_messages.push(comment); + } + } + + // Build additional content from comments and commits + let additional_content = format!( + "{}\n\n{}", + comment_texts.join("\n\n"), + commit_messages.join("\n") + ); + + // Map PR status to string + let status = format!("{:?}", pr.status); + + Ok::<_, SearchError>(PullRequestDocument { + id: pr.id, + title: pr.title, + description: pr.description, + organization: org, + project, + repo_name, + status, + author_id: Some(pr.created_by.id.clone()), + author_name: Some(pr.created_by.display_name.clone()), + is_draft: pr.is_draft, + created_at: pr.created_at, + updated_at: pr.created_at, // PRs don't have updated_at in the model + closed_at: pr.closed_at, + url: pr.url, + additional_content, + linked_work_items: work_item_ids, + }) + } + }) + .buffer_unordered(10) + .try_collect() + .await?; + + Ok(documents) + } + + async fn fetch_work_items( + &self, + org: &str, + project: &str, + _since: Option, + ) -> Result> { + // Fetch work item IDs from PRs + let prs = self + .client + .get_all_pull_requests(None) + .await + .map_err(|e| SearchError::SourceError(e.to_string()))?; + + // Fetch work item IDs in parallel + let all_work_item_ids: Vec> = stream::iter(prs) + .map(|pr| { + let client = &self.client; + async move { + client + .get_work_item_ids_in_pull_request(pr.id) + .await + .map_err(|e| SearchError::SourceError(e.to_string())) + } + }) + .buffer_unordered(10) + .try_collect() + .await?; + + // Flatten and remove duplicates + let mut all_work_item_ids: Vec = all_work_item_ids.into_iter().flatten().collect(); + all_work_item_ids.sort_unstable(); + all_work_item_ids.dedup(); + + if all_work_item_ids.is_empty() { + return Ok(Vec::new()); + } + + // Fetch work item details + let work_items = self + .client + .get_work_items(all_work_item_ids) + .await + .map_err(|e| SearchError::SourceError(e.to_string()))?; + + let documents = work_items + .into_iter() + .map(|wi| { + // For work items, we don't have an easy way to get comments + // This would require additional API calls to get work item comments + // For now, leaving additional_content empty + let additional_content = String::new(); + + WorkItemDocument { + id: wi.id, + title: wi.title, + description: None, // Work items don't expose description in the current model + organization: org.to_string(), + project: project.to_string(), + status: wi.state, + author_id: wi.created_by.as_ref().map(|i| i.id.clone()), + author_name: wi.created_by.as_ref().map(|i| i.display_name.clone()), + assigned_to_id: wi.assigned_to.as_ref().map(|i| i.id.clone()), + assigned_to_name: wi.assigned_to.as_ref().map(|i| i.display_name.clone()), + priority: wi.priority, + item_type: wi.item_type, + created_at: wi.created_at, + updated_at: wi.changed_at, + closed_at: None, // Work items don't have closed_at in the current model + url: format!( + "https://dev.azure.com/{}/{}/_workitems/edit/{}", + org, project, wi.id + ), + parent_id: wi.parent_id, + additional_content, + } + }) + .collect(); + + Ok(documents) + } +} + +#[cfg(test)] +mod tests { + // Note: Real tests would require mocking the RepoClient + // or using an actual ADO connection (which requires credentials) +} diff --git a/toki-api/src/domain/search/source/mod.rs b/toki-api/src/domain/search/source/mod.rs new file mode 100644 index 00000000..feffcbb2 --- /dev/null +++ b/toki-api/src/domain/search/source/mod.rs @@ -0,0 +1,5 @@ +//! Document source implementations for fetching from Azure DevOps. + +mod ado; + +pub use ado::AdoDocumentSource; diff --git a/toki-api/src/domain/search/traits.rs b/toki-api/src/domain/search/traits.rs new file mode 100644 index 00000000..d0a4afd1 --- /dev/null +++ b/toki-api/src/domain/search/traits.rs @@ -0,0 +1,162 @@ +//! Trait definitions for search domain abstractions. +//! +//! These traits enable dependency injection and easy testing through mocking. + +use async_trait::async_trait; +use time::OffsetDateTime; + +use super::types::{ + ParsedQuery, PullRequestDocument, SearchDocument, SearchResult, SearchSource, + WorkItemDocument, +}; + +/// Error type for search operations. +#[allow(dead_code)] +#[derive(Debug, thiserror::Error)] +pub enum SearchError { + #[error("Embedding generation failed: {0}")] + EmbeddingError(String), + + #[error("Database error: {0}")] + DatabaseError(String), + + #[error("Source fetch error: {0}")] + SourceError(String), + + #[error("Configuration error: {0}")] + ConfigError(String), + + #[error("{0}")] + Other(String), +} + +impl From for SearchError { + fn from(e: sqlx::Error) -> Self { + SearchError::DatabaseError(e.to_string()) + } +} + +pub type Result = std::result::Result; + +/// Trait for text embedding generation. +/// +/// Abstracts the embedding provider (Gemini, OpenAI, etc.) for easy testing. +/// +/// # Example +/// +/// ```ignore +/// let embedder = GeminiEmbedder::new(api_key); +/// let embedding = embedder.embed("authentication system").await?; +/// assert_eq!(embedding.len(), 1536); // Gemini embedding dimensions +/// ``` +#[async_trait] +pub trait Embedder: Send + Sync { + /// Generate embedding for a single text. + async fn embed(&self, text: &str) -> Result>; + + /// Generate embeddings for multiple texts in a batch. + /// + /// Default implementation calls `embed` sequentially. + /// Implementations should override for better performance. + async fn embed_batch(&self, texts: &[&str]) -> Result>> { + let mut results = Vec::with_capacity(texts.len()); + for text in texts { + results.push(self.embed(text).await?); + } + Ok(results) + } + + /// Returns the embedding dimensions for this embedder. + #[allow(dead_code)] + fn dimensions(&self) -> usize; +} + +/// Trait for search document persistence and retrieval. +/// +/// Abstracts database operations for testing without a real database. +#[async_trait] +pub trait SearchRepository: Send + Sync { + /// Execute hybrid search (BM25 + vector) with filters. + /// + /// When `embedding` is `None`, only BM25 full-text search is used. + /// Returns results sorted by combined RRF score (or BM25 score if no embedding). + async fn search( + &self, + query: &ParsedQuery, + embedding: Option<&[f32]>, + limit: i32, + ) -> Result>; + + /// Insert or update a single document. + #[allow(dead_code)] + async fn upsert_document(&self, doc: &SearchDocument) -> Result<()>; + + /// Insert or update multiple documents in a batch. + /// + /// Returns the number of documents successfully upserted. + async fn upsert_documents(&self, docs: &[SearchDocument]) -> Result; + + /// Delete a document by source type and ID. + /// + /// Returns true if a document was deleted. + #[allow(dead_code)] + async fn delete_document(&self, source_type: SearchSource, source_id: &str) -> Result; + + /// Delete all documents that haven't been indexed since the given time. + /// + /// Returns the number of documents deleted. + async fn delete_stale_documents(&self, older_than: OffsetDateTime) -> Result; + + /// Get a document by source type and ID. + #[allow(dead_code)] + async fn get_document( + &self, + source_type: SearchSource, + source_id: &str, + ) -> Result>; + + /// Get total document count, optionally filtered by source type. + #[allow(dead_code)] + async fn count(&self, source_type: Option) -> Result; +} + +/// Trait for fetching documents from Azure DevOps. +/// +/// Abstracts ADO API calls for testing without real network requests. +#[async_trait] +pub trait DocumentSource: Send + Sync { + /// Fetch pull requests from a project. + /// + /// Should include active PRs and recently closed ones. + async fn fetch_pull_requests( + &self, + org: &str, + project: &str, + ) -> Result>; + + /// Fetch work items from a project. + /// + /// If `since` is provided, only fetch items updated after that time. + async fn fetch_work_items( + &self, + org: &str, + project: &str, + since: Option, + ) -> Result>; +} + +#[cfg(test)] +mod tests { + use super::*; + + // Verify traits are object-safe (can be used as trait objects) + fn _assert_embedder_object_safe(_: &dyn Embedder) {} + fn _assert_repository_object_safe(_: &dyn SearchRepository) {} + fn _assert_source_object_safe(_: &dyn DocumentSource) {} + + #[test] + fn search_error_from_sqlx() { + // Just verify the conversion compiles + let _: SearchError = SearchError::DatabaseError("test".to_string()); + } +} diff --git a/toki-api/src/domain/search/types.rs b/toki-api/src/domain/search/types.rs new file mode 100644 index 00000000..be6dea36 --- /dev/null +++ b/toki-api/src/domain/search/types.rs @@ -0,0 +1,225 @@ +//! Core types for the search domain. + +use serde::{Deserialize, Serialize}; +use sqlx::Type; +use time::OffsetDateTime; + +/// Source type for searchable documents. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, Type)] +#[sqlx(type_name = "search_source", rename_all = "snake_case")] +#[serde(rename_all = "snake_case")] +pub enum SearchSource { + Pr, + WorkItem, +} + +impl std::fmt::Display for SearchSource { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + SearchSource::Pr => write!(f, "pr"), + SearchSource::WorkItem => write!(f, "work_item"), + } + } +} + +/// A document to be indexed for search. +#[derive(Debug, Clone)] +pub struct SearchDocument { + /// Source type (PR or work item) + pub source_type: SearchSource, + /// Unique identifier: "org/project/repo/123" for PR, "org/project/123" for WI + pub source_id: String, + /// PR number or work item ID + pub external_id: i32, + /// Document title + pub title: String, + /// Document description/body + pub description: Option, + /// Combined searchable content (description + comments + commits) + pub content: Option, + /// Azure DevOps organization + pub organization: String, + /// Azure DevOps project + pub project: String, + /// Repository name (None for work items) + pub repo_name: Option, + /// Status: 'active', 'completed', 'abandoned' for PRs; 'New', 'Active', 'Closed' for WI + pub status: String, + /// Author user ID + pub author_id: Option, + /// Author display name + pub author_name: Option, + /// Assigned user ID + pub assigned_to_id: Option, + /// Assigned user display name + pub assigned_to_name: Option, + /// Priority (1-4 for work items, None for PRs) + pub priority: Option, + /// Item type: 'Bug', 'Task', 'User Story' for WI; None for PRs + pub item_type: Option, + /// Whether PR is a draft + pub is_draft: bool, + /// Creation timestamp + pub created_at: OffsetDateTime, + /// Last update timestamp + pub updated_at: OffsetDateTime, + /// Closed/completed timestamp + pub closed_at: Option, + /// Direct URL to the item + pub url: String, + /// Parent work item ID (for hierarchical items) + pub parent_id: Option, + /// Work items linked to this PR + pub linked_work_items: Vec, + /// Pre-computed embedding vector (1536 dimensions for Gemini) + pub embedding: Option>, +} + +/// Result from a search query. +#[derive(Debug, Clone, Serialize)] +pub struct SearchResult { + pub id: i32, + pub source_type: SearchSource, + pub source_id: String, + pub external_id: i32, + pub title: String, + pub description: Option, + pub status: String, + pub priority: Option, + pub item_type: Option, + pub author_name: Option, + pub url: String, + #[serde(with = "time::serde::rfc3339")] + pub created_at: OffsetDateTime, + #[serde(with = "time::serde::rfc3339")] + pub updated_at: OffsetDateTime, + /// Combined relevance score (higher is better) + pub score: f64, +} + +/// Parsed search query with extracted filters. +#[derive(Debug, Clone, Default)] +pub struct ParsedQuery { + /// Remaining search text after filter extraction + pub search_text: String, + /// Extracted filters + pub filters: SearchFilters, +} + +/// Filters extracted from a search query. +#[allow(dead_code)] +#[derive(Debug, Clone, Default)] +pub struct SearchFilters { + /// Filter by source type + pub source_type: Option, + /// Filter by organization + pub organization: Option, + /// Filter by project + pub project: Option, + /// Filter by repository name + pub repo_name: Option, + /// Filter by status (multiple allowed) + pub status: Option>, + /// Filter by priority (multiple allowed) + pub priority: Option>, + /// Filter by item type (multiple allowed) + pub item_type: Option>, + /// Filter by author name/ID + pub author: Option, + /// Filter by assignee name/ID + pub assigned_to: Option, + /// Filter for draft PRs only + pub is_draft: Option, + /// Filter: created after this date + pub created_after: Option, + /// Filter: created before this date + pub created_before: Option, + /// Filter: updated after this date + pub updated_after: Option, +} + +/// Statistics from a sync operation. +#[derive(Debug, Clone, Default)] +pub struct SyncStats { + pub prs_indexed: usize, + pub work_items_indexed: usize, + pub documents_deleted: usize, + pub errors: usize, +} + +impl SyncStats { + #[allow(dead_code)] + pub fn total_indexed(&self) -> usize { + self.prs_indexed + self.work_items_indexed + } +} + +/// Intermediate type for PR data from ADO. +#[allow(dead_code)] +#[derive(Debug, Clone)] +pub struct PullRequestDocument { + pub id: i32, + pub title: String, + pub description: Option, + pub organization: String, + pub project: String, + pub repo_name: String, + pub status: String, + pub author_id: Option, + pub author_name: Option, + pub is_draft: bool, + pub created_at: OffsetDateTime, + pub updated_at: OffsetDateTime, + pub closed_at: Option, + pub url: String, + /// Combined text from commits and comments + pub additional_content: String, + /// Linked work item IDs + pub linked_work_items: Vec, +} + +/// Intermediate type for work item data from ADO. +#[allow(dead_code)] +#[derive(Debug, Clone)] +pub struct WorkItemDocument { + pub id: i32, + pub title: String, + pub description: Option, + pub organization: String, + pub project: String, + pub status: String, + pub author_id: Option, + pub author_name: Option, + pub assigned_to_id: Option, + pub assigned_to_name: Option, + pub priority: Option, + pub item_type: String, + pub created_at: OffsetDateTime, + pub updated_at: OffsetDateTime, + pub closed_at: Option, + pub url: String, + pub parent_id: Option, + /// Combined text from comments + pub additional_content: String, +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn search_source_display() { + assert_eq!(SearchSource::Pr.to_string(), "pr"); + assert_eq!(SearchSource::WorkItem.to_string(), "work_item"); + } + + #[test] + fn sync_stats_total() { + let stats = SyncStats { + prs_indexed: 10, + work_items_indexed: 20, + ..Default::default() + }; + assert_eq!(stats.total_indexed(), 30); + } +} diff --git a/toki-api/src/router.rs b/toki-api/src/router.rs index cec84cc1..304248f8 100644 --- a/toki-api/src/router.rs +++ b/toki-api/src/router.rs @@ -36,7 +36,8 @@ pub async fn create( .nest("/differs", routes::differs::router()) .nest("/repositories", routes::repositories::router()) .nest("/notifications", routes::notifications::router()) - .nest("/milltime", routes::milltime::router()); + .nest("/milltime", routes::milltime::router()) + .nest("/search", routes::search::router()); // If authentication is enabled, wrap the app with the auth middleware let app_with_auth = if config.application.disable_auth { @@ -63,6 +64,10 @@ pub async fn create( #[cfg(not(debug_assertions))] app_state.start_all_differs().await; + // Start search indexer background task (if in production) + #[cfg(not(debug_assertions))] + app_state.start_search_indexer(); + // Finally, wrap the app with tracing layer, state and CORS let cors = CorsLayer::new() .allow_methods([Method::GET, Method::POST, Method::PUT, Method::DELETE]) diff --git a/toki-api/src/routes/mod.rs b/toki-api/src/routes/mod.rs index ea1bebd5..3829a05e 100644 --- a/toki-api/src/routes/mod.rs +++ b/toki-api/src/routes/mod.rs @@ -3,3 +3,4 @@ pub(crate) mod milltime; pub(crate) mod notifications; pub(crate) mod pull_requests; pub(crate) mod repositories; +pub(crate) mod search; diff --git a/toki-api/src/routes/search.rs b/toki-api/src/routes/search.rs new file mode 100644 index 00000000..9b0de879 --- /dev/null +++ b/toki-api/src/routes/search.rs @@ -0,0 +1,41 @@ +use axum::{ + extract::{Query, State}, + http::StatusCode, + routing::get, + Json, Router, +}; +use serde::Deserialize; +use tracing::instrument; + +use crate::{ + domain::search::SearchResult, + AppState, +}; + +pub fn router() -> Router { + Router::new().route("/", get(search)) +} + +#[derive(Debug, Clone, Deserialize)] +#[serde(rename_all = "camelCase")] +struct SearchQuery { + q: String, + limit: Option, +} + +#[instrument(name = "GET /search", skip(app_state))] +async fn search( + State(app_state): State, + Query(query): Query, +) -> Result>, (StatusCode, String)> { + let search_service = app_state + .search_service() + .ok_or((StatusCode::SERVICE_UNAVAILABLE, "Search service not available".to_string()))?; + + let results = search_service + .search(&query.q, query.limit) + .await + .map_err(|err| (StatusCode::INTERNAL_SERVER_ERROR, err.to_string()))?; + + Ok(Json(results)) +}