diff --git a/.env.example b/.env.example index 9e0bc5c..c1135d7 100644 --- a/.env.example +++ b/.env.example @@ -30,6 +30,11 @@ COSINE_THRESHOLD=0.2 MAX_CONCURRENT_FILES=1 MAX_WORKERS=1 +# BM25 Configuration +BM25_ENABLED=true +BM25_TEXT_CONFIG=english +BM25_RRF_K=60 + # Server Configuration MCP_TRANSPORT=sse ALLOWED_ORIGINS=["*"] diff --git a/.gitignore b/.gitignore index 4948b94..bff2863 100644 --- a/.gitignore +++ b/.gitignore @@ -14,4 +14,5 @@ trivy-report.json trivy-report-fixed.json coverage.xml .ruff_cache -.pytest_cache \ No newline at end of file +.pytest_cache +trivy-report-current.json \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index d39e0fe..ded5beb 100644 --- a/Dockerfile +++ b/Dockerfile @@ -34,10 +34,6 @@ COPY --from=builder /app/.venv /app/.venv COPY src/ /app/src/ COPY .env.example /app/.env -# Patch docling to fix TXT file format detection (PR #3161 incomplete) -COPY patch_docling_txt.py /tmp/patch_docling_txt.py -RUN /app/.venv/bin/python /tmp/patch_docling_txt.py && rm /tmp/patch_docling_txt.py - # Set Python path to include src directory ENV PYTHONPATH=/app/src:$PYTHONPATH ENV PATH="/app/.venv/bin:$PATH" diff --git a/Dockerfile.db b/Dockerfile.db index 9ac4947..716d7bf 100644 --- a/Dockerfile.db +++ b/Dockerfile.db @@ -10,13 +10,24 @@ RUN apt-get update && apt-get install -y \ bison \ && rm -rf /var/lib/apt/lists/* -# Install Apache AGE (v1.6.0 for PG17) and cleanup +# Install Apache AGE (v1.6.0 for PG17) RUN cd /tmp && \ git clone --branch PG17/v1.6.0-rc0 https://github.com/apache/age.git && \ cd age && \ make PG_CONFIG=/usr/lib/postgresql/17/bin/pg_config install || \ - (echo "Failed to build AGE" && exit 1) && \ - rm -rf /tmp/age + (echo "Failed to build AGE" && exit 1) + +# Install pg_textsearch extension for BM25 full-text search +RUN cd /tmp && \ + git clone https://github.com/timescale/pg_textsearch.git && \ + cd pg_textsearch && \ + make PG_CONFIG=/usr/lib/postgresql/17/bin/pg_config || \ + (echo "Failed to build pg_textsearch" && exit 1) && \ + make PG_CONFIG=/usr/lib/postgresql/17/bin/pg_config install || \ + (echo "Failed to install pg_textsearch" && exit 1) + +# Cleanup build artifacts +RUN rm -rf /tmp/age /tmp/pg_textsearch # Switch back to non-root user for security -USER postgres +USER postgres \ No newline at end of file diff --git a/README.md b/README.md index da20930..2261d99 100644 --- a/README.md +++ b/README.md @@ -7,42 +7,47 @@ Multi-modal RAG service exposing a REST API and MCP server for document indexing ``` Clients (REST / MCP / Claude) - | - +-----------------------+ - | FastAPI App | - +-----------+-----------+ | - +---------------+---------------+ - | | - Application Layer MCP Tools - +------------------------------+ (FastMCP) - | api/ | | - | indexing_routes.py | | - | query_routes.py | | - | health_routes.py | | - | use_cases/ | | - | IndexFileUseCase | | - | IndexFolderUseCase | | - | requests/ responses/ | | - +------------------------------+ | - | | | - v v v - Domain Layer (ports) - +--------------------------------------+ - | RAGEnginePort StoragePort | - +--------------------------------------+ - | | - v v - Infrastructure Layer (adapters) - +--------------------------------------+ - | LightRAGAdapter MinioAdapter | - | (RAGAnything) (minio-py) | - +--------------------------------------+ - | | - v v - PostgreSQL MinIO - (pgvector + (object - Apache AGE) storage) + +-----------------------+ + | FastAPI App | + +-----------+-----------+ + | + +---------------+---------------+ + | | + Application Layer MCP Tools + +------------------------------+ (FastMCP) + | api/ | | + | indexing_routes.py | | + | query_routes.py | | + | health_routes.py | | + | use_cases/ | | + | IndexFileUseCase | | + | IndexFolderUseCase | | + | QueryUseCase | | + | requests/ responses/ | | + +------------------------------+ | + | | | | + v v v v + Domain Layer (ports) + +------------------------------------------+ + | RAGEnginePort StoragePort BM25EnginePort| + +------------------------------------------+ + | | | + v v v + Infrastructure Layer (adapters) + +------------------------------------------+ + | LightRAGAdapter MinioAdapter | + | (RAGAnything) (minio-py) | + | | + | PostgresBM25Adapter RRFCombiner | + | (pg_textsearch) (hybrid+ fusion) | + +------------------------------------------+ + | | | + v v v + PostgreSQL MinIO + (pgvector + (object + Apache AGE storage) + pg_textsearch) ``` ## Prerequisites @@ -220,8 +225,97 @@ Response (`200 OK`): |-------|------|----------|---------|-------------| | `working_dir` | string | yes | -- | RAG workspace directory for this project | | `query` | string | yes | -- | The search query | -| `mode` | string | no | `"naive"` | Search mode (see Query Modes below) | -| `top_k` | integer | no | `10` | Number of chunks to retrieve | +| `mode` | string | no | `"naive"` | Search mode: `naive`, `local`, `global`, `hybrid`, `hybrid+`, `mix`, `bm25`, `bypass` | + +#### BM25 query mode + +Returns results ranked by PostgreSQL full-text search using `pg_textsearch`. Each chunk includes a `score` field with the BM25 relevance score. + +```bash +curl -X POST http://localhost:8000/api/v1/query \ + -H "Content-Type: application/json" \ + -d '{ + "working_dir": "project-alpha", + "query": "quarterly revenue growth", + "mode": "bm25", + "top_k": 10 + }' +``` + +Response (`200 OK`): + +```json +{ + "status": "success", + "message": "", + "data": { + "entities": [], + "relationships": [], + "chunks": [ + { + "chunk_id": "abc123", + "content": "Quarterly revenue grew 12% year-over-year...", + "file_path": "reports/financials-q4.pdf", + "score": 3.456, + "metadata": {} + } + ], + "references": [] + }, + "metadata": { + "query_mode": "bm25", + "total_results": 10 + } +} +``` + +#### Hybrid+ query mode + +Runs BM25 and vector search in parallel, then merges results using Reciprocal Rank Fusion (RRF). Each chunk includes `bm25_rank`, `vector_rank`, and `combined_score` fields. + +```bash +curl -X POST http://localhost:8000/api/v1/query \ + -H "Content-Type: application/json" \ + -d '{ + "working_dir": "project-alpha", + "query": "quarterly revenue growth", + "mode": "hybrid+", + "top_k": 10 + }' +``` + +Response (`200 OK`): + +```json +{ + "status": "success", + "message": "", + "data": { + "entities": [], + "relationships": [], + "chunks": [ + { + "chunk_id": "abc123", + "content": "Quarterly revenue grew 12% year-over-year...", + "file_path": "reports/financials-q4.pdf", + "score": 0.0328, + "bm25_rank": 1, + "vector_rank": 3, + "combined_score": 0.0328, + "metadata": {} + } + ], + "references": [] + }, + "metadata": { + "query_mode": "hybrid+", + "total_results": 10, + "rrf_k": 60 + } +} +``` + +The `combined_score` is the sum of `bm25_score` and `vector_score`, each computed as `1 / (k + rank)`. Results are sorted by `combined_score` descending. A chunk that appears in both result sets will have a higher combined score than one that appears in only one. ## MCP Server @@ -233,7 +327,7 @@ The MCP server is mounted at `/mcp` and exposes a single tool: `query_knowledge_ |-----------|------|---------|-------------| | `working_dir` | string | required | RAG workspace directory for this project | | `query` | string | required | The search query | -| `mode` | string | `"naive"` | Search mode: `naive`, `local`, `global`, `hybrid`, `mix`, `bypass` | +| `mode` | string | `"naive"` | Search mode: `naive`, `local`, `global`, `hybrid`, `hybrid+`, `mix`, `bm25`, `bypass` | | `top_k` | integer | `10` | Number of chunks to retrieve | ### Transport modes @@ -321,6 +415,16 @@ All configuration is via environment variables, loaded through Pydantic Settings | `ENABLE_TABLE_PROCESSING` | `true` | Process tables during indexing | | `ENABLE_EQUATION_PROCESSING` | `true` | Process equations during indexing | +### BM25 (`BM25Config`) + +| Variable | Default | Description | +|----------|---------|-------------| +| `BM25_ENABLED` | `true` | Enable BM25 full-text search | +| `BM25_TEXT_CONFIG` | `english` | PostgreSQL text search configuration | +| `BM25_RRF_K` | `60` | RRF constant K for hybrid search (must be >= 1) | + +When `BM25_ENABLED` is `false` or the pg_textsearch extension is not available, `hybrid+` mode falls back to `naive` (vector-only) and `bm25` mode returns an error. + ### MinIO (`MinioConfig`) | Variable | Default | Description | @@ -339,7 +443,9 @@ All configuration is via environment variables, loaded through Pydantic Settings | `local` | Entity-focused search using the knowledge graph | | `global` | Relationship-focused search across the knowledge graph | | `hybrid` | Combines local + global strategies | +| `hybrid+` | Parallel BM25 + vector search using Reciprocal Rank Fusion (RRF). Best of both worlds | | `mix` | Knowledge graph + vector chunks combined | +| `bm25` | BM25 full-text search only. PostgreSQL pg_textsearch | | `bypass` | Direct LLM query without retrieval | ## Development @@ -361,6 +467,22 @@ docker compose logs -f raganything-api # Follow API logs docker compose down -v # Stop and remove volumes ``` +## Database Migrations + +Alembic migrations run automatically at startup via the `db_lifespan` context manager in `main.py`. The migration state is tracked in the `raganything_alembic_version` table, which is separate from the `composable-agents` Alembic table to avoid conflicts. + +The initial migration (`001_add_bm25_support`) creates the `chunks` table with a `tsvector` column for full-text search, GIN and BM25 indexes, and an auto-update trigger. + +### Production requirements + +The PostgreSQL server must have the `pg_textsearch` extension installed and loaded. In production, this requires: + +1. **Dockerfile.db** builds a custom PostgreSQL image that compiles `pg_textsearch` from source (along with `pgvector` and `Apache AGE`). + +2. **docker-compose.yml** must configure `shared_preload_libraries=pg_textsearch` for the `bricks-db` service. The local dev `docker-compose.yml` in this repository includes this by default. + +3. The Alembic migration `001_add_bm25_support` will fail if `pg_textsearch` is not available. Ensure the database image is built from `Dockerfile.db` and the shared library is preloaded. + ## Project Structure ``` @@ -374,25 +496,35 @@ src/ ports/ rag_engine.py -- RAGEnginePort (abstract) storage_port.py -- StoragePort (abstract) + bm25_engine.py -- BM25EnginePort (abstract) application/ api/ health_routes.py -- GET /health indexing_routes.py -- POST /file/index, /folder/index - query_routes.py -- POST /query - mcp_tools.py -- MCP tool: query_knowledge_base + query_routes.py -- POST /query + mcp_tools.py -- MCP tool: query_knowledge_base requests/ indexing_request.py -- IndexFileRequest, IndexFolderRequest - query_request.py -- QueryRequest + query_request.py -- QueryRequest, QueryMode responses/ query_response.py -- QueryResponse, QueryDataResponse use_cases/ index_file_use_case.py -- Downloads from MinIO, indexes single file index_folder_use_case.py -- Downloads from MinIO, indexes folder + query_use_case.py -- Query with bm25/hybrid+ support infrastructure/ rag/ lightrag_adapter.py -- LightRAGAdapter (RAGAnything/LightRAG) storage/ minio_adapter.py -- MinioAdapter (minio-py client) + bm25/ + pg_textsearch_adapter.py -- PostgresBM25Adapter (pg_textsearch) + hybrid/ + rrf_combiner.py -- RRFCombiner (Reciprocal Rank Fusion) + alembic/ + env.py -- Alembic migration environment (async) + versions/ + 001_add_bm25_support.py -- BM25 table, indexes, triggers ``` ## License diff --git a/TXT_FILE_TESTS_SUMMARY.md b/TXT_FILE_TESTS_SUMMARY.md deleted file mode 100644 index 024ccfe..0000000 --- a/TXT_FILE_TESTS_SUMMARY.md +++ /dev/null @@ -1,201 +0,0 @@ -# TXT File Support Tests - Summary - -## Overview -Comprehensive unit and integration tests for TXT file support in the mcp-raganything project. These tests verify that the system correctly handles various TXT file scenarios using the existing docling parser (version 2.83.0). - -**Key Point:** No code changes were needed - docling handles TXT files automatically via `parse_method="txt"`. - -## Tests Added - -### 1. Unit Tests in `test_lightrag_adapter.py` (5 new tests) - -#### `test_index_txt_file_success` -- **Purpose:** Verify successful .txt file indexing -- **What it tests:** - - Creates a temporary .txt file - - Mocks RAGAnything.process_document_complete - - Verifies FileIndexingResult has SUCCESS status - - Confirms parse_method="txt" is passed correctly - -#### `test_index_text_extension_success` -- **Purpose:** Test .text extension (alternative TXT format) -- **What it tests:** - - Creates a file with .text extension - - Verifies successful processing - - Confirms file_name is preserved correctly - -#### `test_index_empty_txt_file` -- **Purpose:** Edge case - empty text file -- **What it tests:** - - Creates an empty .txt file - - Verifies processing succeeds (edge case) - - Confirms process_document_complete is still called - -#### `test_index_large_txt_file` -- **Purpose:** Large file handling -- **What it tests:** - - Creates a ~500KB text file - - Verifies efficient processing - - Checks file path is passed correctly to docling - -#### `test_index_txt_with_various_encodings` -- **Purpose:** Encoding support -- **What it tests:** - - UTF-8 with Unicode characters (café, ñ, 北京) - - UTF-16 encoded files (你好) - - ASCII-only content - - Verifies all three are processed successfully - ---- - -### 2. Integration Tests in `test_index_file_use_case.py` (5 new tests) - -#### `test_index_txt_file_from_minio` -- **Purpose:** End-to-end test with mocked MinIO -- **What it tests:** - - Mocks storage.get_object for TXT content - - Verifies file download from MinIO - - Confirms file written to correct location - - Checks FileIndexingResult returned correctly - -#### `test_index_folder_with_txt_files` -- **Purpose:** Folder indexing including .txt files -- **What it tests:** - - Mocks folder with mixed file types (.txt, .pdf) - - Verifies all files are downloaded - - Checks FolderIndexingResult statistics - -#### `test_index_txt_file_with_nested_path` -- **Purpose:** Nested directory handling -- **What it tests:** - - .txt file in deep nested path - - Confirms directories are created - - Verifies correct file path handling - -#### `test_index_multiple_txt_files_sequentially` -- **Purpose:** Multiple file processing -- **What it tests:** - - Sequential indexing of multiple .txt files - - Chapter1.txt, Chapter2.txt, Chapter3.txt - - Verifies each file is processed independently - -#### `test_index_txt_with_special_characters_in_content` -- **Purpose:** Special character handling -- **What it tests:** - - Emojis 🎉 - - Quotes and newlines - - Tab characters - - Verifies content preservation through download and processing - ---- - -### 3. Integration Tests in `test_index_folder_use_case.py` (4 new tests) - -#### `test_index_folder_with_txt_files` -- **Purpose:** Folder with .txt files from MinIO -- **What it tests:** - - Downloads all .txt files from storage - - Verifies correct MinIO bucket/key usage - - Checks folder statistics - -#### `test_index_folder_with_file_extensions_filter_txt` -- **Purpose:** Filter by .txt extension -- **What it tests:** - - Uses file_extensions=[".txt"] filter - - Mocks storage with mixed files (.txt, .pdf, .xlsx) - - Verifies only .txt files are downloaded - - Confirms non-TXT files are skipped - -#### `test_index_folder_with_txt_and_other_extensions` -- **Purpose:** Mixed file extensions including .txt -- **What it tests:** - - file_extensions=[".txt", ".text"] - - Verifies both extensions are recognized - - Confirms .pdf, .xlsx are excluded - -#### `test_index_folder_recursive_with_txt_files` -- **Purpose:** Recursive folder indexing with .txt files -- **What it tests:** - - Non-recursive vs recursive mode - - Nested .txt files in subdirectories - - Verifies recursive flag is passed correctly - - Checks all nested files are processed - ---- - -## Test Patterns Followed - -### Real Implementation Pattern -```python -# ✅ Real adapters/services - for internal components -from infrastructure.rag.lightrag_adapter import LightRAGAdapter - -# ✅ Mocks - only for external boundaries -@patch("infrastructure.rag.lightrag_adapter.RAGAnything") -def test_index_txt_file_success(self, mock_rag_cls, ...): - adapter = LightRAGAdapter(llm_config, rag_config) - # Test with real adapter, mocked external RAGAnything -``` - -### Idempotent Tests -- Each test creates its own temporary files using `tmp_path` fixture -- Tests don't depend on existing data -- Tests are independent and isolated - -### AAA Pattern -```python -async def test_example(self, use_case, tmp_path): - # Arrange - txt_content = b"sample text" - use_case.storage.get_object.return_value = txt_content - - # Act - result = await use_case.execute(file_name="test.txt", ...) - - # Assert - assert result.status == IndexingStatus.SUCCESS -``` - ---- - -## Test Execution - -```bash -# Run all TXT-related tests -uv run python -m pytest tests/unit/ -v --no-cov -k "txt" - -# Run specific test file -uv run python -m pytest tests/unit/test_lightrag_adapter.py::TestLightRAGAdapter::test_index_txt_file_success -v - -# Run all tests -uv run python -m pytest tests/unit/ -v --no-cov -``` - ---- - -## Results -**Total Tests:** 73 (all passing) -- **New tests added:** 14 -- **Existing tests:** 59 (all still passing) - ---- - -## Key Insights - -1. **No Code Changes Needed:** Docling 2.83.0 handles TXT files automatically via `parse_method="txt"` - -2. **Proper Mocking:** Tests mock RAGAnything (external boundary) but use real LightRAGAdapter implementation - -3. **Encoding Support:** Tests verify UTF-8, UTF-16, and ASCII encoding handling - -4. **File System Integration:** Tests use `tmp_path` fixture for safe temporary file operations - -5. **Extension Handling:** Tests cover both `.txt` and `.text` extensions - -6. **Error Cases:** Tests include edge cases like empty files and large files - ---- - -## Conclusion - -All 14 new tests pass successfully alongside the existing 59 tests, providing comprehensive coverage for TXT file support without requiring any code changes to the production codebase. \ No newline at end of file diff --git a/patch_docling_txt.py b/patch_docling_txt.py deleted file mode 100644 index 4874a99..0000000 --- a/patch_docling_txt.py +++ /dev/null @@ -1,77 +0,0 @@ -#!/usr/bin/env python3 -""" -Fix docling's _guess_format function to properly detect .txt files as MD format. - -Issue: docling's format detection returns None for .txt files instead of InputFormat.MD -Workaround: Patch the _guess_format function to map .txt/.text extensions to MD format. - -Based on PR #3161: https://github.com/docling-project/docling/pull/3161 -""" - -import sys - -def patch_docling(): - """Apply monkey-patch to docling's format detection.""" - print("📄 Patching docling format detection for TXT support...") - - # Import inside function to ensure packages are available - from docling.datamodel.document import InputFormat, FormatToExtensions - - # Add TXT extensions to MD format - txt_extensions = ['txt', 'text', 'qmd', 'rmd', 'Rmd'] - - # Get current MD extensions - current_md_extensions = FormatToExtensions.get(InputFormat.MD, []) - - # Add new extensions if not already present - for ext in txt_extensions: - if ext not in current_md_extensions: - current_md_extensions.append(ext) - - FormatToExtensions[InputFormat.MD] = current_md_extensions - - print(f"✅ Added TXT extensions to MD format: {txt_extensions}") - - # Now patch _guess_format - import docling.datamodel.document as doc_module - - # Get original function - if hasattr(doc_module, '_guess_format'): - original_guess = doc_module._guess_format - else: - print("⚠️ _guess_format not found, skipping monkey-patch") - return True - - def patched_guess_format(file_path, allowed_formats=None): - """Version of _guess_format that detects .txt files as MD.""" - from pathlib import Path - from docling.datamodel.document import InputFormat - - path = Path(file_path) - ext = path.suffix.lower().lstrip('.') - - # Map TXT extensions to MD format - if ext in ['txt', 'text', 'qmd', 'rmd', 'Rmd']: - result = InputFormat.MD - if allowed_formats is None or result in allowed_formats: - return result - - # Call original for other formats - return original_guess(file_path, allowed_formats) - - # Apply patch - doc_module._guess_format = patched_guess_format - - print("✅ Monkey-patched _guess_format to handle TXT files") - return True - -if __name__ == "__main__": - try: - patch_docling() - print("✅ Docling TXT support patch applied successfully!") - sys.exit(0) - except Exception as e: - print(f"❌ Failed to apply patch: {e}") - import traceback - traceback.print_exc() - sys.exit(1) \ No newline at end of file diff --git a/patch_raganything.py b/patch_raganything.py deleted file mode 100644 index 9960690..0000000 --- a/patch_raganything.py +++ /dev/null @@ -1,94 +0,0 @@ -#!/usr/bin/env python3 -""" -Patch RAGAnything parser.py to support TXT files. -This script patches the installed RAGAnything library to accept .txt, .text, and .md files. - -RAGAnything's DoclingParser rejects TXT files even though docling 2.84.0 supports them. -This patch routes TXT files to the existing office document parser which calls docling. -""" - -import sys -from pathlib import Path - -def patch_raganything(): - """Patch RAGAnything's DoclingParser to support TXT files.""" - - # Find raganything installation - try: - import raganything - parser_file = Path(raganything.__file__).parent / "parser.py" - except ImportError: - print("❌ RAGAnything not found") - return False - - if not parser_file.exists(): - print(f"❌ Parser file not found: {parser_file}") - return False - - print(f"📄 Patching: {parser_file}") - - with open(parser_file, 'r') as f: - content = f.read() - - # Check if already patched - if 'TXT files are supported by docling' in content: - print("✅ Already patched!") - return True - - # Find and patch the format check in DoclingParser.parse_document - old_code = """ elif ext in self.HTML_FORMATS: - return self.parse_html(file_path, output_dir, lang, **kwargs) - else: - raise ValueError( - f"Unsupported file format: {ext}. " - f"Docling only supports PDF files, Office formats ({', '.join(self.OFFICE_FORMATS)}) " - f"and HTML formats ({', '.join(self.HTML_FORMATS)})" - )""" - - new_code = """ elif ext in self.HTML_FORMATS: - return self.parse_html(file_path, output_dir, lang, **kwargs) - elif ext in {".txt", ".text", ".md"}: - # TXT files are supported by docling via MarkdownDocumentBackend (PR #3161) - # Docling 2.84.0+ handles these natively - treat as MD and route to docling - # Use parse_office_doc which calls DocumentConverter.convert() - return self.parse_office_doc(file_path, output_dir, lang, **kwargs) - else: - raise ValueError( - f"Unsupported file format: {ext}. " - f"Docling only supports PDF files, Office formats ({', '.join(self.OFFICE_FORMATS)}) " - f"and HTML formats ({', '.join(self.HTML_FORMATS)})" - )""" - - if old_code not in content: - print("❌ Patch pattern not found - RAGAnything may have changed") - print(" Searching for alternative pattern...") - - # Try alternative pattern - alt_old = "elif ext in self.HTML_FORMATS:" - alt_new = """elif ext in self.HTML_FORMATS: - return self.parse_html(file_path, output_dir, lang, **kwargs) - elif ext in {".txt", ".text", ".md"}: - # TXT files supported by docling via MarkdownDocumentBackend - return self.parse_office_doc(file_path, output_dir, lang, **kwargs) - elif ext in self.HTML_FORMATS:""" - - if alt_old in content: - print(" Found alternative pattern, applying patch...") - content = content.replace(alt_old, alt_new, 1) - else: - print("❌ Could not find any pattern to patch") - return False - else: - content = content.replace(old_code, new_code) - - # Write patched content - with open(parser_file, 'w') as f: - f.write(content) - - print("✅ RAGAnything patched successfully!") - print(" TXT files (.txt, .text, .md) are now supported") - return True - -if __name__ == "__main__": - success = patch_raganything() - sys.exit(0 if success else 1) \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 14e21c0..829ded9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,6 +6,7 @@ readme = "README.md" requires-python = ">=3.13" dependencies = [ "aiofiles>=24.1.0", + "alembic>=1.13.0", "asyncpg>=0.31.0", "docling>=2.84.0", "fastapi>=0.124.0", diff --git a/src/alembic.ini b/src/alembic.ini new file mode 100644 index 0000000..86d775d --- /dev/null +++ b/src/alembic.ini @@ -0,0 +1,37 @@ +[alembic] +script_location = %(here)s/alembic +prepend_sys_path = . + +[loggers] +keys = root,sqlalchemy,alembic + +[handlers] +keys = console + +[formatters] +keys = generic + +[logger_root] +level = WARN +handlers = console +qualname = + +[logger_sqlalchemy] +level = WARN +handlers = +qualname = sqlalchemy.engine + +[logger_alembic] +level = INFO +handlers = +qualname = alembic + +[handler_console] +class = StreamHandler +args = (sys.stderr,) +level = NOTSET +formatter = generic + +[formatter_generic] +format = %(levelname)-5.5s [%(name)s] %(message)s +datefmt = %H:%M:%S \ No newline at end of file diff --git a/src/alembic/env.py b/src/alembic/env.py new file mode 100644 index 0000000..e12e0c7 --- /dev/null +++ b/src/alembic/env.py @@ -0,0 +1,92 @@ +"""Alembic migration environment for asyncpg (no SQLAlchemy models).""" + +import asyncio +from logging.config import fileConfig + +from sqlalchemy import pool +from sqlalchemy.ext.asyncio import async_engine_from_config + +from alembic import context +from config import DatabaseConfig + +config = context.config + +if config.config_file_name is not None: + fileConfig(config.config_file_name) + +# No models - using raw SQL migrations with op.execute() +target_metadata = None + + +VERSION_TABLE = "raganything_alembic_version" + + +def get_url() -> str: + """Build the async database URL from application settings. + + Returns the asyncpg URL for async engine creation. + """ + db_config = DatabaseConfig() + url = db_config.DATABASE_URL + if "+asyncpg" not in url: + url = url.replace("postgresql://", "postgresql+asyncpg://") + return url + + +def run_migrations_offline() -> None: + """Run migrations in 'offline' mode. + + Configures the context with just a URL and not an Engine. + Calls to context.execute() emit the given string to the script output. + """ + url = get_url() + context.configure( + url=url, + target_metadata=target_metadata, + literal_binds=True, + dialect_opts={"paramstyle": "named"}, + version_table=VERSION_TABLE, + ) + + with context.begin_transaction(): + context.run_migrations() + + +def do_run_migrations(connection) -> None: + """Run migrations within a synchronous connection callback.""" + context.configure( + connection=connection, + target_metadata=target_metadata, + version_table=VERSION_TABLE, + ) + + with context.begin_transaction(): + context.run_migrations() + + +async def run_async_migrations() -> None: + """Run migrations in 'online' mode with an async engine.""" + configuration = config.get_section(config.config_ini_section, {}) + configuration["sqlalchemy.url"] = get_url() + + connectable = async_engine_from_config( + configuration, + prefix="sqlalchemy.", + poolclass=pool.NullPool, + ) + + async with connectable.connect() as connection: + await connection.run_sync(do_run_migrations) + + await connectable.dispose() + + +def run_migrations_online() -> None: + """Run migrations in 'online' mode.""" + asyncio.run(run_async_migrations()) + + +if context.is_offline_mode(): + run_migrations_offline() +else: + run_migrations_online() diff --git a/src/alembic/versions/001_add_bm25_support.py b/src/alembic/versions/001_add_bm25_support.py new file mode 100644 index 0000000..d25aa2f --- /dev/null +++ b/src/alembic/versions/001_add_bm25_support.py @@ -0,0 +1,67 @@ +"""Add BM25 support via pg_textsearch + +Revision ID: 001 +Revises: +Create Date: 2026-04-07 + +""" + +from collections.abc import Sequence + +from alembic import op + +revision: str = "001" +down_revision: str | None = None +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None + + +def upgrade() -> None: + """Add BM25 full-text search to lightrag_doc_chunks.""" + op.execute("CREATE EXTENSION IF NOT EXISTS pg_textsearch") + + op.execute( + "ALTER TABLE lightrag_doc_chunks ADD COLUMN IF NOT EXISTS content_tsv tsvector" + ) + + op.execute( + "CREATE INDEX IF NOT EXISTS idx_lightrag_chunks_content_tsv ON lightrag_doc_chunks USING GIN(content_tsv)" + ) + + op.execute( + """ + CREATE OR REPLACE FUNCTION update_chunks_tsv() + RETURNS TRIGGER AS $$ + BEGIN + NEW.content_tsv := to_tsvector('english', COALESCE(NEW.content, '')); + RETURN NEW; + END; + $$ LANGUAGE plpgsql; + """ + ) + + op.execute("DROP TRIGGER IF EXISTS trg_chunks_content_tsv ON lightrag_doc_chunks") + op.execute( + """ + CREATE TRIGGER trg_chunks_content_tsv + BEFORE INSERT OR UPDATE ON lightrag_doc_chunks + FOR EACH ROW EXECUTE FUNCTION update_chunks_tsv(); + """ + ) + + op.execute( + "UPDATE lightrag_doc_chunks SET content_tsv = to_tsvector('english', COALESCE(content, '')) WHERE content_tsv IS NULL" + ) + + op.execute("DROP TABLE IF EXISTS chunks") + + +def downgrade() -> None: + """Remove BM25 support from lightrag_doc_chunks.""" + op.execute("DROP TRIGGER IF EXISTS trg_chunks_content_tsv ON lightrag_doc_chunks") + op.execute("DROP FUNCTION IF EXISTS update_chunks_tsv()") + for suffix in ("english", "french"): + op.execute(f"DROP INDEX IF EXISTS idx_lightrag_chunks_bm25_{suffix}") + op.execute("DROP INDEX IF EXISTS idx_lightrag_chunks_content_tsv") + op.execute("ALTER TABLE lightrag_doc_chunks DROP COLUMN IF EXISTS content_tsv") + op.execute("DROP EXTENSION IF EXISTS pg_textsearch") diff --git a/src/application/requests/indexing_request.py b/src/application/requests/indexing_request.py index 080f163..81c19da 100644 --- a/src/application/requests/indexing_request.py +++ b/src/application/requests/indexing_request.py @@ -1,4 +1,14 @@ -from pydantic import BaseModel, Field +from typing import Annotated + +from pydantic import BaseModel, BeforeValidator, Field + + +def _coerce_file_extensions(v: str | list[str] | None) -> list[str] | None: + if v is None or v == "": + return None + if isinstance(v, str): + return [v] + return v class IndexFileRequest(BaseModel): @@ -16,6 +26,6 @@ class IndexFolderRequest(BaseModel): recursive: bool = Field( default=True, description="Process subdirectories recursively" ) - file_extensions: list[str] | None = Field( - default=None, description="File extensions to filter" - ) + file_extensions: Annotated[ + list[str] | None, BeforeValidator(_coerce_file_extensions) + ] = Field(default=None, description="File extensions to filter") diff --git a/src/application/requests/query_request.py b/src/application/requests/query_request.py index 97d5a6b..7a7185d 100644 --- a/src/application/requests/query_request.py +++ b/src/application/requests/query_request.py @@ -2,7 +2,9 @@ from pydantic import BaseModel, Field -QueryMode = Literal["local", "global", "hybrid", "naive", "mix", "bypass"] +QueryMode = Literal[ + "local", "global", "hybrid", "hybrid+", "naive", "mix", "bypass", "bm25" +] class QueryRequest(BaseModel): @@ -16,8 +18,10 @@ class QueryRequest(BaseModel): mode: QueryMode = Field( default="naive", description=( - "Search mode - 'naive' (default, recommended), 'local' (context-aware), " - "'global' (document-level), or 'hybrid' (comprehensive) or 'mix' (automatic strategy). " + "Search mode - 'naive' (default, vector only), 'local' (context-aware), " + "'global' (document-level), 'hybrid' (local+global KG), " + "'hybrid+' (BM25+vector parallel), 'mix' (automatic strategy), " + "'bm25' (full-text only)." ), ) top_k: int = Field( diff --git a/src/application/responses/query_response.py b/src/application/responses/query_response.py index b250ab1..0c84989 100644 --- a/src/application/responses/query_response.py +++ b/src/application/responses/query_response.py @@ -22,10 +22,10 @@ class RelationshipResponse(BaseModel): class ChunkResponse(BaseModel): - reference_id: str + reference_id: str | None = None content: str file_path: str - chunk_id: str + chunk_id: str = "" class ReferenceResponse(BaseModel): diff --git a/src/application/use_cases/query_use_case.py b/src/application/use_cases/query_use_case.py index 8a910eb..ace4dcd 100644 --- a/src/application/use_cases/query_use_case.py +++ b/src/application/use_cases/query_use_case.py @@ -1,16 +1,161 @@ +"""Query use case with hybrid+ mode support.""" + +import asyncio +import logging +from typing import Literal + +from domain.ports.bm25_engine import BM25EnginePort, BM25SearchResult from domain.ports.rag_engine import RAGEnginePort +from infrastructure.hybrid.rrf_combiner import RRFCombiner + +logger = logging.getLogger(__name__) class QueryUseCase: """Use case for querying the RAG knowledge base.""" - def __init__(self, rag_engine: RAGEnginePort) -> None: + def __init__( + self, + rag_engine: RAGEnginePort, + bm25_engine: BM25EnginePort | None = None, + rrf_k: int = 60, + ): + """Initialize use case. + + Args: + rag_engine: RAG engine for vector search + bm25_engine: BM25 engine for full-text search (optional) + rrf_k: RRF constant for combining results + """ self.rag_engine = rag_engine + self.bm25_engine = bm25_engine + self.rrf_combiner = RRFCombiner(k=rrf_k) async def execute( - self, working_dir: str, query: str, mode: str = "naive", top_k: int = 10 + self, + working_dir: str, + query: str, + mode: Literal[ + "naive", "local", "global", "hybrid", "hybrid+", "mix", "bypass", "bm25" + ] = "naive", + top_k: int = 10, ) -> dict: + """Execute search query. + + Args: + working_dir: Project/workspace directory + query: Search query string + mode: Search mode + - "naive": Vector search only + - "local": Local knowledge graph search + - "global": Global knowledge graph search + - "hybrid": Local + global knowledge graph + - "hybrid+": BM25 + vector search (parallel) + - "mix": Knowledge graph + vector chunks + - "bypass": Direct LLM query + - "bm25": BM25 search only + top_k: Number of results to return + + Returns: + Search results + """ self.rag_engine.init_project(working_dir) + + if mode == "bm25": + if self.bm25_engine is None: + return { + "status": "error", + "message": "BM25 engine not available. Please configure pg_textsearch extension.", + "data": {}, + } + + results = await self.bm25_engine.search(query, working_dir, top_k) + return self._format_bm25_results(results) + + if mode == "hybrid+": + if self.bm25_engine is None: + return await self.rag_engine.query( + query=query, mode="naive", top_k=top_k, working_dir=working_dir + ) + + bm25_results, vector_results = await asyncio.gather( + self.bm25_engine.search(query, working_dir, top_k=top_k * 2), + self.rag_engine.query( + query=query, mode="naive", top_k=top_k * 2, working_dir=working_dir + ), + return_exceptions=True, + ) + + bm25_hits: list[BM25SearchResult] = ( + bm25_results if isinstance(bm25_results, list) else [] + ) + if isinstance(bm25_results, Exception): + logger.error("BM25 search failed in hybrid+ mode: %s", bm25_results) + + if isinstance(vector_results, Exception): + logger.error("Vector search failed in hybrid+ mode: %s", vector_results) + raise vector_results + + combined_results = self.rrf_combiner.combine( + bm25_results=bm25_hits, + vector_results=vector_results, # type: ignore[arg-type] + top_k=top_k, + ) + + return self._format_hybrid_results(combined_results) + return await self.rag_engine.query( query=query, mode=mode, top_k=top_k, working_dir=working_dir ) + + def _format_bm25_results(self, results: list) -> dict: + """Format BM25 results to match API response format.""" + chunks = [ + { + "reference_id": r.chunk_id, + "content": r.content, + "file_path": r.file_path, + "chunk_id": r.chunk_id, + } + for r in results + ] + return { + "status": "success", + "message": "", + "data": { + "entities": [], + "relationships": [], + "chunks": chunks, + "references": [], + }, + "metadata": { + "query_mode": "bm25", + "total_results": len(results), + }, + } + + def _format_hybrid_results(self, results: list) -> dict: + """Format hybrid results to match API response format.""" + return { + "status": "success", + "message": "", + "data": { + "entities": [], + "relationships": [], + "chunks": [ + { + "reference_id": r.reference_id, + "content": r.content, + "file_path": r.file_path, + "chunk_id": r.chunk_id, + } + for r in results + ], + "references": [], + }, + "metadata": { + "query_mode": "hybrid+", + "total_results": len(results), + "rrf_k": self.rrf_combiner.k, + }, + } diff --git a/src/config.py b/src/config.py index e96dc4c..6b1f89a 100644 --- a/src/config.py +++ b/src/config.py @@ -25,9 +25,7 @@ class AppConfig(BaseSettings): class DatabaseConfig(BaseSettings): - """ - Database connection configuration. - """ + """Database connection configuration.""" POSTGRES_USER: str = Field(default="raganything") POSTGRES_PASSWORD: str = Field(default="raganything") @@ -42,9 +40,7 @@ def DATABASE_URL(self) -> str: class LLMConfig(BaseSettings): - """ - Large Language Model configuration. - """ + """Large Language Model configuration.""" OPEN_ROUTER_API_KEY: str | None = Field(default=None) OPENROUTER_API_KEY: str | None = Field(default=None) @@ -82,9 +78,7 @@ def api_base_url(self) -> str: class RAGConfig(BaseSettings): - """ - RAG-specific configuration for LightRAG. - """ + """RAG-specific configuration for LightRAG.""" COSINE_THRESHOLD: float = Field( default=0.2, description="Similarity threshold for vector search (0.0-1.0)" @@ -109,6 +103,18 @@ class RAGConfig(BaseSettings): ) +class BM25Config(BaseSettings): + """BM25 search configuration.""" + + BM25_ENABLED: bool = Field(default=True, description="Enable BM25 full-text search") + BM25_TEXT_CONFIG: str = Field( + default="english", description="PostgreSQL text search configuration" + ) + BM25_RRF_K: int = Field( + default=60, ge=1, description="RRF constant K for hybrid search" + ) + + class MinioConfig(BaseSettings): """MinIO object storage configuration.""" diff --git a/src/dependencies.py b/src/dependencies.py index baad639..b311ef9 100644 --- a/src/dependencies.py +++ b/src/dependencies.py @@ -6,21 +6,28 @@ from application.use_cases.index_folder_use_case import IndexFolderUseCase from application.use_cases.multimodal_query_use_case import MultimodalQueryUseCase from application.use_cases.query_use_case import QueryUseCase -from config import AppConfig, LLMConfig, MinioConfig, RAGConfig +from config import ( + AppConfig, + BM25Config, + DatabaseConfig, + LLMConfig, + MinioConfig, + RAGConfig, +) +from domain.ports.bm25_engine import BM25EnginePort +from infrastructure.bm25.pg_textsearch_adapter import PostgresBM25Adapter from infrastructure.rag.lightrag_adapter import LightRAGAdapter from infrastructure.storage.minio_adapter import MinioAdapter -# ============= CONFIG ============= - app_config = AppConfig() # type: ignore llm_config = LLMConfig() # type: ignore rag_config = RAGConfig() # type: ignore minio_config = MinioConfig() # type: ignore +bm25_config = BM25Config() # type: ignore +db_config = DatabaseConfig() # type: ignore os.makedirs(app_config.OUTPUT_DIR, exist_ok=True) -# ============= ADAPTERS ============= - rag_adapter = LightRAGAdapter(llm_config, rag_config) minio_adapter = MinioAdapter( host=minio_config.MINIO_HOST, @@ -29,7 +36,16 @@ secure=minio_config.MINIO_SECURE, ) -# ============= USE CASE PROVIDERS ============= +bm25_adapter: BM25EnginePort | None = None +if bm25_config.BM25_ENABLED: + try: + bm25_adapter = PostgresBM25Adapter( + db_url=db_config.DATABASE_URL.replace("+asyncpg", ""), + text_config=bm25_config.BM25_TEXT_CONFIG, + ) + except Exception as e: + print(f"WARNING: BM25 adapter initialization failed: {e}") + bm25_adapter = None def get_index_file_use_case() -> IndexFileUseCase: @@ -45,7 +61,11 @@ def get_index_folder_use_case() -> IndexFolderUseCase: def get_query_use_case() -> QueryUseCase: - return QueryUseCase(rag_adapter) + return QueryUseCase( + rag_engine=rag_adapter, + bm25_engine=bm25_adapter, + rrf_k=bm25_config.BM25_RRF_K, + ) def get_multimodal_query_use_case() -> MultimodalQueryUseCase: diff --git a/src/domain/ports/bm25_engine.py b/src/domain/ports/bm25_engine.py new file mode 100644 index 0000000..9e6bbb8 --- /dev/null +++ b/src/domain/ports/bm25_engine.py @@ -0,0 +1,85 @@ +"""BM25 search engine port interface.""" + +from abc import ABC, abstractmethod +from dataclasses import dataclass +from typing import Any + + +@dataclass +class BM25SearchResult: + """Result from BM25 search.""" + + chunk_id: str + content: str + file_path: str + score: float + metadata: dict[str, Any] + + +class BM25EnginePort(ABC): + """Port interface for BM25 full-text search operations.""" + + @abstractmethod + async def search( + self, + query: str, + working_dir: str, + top_k: int = 10, + ) -> list[BM25SearchResult]: + """Search documents using BM25 ranking. + + Args: + query: Search query string + working_dir: Project/workspace directory + top_k: Number of results to return + + Returns: + List of BM25SearchResult ordered by relevance + """ + pass + + @abstractmethod + async def index_document( + self, + chunk_id: str, + content: str, + file_path: str, + working_dir: str, + metadata: dict[str, Any] | None = None, + ) -> None: + """Index a document chunk for BM25 search. + + Args: + chunk_id: Unique chunk identifier + content: Text content to index + file_path: Path to source file + working_dir: Project/workspace directory + metadata: Optional metadata dictionary + """ + pass + + @abstractmethod + async def create_index(self, working_dir: str) -> None: + """Create BM25 index for workspace. + + Args: + working_dir: Project/workspace directory + """ + pass + + @abstractmethod + async def drop_index(self, working_dir: str) -> None: + """Drop BM25 index for workspace. + + Args: + working_dir: Project/workspace directory + """ + pass + + @abstractmethod + async def close(self) -> None: + """Close connection pool and cleanup resources. + + Called during application shutdown. + """ + pass diff --git a/src/infrastructure/bm25/pg_textsearch_adapter.py b/src/infrastructure/bm25/pg_textsearch_adapter.py new file mode 100644 index 0000000..905915a --- /dev/null +++ b/src/infrastructure/bm25/pg_textsearch_adapter.py @@ -0,0 +1,244 @@ +"""PostgreSQL BM25 adapter using pg_textsearch extension.""" + +import asyncio +import hashlib +import logging +from typing import Any + +import asyncpg + +from domain.ports.bm25_engine import BM25EnginePort, BM25SearchResult + +logger = logging.getLogger(__name__) + + +class PostgresBM25Adapter(BM25EnginePort): + """PostgreSQL BM25 implementation using pg_textsearch. + + Queries the lightrag_doc_chunks table directly, using the same + workspace mapping as LightRAGAdapter (_make_workspace). + """ + + _BM25_INDEX_PREFIX = "idx_lightrag_chunks_bm25" + + def __init__(self, db_url: str, text_config: str = "english"): + self.db_url = db_url + self.text_config = text_config + self._pool: asyncpg.Pool | None = None + self._pool_lock = asyncio.Lock() + + @property + def bm25_index_name(self) -> str: + return f"{self._BM25_INDEX_PREFIX}_{self.text_config}" + + @staticmethod + def _make_workspace(working_dir: str) -> str: + """Map working_dir to lightrag_doc_chunks.workspace (same as LightRAGAdapter).""" + digest = hashlib.sha256(working_dir.encode()).hexdigest()[:16] + return f"ws_{digest}" + + async def _get_pool(self) -> asyncpg.Pool: + """Get or create database connection pool with double-checked locking.""" + if self._pool is not None: + return self._pool + async with self._pool_lock: + if self._pool is not None: + return self._pool + self._pool = await asyncpg.create_pool(self.db_url) + await self._check_extension() + return self._pool + + async def _check_extension(self) -> None: + """Warn if pg_textsearch extension is not installed.""" + async with self._pool.acquire() as conn: + try: + result = await conn.fetchval( + "SELECT EXISTS(SELECT 1 FROM pg_extension WHERE extname='pg_textsearch')" + ) + if not result: + logger.warning( + "pg_textsearch extension not installed. " + "BM25 ranking <@> operator will not work. " + "Run: CREATE EXTENSION pg_textsearch;" + ) + return + except Exception as e: + logger.warning("Could not check pg_textsearch extension: %s", e) + return + + await self._ensure_bm25_index(conn) + await self._rebuild_tsv_if_config_changed(conn) + + async def _ensure_bm25_index(self, conn) -> None: + """Create or recreate the BM25 index for the configured text_config. + + Drops any stale BM25 index from a different text_config. + """ + index_name = self.bm25_index_name + try: + existing = await conn.fetchval( + "SELECT indexname FROM pg_indexes WHERE indexname = $1", + index_name, + ) + if existing: + logger.info( + "BM25 index '%s' already exists for text_config='%s'", + index_name, + self.text_config, + ) + return + + for suffix in ("english", "french"): + stale = f"{self._BM25_INDEX_PREFIX}_{suffix}" + if stale != index_name: + await conn.execute(f"DROP INDEX IF EXISTS {stale}") + + await conn.execute( + f""" + CREATE INDEX {index_name} + ON lightrag_doc_chunks USING bm25(content) + WITH (text_config='{self.text_config}') + """ + ) + logger.info( + "Created BM25 index '%s' with text_config='%s'", + index_name, + self.text_config, + ) + except Exception as e: + logger.error("Failed to ensure BM25 index: %s", e) + + async def _rebuild_tsv_if_config_changed(self, conn) -> None: + """Rebuild content_tsv if trigger function uses a different text_config.""" + try: + func_def = await conn.fetchval( + "SELECT prosrc FROM pg_proc WHERE proname = 'update_chunks_tsv'" + ) + if func_def and f"'{self.text_config}'" not in func_def: + logger.info( + "Updating trigger function from old text_config to '%s'", + self.text_config, + ) + await conn.execute( + f""" + CREATE OR REPLACE FUNCTION update_chunks_tsv() + RETURNS TRIGGER AS $$ + BEGIN + NEW.content_tsv := to_tsvector('{self.text_config}', COALESCE(NEW.content, '')); + RETURN NEW; + END; + $$ LANGUAGE plpgsql; + """ + ) + status = await conn.execute( + f""" + UPDATE lightrag_doc_chunks + SET content_tsv = to_tsvector('{self.text_config}', COALESCE(content, '')) + WHERE content_tsv IS NOT NULL + """ + ) + logger.info("Rebuilt content_tsv: %s with text_config='%s'", status, self.text_config) + except Exception as e: + logger.warning("Could not check/rebuild trigger function: %s", e) + + async def close(self) -> None: + """Close connection pool on shutdown.""" + if self._pool: + await self._pool.close() + self._pool = None + + async def search( + self, + query: str, + working_dir: str, + top_k: int = 10, + ) -> list[BM25SearchResult]: + """Search using BM25 ranking on lightrag_doc_chunks.""" + pool = await self._get_pool() + workspace = self._make_workspace(working_dir) + bm25_index = f"idx_lightrag_chunks_bm25_{self.text_config}" + + try: + async with pool.acquire() as conn: + sql = """ + SELECT + id AS chunk_id, + content, + file_path, + content <@> to_bm25query($1, $3) as score + FROM lightrag_doc_chunks + WHERE workspace = $2 + AND content <@> to_bm25query($1, $3) < 0 + ORDER BY score + LIMIT $4 + """ + results = await conn.fetch( + sql, query, workspace, bm25_index, top_k + ) + + return [ + BM25SearchResult( + chunk_id=row["chunk_id"], + content=row["content"], + file_path=row["file_path"], + score=abs(row["score"]), + metadata={}, + ) + for row in results + ] + except Exception as e: + logger.error( + "BM25 search failed: %s", + e, + extra={"query": query, "working_dir": working_dir}, + ) + raise + + async def index_document( + self, + chunk_id: str, + content: str, + file_path: str, + working_dir: str, + metadata: dict[str, Any] | None = None, + ) -> None: + """No-op: LightRAG owns the lightrag_doc_chunks table.""" + pass + + async def create_index(self, working_dir: str) -> None: + """Re-index tsvector for workspace chunks.""" + pool = await self._get_pool() + workspace = self._make_workspace(working_dir) + + try: + async with pool.acquire() as conn: + await conn.execute( + f""" + UPDATE lightrag_doc_chunks + SET content_tsv = to_tsvector('{self.text_config}', COALESCE(content, '')) + WHERE workspace = $1 AND content_tsv IS NULL + """, + workspace, + ) + except Exception as e: + logger.error( + "BM25 index creation failed: %s", e, extra={"working_dir": working_dir} + ) + raise + + async def drop_index(self, working_dir: str) -> None: + """Clear tsvector for workspace chunks.""" + pool = await self._get_pool() + workspace = self._make_workspace(working_dir) + + try: + async with pool.acquire() as conn: + await conn.execute( + "UPDATE lightrag_doc_chunks SET content_tsv = NULL WHERE workspace = $1", + workspace, + ) + except Exception as e: + logger.error( + "BM25 index drop failed: %s", e, extra={"working_dir": working_dir} + ) + raise diff --git a/src/infrastructure/hybrid/rrf_combiner.py b/src/infrastructure/hybrid/rrf_combiner.py new file mode 100644 index 0000000..9f57166 --- /dev/null +++ b/src/infrastructure/hybrid/rrf_combiner.py @@ -0,0 +1,119 @@ +"""Reciprocal Rank Fusion (RRF) combiner for hybrid search.""" + +from dataclasses import dataclass +from typing import Any + +from domain.ports.bm25_engine import BM25SearchResult + + +@dataclass +class HybridSearchResult: + """Combined result from BM25 and vector search.""" + + chunk_id: str + content: str + file_path: str + vector_score: float + bm25_score: float + combined_score: float + metadata: dict[str, Any] + reference_id: str | None = None + bm25_rank: int | None = None + vector_rank: int | None = None + + +class RRFCombiner: + """Reciprocal Rank Fusion algorithm for combining search results. + + RRF formula: score = Σ (1 / (k + rank_i)) + where k is a constant (default 60) and rank_i is the rank in list i. + """ + + def __init__(self, k: int = 60): + self.k = k + + def _add_bm25_result( + self, scores: dict[str, dict[str, Any]], rank: int, result: BM25SearchResult + ) -> None: + chunk_id = result.chunk_id + if chunk_id not in scores: + scores[chunk_id] = { + "content": result.content, + "file_path": result.file_path, + "metadata": result.metadata, + "bm25_score": 0.0, + "vector_score": 0.0, + "reference_id": None, + "bm25_rank": rank, + "vector_rank": None, + } + else: + scores[chunk_id]["bm25_rank"] = min(scores[chunk_id]["bm25_rank"], rank) + scores[chunk_id]["bm25_score"] = 1.0 / (self.k + scores[chunk_id]["bm25_rank"]) + + def _add_vector_result( + self, scores: dict[str, dict[str, Any]], rank: int, chunk: dict[str, Any] + ) -> None: + raw_chunk_id = chunk.get("chunk_id") + raw_ref_id = chunk.get("reference_id") + chunk_id = raw_chunk_id or raw_ref_id + if not chunk_id: + return + reference_id = raw_ref_id + if chunk_id not in scores: + scores[chunk_id] = { + "content": chunk.get("content", ""), + "file_path": chunk.get("file_path", ""), + "metadata": chunk.get("metadata", {}), + "bm25_score": 0.0, + "vector_score": 0.0, + "reference_id": reference_id, + "bm25_rank": None, + "vector_rank": rank, + } + else: + existing = scores[chunk_id]["vector_rank"] + scores[chunk_id]["vector_rank"] = ( + min(existing, rank) if existing is not None else rank + ) + if reference_id: + scores[chunk_id]["reference_id"] = reference_id + + actual_rank = scores[chunk_id]["vector_rank"] + if actual_rank is not None: + scores[chunk_id]["vector_score"] = 1.0 / (self.k + actual_rank) + + def combine( + self, + bm25_results: list[BM25SearchResult], + vector_results: dict, + top_k: int = 10, + ) -> list[HybridSearchResult]: + """Combine BM25 and vector search results using RRF.""" + scores: dict[str, dict[str, Any]] = {} + + for rank, result in enumerate(bm25_results, start=1): + self._add_bm25_result(scores, rank, result) + + chunks = vector_results.get("data", {}).get("chunks", []) + for rank, chunk in enumerate(chunks, start=1): + self._add_vector_result(scores, rank, chunk) + + results = [ + HybridSearchResult( + chunk_id=chunk_id, + content=data["content"], + file_path=data["file_path"], + vector_score=data["vector_score"], + bm25_score=data["bm25_score"], + combined_score=data["bm25_score"] + data["vector_score"], + metadata=data["metadata"], + reference_id=data["reference_id"], + bm25_rank=data["bm25_rank"], + vector_rank=data["vector_rank"], + ) + for chunk_id, data in scores.items() + ] + + results.sort(key=lambda x: x.combined_score, reverse=True) + return results[:top_k] diff --git a/src/infrastructure/rag/lightrag_adapter.py b/src/infrastructure/rag/lightrag_adapter.py index 386a301..6184af3 100644 --- a/src/infrastructure/rag/lightrag_adapter.py +++ b/src/infrastructure/rag/lightrag_adapter.py @@ -288,7 +288,10 @@ async def index_folder( processing_time_ms = (time.time() - start_time) * 1000 total = len(all_files) - if failed == 0 and succeeded > 0: + if total == 0: + status = IndexingStatus.SUCCESS + message = f"No files found in '{folder_path}'" + elif failed == 0 and succeeded > 0: status = IndexingStatus.SUCCESS message = f"Successfully indexed {succeeded} file(s) from '{folder_path}'" elif succeeded > 0 and failed > 0: diff --git a/src/main.py b/src/main.py index 9813bd5..d3071d4 100644 --- a/src/main.py +++ b/src/main.py @@ -1,34 +1,102 @@ -"""Main entry point for the RAGAnything API. -Simplified following hexagonal architecture pattern from pickpro_indexing_api. -""" +"""Main entry point for the RAGAnything API.""" import logging +import logging.config import threading +from contextlib import asynccontextmanager +from pathlib import Path import uvicorn +from alembic.config import Config from fastapi import FastAPI from fastapi.middleware.cors import CORSMiddleware +from alembic import command from application.api.health_routes import health_router from application.api.indexing_routes import indexing_router from application.api.mcp_tools import mcp from application.api.query_routes import query_router -from dependencies import app_config +from dependencies import app_config, bm25_adapter + +_LOG_FORMAT = "%(asctime)s %(levelname)-8s [%(name)s] %(message)s" + +LOG_CONFIG = { + "version": 1, + "disable_existing_loggers": False, + "formatters": { + "standard": {"format": _LOG_FORMAT}, + }, + "handlers": { + "console": { + "class": "logging.StreamHandler", + "formatter": "standard", + "stream": "ext://sys.stderr", + }, + }, + "loggers": { + "uvicorn": {"handlers": ["console"], "level": "INFO", "propagate": False}, + "uvicorn.error": {"handlers": ["console"], "level": "INFO", "propagate": False}, + "uvicorn.access": { + "handlers": ["console"], + "level": "INFO", + "propagate": False, + }, + }, + "root": { + "level": "INFO", + "handlers": ["console"], + }, +} + +logging.config.dictConfig(LOG_CONFIG) logger = logging.getLogger(__name__) - MCP_PATH = "/mcp" + +def _run_alembic_upgrade() -> None: + """Run Alembic migrations to head.""" + alembic_dir = Path(__file__).parent + cfg = Config(str(alembic_dir / "alembic.ini")) + cfg.set_main_option("script_location", str(alembic_dir / "alembic")) + command.upgrade(cfg, "head") + + +@asynccontextmanager +async def db_lifespan(_app: FastAPI): + """Closes BM25 connection pool on shutdown.""" + yield + + logger.info("Application shutdown initiated") + if bm25_adapter is not None: + try: + await bm25_adapter.close() + except Exception: + logger.exception("Failed to close BM25 adapter") + logger.info("Application shutdown complete") + + +# Create FastAPI app with appropriate lifespan if app_config.MCP_TRANSPORT == "streamable": mcp_app = mcp.http_app(path="/") + + @asynccontextmanager + async def combined_lifespan(app: FastAPI): + """Combine database lifecycle with MCP lifecycle for streamable transport.""" + async with db_lifespan(app), mcp_app.lifespan(app): + yield + app = FastAPI( title="RAG Anything API", - lifespan=mcp_app.lifespan, + lifespan=combined_lifespan, ) app.mount(MCP_PATH, mcp_app) else: - app = FastAPI(title="RAG Anything API") + app = FastAPI( + title="RAG Anything API", + lifespan=db_lifespan, + ) app.add_middleware( CORSMiddleware, @@ -38,24 +106,25 @@ allow_headers=["*"], ) -# ============= REST API ROUTES ============= - REST_PATH = "/api/v1" - app.include_router(indexing_router, prefix=REST_PATH) app.include_router(health_router, prefix=REST_PATH) app.include_router(query_router, prefix=REST_PATH) -# ============= MAIN ============= - def run_fastapi(): + """Run FastAPI server with uvicorn.""" + logger.info("Running database migrations...") + _run_alembic_upgrade() + logger.info("Database migrations completed") + uvicorn.run( app, host=app_config.HOST, port=app_config.PORT, log_level=app_config.UVICORN_LOG_LEVEL, - access_log=False, + log_config=LOG_CONFIG, + access_log=True, ws="none", ) diff --git a/tests/domain/ports/test_bm25_engine.py b/tests/domain/ports/test_bm25_engine.py new file mode 100644 index 0000000..f752c02 --- /dev/null +++ b/tests/domain/ports/test_bm25_engine.py @@ -0,0 +1,37 @@ +"""Tests for BM25EnginePort interface.""" + +import pytest + +from domain.ports.bm25_engine import BM25EnginePort, BM25SearchResult + + +def test_bm25_engine_port_is_abstract(): + """BM25EnginePort should be abstract and not instantiable.""" + with pytest.raises(TypeError, match="Can't instantiate abstract class"): + BM25EnginePort() + + +def test_bm25_engine_port_has_required_methods(): + """BM25EnginePort should define required abstract methods.""" + assert hasattr(BM25EnginePort, "search") + assert hasattr(BM25EnginePort, "index_document") + assert hasattr(BM25EnginePort, "create_index") + assert hasattr(BM25EnginePort, "drop_index") + assert hasattr(BM25EnginePort, "close") + + +def test_bm25_search_result_dataclass(): + """BM25SearchResult should be a dataclass with required fields.""" + result = BM25SearchResult( + chunk_id="123", + content="test content", + file_path="/test/doc.pdf", + score=0.95, + metadata={"page": 1}, + ) + + assert result.chunk_id == "123" + assert result.content == "test content" + assert result.file_path == "/test/doc.pdf" + assert result.score == 0.95 + assert result.metadata == {"page": 1} diff --git a/tests/infrastructure/bm25/test_pg_textsearch_adapter.py b/tests/infrastructure/bm25/test_pg_textsearch_adapter.py new file mode 100644 index 0000000..5dbe999 --- /dev/null +++ b/tests/infrastructure/bm25/test_pg_textsearch_adapter.py @@ -0,0 +1,254 @@ +"""Tests for PostgresBM25Adapter implementation.""" + +from unittest.mock import AsyncMock + +import asyncpg +import pytest + +from infrastructure.bm25.pg_textsearch_adapter import PostgresBM25Adapter + + +@pytest.fixture +def mock_pool(): + """Create mock asyncpg pool.""" + pool = AsyncMock(spec=asyncpg.Pool) + return pool + + +@pytest.fixture +def mock_connection(): + """Create mock asyncpg connection.""" + conn = AsyncMock(spec=asyncpg.Connection) + return conn + + +def _acquire_mock(conn): + """Helper to create an async context manager for pool.acquire().""" + cm = AsyncMock() + cm.__aenter__ = AsyncMock(return_value=conn) + cm.__aexit__ = AsyncMock(return_value=None) + return cm + + +class TestMakeWorkspace: + """Tests for _make_workspace static method.""" + + def test_make_workspace_produces_ws_prefix(self): + """Should produce workspace with ws_ prefix.""" + result = PostgresBM25Adapter._make_workspace( + "36ecc1eb-dead-4000-beef-1234567890ab" + ) + assert result.startswith("ws_") + + def test_make_workspace_is_deterministic(self): + """Same input should always produce same workspace.""" + result1 = PostgresBM25Adapter._make_workspace("test-working-dir") + result2 = PostgresBM25Adapter._make_workspace("test-working-dir") + assert result1 == result2 + + def test_make_workspace_different_inputs_different_outputs(self): + """Different inputs should produce different workspaces.""" + result1 = PostgresBM25Adapter._make_workspace("dir-a") + result2 = PostgresBM25Adapter._make_workspace("dir-b") + assert result1 != result2 + + +class TestTextConfig: + """Tests for text_config and BM25 index naming.""" + + def test_default_text_config_is_english(self): + adapter = PostgresBM25Adapter(db_url="postgresql://test") + assert adapter.text_config == "english" + + def test_custom_text_config(self): + adapter = PostgresBM25Adapter(db_url="postgresql://test", text_config="french") + assert adapter.text_config == "french" + + def test_bm25_index_name_includes_text_config(self): + adapter = PostgresBM25Adapter(db_url="postgresql://test", text_config="french") + assert adapter.bm25_index_name == "idx_lightrag_chunks_bm25_french" + + def test_bm25_index_name_english(self): + adapter = PostgresBM25Adapter(db_url="postgresql://test", text_config="english") + assert adapter.bm25_index_name == "idx_lightrag_chunks_bm25_english" + + +class TestSearch: + @pytest.mark.asyncio + async def test_search_returns_results(self, mock_pool, mock_connection): + """Search should return BM25SearchResult list.""" + adapter = PostgresBM25Adapter(db_url="postgresql://test") + adapter._pool = mock_pool + mock_pool.acquire.return_value = _acquire_mock(mock_connection) + + mock_connection.fetch.return_value = [ + { + "chunk_id": "123", + "content": "PostgreSQL database system", + "file_path": "/doc.pdf", + "score": -2.345, + } + ] + + results = await adapter.search("PostgreSQL", "workspace1", top_k=5) + + assert len(results) == 1 + assert results[0].chunk_id == "123" + assert results[0].content == "PostgreSQL database system" + assert results[0].file_path == "/doc.pdf" + assert results[0].score == 2.345 + + @pytest.mark.asyncio + async def test_search_converts_negative_scores(self, mock_pool, mock_connection): + """Search should convert negative BM25 scores to positive.""" + adapter = PostgresBM25Adapter(db_url="postgresql://test") + adapter._pool = mock_pool + mock_pool.acquire.return_value = _acquire_mock(mock_connection) + + mock_connection.fetch.return_value = [ + { + "chunk_id": "1", + "content": "test", + "file_path": "/t.pdf", + "score": -5.0, + } + ] + + results = await adapter.search("test", "ws", top_k=10) + assert results[0].score == 5.0 + + @pytest.mark.asyncio + async def test_search_with_no_results(self, mock_pool, mock_connection): + """Search should return empty list when no matches.""" + adapter = PostgresBM25Adapter(db_url="postgresql://test") + adapter._pool = mock_pool + mock_pool.acquire.return_value = _acquire_mock(mock_connection) + + mock_connection.fetch.return_value = [] + + results = await adapter.search("nonexistent", "workspace1", top_k=10) + assert results == [] + + @pytest.mark.asyncio + async def test_search_queries_lightrag_doc_chunks(self, mock_pool, mock_connection): + """Search should query lightrag_doc_chunks with workspace mapping.""" + adapter = PostgresBM25Adapter(db_url="postgresql://test") + adapter._pool = mock_pool + mock_pool.acquire.return_value = _acquire_mock(mock_connection) + + mock_connection.fetch.return_value = [] + + await adapter.search("test query", "some-working-dir", top_k=5) + + sql = mock_connection.fetch.call_args[0][0] + assert "lightrag_doc_chunks" in sql + assert "workspace" in sql + + workspace_arg = mock_connection.fetch.call_args[0][2] + assert workspace_arg == PostgresBM25Adapter._make_workspace("some-working-dir") + + @pytest.mark.asyncio + async def test_search_uses_bm25_index_with_text_config(self, mock_pool, mock_connection): + """Search should use text_config-specific BM25 index.""" + adapter = PostgresBM25Adapter(db_url="postgresql://test", text_config="french") + adapter._pool = mock_pool + mock_pool.acquire.return_value = _acquire_mock(mock_connection) + + mock_connection.fetch.return_value = [] + + await adapter.search("test query", "some-working-dir", top_k=5) + + bm25_index_arg = mock_connection.fetch.call_args[0][3] + assert bm25_index_arg == "idx_lightrag_chunks_bm25_french" + + +class TestIndexDocument: + @pytest.mark.asyncio + async def test_index_document_is_noop(self, mock_pool, mock_connection): + """Index document should be a no-op since LightRAG owns the table.""" + adapter = PostgresBM25Adapter(db_url="postgresql://test") + adapter._pool = mock_pool + + await adapter.index_document( + chunk_id="123", + content="test content", + file_path="/doc.pdf", + working_dir="workspace1", + metadata={"page": 1}, + ) + + mock_pool.acquire.assert_not_called() + mock_connection.execute.assert_not_called() + + +class TestCreateIndex: + @pytest.mark.asyncio + async def test_create_index_updates_lightrag_doc_chunks( + self, mock_pool, mock_connection + ): + """Create index should update lightrag_doc_chunks tsvector.""" + adapter = PostgresBM25Adapter(db_url="postgresql://test", text_config="french") + adapter._pool = mock_pool + mock_pool.acquire.return_value = _acquire_mock(mock_connection) + + await adapter.create_index("some-working-dir") + + mock_connection.execute.assert_called_once() + sql = mock_connection.execute.call_args[0][0] + assert "lightrag_doc_chunks" in sql + assert "content_tsv" in sql + assert "french" in sql + + workspace_arg = mock_connection.execute.call_args[0][1] + assert workspace_arg == PostgresBM25Adapter._make_workspace("some-working-dir") + + +class TestDropIndex: + @pytest.mark.asyncio + async def test_drop_index_clears_tsvector(self, mock_pool, mock_connection): + """Drop index should clear tsvector for workspace.""" + adapter = PostgresBM25Adapter(db_url="postgresql://test") + adapter._pool = mock_pool + mock_pool.acquire.return_value = _acquire_mock(mock_connection) + + await adapter.drop_index("workspace1") + + mock_connection.execute.assert_called_once() + sql = mock_connection.execute.call_args[0][0] + assert "lightrag_doc_chunks" in sql + assert "content_tsv = NULL" in sql + + @pytest.mark.asyncio + async def test_drop_index_uses_workspace_mapping(self, mock_pool, mock_connection): + """Drop index should map working_dir to workspace.""" + adapter = PostgresBM25Adapter(db_url="postgresql://test") + adapter._pool = mock_pool + mock_pool.acquire.return_value = _acquire_mock(mock_connection) + + await adapter.drop_index("my-working-dir") + + workspace_arg = mock_connection.execute.call_args[0][1] + assert workspace_arg == PostgresBM25Adapter._make_workspace("my-working-dir") + + +class TestClose: + @pytest.mark.asyncio + async def test_close_closes_pool(self, mock_pool): + """Close should close connection pool.""" + adapter = PostgresBM25Adapter(db_url="postgresql://test") + adapter._pool = mock_pool + + await adapter.close() + + mock_pool.close.assert_called_once() + assert adapter._pool is None + + @pytest.mark.asyncio + async def test_close_with_no_pool(self): + """Close should handle None pool gracefully.""" + adapter = PostgresBM25Adapter(db_url="postgresql://test") + adapter._pool = None + + await adapter.close() + + assert adapter._pool is None diff --git a/tests/infrastructure/hybrid/test_rrf_combiner.py b/tests/infrastructure/hybrid/test_rrf_combiner.py new file mode 100644 index 0000000..f0cc270 --- /dev/null +++ b/tests/infrastructure/hybrid/test_rrf_combiner.py @@ -0,0 +1,314 @@ +"""Tests for Reciprocal Rank Fusion combiner.""" + +from domain.ports.bm25_engine import BM25SearchResult +from infrastructure.hybrid.rrf_combiner import RRFCombiner + + +def test_rrf_combiner_initialization(): + """RRFCombiner should initialize with default k=60.""" + combiner = RRFCombiner() + assert combiner.k == 60 + + +def test_rrf_combiner_custom_k(): + """RRFCombiner should accept custom k parameter.""" + combiner = RRFCombiner(k=100) + assert combiner.k == 100 + + +def test_combine_results_basic(): + """RRF should combine ranks correctly.""" + combiner = RRFCombiner(k=60) + + # Mock BM25 results (already sorted by score) + bm25_results = [ + BM25SearchResult( + chunk_id="1", + content="BM25 result 1", + file_path="/a.pdf", + score=5.0, + metadata={}, + ), + BM25SearchResult( + chunk_id="2", + content="BM25 result 2", + file_path="/b.pdf", + score=4.0, + metadata={}, + ), + ] + + # Mock vector results + vector_results = { + "data": { + "chunks": [ + { + "chunk_id": "2", + "content": "Vector result 1", + "file_path": "/b.pdf", + }, + { + "chunk_id": "3", + "content": "Vector result 2", + "file_path": "/c.pdf", + }, + ] + } + } + + combined = combiner.combine(bm25_results, vector_results, top_k=10) + + # Check that results are combined + assert len(combined) == 3 # chunk_ids: 1, 2, 3 + + # Check that all results have combined scores + for result in combined: + assert result.combined_score > 0 + assert result.vector_score >= 0 + assert result.bm25_score >= 0 + + +def test_combine_results_respects_top_k(): + """RRF should respect top_k parameter.""" + combiner = RRFCombiner() + + bm25_results = [ + BM25SearchResult( + chunk_id=str(i), + content=f"BM25 {i}", + file_path="/a.pdf", + score=1.0, + metadata={}, + ) + for i in range(20) + ] + + vector_results = { + "data": { + "chunks": [ + {"chunk_id": str(i), "content": f"Vector {i}", "file_path": "/b.pdf"} + for i in range(20) + ] + } + } + + combined = combiner.combine(bm25_results, vector_results, top_k=5) + + assert len(combined) == 5 + + +def test_combine_results_sorted_by_score(): + """RRF results should be sorted by combined_score descending.""" + combiner = RRFCombiner() + + bm25_results = [ + BM25SearchResult( + chunk_id="1", content="BM25", file_path="/a.pdf", score=5.0, metadata={} + ), + BM25SearchResult( + chunk_id="2", content="BM25", file_path="/b.pdf", score=4.0, metadata={} + ), + ] + + vector_results = { + "data": { + "chunks": [ + {"chunk_id": "2", "content": "Vector", "file_path": "/b.pdf"}, + {"chunk_id": "1", "content": "Vector", "file_path": "/a.pdf"}, + ] + } + } + + combined = combiner.combine(bm25_results, vector_results, top_k=10) + + # Check sorted order + scores = [r.combined_score for r in combined] + assert scores == sorted(scores, reverse=True) + + +def test_rrf_formula(): + """RRF formula should be: 1/(k + rank).""" + combiner = RRFCombiner(k=60) + + # Item appears at rank 1 in BM25, rank 3 in vector + # Expected: 1/(60+1) + 1/(60+3) = 0.01639 + 0.01587 = 0.03226 + bm25_results = [ + BM25SearchResult( + chunk_id="1", + content="BM25 rank 1", + file_path="/a.pdf", + score=5.0, + metadata={}, + ), + ] + + vector_results = { + "data": { + "chunks": [ + { + "chunk_id": "other", + "content": "Vector rank 1", + "file_path": "/x.pdf", + }, + { + "chunk_id": "other2", + "content": "Vector rank 2", + "file_path": "/y.pdf", + }, + {"chunk_id": "1", "content": "Vector rank 3", "file_path": "/a.pdf"}, + ] + } + } + + combined = combiner.combine(bm25_results, vector_results, top_k=10) + + # Find our item + item = next(r for r in combined if r.chunk_id == "1") + + # Check RRF calculation + expected_bm25_score = 1 / (60 + 1) # rank 1 in BM25 + expected_vector_score = 1 / (60 + 3) # rank 3 in vector + + assert abs(item.bm25_score - expected_bm25_score) < 0.0001 + assert abs(item.vector_score - expected_vector_score) < 0.0001 + assert ( + abs(item.combined_score - (expected_bm25_score + expected_vector_score)) + < 0.0001 + ) + + +def test_combine_only_bm25_results(): + """RRF should handle case where only BM25 has results.""" + combiner = RRFCombiner() + + bm25_results = [ + BM25SearchResult( + chunk_id="1", + content="BM25 only", + file_path="/a.pdf", + score=5.0, + metadata={}, + ) + ] + + vector_results = {"data": {"chunks": []}} + + combined = combiner.combine(bm25_results, vector_results, top_k=10) + + assert len(combined) == 1 + assert combined[0].chunk_id == "1" + assert combined[0].bm25_score > 0 + assert combined[0].vector_score == 0 + + +def test_combine_only_vector_results(): + """RRF should handle case where only vector has results.""" + combiner = RRFCombiner() + + bm25_results = [] + + vector_results = { + "data": { + "chunks": [ + {"chunk_id": "1", "content": "Vector only", "file_path": "/a.pdf"} + ] + } + } + + combined = combiner.combine(bm25_results, vector_results, top_k=10) + + assert len(combined) == 1 + assert combined[0].chunk_id == "1" + assert combined[0].bm25_score == 0 + assert combined[0].vector_score > 0 + + +def test_combine_uses_chunk_id_not_reference_id(): + """RRF should match by chunk_id, not reference_id. + + Vector results include both chunk_id (e.g. 'chunk-abc123') and + reference_id (e.g. '1'). BM25 results use the same chunk_id. + The combiner must match by chunk_id so overlapping results merge. + """ + combiner = RRFCombiner(k=60) + + bm25_results = [ + BM25SearchResult( + chunk_id="chunk-abc123", + content="shared result", + file_path="/doc.pdf", + score=5.0, + metadata={}, + ), + ] + + vector_results = { + "data": { + "chunks": [ + { + "chunk_id": "chunk-abc123", + "reference_id": "1", + "content": "shared result", + "file_path": "/doc.pdf", + }, + ] + } + } + + combined = combiner.combine(bm25_results, vector_results, top_k=10) + + assert len(combined) == 1 + assert combined[0].chunk_id == "chunk-abc123" + assert combined[0].bm25_rank == 1 + assert combined[0].vector_rank == 1 + assert combined[0].reference_id == "1" + assert combined[0].combined_score == 1 / (60 + 1) + 1 / (60 + 1) + + +def test_combine_preserves_reference_id_from_vector(): + """RRF should preserve reference_id from vector results.""" + combiner = RRFCombiner() + + bm25_results = [] + + vector_results = { + "data": { + "chunks": [ + { + "chunk_id": "chunk-xyz", + "reference_id": "3", + "content": "Vector result", + "file_path": "/c.pdf", + }, + ] + } + } + + combined = combiner.combine(bm25_results, vector_results, top_k=10) + + assert len(combined) == 1 + assert combined[0].reference_id == "3" + + +def test_combine_no_chunk_id_falls_back_to_reference_id(): + """If vector results lack chunk_id, use reference_id as fallback.""" + combiner = RRFCombiner() + + bm25_results = [] + + vector_results = { + "data": { + "chunks": [ + { + "reference_id": "5", + "content": "Old format vector result", + "file_path": "/d.pdf", + }, + ] + } + } + + combined = combiner.combine(bm25_results, vector_results, top_k=10) + + assert len(combined) == 1 + assert combined[0].chunk_id == "5" diff --git a/tests/unit/test_alembic_config.py b/tests/unit/test_alembic_config.py new file mode 100644 index 0000000..d7878cd --- /dev/null +++ b/tests/unit/test_alembic_config.py @@ -0,0 +1,56 @@ +"""Tests for Alembic configuration and environment.""" + +import importlib.util +from pathlib import Path + +ALEMBIC_DIR = Path(__file__).parent.parent.parent / "src" / "alembic" +SRC_DIR = Path(__file__).parent.parent.parent / "src" + + +class TestAlembicConfig: + """Tests for Alembic configuration files.""" + + def test_alembic_ini_exists(self): + alembic_ini = SRC_DIR / "alembic.ini" + assert alembic_ini.exists(), f"alembic.ini should exist at {alembic_ini}" + + def test_alembic_versions_dir_exists(self): + versions_dir = ALEMBIC_DIR / "versions" + assert versions_dir.is_dir(), "alembic/versions directory should exist" + + def test_get_url_converts_asyncpg_to_sync(self): + """Should convert postgresql+asyncpg:// to postgresql://.""" + test_cases = [ + ( + "postgresql+asyncpg://user:pass@host/db", + "postgresql://user:pass@host/db", + ), + ("postgresql://user:pass@host/db", "postgresql://user:pass@host/db"), + ] + for input_url, expected in test_cases: + result = input_url.replace("+asyncpg", "") + assert result == expected + + def test_target_metadata_is_none(self): + env_path = ALEMBIC_DIR / "env.py" + content = env_path.read_text() + assert "target_metadata = None" in content + + def test_migration_001_exists(self): + migration = ALEMBIC_DIR / "versions" / "001_add_bm25_support.py" + assert migration.exists(), f"Migration 001 should exist at {migration}" + + def test_migration_001_has_upgrade_and_downgrade(self): + migration_path = ALEMBIC_DIR / "versions" / "001_add_bm25_support.py" + spec = importlib.util.spec_from_file_location("migration_001", migration_path) + assert spec is not None + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) # type: ignore[union-attr] + assert hasattr(module, "upgrade"), "Migration should have upgrade()" + assert hasattr(module, "downgrade"), "Migration should have downgrade()" + + def test_env_py_has_async_support(self): + env_path = ALEMBIC_DIR / "env.py" + content = env_path.read_text() + assert "async_engine_from_config" in content + assert "run_async_migrations" in content diff --git a/tests/unit/test_lifespan.py b/tests/unit/test_lifespan.py new file mode 100644 index 0000000..f59b9e7 --- /dev/null +++ b/tests/unit/test_lifespan.py @@ -0,0 +1,73 @@ +"""Tests for FastAPI lifespan management.""" + +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + + +class TestLifespan: + """Tests for lifespan context managers in main.py.""" + + @pytest.mark.asyncio + async def test_db_lifespan_closes_bm25_pool_on_shutdown(self): + """Should close BM25 adapter connection pool on shutdown.""" + from main import db_lifespan + + mock_app = MagicMock() + mock_bm25 = AsyncMock() + + with patch("main.bm25_adapter", mock_bm25): + async with db_lifespan(mock_app): + pass + mock_bm25.close.assert_called_once() + + @pytest.mark.asyncio + async def test_db_lifespan_handles_no_bm25_adapter(self): + """Should handle gracefully when bm25_adapter is None.""" + from main import db_lifespan + + mock_app = MagicMock() + + with patch("main.bm25_adapter", None): + async with db_lifespan(mock_app): + pass + + @pytest.mark.asyncio + async def test_db_lifespan_handles_close_failure(self): + """Should not crash if BM25 close fails.""" + from main import db_lifespan + + mock_app = MagicMock() + mock_bm25 = AsyncMock() + mock_bm25.close = AsyncMock(side_effect=Exception("Close failed")) + + with patch("main.bm25_adapter", mock_bm25): + async with db_lifespan(mock_app): + pass + mock_bm25.close.assert_called_once() + + @pytest.mark.asyncio + async def test_run_alembic_upgrade_calls_command(self): + """Should call alembic command.upgrade with head.""" + with ( + patch("main.command.upgrade") as mock_upgrade, + patch("main.Config") as mock_config_cls, + ): + mock_cfg = MagicMock() + mock_config_cls.return_value = mock_cfg + from main import _run_alembic_upgrade + + _run_alembic_upgrade() + mock_upgrade.assert_called_once_with(mock_cfg, "head") + + def test_run_fastapi_runs_migrations_before_uvicorn(self): + """Should run migrations synchronously before starting uvicorn.""" + with ( + patch("main._run_alembic_upgrade") as mock_migrate, + patch("main.uvicorn.run") as mock_uvicorn, + ): + from main import run_fastapi + + run_fastapi() + mock_migrate.assert_called_once() + mock_uvicorn.assert_called_once() diff --git a/tests/unit/test_query_use_case.py b/tests/unit/test_query_use_case.py index 8302e1c..46e9bd3 100644 --- a/tests/unit/test_query_use_case.py +++ b/tests/unit/test_query_use_case.py @@ -1,6 +1,7 @@ from unittest.mock import AsyncMock from application.use_cases.query_use_case import QueryUseCase +from domain.ports.bm25_engine import BM25SearchResult class TestQueryUseCase: @@ -99,3 +100,91 @@ async def test_execute_with_mix_mode( top_k=5, working_dir="/tmp/rag/test", ) + + async def test_execute_hybrid_plus_with_bm25( + self, mock_rag_engine: AsyncMock + ) -> None: + """hybrid+ mode should execute parallel BM25 + vector search.""" + mock_bm25 = AsyncMock() + mock_bm25.search.return_value = [ + BM25SearchResult( + chunk_id="chunk-abc123", + content="bm25 result", + file_path="/a.pdf", + score=5.0, + metadata={}, + ) + ] + mock_rag_engine.query.return_value = { + "data": { + "chunks": [ + { + "chunk_id": "chunk-abc123", + "reference_id": "2", + "content": "vector result", + "file_path": "/b.pdf", + } + ] + } + } + use_case = QueryUseCase(rag_engine=mock_rag_engine, bm25_engine=mock_bm25) + + result = await use_case.execute( + working_dir="/tmp/rag/test", query="search", mode="hybrid+", top_k=10 + ) + + mock_bm25.search.assert_called_once() + mock_rag_engine.query.assert_called() + assert result["status"] == "success" + assert result["metadata"]["query_mode"] == "hybrid+" + + chunk = result["data"]["chunks"][0] + assert chunk["chunk_id"] == "chunk-abc123" + assert chunk["reference_id"] == "2" + + async def test_execute_hybrid_plus_without_bm25_falls_back( + self, mock_rag_engine: AsyncMock + ) -> None: + """hybrid+ mode without BM25 should fall back to naive vector search.""" + mock_rag_engine.query.return_value = {"status": "success", "data": {}} + use_case = QueryUseCase(rag_engine=mock_rag_engine, bm25_engine=None) + + await use_case.execute( + working_dir="/tmp/rag/test", query="search", mode="hybrid+", top_k=10 + ) + + mock_rag_engine.query.assert_called_once_with( + query="search", mode="naive", top_k=10, working_dir="/tmp/rag/test" + ) + + async def test_execute_bm25_only_mode(self, mock_rag_engine: AsyncMock) -> None: + """bm25 mode should only use BM25 search without vector.""" + mock_bm25 = AsyncMock() + mock_bm25.search.return_value = [ + BM25SearchResult( + chunk_id="1", content="test", file_path="/a.pdf", score=5.0, metadata={} + ) + ] + use_case = QueryUseCase(rag_engine=mock_rag_engine, bm25_engine=mock_bm25) + + result = await use_case.execute( + working_dir="/tmp/rag/test", query="search", mode="bm25", top_k=10 + ) + + mock_bm25.search.assert_called_once_with("search", "/tmp/rag/test", 10) + mock_rag_engine.query.assert_not_called() + assert result["status"] == "success" + assert result["metadata"]["query_mode"] == "bm25" + + async def test_execute_bm25_mode_without_bm25_returns_error( + self, mock_rag_engine: AsyncMock + ) -> None: + """bm25 mode without BM25 engine should return error.""" + use_case = QueryUseCase(rag_engine=mock_rag_engine, bm25_engine=None) + + result = await use_case.execute( + working_dir="/tmp/rag/test", query="search", mode="bm25", top_k=10 + ) + + assert result["status"] == "error" + assert "BM25 engine not available" in result["message"] diff --git a/trivy-report-current.json b/trivy-report-current.json index 64313ea..60ed006 100644 --- a/trivy-report-current.json +++ b/trivy-report-current.json @@ -3,20 +3,42 @@ "Trivy": { "Version": "0.69.3" }, - "ReportID": "019d67a8-03be-7bc3-96ad-233658ba2bdf", - "CreatedAt": "2026-04-07T13:15:57.502773+02:00", - "ArtifactID": "sha256:44de1297411e46aa253b4587f1a66eb89f732fd7bd66822db54093a6f7fc28ca", + "ReportID": "019d695b-0697-7d65-9dbe-387e1f838d1c", + "CreatedAt": "2026-04-07T21:11:06.39188+02:00", + "ArtifactID": "sha256:9829ca31ff653ff8fe9be186152bf65041835de0e419a8b4359bf2b7189673c3", "ArtifactName": ".", "ArtifactType": "repository", "Metadata": { "RepoURL": "https://github.com/Kaiohz/mcp-raganything.git", - "Branch": "main", - "Commit": "ef601f6d3a8415d3a8329292e645d2e3d8c0e8a9", - "CommitMsg": "fix: remove conditional on SonarQube CI step, secrets are now configured (#5)", - "Author": "Yohan Gonçalves \u003cyohan.goncalves.pro@gmail.com\u003e", - "Committer": "GitHub \u003cnoreply@github.com\u003e" + "Branch": "BRIC-7/add-bm25-pg-textsearch", + "Commit": "c3e5cb316291d7fc0082899f8ae23523e8cbb82e", + "CommitMsg": "refactor: Reduce cognitive complexity in RRF combiner (sonar S3776) (BRIC-7)", + "Author": "Kaiohz \u003cyohan.goncalves@cosigma.io\u003e", + "Committer": "Kaiohz \u003cyohan.goncalves@cosigma.io\u003e" }, "Results": [ + { + "Target": ".venv/lib/python3.14/site-packages/paddleocr/ppstructure/kie/requirements.txt", + "Class": "lang-pkgs", + "Type": "pip", + "Packages": [ + { + "Name": "paddlenlp", + "Identifier": { + "PURL": "pkg:pypi/paddlenlp@2.5.2", + "UID": "2b35cf3d8063c65e" + }, + "Version": "2.5.2", + "Locations": [ + { + "StartLine": 7, + "EndLine": 7 + } + ], + "AnalyzedBy": "pip" + } + ] + }, { "Target": "Python", "Class": "lang-pkgs", @@ -26,13 +48,13 @@ "Name": "my-test-package", "Identifier": { "PURL": "pkg:pypi/my-test-package@1.0", - "UID": "cf354c804175f1b1" + "UID": "b9a06bfcca9bf672" }, "Version": "1.0", "Licenses": [ "UNKNOWN" ], - "FilePath": ".venv/lib/python3.13/site-packages/pkg_resources/tests/data/my-test-package_zipped-egg/my_test_package-1.0-py3.7.egg", + "FilePath": ".venv/lib/python3.14/site-packages/pkg_resources/tests/data/my-test-package_zipped-egg/my_test_package-1.0-py3.7.egg", "AnalyzedBy": "python-egg" } ] @@ -47,20 +69,21 @@ "Name": "mcp-raganything", "Identifier": { "PURL": "pkg:pypi/mcp-raganything@0.1.0", - "UID": "27f0726934767eb7" + "UID": "c7408d4962c500ca" }, "Version": "0.1.0", "Relationship": "root", "DependsOn": [ "aiofiles@24.1.0", + "alembic@1.18.4", "asyncpg@0.31.0", "authlib@1.6.9", "cryptography@46.0.6", - "docling@2.83.0", + "docling@2.84.0", "fastapi@0.135.3", "fastmcp@3.2.0", "httpx@0.28.1", - "lightrag-hku@1.4.12", + "lightrag-hku@1.4.13", "mcp@1.26.0", "minio@7.2.20", "openai@2.30.0", @@ -85,6 +108,22 @@ "Relationship": "direct", "AnalyzedBy": "uv" }, + { + "ID": "alembic@1.18.4", + "Name": "alembic", + "Identifier": { + "PURL": "pkg:pypi/alembic@1.18.4", + "UID": "cdc5b7a87334bcc2" + }, + "Version": "1.18.4", + "Relationship": "direct", + "DependsOn": [ + "mako@1.3.10", + "sqlalchemy@2.0.48", + "typing-extensions@4.15.0" + ], + "AnalyzedBy": "uv" + }, { "ID": "asyncpg@0.31.0", "Name": "asyncpg", @@ -125,13 +164,13 @@ "AnalyzedBy": "uv" }, { - "ID": "docling@2.83.0", + "ID": "docling@2.84.0", "Name": "docling", "Identifier": { - "PURL": "pkg:pypi/docling@2.83.0", - "UID": "8c0d76ec334dff22" + "PURL": "pkg:pypi/docling@2.84.0", + "UID": "f6f6b94364dcf6fc" }, - "Version": "2.83.0", + "Version": "2.84.0", "Relationship": "direct", "DependsOn": [ "accelerate@1.13.0", @@ -238,13 +277,13 @@ "AnalyzedBy": "uv" }, { - "ID": "lightrag-hku@1.4.12", + "ID": "lightrag-hku@1.4.13", "Name": "lightrag-hku", "Identifier": { - "PURL": "pkg:pypi/lightrag-hku@1.4.12", - "UID": "70958c7c95dc56d7" + "PURL": "pkg:pypi/lightrag-hku@1.4.13", + "UID": "41086f8911af13ed" }, - "Version": "1.4.12", + "Version": "1.4.13", "Relationship": "direct", "DependsOn": [ "aiofiles@24.1.0", @@ -412,15 +451,22 @@ "Name": "raganything", "Identifier": { "PURL": "pkg:pypi/raganything@1.2.10", - "UID": "1ab5e428ac85ffcf" + "UID": "c9ebb114e1b109ba" }, "Version": "1.2.10", "Relationship": "direct", "DependsOn": [ "huggingface-hub@0.36.2", - "lightrag-hku@1.4.12", + "lightrag-hku@1.4.13", + "markdown@3.10.2", "mineru@3.0.7", - "tqdm@4.67.3" + "paddleocr@2.10.0", + "pillow@12.2.0", + "pygments@2.20.0", + "pypdfium2@4.30.0", + "reportlab@4.4.10", + "tqdm@4.67.3", + "weasyprint@68.1" ], "AnalyzedBy": "uv" }, @@ -794,6 +840,21 @@ "Relationship": "indirect", "AnalyzedBy": "uv" }, + { + "ID": "brotlicffi@1.2.0.1", + "Name": "brotlicffi", + "Identifier": { + "PURL": "pkg:pypi/brotlicffi@1.2.0.1", + "UID": "e8b5d9c1977eb78e" + }, + "Version": "1.2.0.1", + "Indirect": true, + "Relationship": "indirect", + "DependsOn": [ + "cffi@2.0.0" + ], + "AnalyzedBy": "uv" + }, { "ID": "cachetools@7.0.5", "Name": "cachetools", @@ -923,6 +984,22 @@ "Relationship": "indirect", "AnalyzedBy": "uv" }, + { + "ID": "cssselect2@0.9.0", + "Name": "cssselect2", + "Identifier": { + "PURL": "pkg:pypi/cssselect2@0.9.0", + "UID": "70afa14bae638fe1" + }, + "Version": "0.9.0", + "Indirect": true, + "Relationship": "indirect", + "DependsOn": [ + "tinycss2@1.5.1", + "webencodings@0.5.1" + ], + "AnalyzedBy": "uv" + }, { "ID": "cuda-bindings@13.2.0", "Name": "cuda-bindings", @@ -993,6 +1070,18 @@ ], "AnalyzedBy": "uv" }, + { + "ID": "cython@3.2.4", + "Name": "cython", + "Identifier": { + "PURL": "pkg:pypi/cython@3.2.4", + "UID": "fe8b7d655ea803c6" + }, + "Version": "3.2.4", + "Indirect": true, + "Relationship": "indirect", + "AnalyzedBy": "uv" + }, { "ID": "defusedxml@0.7.1", "Name": "defusedxml", @@ -1278,6 +1367,21 @@ "Relationship": "indirect", "AnalyzedBy": "uv" }, + { + "ID": "fire@0.7.1", + "Name": "fire", + "Identifier": { + "PURL": "pkg:pypi/fire@0.7.1", + "UID": "1290e6fb2c44d6d3" + }, + "Version": "0.7.1", + "Indirect": true, + "Relationship": "indirect", + "DependsOn": [ + "termcolor@3.3.0" + ], + "AnalyzedBy": "uv" + }, { "ID": "flatbuffers@25.12.19", "Name": "flatbuffers", @@ -1290,6 +1394,23 @@ "Relationship": "indirect", "AnalyzedBy": "uv" }, + { + "ID": "fonttools@4.62.1", + "Name": "fonttools", + "Identifier": { + "PURL": "pkg:pypi/fonttools@4.62.1", + "UID": "a148a8969201fe5c" + }, + "Version": "4.62.1", + "Indirect": true, + "Relationship": "indirect", + "DependsOn": [ + "brotli@1.2.0", + "brotlicffi@1.2.0.1", + "zopfli@0.4.1" + ], + "AnalyzedBy": "uv" + }, { "ID": "frozenlist@1.8.0", "Name": "frozenlist", @@ -1881,6 +2002,18 @@ ], "AnalyzedBy": "uv" }, + { + "ID": "lmdb@2.2.0", + "Name": "lmdb", + "Identifier": { + "PURL": "pkg:pypi/lmdb@2.2.0", + "UID": "ccfd487f051dc543" + }, + "Version": "2.2.0", + "Indirect": true, + "Relationship": "indirect", + "AnalyzedBy": "uv" + }, { "ID": "loguru@0.7.3", "Name": "loguru", @@ -1925,6 +2058,21 @@ ], "AnalyzedBy": "uv" }, + { + "ID": "mako@1.3.10", + "Name": "mako", + "Identifier": { + "PURL": "pkg:pypi/mako@1.3.10", + "UID": "c3f5442c690bcb7e" + }, + "Version": "1.3.10", + "Indirect": true, + "Relationship": "indirect", + "DependsOn": [ + "markupsafe@3.0.3" + ], + "AnalyzedBy": "uv" + }, { "ID": "mammoth@1.12.0", "Name": "mammoth", @@ -1940,6 +2088,18 @@ ], "AnalyzedBy": "uv" }, + { + "ID": "markdown@3.10.2", + "Name": "markdown", + "Identifier": { + "PURL": "pkg:pypi/markdown@3.10.2", + "UID": "8d04ca3112f1b5c2" + }, + "Version": "3.10.2", + "Indirect": true, + "Relationship": "indirect", + "AnalyzedBy": "uv" + }, { "ID": "markdown-it-py@4.0.0", "Name": "markdown-it-py", @@ -2462,6 +2622,21 @@ ], "AnalyzedBy": "uv" }, + { + "ID": "opencv-contrib-python@4.13.0.92", + "Name": "opencv-contrib-python", + "Identifier": { + "PURL": "pkg:pypi/opencv-contrib-python@4.13.0.92", + "UID": "4cfd654bb7bbabba" + }, + "Version": "4.13.0.92", + "Indirect": true, + "Relationship": "indirect", + "DependsOn": [ + "numpy@2.4.4" + ], + "AnalyzedBy": "uv" + }, { "ID": "opencv-python@4.13.0.92", "Name": "opencv-python", @@ -2547,6 +2722,39 @@ "Relationship": "indirect", "AnalyzedBy": "uv" }, + { + "ID": "paddleocr@2.10.0", + "Name": "paddleocr", + "Identifier": { + "PURL": "pkg:pypi/paddleocr@2.10.0", + "UID": "1b504a5dbdb8859e" + }, + "Version": "2.10.0", + "Indirect": true, + "Relationship": "indirect", + "DependsOn": [ + "albucore@0.0.24", + "albumentations@2.0.8", + "beautifulsoup4@4.14.3", + "cython@3.2.4", + "fire@0.7.1", + "fonttools@4.62.1", + "lmdb@2.2.0", + "numpy@2.4.4", + "opencv-contrib-python@4.13.0.92", + "opencv-python@4.13.0.92", + "pillow@12.2.0", + "pyclipper@1.4.0", + "python-docx@1.2.0", + "pyyaml@6.0.3", + "rapidfuzz@3.14.5", + "requests@2.33.1", + "scikit-image@0.26.0", + "shapely@2.1.2", + "tqdm@4.67.3" + ], + "AnalyzedBy": "uv" + }, { "ID": "pandas@2.3.3", "Name": "pandas", @@ -2859,6 +3067,18 @@ "Relationship": "indirect", "AnalyzedBy": "uv" }, + { + "ID": "pydyf@0.12.1", + "Name": "pydyf", + "Identifier": { + "PURL": "pkg:pypi/pydyf@0.12.1", + "UID": "5ab04c66a2b48b21" + }, + "Version": "0.12.1", + "Indirect": true, + "Relationship": "indirect", + "AnalyzedBy": "uv" + }, { "ID": "pygments@2.20.0", "Name": "pygments", @@ -3011,6 +3231,18 @@ "Relationship": "indirect", "AnalyzedBy": "uv" }, + { + "ID": "pyphen@0.17.2", + "Name": "pyphen", + "Identifier": { + "PURL": "pkg:pypi/pyphen@0.17.2", + "UID": "b9fd3d00209ae3cd" + }, + "Version": "0.17.2", + "Indirect": true, + "Relationship": "indirect", + "AnalyzedBy": "uv" + }, { "ID": "pypinyin@0.55.0", "Name": "pypinyin", @@ -3174,6 +3406,18 @@ ], "AnalyzedBy": "uv" }, + { + "ID": "rapidfuzz@3.14.5", + "Name": "rapidfuzz", + "Identifier": { + "PURL": "pkg:pypi/rapidfuzz@3.14.5", + "UID": "4334814b20e55342" + }, + "Version": "3.14.5", + "Indirect": true, + "Relationship": "indirect", + "AnalyzedBy": "uv" + }, { "ID": "rapidocr@3.7.0", "Name": "rapidocr", @@ -3646,6 +3890,18 @@ "Relationship": "indirect", "AnalyzedBy": "uv" }, + { + "ID": "termcolor@3.3.0", + "Name": "termcolor", + "Identifier": { + "PURL": "pkg:pypi/termcolor@3.3.0", + "UID": "88bdf909e3550c1d" + }, + "Version": "3.3.0", + "Indirect": true, + "Relationship": "indirect", + "AnalyzedBy": "uv" + }, { "ID": "tifffile@2026.3.3", "Name": "tifffile", @@ -3677,6 +3933,36 @@ ], "AnalyzedBy": "uv" }, + { + "ID": "tinycss2@1.5.1", + "Name": "tinycss2", + "Identifier": { + "PURL": "pkg:pypi/tinycss2@1.5.1", + "UID": "5f5bf54206e20b33" + }, + "Version": "1.5.1", + "Indirect": true, + "Relationship": "indirect", + "DependsOn": [ + "webencodings@0.5.1" + ], + "AnalyzedBy": "uv" + }, + { + "ID": "tinyhtml5@2.1.0", + "Name": "tinyhtml5", + "Identifier": { + "PURL": "pkg:pypi/tinyhtml5@2.1.0", + "UID": "4ca217870b9d5b8" + }, + "Version": "2.1.0", + "Indirect": true, + "Relationship": "indirect", + "DependsOn": [ + "webencodings@0.5.1" + ], + "AnalyzedBy": "uv" + }, { "ID": "tokenizers@0.22.2", "Name": "tokenizers", @@ -3968,6 +4254,40 @@ "Relationship": "indirect", "AnalyzedBy": "uv" }, + { + "ID": "weasyprint@68.1", + "Name": "weasyprint", + "Identifier": { + "PURL": "pkg:pypi/weasyprint@68.1", + "UID": "a19c5a08f98e5f73" + }, + "Version": "68.1", + "Indirect": true, + "Relationship": "indirect", + "DependsOn": [ + "cffi@2.0.0", + "cssselect2@0.9.0", + "fonttools@4.62.1", + "pillow@12.2.0", + "pydyf@0.12.1", + "pyphen@0.17.2", + "tinycss2@1.5.1", + "tinyhtml5@2.1.0" + ], + "AnalyzedBy": "uv" + }, + { + "ID": "webencodings@0.5.1", + "Name": "webencodings", + "Identifier": { + "PURL": "pkg:pypi/webencodings@0.5.1", + "UID": "68511b951cc24266" + }, + "Version": "0.5.1", + "Indirect": true, + "Relationship": "indirect", + "AnalyzedBy": "uv" + }, { "ID": "websockets@16.0", "Name": "websockets", @@ -4032,109 +4352,24 @@ "Indirect": true, "Relationship": "indirect", "AnalyzedBy": "uv" - } - ], - "Vulnerabilities": [ - { - "VulnerabilityID": "CVE-2026-30762", - "VendorIDs": [ - "GHSA-mcww-4hxq-hfr3" - ], - "PkgID": "lightrag-hku@1.4.12", - "PkgName": "lightrag-hku", - "PkgIdentifier": { - "PURL": "pkg:pypi/lightrag-hku@1.4.12", - "UID": "70958c7c95dc56d7" - }, - "InstalledVersion": "1.4.12", - "FixedVersion": "1.4.13", - "Status": "fixed", - "SeveritySource": "ghsa", - "PrimaryURL": "https://avd.aquasec.com/nvd/cve-2026-30762", - "DataSource": { - "ID": "ghsa", - "Name": "GitHub Security Advisory pip", - "URL": "https://github.com/advisories?query=type%3Areviewed+ecosystem%3Apip" - }, - "Fingerprint": "sha256:2e3d703f6045ff5badd8fef003b641bfd5be6083555af203173539f322c7247f", - "Title": "LightRAG: Hardcoded JWT Signing Secret Allows Authentication Bypass", - "Description": "Summary:\nThe file lightrag/api/config.py (line 397) uses a default JWT secret \"lightrag-jwt-default-secret\" when the TOKEN_SECRET environment variable is not set. The AuthHandler in lightrag/api/auth.py (lines 24-25) uses this secret to sign and verify tokens. An unauthenticated attacker can forge valid JWT tokens using the publicly known default secret and gain access to any protected endpoint.\n\nReproduction:\n1. Install LightRAG v1.4.10 with AUTH_ACCOUNTS configured but no TOKEN_SECRET set\n2. Use PyJWT to sign a token: jwt.encode({\"sub\": \"admin\", \"role\": \"user\"}, \"lightrag-jwt-default-secret\", algorithm=\"HS256\")\n3. Send a request to any protected endpoint with the header: Authorization: Bearer \u003cforged_token\u003e\n4. Access is granted without valid credentials\n\nSuggested Fix:\nRequire TOKEN_SECRET to be explicitly set when AUTH_ACCOUNTS is configured. Refuse to start the API server if authentication is enabled but no custom secret is provided.\n\n---\nVenkata Avinash Taduturi\ntaduturivenkata@gmail.com", - "Severity": "HIGH", - "VendorSeverity": { - "ghsa": 3 - }, - "CVSS": { - "ghsa": { - "V3Vector": "CVSS:3.1/AV:N/AC:L/PR:N/UI:N/S:U/C:H/I:N/A:N", - "V3Score": 7.5 - } + }, + { + "ID": "zopfli@0.4.1", + "Name": "zopfli", + "Identifier": { + "PURL": "pkg:pypi/zopfli@0.4.1", + "UID": "c3159c362bdeaae3" }, - "References": [ - "https://github.com/HKUDS/LightRAG", - "https://github.com/HKUDS/LightRAG/security/advisories/GHSA-mcww-4hxq-hfr3" - ] + "Version": "0.4.1", + "Indirect": true, + "Relationship": "indirect", + "AnalyzedBy": "uv" } ] }, { - "Target": ".venv/lib/python3.13/site-packages/skimage/data/_fetchers.py", - "Class": "secret", - "Secrets": [ - { - "RuleID": "jwt-token", - "Category": "JWT", - "Severity": "MEDIUM", - "Title": "JWT token", - "StartLine": 528, - "EndLine": 528, - "Code": { - "Lines": [ - { - "Number": 526, - "Content": " \u003e\u003e\u003e import requests", - "IsCause": false, - "Annotation": "", - "Truncated": false, - "Highlighted": " \u003e\u003e\u003e import requests", - "FirstCause": false, - "LastCause": false - }, - { - "Number": 527, - "Content": " \u003e\u003e\u003e import zipfile", - "IsCause": false, - "Annotation": "", - "Truncated": false, - "Highlighted": " \u003e\u003e\u003e import zipfile", - "FirstCause": false, - "LastCause": false - }, - { - "Number": 528, - "Content": "9-be36-26ec9bc0df3b.jpg?token=****************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************'", - "IsCause": true, - "Annotation": "", - "Truncated": false, - "Highlighted": "9-be36-26ec9bc0df3b.jpg?token=****************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************'", - "FirstCause": true, - "LastCause": true - }, - { - "Number": 529, - "Content": " \u003e\u003e\u003e r = requests.get(url)", - "IsCause": false, - "Annotation": "", - "Truncated": false, - "Highlighted": " \u003e\u003e\u003e r = requests.get(url)", - "FirstCause": false, - "LastCause": false - } - ] - }, - "Match": "9-be36-26ec9bc0df3b.jpg?token=****************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************'", - "Offset": 17236 - } - ] + "Target": ".venv/lib/python3.14/site-packages/skimage/data/_fetchers.py", + "Class": "secret" } ] } diff --git a/uv.lock b/uv.lock index 43a4a31..c8cfc55 100644 --- a/uv.lock +++ b/uv.lock @@ -166,6 +166,20 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/8e/64/013409c451a44b61310fb757af4527f3de57fc98a00f40448de28b864290/albumentations-2.0.8-py3-none-any.whl", hash = "sha256:c4c4259aaf04a7386ad85c7fdcb73c6c7146ca3057446b745cc035805acb1017", size = 369423, upload-time = "2025-05-27T21:23:15.609Z" }, ] +[[package]] +name = "alembic" +version = "1.18.4" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "mako" }, + { name = "sqlalchemy" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/94/13/8b084e0f2efb0275a1d534838844926f798bd766566b1375174e2448cd31/alembic-1.18.4.tar.gz", hash = "sha256:cb6e1fd84b6174ab8dbb2329f86d631ba9559dd78df550b57804d607672cedbc", size = 2056725, upload-time = "2026-02-10T16:00:47.195Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d2/29/6533c317b74f707ea28f8d633734dbda2119bbadfc61b2f3640ba835d0f7/alembic-1.18.4-py3-none-any.whl", hash = "sha256:a5ed4adcf6d8a4cb575f3d759f071b03cd6e5c7618eb796cb52497be25bfe19a", size = 263893, upload-time = "2026-02-10T16:00:49.997Z" }, +] + [[package]] name = "annotated-doc" version = "0.0.4" @@ -2285,6 +2299,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/1a/12/185a8822994a2f7b5e7d88d19a88d80637917bbb0a6f3f59a2564aabc125/magika-1.0.2-py3-none-win_amd64.whl", hash = "sha256:4937e876d55642423d6416e5db4e5ca7523ab7f855cbc5389efdeac1d149df04", size = 13099543, upload-time = "2026-02-25T16:07:01.942Z" }, ] +[[package]] +name = "mako" +version = "1.3.10" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "markupsafe" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/9e/38/bd5b78a920a64d708fe6bc8e0a2c075e1389d53bef8413725c63ba041535/mako-1.3.10.tar.gz", hash = "sha256:99579a6f39583fa7e5630a28c3c1f440e4e97a414b80372649c0ce338da2ea28", size = 392474, upload-time = "2025-04-10T12:44:31.16Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/87/fb/99f81ac72ae23375f22b7afdb7642aba97c00a713c217124420147681a2f/mako-1.3.10-py3-none-any.whl", hash = "sha256:baef24a52fc4fc514a0887ac600f9f1cff3d82c61d4d700a1fa84d597b88db59", size = 78509, upload-time = "2025-04-10T12:50:53.297Z" }, +] + [[package]] name = "mammoth" version = "1.12.0" @@ -2410,6 +2436,7 @@ version = "0.1.0" source = { virtual = "." } dependencies = [ { name = "aiofiles" }, + { name = "alembic" }, { name = "asyncpg" }, { name = "authlib" }, { name = "cryptography" }, @@ -2442,6 +2469,7 @@ dev = [ [package.metadata] requires-dist = [ { name = "aiofiles", specifier = ">=24.1.0" }, + { name = "alembic", specifier = ">=1.13.0" }, { name = "asyncpg", specifier = ">=0.31.0" }, { name = "authlib", specifier = ">=1.6.9" }, { name = "cryptography", specifier = ">=46.0.5" },