diff --git a/README.md b/README.md index 2261d99..fec5ab1 100644 --- a/README.md +++ b/README.md @@ -19,21 +19,24 @@ Multi-modal RAG service exposing a REST API and MCP server for document indexing | api/ | | | indexing_routes.py | | | query_routes.py | | + | file_routes.py | | | health_routes.py | | | use_cases/ | | | IndexFileUseCase | | | IndexFolderUseCase | | | QueryUseCase | | + | ListFilesUseCase | | + | ReadFileUseCase | | | requests/ responses/ | | +------------------------------+ | | | | | v v v v Domain Layer (ports) +------------------------------------------+ - | RAGEnginePort StoragePort BM25EnginePort| + | RAGEnginePort StoragePort BM25EnginePort DocumentReaderPort | +------------------------------------------+ - | | | - v v v + | | | | + v v v v Infrastructure Layer (adapters) +------------------------------------------+ | LightRAGAdapter MinioAdapter | @@ -41,12 +44,15 @@ Multi-modal RAG service exposing a REST API and MCP server for document indexing | | | PostgresBM25Adapter RRFCombiner | | (pg_textsearch) (hybrid+ fusion) | + | | + | KreuzbergAdapter | + | (kreuzberg - 91 formats) | +------------------------------------------+ - | | | - v v v - PostgreSQL MinIO - (pgvector + (object - Apache AGE storage) + | | | | + v v v v + PostgreSQL MinIO Kreuzberg + (pgvector + (object (document + Apache AGE storage) extraction) pg_textsearch) ``` @@ -179,6 +185,65 @@ The service automatically detects and processes the following document formats t **Note:** File format detection is automatic. No configuration is required to specify the document type. The service will process any supported format when indexed. All document and image formats are supported out-of-the-box when installed with `raganything[all]`. +## File Browsing & Reading + +Browse and read files directly from MinIO without indexing them into the RAG knowledge base. Powered by [Kreuzberg](https://github.com/kreuzberg-dev/kreuzberg) for document text extraction (91 file formats). + +### List files + +```bash +# List all files in the bucket +curl http://localhost:8000/api/v1/files/list + +# List files under a specific prefix +curl "http://localhost:8000/api/v1/files/list?prefix=documents/&recursive=true" +``` + +Response (`200 OK`): + +```json +[ + {"object_name": "documents/report.pdf", "size": 1024, "last_modified": "2026-01-01 00:00:00+00:00"}, + {"object_name": "documents/notes.txt", "size": 512, "last_modified": "2026-01-02 00:00:00+00:00"} +] +``` + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `prefix` | string | `""` | MinIO prefix to filter files by | +| `recursive` | boolean | `true` | List files in subdirectories | + +### Read a file + +Downloads the file from MinIO, extracts its text content using Kreuzberg, and returns the result. Supports 91 file formats including PDF, Office documents, images, and HTML. + +```bash +curl -X POST http://localhost:8000/api/v1/files/read \ + -H "Content-Type: application/json" \ + -d '{"file_path": "documents/report.pdf"}' +``` + +Response (`200 OK`): + +```json +{ + "content": "Extracted text from the document...", + "metadata": {"format_type": "pdf", "mime_type": "application/pdf"}, + "tables": [{"markdown": "| Header | Value |\n|---|---|\n| A | 1 |"}] +} +``` + +| Field | Type | Description | +|-------|------|-------------| +| `file_path` | string | **Required.** File path in the MinIO bucket (relative, no `..` or absolute paths) | + +Error responses: + +| Status | Condition | +|--------|-----------| +| `404` | File not found in MinIO | +| `422` | Unsupported file format or invalid path (path traversal, absolute path) | + ### Query Query the indexed knowledge base. The RAG engine is initialized for the given `working_dir` before executing the query. @@ -319,7 +384,7 @@ The `combined_score` is the sum of `bm25_score` and `vector_score`, each compute ## MCP Server -The MCP server is mounted at `/mcp` and exposes a single tool: `query_knowledge_base`. +The MCP server is mounted at `/mcp` and exposes the following tools: ### Tool: `query_knowledge_base` @@ -330,6 +395,31 @@ The MCP server is mounted at `/mcp` and exposes a single tool: `query_knowledge_ | `mode` | string | `"naive"` | Search mode: `naive`, `local`, `global`, `hybrid`, `hybrid+`, `mix`, `bm25`, `bypass` | | `top_k` | integer | `10` | Number of chunks to retrieve | +### Tool: `query_knowledge_base_multimodal` + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `working_dir` | string | required | RAG workspace directory for this project | +| `query` | string | required | The search query | +| `multimodal_content` | list | required | List of multimodal content items | +| `mode` | string | `"hybrid"` | Search mode | +| `top_k` | integer | `5` | Number of chunks to retrieve | + +### Tool: `list_files` + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `prefix` | string | `""` | MinIO prefix to filter files by | +| `recursive` | boolean | `true` | List files in subdirectories | + +### Tool: `read_file` + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `file_path` | string | required | File path in MinIO bucket (e.g. `documents/report.pdf`) | + +Downloads the file from MinIO, extracts its text content using Kreuzberg, and returns the extracted text along with metadata and any detected tables. + ### Transport modes The `MCP_TRANSPORT` environment variable controls how the MCP server is exposed: @@ -495,28 +585,36 @@ src/ indexing_result.py -- FileIndexingResult, FolderIndexingResult ports/ rag_engine.py -- RAGEnginePort (abstract) - storage_port.py -- StoragePort (abstract) + storage_port.py -- StoragePort (abstract) + FileInfo bm25_engine.py -- BM25EnginePort (abstract) + document_reader_port.py -- DocumentReaderPort (abstract) + DocumentContent application/ api/ health_routes.py -- GET /health indexing_routes.py -- POST /file/index, /folder/index query_routes.py -- POST /query - mcp_tools.py -- MCP tool: query_knowledge_base + file_routes.py -- GET /files/list, POST /files/read + mcp_tools.py -- MCP tools: query_knowledge_base, list_files, read_file requests/ indexing_request.py -- IndexFileRequest, IndexFolderRequest - query_request.py -- QueryRequest, QueryMode + query_request.py -- QueryRequest, MultimodalQueryRequest + file_request.py -- ListFilesRequest, ReadFileRequest responses/ query_response.py -- QueryResponse, QueryDataResponse + file_response.py -- FileInfoResponse, FileContentResponse use_cases/ index_file_use_case.py -- Downloads from MinIO, indexes single file index_folder_use_case.py -- Downloads from MinIO, indexes folder query_use_case.py -- Query with bm25/hybrid+ support + list_files_use_case.py -- Lists files with metadata from MinIO + read_file_use_case.py -- Reads file from MinIO, extracts content via Kreuzberg infrastructure/ rag/ lightrag_adapter.py -- LightRAGAdapter (RAGAnything/LightRAG) storage/ minio_adapter.py -- MinioAdapter (minio-py client) + document_reader/ + kreuzberg_adapter.py -- KreuzbergAdapter (kreuzberg, 91 formats) bm25/ pg_textsearch_adapter.py -- PostgresBM25Adapter (pg_textsearch) hybrid/ diff --git a/pyproject.toml b/pyproject.toml index 829ded9..ae181e3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,6 +14,7 @@ dependencies = [ "fastmcp>=3.2.0", "cryptography>=46.0.5", "httpx>=0.27.0", + "kreuzberg>=4.0.0", "lightrag-hku>=1.4.13", "lightrag-hku[api]>=1.4.13", "mcp>=1.24.0", diff --git a/src/application/api/file_routes.py b/src/application/api/file_routes.py new file mode 100644 index 0000000..7031004 --- /dev/null +++ b/src/application/api/file_routes.py @@ -0,0 +1,53 @@ +from dataclasses import asdict + +from fastapi import APIRouter, Depends, HTTPException, status + +from application.requests.file_request import ReadFileRequest +from application.responses.file_response import FileContentResponse, FileInfoResponse +from application.use_cases.list_files_use_case import ListFilesUseCase +from application.use_cases.read_file_use_case import ReadFileUseCase +from dependencies import get_list_files_use_case, get_read_file_use_case + +file_router = APIRouter(tags=["Files"]) + + +@file_router.get( + "/files/list", + response_model=list[FileInfoResponse], + status_code=status.HTTP_200_OK, +) +async def list_files( + prefix: str = "", + recursive: bool = True, + use_case: ListFilesUseCase = Depends(get_list_files_use_case), +) -> list[FileInfoResponse]: + files = await use_case.execute(prefix=prefix, recursive=recursive) + return [FileInfoResponse(**asdict(f)) for f in files] + + +@file_router.post( + "/files/read", + response_model=FileContentResponse, + status_code=status.HTTP_200_OK, +) +async def read_file( + request: ReadFileRequest, + use_case: ReadFileUseCase = Depends(get_read_file_use_case), +) -> FileContentResponse: + try: + result = await use_case.execute(file_path=request.file_path) + except FileNotFoundError: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail=f"File not found: {request.file_path}", + ) from None + except ValueError as e: + raise HTTPException( + status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, + detail=str(e), + ) from None + return FileContentResponse( + content=result.content, + metadata=result.metadata, + tables=result.tables, + ) diff --git a/src/application/api/mcp_tools.py b/src/application/api/mcp_tools.py index 697a780..11f98c8 100644 --- a/src/application/api/mcp_tools.py +++ b/src/application/api/mcp_tools.py @@ -3,11 +3,22 @@ These tools are registered with FastMCP for Claude Desktop integration. """ +import logging +from dataclasses import asdict + from fastmcp import FastMCP from application.requests.query_request import MultimodalContentItem +from application.responses.file_response import FileContentResponse, FileInfoResponse from application.responses.query_response import ChunkResponse, QueryResponse -from dependencies import get_multimodal_query_use_case, get_query_use_case +from dependencies import ( + get_list_files_use_case, + get_multimodal_query_use_case, + get_query_use_case, + get_read_file_use_case, +) + +logger = logging.getLogger(__name__) mcp = FastMCP("RAGAnything") @@ -77,3 +88,49 @@ async def query_knowledge_base_multimodal( mode=mode, top_k=top_k, ) + + +@mcp.tool() +async def list_files( + prefix: str = "", recursive: bool = True +) -> list[FileInfoResponse]: + """List files in MinIO storage under a given prefix. + + Args: + prefix: MinIO prefix/path to filter files by (e.g. 'documents/') + recursive: Whether to list files in subdirectories (default True) + + Returns: + List of file objects with object_name, size, and last_modified + """ + use_case = get_list_files_use_case() + files = await use_case.execute(prefix=prefix, recursive=recursive) + return [FileInfoResponse(**asdict(f)) for f in files] + + +@mcp.tool() +async def read_file(file_path: str) -> FileContentResponse: + """Read and extract text content from a file stored in MinIO. + + Supports 91 file formats including PDF, Office documents, images, HTML, etc. + Uses Kreuzberg for document intelligence extraction. + + Args: + file_path: Path to the file in MinIO bucket (e.g. 'documents/report.pdf') + + Returns: + Extracted text content with metadata and any detected tables + """ + use_case = get_read_file_use_case() + try: + result = await use_case.execute(file_path=file_path) + except FileNotFoundError: + raise ValueError(f"File not found: {file_path}") from None + except Exception: + logger.exception("Unexpected error reading file: %s", file_path) + raise RuntimeError("Failed to read file") from None + return FileContentResponse( + content=result.content, + metadata=result.metadata, + tables=result.tables, + ) diff --git a/src/application/requests/file_request.py b/src/application/requests/file_request.py new file mode 100644 index 0000000..86b7d4a --- /dev/null +++ b/src/application/requests/file_request.py @@ -0,0 +1,15 @@ +import os + +from pydantic import BaseModel, Field, field_validator + + +class ReadFileRequest(BaseModel): + file_path: str = Field(..., description="File path in MinIO bucket") + + @field_validator("file_path") + @classmethod + def validate_file_path(cls, v: str) -> str: + normalized = os.path.normpath(v).replace("\\", "/") + if normalized.startswith("..") or os.path.isabs(normalized): + raise ValueError("file_path must be a relative path within the bucket") + return normalized diff --git a/src/application/responses/file_response.py b/src/application/responses/file_response.py new file mode 100644 index 0000000..492fa02 --- /dev/null +++ b/src/application/responses/file_response.py @@ -0,0 +1,15 @@ +from pydantic import BaseModel, Field + +from domain.ports.document_reader_port import DocumentMetadata, TableData + + +class FileInfoResponse(BaseModel): + object_name: str + size: int + last_modified: str | None = None + + +class FileContentResponse(BaseModel): + content: str + metadata: DocumentMetadata + tables: list[TableData] = Field(default_factory=list) diff --git a/src/application/use_cases/list_files_use_case.py b/src/application/use_cases/list_files_use_case.py new file mode 100644 index 0000000..df5fceb --- /dev/null +++ b/src/application/use_cases/list_files_use_case.py @@ -0,0 +1,10 @@ +from domain.ports.storage_port import FileInfo, StoragePort + + +class ListFilesUseCase: + def __init__(self, storage: StoragePort, bucket: str) -> None: + self.storage = storage + self.bucket = bucket + + async def execute(self, prefix: str = "", recursive: bool = True) -> list[FileInfo]: + return await self.storage.list_files_metadata(self.bucket, prefix, recursive) diff --git a/src/application/use_cases/read_file_use_case.py b/src/application/use_cases/read_file_use_case.py new file mode 100644 index 0000000..34a0acf --- /dev/null +++ b/src/application/use_cases/read_file_use_case.py @@ -0,0 +1,38 @@ +import contextlib +import os +import tempfile + +import aiofiles + +from domain.ports.document_reader_port import DocumentContent, DocumentReaderPort +from domain.ports.storage_port import StoragePort + + +class ReadFileUseCase: + def __init__( + self, + storage: StoragePort, + document_reader: DocumentReaderPort, + bucket: str, + output_dir: str, + ) -> None: + self.storage = storage + self.document_reader = document_reader + self.bucket = bucket + self.output_dir = output_dir + + async def execute(self, file_path: str) -> DocumentContent: + data = await self.storage.get_object(self.bucket, file_path) + + os.makedirs(self.output_dir, exist_ok=True) + suffix = os.path.splitext(file_path)[1] or ".bin" + fd, tmp_path = tempfile.mkstemp(suffix=suffix, dir=self.output_dir) + try: + os.close(fd) + async with aiofiles.open(tmp_path, "wb") as f: + await f.write(data) + + return await self.document_reader.extract_content(tmp_path) + finally: + with contextlib.suppress(OSError): + os.unlink(tmp_path) diff --git a/src/dependencies.py b/src/dependencies.py index 6b76062..781a8fb 100644 --- a/src/dependencies.py +++ b/src/dependencies.py @@ -4,8 +4,10 @@ from application.use_cases.index_file_use_case import IndexFileUseCase from application.use_cases.index_folder_use_case import IndexFolderUseCase +from application.use_cases.list_files_use_case import ListFilesUseCase from application.use_cases.multimodal_query_use_case import MultimodalQueryUseCase from application.use_cases.query_use_case import QueryUseCase +from application.use_cases.read_file_use_case import ReadFileUseCase from config import ( AppConfig, BM25Config, @@ -15,8 +17,9 @@ RAGConfig, ) from domain.ports.bm25_engine import BM25EnginePort -from infrastructure.rag.pg_textsearch_adapter import PostgresBM25Adapter +from infrastructure.document_reader.kreuzberg_adapter import KreuzbergAdapter from infrastructure.rag.lightrag_adapter import LightRAGAdapter +from infrastructure.rag.pg_textsearch_adapter import PostgresBM25Adapter from infrastructure.storage.minio_adapter import MinioAdapter app_config = AppConfig() # type: ignore @@ -47,6 +50,8 @@ print(f"WARNING: BM25 adapter initialization failed: {e}") bm25_adapter = None +kreuzberg_adapter = KreuzbergAdapter() + def get_index_file_use_case() -> IndexFileUseCase: return IndexFileUseCase( @@ -70,3 +75,16 @@ def get_query_use_case() -> QueryUseCase: def get_multimodal_query_use_case() -> MultimodalQueryUseCase: return MultimodalQueryUseCase(rag_adapter) + + +def get_list_files_use_case() -> ListFilesUseCase: + return ListFilesUseCase(storage=minio_adapter, bucket=minio_config.MINIO_BUCKET) + + +def get_read_file_use_case() -> ReadFileUseCase: + return ReadFileUseCase( + storage=minio_adapter, + document_reader=kreuzberg_adapter, + bucket=minio_config.MINIO_BUCKET, + output_dir=app_config.OUTPUT_DIR, + ) diff --git a/src/domain/ports/document_reader_port.py b/src/domain/ports/document_reader_port.py new file mode 100644 index 0000000..a10a8ea --- /dev/null +++ b/src/domain/ports/document_reader_port.py @@ -0,0 +1,24 @@ +from abc import ABC, abstractmethod + +from pydantic import BaseModel + + +class DocumentMetadata(BaseModel): + format_type: str = "" + mime_type: str = "" + + +class TableData(BaseModel): + markdown: str = "" + + +class DocumentContent(BaseModel): + content: str + metadata: DocumentMetadata + tables: list[TableData] = [] + + +class DocumentReaderPort(ABC): + @abstractmethod + async def extract_content(self, file_path: str) -> DocumentContent: + pass diff --git a/src/domain/ports/storage_port.py b/src/domain/ports/storage_port.py index c9a0631..ba50e62 100644 --- a/src/domain/ports/storage_port.py +++ b/src/domain/ports/storage_port.py @@ -1,4 +1,12 @@ from abc import ABC, abstractmethod +from dataclasses import dataclass + + +@dataclass +class FileInfo: + object_name: str + size: int + last_modified: str | None = None class StoragePort(ABC): @@ -37,3 +45,20 @@ async def list_objects( A list of object keys matching the prefix. """ pass + + @abstractmethod + async def list_files_metadata( + self, bucket: str, prefix: str, recursive: bool = True + ) -> list[FileInfo]: + """ + List files with metadata under a given prefix. + + Args: + bucket: The bucket name to list files from. + prefix: The prefix to filter files by. + recursive: Whether to list files recursively. + + Returns: + A list of FileInfo objects with object_name, size, and last_modified. + """ + pass diff --git a/src/infrastructure/document_reader/kreuzberg_adapter.py b/src/infrastructure/document_reader/kreuzberg_adapter.py new file mode 100644 index 0000000..79e42da --- /dev/null +++ b/src/infrastructure/document_reader/kreuzberg_adapter.py @@ -0,0 +1,31 @@ +from kreuzberg import ParsingError, ValidationError, extract_file + +from domain.ports.document_reader_port import ( + DocumentContent, + DocumentMetadata, + DocumentReaderPort, + TableData, +) + + +class KreuzbergAdapter(DocumentReaderPort): + async def extract_content(self, file_path: str) -> DocumentContent: + try: + result = await extract_file(file_path) + except ParsingError as e: + raise ValueError(f"Unsupported file format: {e}") from e + except ValidationError as e: + raise ValueError(f"Invalid file: {e}") from e + + metadata = result.metadata if isinstance(result.metadata, dict) else {} + return DocumentContent( + content=result.content or "", + metadata=DocumentMetadata( + format_type=metadata.get("format_type", ""), + mime_type=result.mime_type or "", + ), + tables=[ + TableData(markdown=getattr(t, "markdown", str(t))) + for t in (result.tables or []) + ], + ) diff --git a/src/infrastructure/storage/minio_adapter.py b/src/infrastructure/storage/minio_adapter.py index ae5bdd6..db9dafe 100644 --- a/src/infrastructure/storage/minio_adapter.py +++ b/src/infrastructure/storage/minio_adapter.py @@ -4,7 +4,7 @@ from minio import Minio from minio.error import S3Error -from domain.ports.storage_port import StoragePort +from domain.ports.storage_port import FileInfo, StoragePort logger = logging.getLogger(__name__) @@ -15,15 +15,6 @@ class MinioAdapter(StoragePort): def __init__( self, host: str, access: str, secret: str, secure: bool = False ) -> None: - """ - Initialize the MinIO adapter with connection parameters. - - Args: - host: The MinIO server endpoint (host:port). - access: The access key for authentication. - secret: The secret key for authentication. - secure: Whether to use HTTPS. Defaults to False. - """ self.client = Minio( endpoint=host, access_key=access, @@ -32,19 +23,6 @@ def __init__( ) async def get_object(self, bucket: str, object_path: str) -> bytes: - """ - Retrieve an object from MinIO storage. - - Args: - bucket: The bucket name where the object is stored. - object_path: The path/key of the object within the bucket. - - Returns: - The object content as bytes. - - Raises: - FileNotFoundError: If the object or bucket does not exist. - """ try: loop = asyncio.get_running_loop() response = await loop.run_in_executor( @@ -57,32 +35,48 @@ async def get_object(self, bucket: str, object_path: str) -> bytes: response.release_conn() except S3Error as e: if e.code in ("NoSuchKey", "NoSuchBucket"): - logger.warning(f"Object not found: bucket={bucket}, path={object_path}") + logger.info("Object not found: bucket=%s, path=%s", bucket, object_path) raise FileNotFoundError( f"Object not found: bucket={bucket}, path={object_path}" - ) from None - logger.error(f"MinIO error retrieving object: {e}", exc_info=True) + ) from e + logger.error("MinIO error retrieving object: %s", e, exc_info=True) + raise + + async def _list_minio_objects( + self, bucket: str, prefix: str, recursive: bool = True + ) -> list: + """Fetch raw MinIO object list, raising FileNotFoundError for missing buckets.""" + try: + loop = asyncio.get_running_loop() + return await loop.run_in_executor( + None, + lambda: list( + self.client.list_objects(bucket, prefix=prefix, recursive=recursive) + ), + ) + except S3Error as e: + if e.code == "NoSuchBucket": + logger.info("Bucket not found: %s", bucket) + raise FileNotFoundError(f"Bucket not found: {bucket}") from e + logger.error("MinIO error listing objects: %s", e, exc_info=True) raise async def list_objects( self, bucket: str, prefix: str, recursive: bool = True ) -> list[str]: - """ - List object keys under a given prefix in MinIO. - - Args: - bucket: The bucket name to list objects from. - prefix: The prefix to filter objects by. - recursive: Whether to list objects recursively. - - Returns: - A list of object keys (excluding directories). - """ - loop = asyncio.get_running_loop() - objects = await loop.run_in_executor( - None, - lambda: list( - self.client.list_objects(bucket, prefix=prefix, recursive=recursive) - ), - ) + objects = await self._list_minio_objects(bucket, prefix, recursive) return [obj.object_name for obj in objects if not obj.is_dir] + + async def list_files_metadata( + self, bucket: str, prefix: str, recursive: bool = True + ) -> list[FileInfo]: + objects = await self._list_minio_objects(bucket, prefix, recursive) + return [ + FileInfo( + object_name=obj.object_name, + size=obj.size or 0, + last_modified=str(obj.last_modified) if obj.last_modified else None, + ) + for obj in objects + if not obj.is_dir + ] diff --git a/src/main.py b/src/main.py index d3071d4..992372a 100644 --- a/src/main.py +++ b/src/main.py @@ -12,6 +12,7 @@ from fastapi.middleware.cors import CORSMiddleware from alembic import command +from application.api.file_routes import file_router from application.api.health_routes import health_router from application.api.indexing_routes import indexing_router from application.api.mcp_tools import mcp @@ -110,6 +111,7 @@ async def combined_lifespan(app: FastAPI): app.include_router(indexing_router, prefix=REST_PATH) app.include_router(health_router, prefix=REST_PATH) app.include_router(query_router, prefix=REST_PATH) +app.include_router(file_router, prefix=REST_PATH) def run_fastapi(): diff --git a/tests/conftest.py b/tests/conftest.py index 7e422ab..fb71b45 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -19,6 +19,7 @@ # Re-export fixtures so pytest discovers them mock_rag_engine = _external.mock_rag_engine mock_storage = _external.mock_storage +mock_document_reader = _external.mock_document_reader @pytest.fixture diff --git a/tests/fixtures/external.py b/tests/fixtures/external.py index 9f696f2..9539656 100644 --- a/tests/fixtures/external.py +++ b/tests/fixtures/external.py @@ -8,8 +8,13 @@ FolderIndexingStats, IndexingStatus, ) +from domain.ports.document_reader_port import ( + DocumentContent, + DocumentMetadata, + DocumentReaderPort, +) from domain.ports.rag_engine import RAGEnginePort -from domain.ports.storage_port import StoragePort +from domain.ports.storage_port import FileInfo, StoragePort @pytest.fixture @@ -50,4 +55,27 @@ def mock_storage() -> AsyncMock: mock = AsyncMock(spec=StoragePort) mock.get_object.return_value = b"fake file content" mock.list_objects.return_value = ["project/doc1.pdf", "project/doc2.pdf"] + mock.list_files_metadata.return_value = [ + FileInfo( + object_name="project/doc1.pdf", + size=1024, + last_modified="2026-01-01 00:00:00+00:00", + ), + FileInfo( + object_name="project/doc2.pdf", + size=2048, + last_modified="2026-01-02 00:00:00+00:00", + ), + ] + return mock + + +@pytest.fixture +def mock_document_reader() -> AsyncMock: + mock = AsyncMock(spec=DocumentReaderPort) + mock.extract_content.return_value = DocumentContent( + content="Extracted text content from document.", + metadata=DocumentMetadata(format_type="pdf", mime_type="application/pdf"), + tables=[], + ) return mock diff --git a/tests/unit/test_file_routes.py b/tests/unit/test_file_routes.py new file mode 100644 index 0000000..e612284 --- /dev/null +++ b/tests/unit/test_file_routes.py @@ -0,0 +1,208 @@ +from unittest.mock import AsyncMock + +import httpx +import pytest +from httpx import ASGITransport + +from application.use_cases.list_files_use_case import ListFilesUseCase +from application.use_cases.read_file_use_case import ReadFileUseCase +from dependencies import get_list_files_use_case, get_read_file_use_case +from domain.ports.document_reader_port import DocumentContent, DocumentMetadata +from domain.ports.storage_port import FileInfo +from main import app + + +@pytest.fixture(autouse=True) +def _clear_dependency_overrides(): + yield + app.dependency_overrides.clear() + + +class TestListFilesRoute: + @pytest.fixture + def mock_list_files_use_case(self) -> AsyncMock: + mock = AsyncMock(spec=ListFilesUseCase) + mock.execute.return_value = [ + FileInfo( + object_name="docs/report.pdf", size=1024, last_modified="2026-01-01" + ), + FileInfo( + object_name="docs/notes.txt", size=512, last_modified="2026-01-02" + ), + ] + return mock + + async def test_list_files_returns_200( + self, mock_list_files_use_case: AsyncMock + ) -> None: + app.dependency_overrides[get_list_files_use_case] = lambda: ( + mock_list_files_use_case + ) + + async with httpx.AsyncClient( + transport=ASGITransport(app=app), base_url="http://test" + ) as client: + response = await client.get("/api/v1/files/list") + + assert response.status_code == 200 + + async def test_list_files_returns_file_list( + self, mock_list_files_use_case: AsyncMock + ) -> None: + app.dependency_overrides[get_list_files_use_case] = lambda: ( + mock_list_files_use_case + ) + + async with httpx.AsyncClient( + transport=ASGITransport(app=app), base_url="http://test" + ) as client: + response = await client.get("/api/v1/files/list") + + body = response.json() + assert len(body) == 2 + assert body[0]["object_name"] == "docs/report.pdf" + assert body[0]["size"] == 1024 + + async def test_list_files_with_prefix_param( + self, mock_list_files_use_case: AsyncMock + ) -> None: + app.dependency_overrides[get_list_files_use_case] = lambda: ( + mock_list_files_use_case + ) + + async with httpx.AsyncClient( + transport=ASGITransport(app=app), base_url="http://test" + ) as client: + response = await client.get( + "/api/v1/files/list", params={"prefix": "docs/", "recursive": "false"} + ) + + assert response.status_code == 200 + mock_list_files_use_case.execute.assert_called_once_with( + prefix="docs/", recursive=False + ) + + async def test_list_files_empty_result( + self, mock_list_files_use_case: AsyncMock + ) -> None: + mock_list_files_use_case.execute.return_value = [] + app.dependency_overrides[get_list_files_use_case] = lambda: ( + mock_list_files_use_case + ) + + async with httpx.AsyncClient( + transport=ASGITransport(app=app), base_url="http://test" + ) as client: + response = await client.get("/api/v1/files/list") + + body = response.json() + assert body == [] + + +class TestReadFileRoute: + @pytest.fixture + def mock_read_file_use_case(self) -> AsyncMock: + mock = AsyncMock(spec=ReadFileUseCase) + mock.execute.return_value = DocumentContent( + content="Extracted text from PDF", + metadata=DocumentMetadata(format_type="pdf", mime_type="application/pdf"), + tables=[], + ) + return mock + + async def test_read_file_returns_200( + self, mock_read_file_use_case: AsyncMock + ) -> None: + app.dependency_overrides[get_read_file_use_case] = lambda: ( + mock_read_file_use_case + ) + + async with httpx.AsyncClient( + transport=ASGITransport(app=app), base_url="http://test" + ) as client: + response = await client.post( + "/api/v1/files/read", + json={"file_path": "docs/report.pdf"}, + ) + + assert response.status_code == 200 + + async def test_read_file_returns_content( + self, mock_read_file_use_case: AsyncMock + ) -> None: + app.dependency_overrides[get_read_file_use_case] = lambda: ( + mock_read_file_use_case + ) + + async with httpx.AsyncClient( + transport=ASGITransport(app=app), base_url="http://test" + ) as client: + response = await client.post( + "/api/v1/files/read", + json={"file_path": "docs/report.pdf"}, + ) + + body = response.json() + assert body["content"] == "Extracted text from PDF" + assert body["metadata"]["mime_type"] == "application/pdf" + + async def test_read_file_returns_404_for_missing_file( + self, mock_read_file_use_case: AsyncMock + ) -> None: + mock_read_file_use_case.execute.side_effect = FileNotFoundError("not found") + app.dependency_overrides[get_read_file_use_case] = lambda: ( + mock_read_file_use_case + ) + + async with httpx.AsyncClient( + transport=ASGITransport(app=app), base_url="http://test" + ) as client: + response = await client.post( + "/api/v1/files/read", + json={"file_path": "nonexistent.pdf"}, + ) + + assert response.status_code == 404 + assert ( + "not found" in response.json()["detail"].lower() + or "File not found" in response.json()["detail"] + ) + + async def test_read_file_returns_422_for_unsupported_format( + self, mock_read_file_use_case: AsyncMock + ) -> None: + mock_read_file_use_case.execute.side_effect = ValueError( + "Unsupported file format: video.mp4" + ) + app.dependency_overrides[get_read_file_use_case] = lambda: ( + mock_read_file_use_case + ) + + async with httpx.AsyncClient( + transport=ASGITransport(app=app), base_url="http://test" + ) as client: + response = await client.post( + "/api/v1/files/read", + json={"file_path": "video.mp4"}, + ) + + assert response.status_code == 422 + + async def test_read_file_rejects_missing_file_path(self) -> None: + async with httpx.AsyncClient( + transport=ASGITransport(app=app), base_url="http://test" + ) as client: + response = await client.post("/api/v1/files/read", json={}) + + assert response.status_code == 422 + + async def test_read_file_rejects_path_traversal(self) -> None: + async with httpx.AsyncClient( + transport=ASGITransport(app=app), base_url="http://test" + ) as client: + response = await client.post( + "/api/v1/files/read", + json={"file_path": "../../etc/passwd"}, + ) + + assert response.status_code == 422 diff --git a/tests/unit/test_kreuzberg_adapter.py b/tests/unit/test_kreuzberg_adapter.py new file mode 100644 index 0000000..2f5b184 --- /dev/null +++ b/tests/unit/test_kreuzberg_adapter.py @@ -0,0 +1,77 @@ +from unittest.mock import AsyncMock, patch + +import pytest + +from domain.ports.document_reader_port import DocumentContent +from infrastructure.document_reader.kreuzberg_adapter import KreuzbergAdapter + + +class TestKreuzbergAdapter: + @patch("infrastructure.document_reader.kreuzberg_adapter.extract_file") + async def test_extract_content_returns_document_content(self, mock_extract) -> None: + mock_result = AsyncMock() + mock_result.content = "Hello world" + mock_result.mime_type = "application/pdf" + mock_result.metadata = {"format_type": "pdf"} + mock_result.tables = [] + mock_extract.return_value = mock_result + + adapter = KreuzbergAdapter() + result = await adapter.extract_content("/tmp/test.pdf") + + assert isinstance(result, DocumentContent) + assert result.content == "Hello world" + assert result.metadata.mime_type == "application/pdf" + + @patch("infrastructure.document_reader.kreuzberg_adapter.extract_file") + async def test_extract_content_with_tables(self, mock_extract) -> None: + mock_result = AsyncMock() + mock_result.content = "text with table" + mock_result.mime_type = "application/pdf" + mock_result.metadata = {} + mock_table = AsyncMock() + mock_table.markdown = "| A | B |\n|---|---|" + mock_result.tables = [mock_table] + mock_extract.return_value = mock_result + + adapter = KreuzbergAdapter() + result = await adapter.extract_content("/tmp/test.pdf") + + assert len(result.tables) == 1 + assert result.tables[0].markdown == "| A | B |\n|---|---|" + + @patch("infrastructure.document_reader.kreuzberg_adapter.extract_file") + async def test_extract_content_raises_value_error_for_parsing_error( + self, mock_extract + ) -> None: + from kreuzberg import ParsingError + + mock_extract.side_effect = ParsingError("unsupported format") + adapter = KreuzbergAdapter() + + with pytest.raises(ValueError, match="Unsupported file format"): + await adapter.extract_content("/tmp/test.xyz") + + @patch("infrastructure.document_reader.kreuzberg_adapter.extract_file") + async def test_extract_content_raises_value_error_for_validation_error( + self, mock_extract + ) -> None: + from kreuzberg import ValidationError + + mock_extract.side_effect = ValidationError("invalid file") + adapter = KreuzbergAdapter() + + with pytest.raises(ValueError, match="Invalid file"): + await adapter.extract_content("/tmp/test.bad") + + @patch("infrastructure.document_reader.kreuzberg_adapter.extract_file") + async def test_extract_content_raises_on_other_kreuzberg_error( + self, mock_extract + ) -> None: + from kreuzberg import KreuzbergError + + mock_extract.side_effect = KreuzbergError("some other error") + adapter = KreuzbergAdapter() + + with pytest.raises(KreuzbergError): + await adapter.extract_content("/tmp/test.pdf") diff --git a/tests/unit/test_list_files_use_case.py b/tests/unit/test_list_files_use_case.py new file mode 100644 index 0000000..be945ed --- /dev/null +++ b/tests/unit/test_list_files_use_case.py @@ -0,0 +1,59 @@ +from unittest.mock import AsyncMock + +from application.use_cases.list_files_use_case import ListFilesUseCase +from domain.ports.storage_port import FileInfo + + +class TestListFilesUseCase: + async def test_execute_calls_storage_list_files_metadata( + self, mock_storage: AsyncMock + ) -> None: + mock_storage.list_files_metadata.return_value = [ + FileInfo( + object_name="docs/report.pdf", size=1024, last_modified="2026-01-01" + ), + ] + use_case = ListFilesUseCase(storage=mock_storage, bucket="test-bucket") + + await use_case.execute(prefix="docs/", recursive=True) + + mock_storage.list_files_metadata.assert_called_once_with( + "test-bucket", "docs/", True + ) + + async def test_execute_returns_file_infos(self, mock_storage: AsyncMock) -> None: + expected_files = [ + FileInfo( + object_name="docs/report.pdf", size=1024, last_modified="2026-01-01" + ), + FileInfo( + object_name="docs/notes.txt", size=512, last_modified="2026-01-02" + ), + ] + mock_storage.list_files_metadata.return_value = expected_files + use_case = ListFilesUseCase(storage=mock_storage, bucket="test-bucket") + + result = await use_case.execute(prefix="docs/") + + assert len(result) == 2 + assert result[0].object_name == "docs/report.pdf" + assert result[0].size == 1024 + assert result[1].object_name == "docs/notes.txt" + + async def test_execute_with_default_prefix(self, mock_storage: AsyncMock) -> None: + mock_storage.list_files_metadata.return_value = [] + use_case = ListFilesUseCase(storage=mock_storage, bucket="my-bucket") + + await use_case.execute() + + mock_storage.list_files_metadata.assert_called_once_with("my-bucket", "", True) + + async def test_execute_non_recursive(self, mock_storage: AsyncMock) -> None: + mock_storage.list_files_metadata.return_value = [] + use_case = ListFilesUseCase(storage=mock_storage, bucket="my-bucket") + + await use_case.execute(prefix="docs/", recursive=False) + + mock_storage.list_files_metadata.assert_called_once_with( + "my-bucket", "docs/", False + ) diff --git a/tests/unit/test_read_file_use_case.py b/tests/unit/test_read_file_use_case.py new file mode 100644 index 0000000..59d630c --- /dev/null +++ b/tests/unit/test_read_file_use_case.py @@ -0,0 +1,173 @@ +import os +from unittest.mock import AsyncMock + +import pytest + +from application.use_cases.read_file_use_case import ReadFileUseCase +from domain.ports.document_reader_port import DocumentContent, DocumentMetadata + + +class TestReadFileUseCase: + async def test_execute_downloads_file_from_storage( + self, + mock_storage: AsyncMock, + mock_document_reader: AsyncMock, + tmp_path, + ) -> None: + mock_storage.get_object.return_value = b"file content" + mock_document_reader.extract_content.return_value = DocumentContent( + content="extracted text", + metadata=DocumentMetadata(format_type="pdf", mime_type="application/pdf"), + tables=[], + ) + use_case = ReadFileUseCase( + storage=mock_storage, + document_reader=mock_document_reader, + bucket="my-bucket", + output_dir=str(tmp_path), + ) + + await use_case.execute(file_path="docs/report.pdf") + + mock_storage.get_object.assert_called_once_with("my-bucket", "docs/report.pdf") + + async def test_execute_returns_document_content( + self, + mock_storage: AsyncMock, + mock_document_reader: AsyncMock, + tmp_path, + ) -> None: + mock_storage.get_object.return_value = b"file content" + expected = DocumentContent( + content="extracted text", + metadata=DocumentMetadata(format_type="pdf", mime_type="application/pdf"), + tables=[], + ) + mock_document_reader.extract_content.return_value = expected + use_case = ReadFileUseCase( + storage=mock_storage, + document_reader=mock_document_reader, + bucket="my-bucket", + output_dir=str(tmp_path), + ) + + result = await use_case.execute(file_path="docs/report.pdf") + + assert result.content == "extracted text" + assert result.metadata.format_type == "pdf" + + async def test_execute_calls_document_reader_with_temp_file( + self, + mock_storage: AsyncMock, + mock_document_reader: AsyncMock, + tmp_path, + ) -> None: + mock_storage.get_object.return_value = b"pdf binary data" + mock_document_reader.extract_content.return_value = DocumentContent( + content="text", + metadata=DocumentMetadata(format_type="pdf"), + tables=[], + ) + use_case = ReadFileUseCase( + storage=mock_storage, + document_reader=mock_document_reader, + bucket="my-bucket", + output_dir=str(tmp_path), + ) + + await use_case.execute(file_path="docs/report.pdf") + + call_args = mock_document_reader.extract_content.call_args + tmp_file_path = call_args[0][0] + assert tmp_file_path.endswith(".pdf") + assert os.path.dirname(tmp_file_path) == str(tmp_path) + + async def test_execute_propagates_file_not_found( + self, + mock_storage: AsyncMock, + mock_document_reader: AsyncMock, + tmp_path, + ) -> None: + mock_storage.get_object.side_effect = FileNotFoundError("not found") + use_case = ReadFileUseCase( + storage=mock_storage, + document_reader=mock_document_reader, + bucket="my-bucket", + output_dir=str(tmp_path), + ) + + with pytest.raises(FileNotFoundError): + await use_case.execute(file_path="nonexistent.pdf") + + async def test_execute_cleans_up_temp_file( + self, + mock_storage: AsyncMock, + mock_document_reader: AsyncMock, + tmp_path, + ) -> None: + mock_storage.get_object.return_value = b"data" + mock_document_reader.extract_content.return_value = DocumentContent( + content="text", + metadata=DocumentMetadata(format_type="txt"), + tables=[], + ) + use_case = ReadFileUseCase( + storage=mock_storage, + document_reader=mock_document_reader, + bucket="my-bucket", + output_dir=str(tmp_path), + ) + + await use_case.execute(file_path="report.pdf") + + call_args = mock_document_reader.extract_content.call_args + tmp_file_path = call_args[0][0] + assert not os.path.exists(tmp_file_path) + + async def test_execute_cleans_up_temp_file_on_error( + self, + mock_storage: AsyncMock, + mock_document_reader: AsyncMock, + tmp_path, + ) -> None: + mock_storage.get_object.return_value = b"data" + mock_document_reader.extract_content.side_effect = ValueError("bad format") + use_case = ReadFileUseCase( + storage=mock_storage, + document_reader=mock_document_reader, + bucket="my-bucket", + output_dir=str(tmp_path), + ) + + with pytest.raises(ValueError): + await use_case.execute(file_path="report.pdf") + + call_args = mock_document_reader.extract_content.call_args + tmp_file_path = call_args[0][0] + assert not os.path.exists(tmp_file_path) + + async def test_execute_with_tables( + self, + mock_storage: AsyncMock, + mock_document_reader: AsyncMock, + tmp_path, + ) -> None: + from domain.ports.document_reader_port import TableData + + mock_storage.get_object.return_value = b"data" + mock_document_reader.extract_content.return_value = DocumentContent( + content="text with table", + metadata=DocumentMetadata(format_type="pdf", mime_type="application/pdf"), + tables=[TableData(markdown="| A | B |\n|---|---|\n| 1 | 2 |")], + ) + use_case = ReadFileUseCase( + storage=mock_storage, + document_reader=mock_document_reader, + bucket="my-bucket", + output_dir=str(tmp_path), + ) + + result = await use_case.execute(file_path="report.pdf") + + assert len(result.tables) == 1 + assert result.tables[0].markdown == "| A | B |\n|---|---|\n| 1 | 2 |" diff --git a/uv.lock b/uv.lock index c8cfc55..0cb893f 100644 --- a/uv.lock +++ b/uv.lock @@ -2049,6 +2049,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/81/db/e655086b7f3a705df045bf0933bdd9c2f79bb3c97bfef1384598bb79a217/keyring-25.7.0-py3-none-any.whl", hash = "sha256:be4a0b195f149690c166e850609a477c532ddbfbaed96a404d4e43f8d5e2689f", size = 39160, upload-time = "2025-11-16T16:26:08.402Z" }, ] +[[package]] +name = "kreuzberg" +version = "4.8.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/e9/20/5dd24291d7a527fd05ea6288d53c2c11d5b38fed099ba1f7ab4949acabe4/kreuzberg-4.8.2.tar.gz", hash = "sha256:acf2067a8b0b64b99658048f0b830b4e9c4f085de6f674abe12079b2e89a6401", size = 2374937, upload-time = "2026-04-10T08:07:15.523Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/68/0d/05da654aee8c1388f2c97a39c3b63fca0b7141571fed2a529d855d854c35/kreuzberg-4.8.2-cp310-abi3-macosx_14_0_arm64.whl", hash = "sha256:70fcb2d003a36a78c09d128a9e1ac2f31bcf38c9c01ca2d08445afce8f1d85aa", size = 58423542, upload-time = "2026-04-10T08:07:00.768Z" }, + { url = "https://files.pythonhosted.org/packages/90/13/eb755e5eb06acb002aa99f618001d870cd614a9fec24ef10ff881b7e9075/kreuzberg-4.8.2-cp310-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:338f53afa1787de9b52aa6de4e7a4b2680333a8d89495d5585505c20af87b65b", size = 57380314, upload-time = "2026-04-10T08:07:04.409Z" }, + { url = "https://files.pythonhosted.org/packages/bc/2a/2279d15e46a504814a5b3ea5cf6389d8ecbcbd6691df753ce0f89f23a18c/kreuzberg-4.8.2-cp310-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:9f0ec11f96259d579fac1941bc4ad5114cfcd007d07d4e60b29664de1fd1bd1d", size = 60628097, upload-time = "2026-04-10T08:07:08.217Z" }, + { url = "https://files.pythonhosted.org/packages/33/56/258d1d227620075d6f1a21cd7a5f0b80488d816598c3c9edcdf1d067a6eb/kreuzberg-4.8.2-cp310-abi3-win_amd64.whl", hash = "sha256:232b3bdbd30ca850e129f55ef8735be7696f91cc372ad29c77ef0539a13974de", size = 66916658, upload-time = "2026-04-10T08:07:12.619Z" }, +] + [[package]] name = "latex2mathml" version = "3.79.0" @@ -2444,6 +2456,7 @@ dependencies = [ { name = "fastapi" }, { name = "fastmcp" }, { name = "httpx" }, + { name = "kreuzberg" }, { name = "lightrag-hku", extra = ["api"] }, { name = "mcp" }, { name = "minio" }, @@ -2477,6 +2490,7 @@ requires-dist = [ { name = "fastapi", specifier = ">=0.124.0" }, { name = "fastmcp", specifier = ">=3.2.0" }, { name = "httpx", specifier = ">=0.27.0" }, + { name = "kreuzberg", specifier = ">=4.0.0" }, { name = "lightrag-hku", specifier = ">=1.4.13" }, { name = "lightrag-hku", extras = ["api"], specifier = ">=1.4.13" }, { name = "mcp", specifier = ">=1.24.0" },