SoluDevTech · Kaiohz · Apr 10, 2026 · Apr 10, 2026
diff --git a/README.md b/README.md
@@ -19,34 +19,40 @@ Multi-modal RAG service exposing a REST API and MCP server for document indexing
    | api/                         |       |
    |   indexing_routes.py         |       |
    |   query_routes.py           |       |
+   |   file_routes.py             |       |
    |   health_routes.py          |       |
    | use_cases/                   |       |
    |   IndexFileUseCase           |       |
    |   IndexFolderUseCase         |       |
    |   QueryUseCase               |       |
+   |   ListFilesUseCase            |       |
+   |   ReadFileUseCase             |       |
    | requests/ responses/         |       |
    +------------------------------+       |
             |         |          |        |
             v         v          v        v
    Domain Layer (ports)
    +------------------------------------------+
-   | RAGEnginePort  StoragePort  BM25EnginePort|
+   | RAGEnginePort  StoragePort  BM25EnginePort  DocumentReaderPort |
    +------------------------------------------+
-            |         |          |
-            v         v          v
+            |         |          |              |
+            v         v          v              v
    Infrastructure Layer (adapters)
    +------------------------------------------+
    | LightRAGAdapter  MinioAdapter            |
    | (RAGAnything)    (minio-py)              |
    |                                              |
    | PostgresBM25Adapter    RRFCombiner          |
    | (pg_textsearch)         (hybrid+ fusion)   |
+   |                                              |
+   | KreuzbergAdapter                             |
+   | (kreuzberg - 91 formats)                    |
    +------------------------------------------+
-            |         |          |
-            v         v          v
-      PostgreSQL        MinIO
-      (pgvector +     (object
-       Apache AGE     storage)
+            |         |          |              |
+            v         v          v              v
+      PostgreSQL        MinIO         Kreuzberg
+      (pgvector +     (object       (document
+       Apache AGE      storage)       extraction)
        pg_textsearch)
 ```
 
@@ -179,6 +185,65 @@ The service automatically detects and processes the following document formats t
 
 **Note:** File format detection is automatic. No configuration is required to specify the document type. The service will process any supported format when indexed. All document and image formats are supported out-of-the-box when installed with `raganything[all]`.
 
+## File Browsing & Reading
+
+Browse and read files directly from MinIO without indexing them into the RAG knowledge base. Powered by [Kreuzberg](https://github.com/kreuzberg-dev/kreuzberg) for document text extraction (91 file formats).
+
+### List files
+
+```bash
+# List all files in the bucket
+curl http://localhost:8000/api/v1/files/list
+
+# List files under a specific prefix
+curl "http://localhost:8000/api/v1/files/list?prefix=documents/&recursive=true"
+```
+
+Response (`200 OK`):
+
+```json
+[
+  {"object_name": "documents/report.pdf", "size": 1024, "last_modified": "2026-01-01 00:00:00+00:00"},
+  {"object_name": "documents/notes.txt", "size": 512, "last_modified": "2026-01-02 00:00:00+00:00"}
+]
+```
+
+| Parameter | Type | Default | Description |
+|-----------|------|---------|-------------|
+| `prefix` | string | `""` | MinIO prefix to filter files by |
+| `recursive` | boolean | `true` | List files in subdirectories |
+
+### Read a file
+
+Downloads the file from MinIO, extracts its text content using Kreuzberg, and returns the result. Supports 91 file formats including PDF, Office documents, images, and HTML.
+
+```bash
+curl -X POST http://localhost:8000/api/v1/files/read \
+  -H "Content-Type: application/json" \
+  -d '{"file_path": "documents/report.pdf"}'
+```
+
+Response (`200 OK`):
+
+```json
+{
+  "content": "Extracted text from the document...",
+  "metadata": {"format_type": "pdf", "mime_type": "application/pdf"},
+  "tables": [{"markdown": "| Header | Value |\n|---|---|\n| A | 1 |"}]
+}
+```
+
+| Field | Type | Description |
+|-------|------|-------------|
+| `file_path` | string | **Required.** File path in the MinIO bucket (relative, no `..` or absolute paths) |
+
+Error responses:
+
+| Status | Condition |
+|--------|-----------|
+| `404` | File not found in MinIO |
+| `422` | Unsupported file format or invalid path (path traversal, absolute path) |
+
 ### Query
 
 Query the indexed knowledge base. The RAG engine is initialized for the given `working_dir` before executing the query.
@@ -319,7 +384,7 @@ The `combined_score` is the sum of `bm25_score` and `vector_score`, each compute
 
 ## MCP Server
 
-The MCP server is mounted at `/mcp` and exposes a single tool: `query_knowledge_base`.
+The MCP server is mounted at `/mcp` and exposes the following tools:
 
 ### Tool: `query_knowledge_base`
 
@@ -330,6 +395,31 @@ The MCP server is mounted at `/mcp` and exposes a single tool: `query_knowledge_
 | `mode` | string | `"naive"` | Search mode: `naive`, `local`, `global`, `hybrid`, `hybrid+`, `mix`, `bm25`, `bypass` |
 | `top_k` | integer | `10` | Number of chunks to retrieve |
 
+### Tool: `query_knowledge_base_multimodal`
+
+| Parameter | Type | Default | Description |
+|-----------|------|---------|-------------|
+| `working_dir` | string | required | RAG workspace directory for this project |
+| `query` | string | required | The search query |
+| `multimodal_content` | list | required | List of multimodal content items |
+| `mode` | string | `"hybrid"` | Search mode |
+| `top_k` | integer | `5` | Number of chunks to retrieve |
+
+### Tool: `list_files`
+
+| Parameter | Type | Default | Description |
+|-----------|------|---------|-------------|
+| `prefix` | string | `""` | MinIO prefix to filter files by |
+| `recursive` | boolean | `true` | List files in subdirectories |
+
+### Tool: `read_file`
+
+| Parameter | Type | Default | Description |
+|-----------|------|---------|-------------|
+| `file_path` | string | required | File path in MinIO bucket (e.g. `documents/report.pdf`) |
+
+Downloads the file from MinIO, extracts its text content using Kreuzberg, and returns the extracted text along with metadata and any detected tables.
+
 ### Transport modes
 
 The `MCP_TRANSPORT` environment variable controls how the MCP server is exposed:
@@ -495,28 +585,36 @@ src/
       indexing_result.py             -- FileIndexingResult, FolderIndexingResult
     ports/
       rag_engine.py                  -- RAGEnginePort (abstract)
-      storage_port.py                -- StoragePort (abstract)
+      storage_port.py                -- StoragePort (abstract) + FileInfo
       bm25_engine.py                 -- BM25EnginePort (abstract)
+      document_reader_port.py        -- DocumentReaderPort (abstract) + DocumentContent
   application/
     api/
       health_routes.py               -- GET /health
       indexing_routes.py              -- POST /file/index, /folder/index
       query_routes.py                 -- POST /query
-      mcp_tools.py                    -- MCP tool: query_knowledge_base
+      file_routes.py                  -- GET /files/list, POST /files/read
+      mcp_tools.py                    -- MCP tools: query_knowledge_base, list_files, read_file
     requests/
       indexing_request.py            -- IndexFileRequest, IndexFolderRequest
-      query_request.py                -- QueryRequest, QueryMode
+      query_request.py                -- QueryRequest, MultimodalQueryRequest
+      file_request.py                 -- ListFilesRequest, ReadFileRequest
     responses/
       query_response.py              -- QueryResponse, QueryDataResponse
+      file_response.py                -- FileInfoResponse, FileContentResponse
     use_cases/
       index_file_use_case.py         -- Downloads from MinIO, indexes single file
       index_folder_use_case.py       -- Downloads from MinIO, indexes folder
       query_use_case.py              -- Query with bm25/hybrid+ support
+      list_files_use_case.py          -- Lists files with metadata from MinIO
+      read_file_use_case.py           -- Reads file from MinIO, extracts content via Kreuzberg
   infrastructure/
     rag/
       lightrag_adapter.py            -- LightRAGAdapter (RAGAnything/LightRAG)
     storage/
       minio_adapter.py               -- MinioAdapter (minio-py client)
+    document_reader/
+      kreuzberg_adapter.py            -- KreuzbergAdapter (kreuzberg, 91 formats)
     bm25/
       pg_textsearch_adapter.py        -- PostgresBM25Adapter (pg_textsearch)
     hybrid/

diff --git a/pyproject.toml b/pyproject.toml
@@ -14,6 +14,7 @@ dependencies = [
     "fastmcp>=3.2.0",
     "cryptography>=46.0.5",
     "httpx>=0.27.0",
+    "kreuzberg>=4.0.0",
     "lightrag-hku>=1.4.13",
     "lightrag-hku[api]>=1.4.13",
     "mcp>=1.24.0",

diff --git a/src/application/api/file_routes.py b/src/application/api/file_routes.py
@@ -0,0 +1,53 @@
+from dataclasses import asdict
+
+from fastapi import APIRouter, Depends, HTTPException, status
+
+from application.requests.file_request import ReadFileRequest
+from application.responses.file_response import FileContentResponse, FileInfoResponse
+from application.use_cases.list_files_use_case import ListFilesUseCase
+from application.use_cases.read_file_use_case import ReadFileUseCase
+from dependencies import get_list_files_use_case, get_read_file_use_case
+
+file_router = APIRouter(tags=["Files"])
+
+
+@file_router.get(
+    "/files/list",
+    response_model=list[FileInfoResponse],
+    status_code=status.HTTP_200_OK,
+)
+async def list_files(
+    prefix: str = "",
+    recursive: bool = True,
+    use_case: ListFilesUseCase = Depends(get_list_files_use_case),
+) -> list[FileInfoResponse]:
+    files = await use_case.execute(prefix=prefix, recursive=recursive)
+    return [FileInfoResponse(**asdict(f)) for f in files]
+
+
+@file_router.post(
+    "/files/read",
+    response_model=FileContentResponse,
+    status_code=status.HTTP_200_OK,
+)
+async def read_file(
+    request: ReadFileRequest,
+    use_case: ReadFileUseCase = Depends(get_read_file_use_case),
+) -> FileContentResponse:
+    try:
+        result = await use_case.execute(file_path=request.file_path)
+    except FileNotFoundError:
+        raise HTTPException(
+            status_code=status.HTTP_404_NOT_FOUND,
+            detail=f"File not found: {request.file_path}",
+        ) from None
+    except ValueError as e:
+        raise HTTPException(
+            status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
+            detail=str(e),
+        ) from None
+    return FileContentResponse(
+        content=result.content,
+        metadata=result.metadata,
+        tables=result.tables,
+    )
diff --git a/src/application/api/mcp_tools.py b/src/application/api/mcp_tools.py
@@ -3,11 +3,22 @@
 These tools are registered with FastMCP for Claude Desktop integration.
 """
 
+import logging
+from dataclasses import asdict
+
 from fastmcp import FastMCP
 
 from application.requests.query_request import MultimodalContentItem
+from application.responses.file_response import FileContentResponse, FileInfoResponse
 from application.responses.query_response import ChunkResponse, QueryResponse
-from dependencies import get_multimodal_query_use_case, get_query_use_case
+from dependencies import (
+    get_list_files_use_case,
+    get_multimodal_query_use_case,
+    get_query_use_case,
+    get_read_file_use_case,
+)
+
+logger = logging.getLogger(__name__)
 
 mcp = FastMCP("RAGAnything")
 
@@ -77,3 +88,49 @@ async def query_knowledge_base_multimodal(
         mode=mode,
         top_k=top_k,
     )
+
+
+@mcp.tool()
+async def list_files(
+    prefix: str = "", recursive: bool = True
+) -> list[FileInfoResponse]:
+    """List files in MinIO storage under a given prefix.
+
+    Args:
+        prefix: MinIO prefix/path to filter files by (e.g. 'documents/')
+        recursive: Whether to list files in subdirectories (default True)
+
+    Returns:
+        List of file objects with object_name, size, and last_modified
+    """
+    use_case = get_list_files_use_case()
+    files = await use_case.execute(prefix=prefix, recursive=recursive)
+    return [FileInfoResponse(**asdict(f)) for f in files]
+
+
+@mcp.tool()
+async def read_file(file_path: str) -> FileContentResponse:
+    """Read and extract text content from a file stored in MinIO.
+
+    Supports 91 file formats including PDF, Office documents, images, HTML, etc.
+    Uses Kreuzberg for document intelligence extraction.
+
+    Args:
+        file_path: Path to the file in MinIO bucket (e.g. 'documents/report.pdf')
+
+    Returns:
+        Extracted text content with metadata and any detected tables
+    """
+    use_case = get_read_file_use_case()
+    try:
+        result = await use_case.execute(file_path=file_path)
+    except FileNotFoundError:
+        raise ValueError(f"File not found: {file_path}") from None
+    except Exception:
+        logger.exception("Unexpected error reading file: %s", file_path)
+        raise RuntimeError("Failed to read file") from None
+    return FileContentResponse(
+        content=result.content,
+        metadata=result.metadata,
+        tables=result.tables,
+    )
diff --git a/src/application/requests/file_request.py b/src/application/requests/file_request.py
@@ -0,0 +1,15 @@
+import os
+
+from pydantic import BaseModel, Field, field_validator
+
+
+class ReadFileRequest(BaseModel):
+    file_path: str = Field(..., description="File path in MinIO bucket")
+
+    @field_validator("file_path")
+    @classmethod
+    def validate_file_path(cls, v: str) -> str:
+        normalized = os.path.normpath(v).replace("\\", "/")
+        if normalized.startswith("..") or os.path.isabs(normalized):
+            raise ValueError("file_path must be a relative path within the bucket")
+        return normalized
diff --git a/src/application/responses/file_response.py b/src/application/responses/file_response.py
@@ -0,0 +1,15 @@
+from pydantic import BaseModel, Field
+
+from domain.ports.document_reader_port import DocumentMetadata, TableData
+
+
+class FileInfoResponse(BaseModel):
+    object_name: str
+    size: int
+    last_modified: str | None = None
+
+
+class FileContentResponse(BaseModel):
+    content: str
+    metadata: DocumentMetadata
+    tables: list[TableData] = Field(default_factory=list)
diff --git a/src/application/use_cases/list_files_use_case.py b/src/application/use_cases/list_files_use_case.py
@@ -0,0 +1,10 @@
+from domain.ports.storage_port import FileInfo, StoragePort
+
+
+class ListFilesUseCase:
+    def __init__(self, storage: StoragePort, bucket: str) -> None:
+        self.storage = storage
+        self.bucket = bucket
+
+    async def execute(self, prefix: str = "", recursive: bool = True) -> list[FileInfo]:
+        return await self.storage.list_files_metadata(self.bucket, prefix, recursive)