Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{
"python-envs.pythonProjects": [
{
"path": ".",
"envManager": "ms-python.python:venv",
"packageManager": "ms-python.python:pip"
}
]
}
24 changes: 18 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -214,25 +214,36 @@ Response (`200 OK`):
| `prefix` | string | `""` | MinIO prefix to filter files by |
| `recursive` | boolean | `true` | List files in subdirectories |

### List folders
### Upload a file

Returns top-level folder prefixes in the bucket. REST-only endpoint (not exposed as an MCP tool).
Uploads a file directly to the MinIO bucket. The file is stored at `{prefix}{filename}`. This endpoint does **not** index the file — use the `POST /file/index` endpoint after uploading to add it to the RAG knowledge base.

**Allowed file types:** `.pdf`, `.txt`, `.docx`, `.xlsx`, `.pptx`, `.md`, `.csv`, `.png`, `.jpg`, `.jpeg`, `.gif`, `.webp`, `.svg`, `.bmp`, `.html`, `.xml`, `.json`, `.rtf`, `.odt`, `.ods`
**Maximum file size:** 50 MB

```bash
curl http://localhost:8000/api/v1/files/folders
curl -X POST http://localhost:8000/api/v1/files/upload \
-F "file=@report.pdf" \
-F "prefix=documents/"
```

Response (`200 OK`):
Response (`201 Created`):

```json
["documents/", "photos/", "reports/"]
{"object_name": "documents/report.pdf", "size": 2048, "message": "File uploaded successfully"}
```

| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `file` | file | yes | -- | The file to upload (multipart form) |
| `prefix` | string | no | `""` | MinIO prefix (folder path). Must be a relative path |

Error responses:

| Status | Condition |
|--------|-----------|
| `404` | Bucket not found |
| `413` | File exceeds 50 MB limit |
| `422` | Invalid prefix (path traversal/absolute), disallowed file type, or missing file |

### Read a file

Expand Down Expand Up @@ -612,6 +623,7 @@ src/
list_files_use_case.py -- Lists files with metadata from MinIO
list_folders_use_case.py -- Lists folder prefixes from MinIO
read_file_use_case.py -- Reads file from MinIO, extracts content via Kreuzberg
upload_file_use_case.py -- Uploads file to MinIO storage
infrastructure/
rag/
lightrag_adapter.py -- LightRAGAdapter (RAGAnything/LightRAG)
Expand Down
88 changes: 87 additions & 1 deletion src/application/api/file_routes.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,63 @@
import posixpath
from dataclasses import asdict

from fastapi import APIRouter, Depends, HTTPException, status
from fastapi import APIRouter, Depends, File, Form, HTTPException, UploadFile, status

from application.requests.file_request import ReadFileRequest
from application.responses.file_response import FileContentResponse, FileInfoResponse
from application.use_cases.list_files_use_case import ListFilesUseCase
from application.use_cases.list_folders_use_case import ListFoldersUseCase
from application.use_cases.read_file_use_case import ReadFileUseCase
from application.use_cases.upload_file_use_case import UploadFileUseCase
from dependencies import (
get_list_files_use_case,
get_list_folders_use_case,
get_read_file_use_case,
get_upload_file_use_case,
)

file_router = APIRouter(tags=["Files"])

MAX_UPLOAD_SIZE = 50 * 1024 * 1024

ALLOWED_EXTENSIONS = {
".pdf", ".txt", ".docx", ".xlsx", ".pptx", ".md", ".csv",
".png", ".jpg", ".jpeg", ".gif", ".webp", ".svg", ".bmp",
".html", ".xml", ".json", ".rtf", ".odt", ".ods",
}

ALLOWED_MIME_PREFIXES = (
"application/pdf",
"text/",
"image/",
"application/vnd.openxmlformats-officedocument",
"application/vnd.ms-",
"application/json",
"application/rtf",
"application/vnd.oasis.opendocument",
)


def _sanitize_filename(filename: str | None) -> str:
if not filename:
raise HTTPException(status_code=422, detail="Filename is required")
clean = posixpath.basename(filename.replace("\\", "/"))
if not clean or clean.startswith("."):
raise HTTPException(status_code=422, detail="Invalid filename")
return clean


def _validate_file_type(filename: str, content_type: str) -> None:
ext = posixpath.splitext(filename)[1].lower()
if ext not in ALLOWED_EXTENSIONS:
raise HTTPException(
status_code=422, detail=f"File type '{ext}' is not allowed"
)
if not any(content_type.startswith(p) for p in ALLOWED_MIME_PREFIXES):
raise HTTPException(
status_code=422, detail=f"Content type '{content_type}' is not allowed"
)


@file_router.get(
"/files/list",
Expand Down Expand Up @@ -70,3 +113,46 @@ async def read_file(
metadata=result.metadata,
tables=result.tables,
)


@file_router.post(
"/files/upload",
status_code=status.HTTP_201_CREATED,
)
async def upload_file(
file: UploadFile = File(...),
prefix: str = Form(default=""),
use_case: UploadFileUseCase = Depends(get_upload_file_use_case),
):
normalized = posixpath.normpath(prefix.replace("\\", "/"))
if normalized == ".":
normalized = ""
if normalized.startswith("..") or posixpath.isabs(normalized):
raise HTTPException(
status_code=422, detail="prefix must be a relative path"
)
if prefix.endswith("/") and not normalized.endswith("/"):
normalized += "/"

safe_name = _sanitize_filename(file.filename)
content_type = file.content_type or "application/octet-stream"
_validate_file_type(safe_name, content_type)

file_data = await file.read()
if len(file_data) > MAX_UPLOAD_SIZE:
raise HTTPException(
status_code=status.HTTP_413_REQUEST_ENTITY_TOO_LARGE,
detail=f"File exceeds maximum allowed size of {MAX_UPLOAD_SIZE // (1024 * 1024)} MB",
)

result = await use_case.execute(
file_data=file_data,
file_name=safe_name,
prefix=normalized,
content_type=content_type,
)
return {
"object_name": result.object_name,
"size": result.size,
"message": "File uploaded successfully",
}
16 changes: 16 additions & 0 deletions src/application/use_cases/upload_file_use_case.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from domain.ports.storage_port import FileInfo, StoragePort


class UploadFileUseCase:
def __init__(self, storage: StoragePort, bucket: str) -> None:
self.storage = storage
self.bucket = bucket

async def execute(
self, file_data: bytes, file_name: str, prefix: str, content_type: str
) -> FileInfo:
if prefix and not prefix.endswith("/"):
prefix += "/"
object_path = prefix + file_name
await self.storage.put_object(self.bucket, object_path, file_data, content_type)
return FileInfo(object_name=object_path, size=len(file_data))
4 changes: 4 additions & 0 deletions src/dependencies.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from application.use_cases.multimodal_query_use_case import MultimodalQueryUseCase
from application.use_cases.query_use_case import QueryUseCase
from application.use_cases.read_file_use_case import ReadFileUseCase
from application.use_cases.upload_file_use_case import UploadFileUseCase
from config import (
AppConfig,
BM25Config,
Expand Down Expand Up @@ -98,6 +99,9 @@ def get_read_file_use_case() -> ReadFileUseCase:
)


def get_upload_file_use_case() -> UploadFileUseCase:
return UploadFileUseCase(storage=minio_adapter, bucket=minio_config.MINIO_BUCKET)

def get_liveness_check_use_case() -> LivenessCheckUseCase:
return LivenessCheckUseCase(
storage=minio_adapter,
Expand Down
18 changes: 18 additions & 0 deletions src/domain/ports/storage_port.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,24 @@ async def list_objects(
"""
pass

@abstractmethod
async def put_object(
self, bucket: str, object_path: str, data: bytes, content_type: str
) -> None:
"""
Store an object in the given bucket.

Args:
bucket: The bucket name to store the object in.
object_path: The path/key for the object within the bucket.
data: The raw bytes to store.
content_type: The MIME type of the content.

Raises:
FileNotFoundError: If the bucket does not exist.
"""
pass

@abstractmethod
async def list_files_metadata(
self, bucket: str, prefix: str, recursive: bool = True
Expand Down
23 changes: 23 additions & 0 deletions src/infrastructure/storage/minio_adapter.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import asyncio
import io
import logging

from minio import Minio
Expand Down Expand Up @@ -42,6 +43,28 @@ async def get_object(self, bucket: str, object_path: str) -> bytes:
logger.error("MinIO error retrieving object: %s", e, exc_info=True)
raise

async def put_object(
self, bucket: str, object_path: str, data: bytes, content_type: str
) -> None:
try:
loop = asyncio.get_running_loop()
await loop.run_in_executor(
None,
lambda: self.client.put_object(
bucket,
object_path,
io.BytesIO(data),
len(data),
content_type=content_type,
),
)
except S3Error as e:
if e.code == "NoSuchBucket":
logger.info("Bucket not found: %s", bucket)
raise FileNotFoundError(f"Bucket not found: {bucket}") from e
logger.error("MinIO error uploading object: %s", e, exc_info=True)
raise

async def _list_minio_objects(
self, bucket: str, prefix: str, recursive: bool = True
) -> list:
Expand Down
129 changes: 129 additions & 0 deletions tests/unit/test_upload_file_use_case.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
"""TDD tests for UploadFileUseCase — the implementation does not exist yet."""

from unittest.mock import AsyncMock

import pytest

from application.use_cases.upload_file_use_case import UploadFileUseCase
from domain.ports.storage_port import FileInfo


class TestUploadFileUseCase:
"""Tests for the upload file use case.

The use case will:
- Depend on StoragePort (abstract port)
- Construct the full object path as prefix + file_name
- Call storage.put_object(bucket, object_path, file_data, content_type)
- Return a FileInfo with object_name, size (len of data), last_modified=None
"""

@pytest.fixture
def use_case(self, mock_storage: AsyncMock) -> UploadFileUseCase:
return UploadFileUseCase(storage=mock_storage, bucket="test-bucket")

async def test_successful_upload_with_prefix_calls_put_object(
self, use_case: UploadFileUseCase, mock_storage: AsyncMock
) -> None:
"""Should call storage.put_object with prefix + file_name."""
file_data = b"hello world"
file_name = "report.pdf"
prefix = "documents/"
content_type = "application/pdf"

await use_case.execute(
file_data=file_data,
file_name=file_name,
prefix=prefix,
content_type=content_type,
)

mock_storage.put_object.assert_called_once_with(
"test-bucket",
"documents/report.pdf",
b"hello world",
"application/pdf",
)

async def test_successful_upload_returns_file_info(
self, use_case: UploadFileUseCase
) -> None:
"""Should return a FileInfo with object_name, size, and last_modified=None."""
file_data = b"hello world"
file_name = "report.pdf"
prefix = "documents/"
content_type = "application/pdf"

result = await use_case.execute(
file_data=file_data,
file_name=file_name,
prefix=prefix,
content_type=content_type,
)

assert isinstance(result, FileInfo)
assert result.object_name == "documents/report.pdf"
assert result.size == len(b"hello world")
assert result.last_modified is None

async def test_upload_with_empty_prefix_uses_just_file_name(
self, use_case: UploadFileUseCase, mock_storage: AsyncMock
) -> None:
"""When prefix is empty, object_path should be just the file_name."""
file_data = b"data"
file_name = "image.png"
prefix = ""
content_type = "image/png"

result = await use_case.execute(
file_data=file_data,
file_name=file_name,
prefix=prefix,
content_type=content_type,
)

assert result.object_name == "image.png"
mock_storage.put_object.assert_called_once_with(
"test-bucket",
"image.png",
b"data",
"image/png",
)

async def test_upload_with_prefix_not_ending_in_slash_still_works(
self, use_case: UploadFileUseCase, mock_storage: AsyncMock
) -> None:
"""Prefix without trailing slash should have slash appended."""
file_data = b"content"
file_name = "file.txt"
prefix = "folder"
content_type = "text/plain"

result = await use_case.execute(
file_data=file_data,
file_name=file_name,
prefix=prefix,
content_type=content_type,
)

assert result.object_name == "folder/file.txt"
mock_storage.put_object.assert_called_once_with(
"test-bucket",
"folder/file.txt",
b"content",
"text/plain",
)

async def test_file_not_found_error_from_storage_propagates(
self, use_case: UploadFileUseCase, mock_storage: AsyncMock
) -> None:
"""FileNotFoundError raised by storage should propagate to the caller."""
mock_storage.put_object.side_effect = FileNotFoundError("bucket not found")

with pytest.raises(FileNotFoundError, match="bucket not found"):
await use_case.execute(
file_data=b"data",
file_name="test.txt",
prefix="",
content_type="text/plain",
)
Loading
Loading