diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..1c49803 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,9 @@ +{ + "python-envs.pythonProjects": [ + { + "path": ".", + "envManager": "ms-python.python:venv", + "packageManager": "ms-python.python:pip" + } + ] +} \ No newline at end of file diff --git a/README.md b/README.md index 0d04255..e380b84 100644 --- a/README.md +++ b/README.md @@ -214,25 +214,36 @@ Response (`200 OK`): | `prefix` | string | `""` | MinIO prefix to filter files by | | `recursive` | boolean | `true` | List files in subdirectories | -### List folders +### Upload a file -Returns top-level folder prefixes in the bucket. REST-only endpoint (not exposed as an MCP tool). +Uploads a file directly to the MinIO bucket. The file is stored at `{prefix}{filename}`. This endpoint does **not** index the file — use the `POST /file/index` endpoint after uploading to add it to the RAG knowledge base. + +**Allowed file types:** `.pdf`, `.txt`, `.docx`, `.xlsx`, `.pptx`, `.md`, `.csv`, `.png`, `.jpg`, `.jpeg`, `.gif`, `.webp`, `.svg`, `.bmp`, `.html`, `.xml`, `.json`, `.rtf`, `.odt`, `.ods` +**Maximum file size:** 50 MB ```bash -curl http://localhost:8000/api/v1/files/folders +curl -X POST http://localhost:8000/api/v1/files/upload \ + -F "file=@report.pdf" \ + -F "prefix=documents/" ``` -Response (`200 OK`): +Response (`201 Created`): ```json -["documents/", "photos/", "reports/"] +{"object_name": "documents/report.pdf", "size": 2048, "message": "File uploaded successfully"} ``` +| Field | Type | Required | Default | Description | +|-------|------|----------|---------|-------------| +| `file` | file | yes | -- | The file to upload (multipart form) | +| `prefix` | string | no | `""` | MinIO prefix (folder path). Must be a relative path | + Error responses: | Status | Condition | |--------|-----------| -| `404` | Bucket not found | +| `413` | File exceeds 50 MB limit | +| `422` | Invalid prefix (path traversal/absolute), disallowed file type, or missing file | ### Read a file @@ -612,6 +623,7 @@ src/ list_files_use_case.py -- Lists files with metadata from MinIO list_folders_use_case.py -- Lists folder prefixes from MinIO read_file_use_case.py -- Reads file from MinIO, extracts content via Kreuzberg + upload_file_use_case.py -- Uploads file to MinIO storage infrastructure/ rag/ lightrag_adapter.py -- LightRAGAdapter (RAGAnything/LightRAG) diff --git a/src/application/api/file_routes.py b/src/application/api/file_routes.py index 91e7ea2..1af66a6 100644 --- a/src/application/api/file_routes.py +++ b/src/application/api/file_routes.py @@ -1,20 +1,63 @@ +import posixpath from dataclasses import asdict -from fastapi import APIRouter, Depends, HTTPException, status +from fastapi import APIRouter, Depends, File, Form, HTTPException, UploadFile, status from application.requests.file_request import ReadFileRequest from application.responses.file_response import FileContentResponse, FileInfoResponse from application.use_cases.list_files_use_case import ListFilesUseCase from application.use_cases.list_folders_use_case import ListFoldersUseCase from application.use_cases.read_file_use_case import ReadFileUseCase +from application.use_cases.upload_file_use_case import UploadFileUseCase from dependencies import ( get_list_files_use_case, get_list_folders_use_case, get_read_file_use_case, + get_upload_file_use_case, ) file_router = APIRouter(tags=["Files"]) +MAX_UPLOAD_SIZE = 50 * 1024 * 1024 + +ALLOWED_EXTENSIONS = { + ".pdf", ".txt", ".docx", ".xlsx", ".pptx", ".md", ".csv", + ".png", ".jpg", ".jpeg", ".gif", ".webp", ".svg", ".bmp", + ".html", ".xml", ".json", ".rtf", ".odt", ".ods", +} + +ALLOWED_MIME_PREFIXES = ( + "application/pdf", + "text/", + "image/", + "application/vnd.openxmlformats-officedocument", + "application/vnd.ms-", + "application/json", + "application/rtf", + "application/vnd.oasis.opendocument", +) + + +def _sanitize_filename(filename: str | None) -> str: + if not filename: + raise HTTPException(status_code=422, detail="Filename is required") + clean = posixpath.basename(filename.replace("\\", "/")) + if not clean or clean.startswith("."): + raise HTTPException(status_code=422, detail="Invalid filename") + return clean + + +def _validate_file_type(filename: str, content_type: str) -> None: + ext = posixpath.splitext(filename)[1].lower() + if ext not in ALLOWED_EXTENSIONS: + raise HTTPException( + status_code=422, detail=f"File type '{ext}' is not allowed" + ) + if not any(content_type.startswith(p) for p in ALLOWED_MIME_PREFIXES): + raise HTTPException( + status_code=422, detail=f"Content type '{content_type}' is not allowed" + ) + @file_router.get( "/files/list", @@ -70,3 +113,46 @@ async def read_file( metadata=result.metadata, tables=result.tables, ) + + +@file_router.post( + "/files/upload", + status_code=status.HTTP_201_CREATED, +) +async def upload_file( + file: UploadFile = File(...), + prefix: str = Form(default=""), + use_case: UploadFileUseCase = Depends(get_upload_file_use_case), +): + normalized = posixpath.normpath(prefix.replace("\\", "/")) + if normalized == ".": + normalized = "" + if normalized.startswith("..") or posixpath.isabs(normalized): + raise HTTPException( + status_code=422, detail="prefix must be a relative path" + ) + if prefix.endswith("/") and not normalized.endswith("/"): + normalized += "/" + + safe_name = _sanitize_filename(file.filename) + content_type = file.content_type or "application/octet-stream" + _validate_file_type(safe_name, content_type) + + file_data = await file.read() + if len(file_data) > MAX_UPLOAD_SIZE: + raise HTTPException( + status_code=status.HTTP_413_REQUEST_ENTITY_TOO_LARGE, + detail=f"File exceeds maximum allowed size of {MAX_UPLOAD_SIZE // (1024 * 1024)} MB", + ) + + result = await use_case.execute( + file_data=file_data, + file_name=safe_name, + prefix=normalized, + content_type=content_type, + ) + return { + "object_name": result.object_name, + "size": result.size, + "message": "File uploaded successfully", + } diff --git a/src/application/use_cases/upload_file_use_case.py b/src/application/use_cases/upload_file_use_case.py new file mode 100644 index 0000000..3480ec9 --- /dev/null +++ b/src/application/use_cases/upload_file_use_case.py @@ -0,0 +1,16 @@ +from domain.ports.storage_port import FileInfo, StoragePort + + +class UploadFileUseCase: + def __init__(self, storage: StoragePort, bucket: str) -> None: + self.storage = storage + self.bucket = bucket + + async def execute( + self, file_data: bytes, file_name: str, prefix: str, content_type: str + ) -> FileInfo: + if prefix and not prefix.endswith("/"): + prefix += "/" + object_path = prefix + file_name + await self.storage.put_object(self.bucket, object_path, file_data, content_type) + return FileInfo(object_name=object_path, size=len(file_data)) diff --git a/src/dependencies.py b/src/dependencies.py index 87a263a..feef70e 100644 --- a/src/dependencies.py +++ b/src/dependencies.py @@ -10,6 +10,7 @@ from application.use_cases.multimodal_query_use_case import MultimodalQueryUseCase from application.use_cases.query_use_case import QueryUseCase from application.use_cases.read_file_use_case import ReadFileUseCase +from application.use_cases.upload_file_use_case import UploadFileUseCase from config import ( AppConfig, BM25Config, @@ -98,6 +99,9 @@ def get_read_file_use_case() -> ReadFileUseCase: ) +def get_upload_file_use_case() -> UploadFileUseCase: + return UploadFileUseCase(storage=minio_adapter, bucket=minio_config.MINIO_BUCKET) + def get_liveness_check_use_case() -> LivenessCheckUseCase: return LivenessCheckUseCase( storage=minio_adapter, diff --git a/src/domain/ports/storage_port.py b/src/domain/ports/storage_port.py index 4f5f733..45ce784 100644 --- a/src/domain/ports/storage_port.py +++ b/src/domain/ports/storage_port.py @@ -46,6 +46,24 @@ async def list_objects( """ pass + @abstractmethod + async def put_object( + self, bucket: str, object_path: str, data: bytes, content_type: str + ) -> None: + """ + Store an object in the given bucket. + + Args: + bucket: The bucket name to store the object in. + object_path: The path/key for the object within the bucket. + data: The raw bytes to store. + content_type: The MIME type of the content. + + Raises: + FileNotFoundError: If the bucket does not exist. + """ + pass + @abstractmethod async def list_files_metadata( self, bucket: str, prefix: str, recursive: bool = True diff --git a/src/infrastructure/storage/minio_adapter.py b/src/infrastructure/storage/minio_adapter.py index 3eb0b9a..ac557c6 100644 --- a/src/infrastructure/storage/minio_adapter.py +++ b/src/infrastructure/storage/minio_adapter.py @@ -1,4 +1,5 @@ import asyncio +import io import logging from minio import Minio @@ -42,6 +43,28 @@ async def get_object(self, bucket: str, object_path: str) -> bytes: logger.error("MinIO error retrieving object: %s", e, exc_info=True) raise + async def put_object( + self, bucket: str, object_path: str, data: bytes, content_type: str + ) -> None: + try: + loop = asyncio.get_running_loop() + await loop.run_in_executor( + None, + lambda: self.client.put_object( + bucket, + object_path, + io.BytesIO(data), + len(data), + content_type=content_type, + ), + ) + except S3Error as e: + if e.code == "NoSuchBucket": + logger.info("Bucket not found: %s", bucket) + raise FileNotFoundError(f"Bucket not found: {bucket}") from e + logger.error("MinIO error uploading object: %s", e, exc_info=True) + raise + async def _list_minio_objects( self, bucket: str, prefix: str, recursive: bool = True ) -> list: diff --git a/tests/unit/test_upload_file_use_case.py b/tests/unit/test_upload_file_use_case.py new file mode 100644 index 0000000..a9b0c82 --- /dev/null +++ b/tests/unit/test_upload_file_use_case.py @@ -0,0 +1,129 @@ +"""TDD tests for UploadFileUseCase — the implementation does not exist yet.""" + +from unittest.mock import AsyncMock + +import pytest + +from application.use_cases.upload_file_use_case import UploadFileUseCase +from domain.ports.storage_port import FileInfo + + +class TestUploadFileUseCase: + """Tests for the upload file use case. + + The use case will: + - Depend on StoragePort (abstract port) + - Construct the full object path as prefix + file_name + - Call storage.put_object(bucket, object_path, file_data, content_type) + - Return a FileInfo with object_name, size (len of data), last_modified=None + """ + + @pytest.fixture + def use_case(self, mock_storage: AsyncMock) -> UploadFileUseCase: + return UploadFileUseCase(storage=mock_storage, bucket="test-bucket") + + async def test_successful_upload_with_prefix_calls_put_object( + self, use_case: UploadFileUseCase, mock_storage: AsyncMock + ) -> None: + """Should call storage.put_object with prefix + file_name.""" + file_data = b"hello world" + file_name = "report.pdf" + prefix = "documents/" + content_type = "application/pdf" + + await use_case.execute( + file_data=file_data, + file_name=file_name, + prefix=prefix, + content_type=content_type, + ) + + mock_storage.put_object.assert_called_once_with( + "test-bucket", + "documents/report.pdf", + b"hello world", + "application/pdf", + ) + + async def test_successful_upload_returns_file_info( + self, use_case: UploadFileUseCase + ) -> None: + """Should return a FileInfo with object_name, size, and last_modified=None.""" + file_data = b"hello world" + file_name = "report.pdf" + prefix = "documents/" + content_type = "application/pdf" + + result = await use_case.execute( + file_data=file_data, + file_name=file_name, + prefix=prefix, + content_type=content_type, + ) + + assert isinstance(result, FileInfo) + assert result.object_name == "documents/report.pdf" + assert result.size == len(b"hello world") + assert result.last_modified is None + + async def test_upload_with_empty_prefix_uses_just_file_name( + self, use_case: UploadFileUseCase, mock_storage: AsyncMock + ) -> None: + """When prefix is empty, object_path should be just the file_name.""" + file_data = b"data" + file_name = "image.png" + prefix = "" + content_type = "image/png" + + result = await use_case.execute( + file_data=file_data, + file_name=file_name, + prefix=prefix, + content_type=content_type, + ) + + assert result.object_name == "image.png" + mock_storage.put_object.assert_called_once_with( + "test-bucket", + "image.png", + b"data", + "image/png", + ) + + async def test_upload_with_prefix_not_ending_in_slash_still_works( + self, use_case: UploadFileUseCase, mock_storage: AsyncMock + ) -> None: + """Prefix without trailing slash should have slash appended.""" + file_data = b"content" + file_name = "file.txt" + prefix = "folder" + content_type = "text/plain" + + result = await use_case.execute( + file_data=file_data, + file_name=file_name, + prefix=prefix, + content_type=content_type, + ) + + assert result.object_name == "folder/file.txt" + mock_storage.put_object.assert_called_once_with( + "test-bucket", + "folder/file.txt", + b"content", + "text/plain", + ) + + async def test_file_not_found_error_from_storage_propagates( + self, use_case: UploadFileUseCase, mock_storage: AsyncMock + ) -> None: + """FileNotFoundError raised by storage should propagate to the caller.""" + mock_storage.put_object.side_effect = FileNotFoundError("bucket not found") + + with pytest.raises(FileNotFoundError, match="bucket not found"): + await use_case.execute( + file_data=b"data", + file_name="test.txt", + prefix="", + content_type="text/plain", + ) diff --git a/tests/unit/test_upload_routes.py b/tests/unit/test_upload_routes.py new file mode 100644 index 0000000..e96d78b --- /dev/null +++ b/tests/unit/test_upload_routes.py @@ -0,0 +1,161 @@ +"""TDD tests for the upload file route — the implementation does not exist yet.""" + +from unittest.mock import AsyncMock + +import httpx +import pytest +from httpx import ASGITransport + +from application.use_cases.upload_file_use_case import UploadFileUseCase +from dependencies import get_upload_file_use_case +from domain.ports.storage_port import FileInfo +from main import app + + +@pytest.fixture(autouse=True) +def _clear_dependency_overrides(): + yield + app.dependency_overrides.clear() + + +class TestUploadFileRoute: + """Tests for POST /api/v1/files/upload. + + The route will: + - Accept multipart form with `file` (UploadFile) and `prefix` (optional, default "") + - Validate prefix (reject path traversal `..` and absolute paths) + - Call UploadFileUseCase + - Return HTTP 201 with {object_name, size, message} + """ + + @pytest.fixture + def mock_upload_use_case(self) -> AsyncMock: + mock = AsyncMock(spec=UploadFileUseCase) + mock.execute.return_value = FileInfo( + object_name="documents/report.pdf", + size=2048, + last_modified=None, + ) + return mock + + async def test_successful_upload_returns_201( + self, mock_upload_use_case: AsyncMock + ) -> None: + """Should return 201 with object_name and size on successful upload.""" + app.dependency_overrides[get_upload_file_use_case] = lambda: ( + mock_upload_use_case + ) + + async with httpx.AsyncClient( + transport=ASGITransport(app=app), base_url="http://test" + ) as client: + response = await client.post( + "/api/v1/files/upload", + files={"file": ("report.pdf", b"fake pdf content", "application/pdf")}, + data={"prefix": "documents/"}, + ) + + assert response.status_code == 201 + body = response.json() + assert body["object_name"] == "documents/report.pdf" + assert body["size"] == 2048 + + async def test_successful_upload_includes_message( + self, mock_upload_use_case: AsyncMock + ) -> None: + """Should include a success message in the response.""" + app.dependency_overrides[get_upload_file_use_case] = lambda: ( + mock_upload_use_case + ) + + async with httpx.AsyncClient( + transport=ASGITransport(app=app), base_url="http://test" + ) as client: + response = await client.post( + "/api/v1/files/upload", + files={"file": ("report.pdf", b"data", "application/pdf")}, + data={"prefix": "documents/"}, + ) + + body = response.json() + assert "message" in body + + async def test_upload_without_prefix_uses_empty_string( + self, mock_upload_use_case: AsyncMock + ) -> None: + """Should default to empty prefix when prefix is not provided.""" + app.dependency_overrides[get_upload_file_use_case] = lambda: ( + mock_upload_use_case + ) + + async with httpx.AsyncClient( + transport=ASGITransport(app=app), base_url="http://test" + ) as client: + response = await client.post( + "/api/v1/files/upload", + files={"file": ("photo.jpg", b"image data", "image/jpeg")}, + ) + + assert response.status_code == 201 + mock_upload_use_case.execute.assert_called_once_with( + file_data=b"image data", + file_name="photo.jpg", + prefix="", + content_type="image/jpeg", + ) + + async def test_upload_with_path_traversal_in_prefix_returns_422( + self, mock_upload_use_case: AsyncMock + ) -> None: + """Should reject prefix containing '..' path traversal.""" + app.dependency_overrides[get_upload_file_use_case] = lambda: ( + mock_upload_use_case + ) + + async with httpx.AsyncClient( + transport=ASGITransport(app=app), base_url="http://test" + ) as client: + response = await client.post( + "/api/v1/files/upload", + files={"file": ("report.pdf", b"data", "application/pdf")}, + data={"prefix": "../../etc/"}, + ) + + assert response.status_code == 422 + + async def test_upload_with_absolute_path_in_prefix_returns_422( + self, mock_upload_use_case: AsyncMock + ) -> None: + """Should reject prefix that is an absolute path.""" + app.dependency_overrides[get_upload_file_use_case] = lambda: ( + mock_upload_use_case + ) + + async with httpx.AsyncClient( + transport=ASGITransport(app=app), base_url="http://test" + ) as client: + response = await client.post( + "/api/v1/files/upload", + files={"file": ("report.pdf", b"data", "application/pdf")}, + data={"prefix": "/etc/secrets/"}, + ) + + assert response.status_code == 422 + + async def test_upload_missing_file_returns_422( + self, mock_upload_use_case: AsyncMock + ) -> None: + """Should return 422 when no file is provided in the request.""" + app.dependency_overrides[get_upload_file_use_case] = lambda: ( + mock_upload_use_case + ) + + async with httpx.AsyncClient( + transport=ASGITransport(app=app), base_url="http://test" + ) as client: + response = await client.post( + "/api/v1/files/upload", + data={"prefix": "documents/"}, + ) + + assert response.status_code == 422