diff --git a/.ai-context/COMMANDS.md b/.ai-context/COMMANDS.md index dbb8cac..0ad29a6 100644 --- a/.ai-context/COMMANDS.md +++ b/.ai-context/COMMANDS.md @@ -35,12 +35,16 @@ Run the v2 financial intelligence CLI with: bb --help bb init bb status +bb documents import --dry-run --file path/to/document.pdf +bb documents import --file path/to/document.pdf ``` `bb` is the side-by-side command surface for new `BB_` schema work. It should grow new document/entity/observation workflows while the legacy command surfaces remain available. Use `bb init` to apply current migrations and prepare v2 `financial/` storage roots for the active data home. +`bb documents import` is parser-free: it hashes the explicit file, records v2 +document/object metadata, and copies the canonical object into managed storage. ## BankBuddy CLI diff --git a/.ai-context/STATUS.md b/.ai-context/STATUS.md index f3ed8d8..ab4fdde 100644 --- a/.ai-context/STATUS.md +++ b/.ai-context/STATUS.md @@ -15,7 +15,8 @@ section is `Unreleased`. - Banking CLI commands for banks, accounts, statement refs, imports, transactions, categories, reports, exports, storage migration, and status. - Side-by-side `bb` CLI for v2 financial intelligence initialization, - foundation status, storage readiness, and `BB_` schema visibility. + foundation status, storage readiness, generic document import, and `BB_` + schema visibility. - Supported banking imports for Bank of America PDF/CSV, Apple Card PDF, ICICI `.xls`, and HDFC `.xls`. - Statement inventory and statement coverage auditing. @@ -32,6 +33,8 @@ section is `Unreleased`. leaving existing `bankbuddy` and `taxbuddy` commands in place. - `bb init` for applying migrations and preparing v2 financial storage roots without depending on the legacy `bankbuddy` command surface. +- Generic `bb documents import` for parser-free explicit-file intake into + `BB_DOCUMENT` and `BB_DOCUMENT_OBJECT` with SHA-256 canonical storage keys. - Prospective relicensing from MIT to AGPL-3.0-or-later. - Canonical data-home layout with `database/`, `bank/`, and `tax/` directories. - First TaxBuddy CLI slice and `tax_documents` metadata index. diff --git a/CHANGELOG.md b/CHANGELOG.md index 9d53b8c..16c92ff 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -32,6 +32,9 @@ and versions are tracked in the repo-root `VERSION` file. ### Added +- Added `bb documents import --dry-run --file ...` and + `bb documents import --file ...` for parser-free v2 document intake with + SHA-256 canonical object storage and idempotent duplicate handling. - Added `bb init` and v2 storage readiness output in `bb status`, so the v2 command surface can initialize migrations and financial storage roots independently of the legacy `bankbuddy` CLI. diff --git a/README.md b/README.md index 74d6465..4cd444d 100644 --- a/README.md +++ b/README.md @@ -45,6 +45,8 @@ Activate the project shell once, then run the CLI directly: basectl activate bankbuddy bb init bb status +bb documents import --dry-run --file path/to/document.pdf +bb documents import --file path/to/document.pdf bankbuddy --help bankbuddy status bankbuddy init @@ -164,6 +166,11 @@ Use `bb init` for the v2 financial intelligence foundation. It applies the current SQLite migrations and prepares the `financial/` storage roots used by document-first workflows. +Use `bb documents import` for parser-free v2 document intake. It hashes the +file, records a `BB_DOCUMENT`, stores a canonical object under +`financial/canonical`, and leaves bank-specific parsing or inference for later +workflows. + Switch the current shell by exporting `BANKBUDDY_ENV`: ```bash diff --git a/src/bankbuddy/bb/cli.py b/src/bankbuddy/bb/cli.py index efcb05f..7135584 100644 --- a/src/bankbuddy/bb/cli.py +++ b/src/bankbuddy/bb/cli.py @@ -2,12 +2,16 @@ from __future__ import annotations +from pathlib import Path import sqlite3 import click from bankbuddy import __version__ from bankbuddy.database import initialize_database +from bankbuddy.bb.documents import DocumentImportError +from bankbuddy.bb.documents import import_document +from bankbuddy.bb.documents import plan_document_import from bankbuddy.paths import resolve_app_paths from bankbuddy.bb.storage import ensure_financial_storage_dirs from bankbuddy.runtime import CliRuntime @@ -160,6 +164,47 @@ def init_command(ctx: click.Context) -> None: click.echo(f"Initialized BankBuddy v2 at {paths.root}") +@main.group() +def documents() -> None: + """Manage v2 documents.""" + + +@documents.command("import") +@click.option("--dry-run", is_flag=True, help="Plan the import without writes.") +@click.option( + "--file", + "file_path", + required=True, + type=click.Path(dir_okay=False, path_type=Path), + help="Document file to import.", +) +@click.pass_context +def documents_import( + ctx: click.Context, + dry_run: bool, + file_path: Path, +) -> None: + """Import one document into v2 canonical storage.""" + + runtime = runtime_from_context(ctx) + paths = resolve_app_paths(environment=runtime.environment) + try: + if dry_run: + plan = plan_document_import(paths, file_path) + _print_document_import_plan(plan, dry_run=True) + click.echo("Database changed: no") + click.echo("Files changed: none") + return + result = import_document(paths, file_path) + except (OSError, DocumentImportError) as exc: + raise click.ClickException(str(exc)) from exc + + _print_document_import_plan(result.plan, dry_run=False) + click.echo(f"Document ID: {result.document.document_id}") + click.echo(f"Document object ID: {result.document_object.document_object_id}") + click.echo(f"Duplicate: {_yes_no(result.duplicate)}") + + def _database_table_names(database_path) -> set[str]: if not database_path.exists(): return set() @@ -176,6 +221,15 @@ def _yes_no(value: bool) -> str: return "yes" if value else "no" +def _print_document_import_plan(plan, *, dry_run: bool) -> None: + click.echo(f"Dry run: {_yes_no(dry_run)}") + click.echo(f"File: {plan.source_path.name}") + click.echo(f"SHA-256: {plan.file_hash}") + click.echo(f"Size: {plan.byte_size} bytes") + click.echo(f"Media type: {plan.media_type}") + click.echo(f"Canonical object: {plan.canonical_relative_path}") + + def _v2_storage_ready(paths) -> bool: return all( path.is_dir() diff --git a/src/bankbuddy/bb/documents.py b/src/bankbuddy/bb/documents.py new file mode 100644 index 0000000..839aadd --- /dev/null +++ b/src/bankbuddy/bb/documents.py @@ -0,0 +1,153 @@ +"""Generic document import services for the BankBuddy v2 model.""" + +from __future__ import annotations + +from dataclasses import dataclass +from hashlib import sha256 +import mimetypes +from pathlib import Path +import shutil + +from bankbuddy.bb.dao import FinancialIntelligenceDAO +from bankbuddy.bb.records import DocumentCreate +from bankbuddy.bb.records import DocumentObjectCreate +from bankbuddy.bb.records import DocumentObjectRecord +from bankbuddy.bb.records import DocumentRecord +from bankbuddy.bb.storage import FinancialStorageDAO +from bankbuddy.bb.storage import object_key_for_hash +from bankbuddy.bb.storage import protect_managed_path +from bankbuddy.bb.storage import resolve_storage_path +from bankbuddy.database import connect_database +from bankbuddy.database import initialize_database +from bankbuddy.paths import AppPaths + + +@dataclass(frozen=True) +class DocumentImportPlan: + """Dry-run-safe plan for importing one document.""" + + source_path: Path + file_hash: str + byte_size: int + media_type: str + object_key: str + canonical_relative_path: str + + +@dataclass(frozen=True) +class DocumentImportResult: + """Result of importing one document into v2 storage.""" + + plan: DocumentImportPlan + document: DocumentRecord + document_object: DocumentObjectRecord + duplicate: bool + + +class DocumentImportError(ValueError): + """Raised when a generic document import cannot be planned or completed.""" + + +def plan_document_import(paths: AppPaths, source_path: Path) -> DocumentImportPlan: + """Return a deterministic import plan without creating directories or rows.""" + + resolved_source = source_path.expanduser() + if not resolved_source.is_file(): + raise DocumentImportError(f"Document file does not exist: {resolved_source}") + + file_hash = hash_file(resolved_source) + object_key = object_key_for_hash(file_hash, resolved_source.suffix) + return DocumentImportPlan( + source_path=resolved_source, + file_hash=file_hash, + byte_size=resolved_source.stat().st_size, + media_type=guess_media_type(resolved_source), + object_key=object_key, + canonical_relative_path=f"financial/canonical/{object_key}", + ) + + +def import_document(paths: AppPaths, source_path: Path) -> DocumentImportResult: + """Import one document into the v2 canonical object store.""" + + plan = plan_document_import(paths, source_path) + initialize_database(paths) + + with connect_database(paths) as conn: + documents = FinancialIntelligenceDAO(conn) + storage = FinancialStorageDAO(conn) + document = documents.find_document_by_hash(plan.file_hash) + document_existed = document is not None + if document is None: + document = documents.create_document( + DocumentCreate( + file_hash=plan.file_hash, + original_file_name=plan.source_path.name, + ) + ) + + document_object = storage.find_document_object( + storage_root_code="FINANCIAL_CANONICAL", + object_key=plan.object_key, + ) + object_existed = document_object is not None + canonical_root = storage.get_storage_root("FINANCIAL_CANONICAL") + canonical_path = resolve_storage_path(paths, canonical_root, plan.object_key) + + if document_object is None: + _copy_canonical_object(plan.source_path, canonical_path, plan.file_hash) + document_object = storage.create_document_object( + DocumentObjectCreate( + document_id=document.document_id, + storage_root_code="FINANCIAL_CANONICAL", + object_key=plan.object_key, + object_role="canonical", + content_hash=plan.file_hash, + byte_size=plan.byte_size, + media_type=plan.media_type, + original_file_name=plan.source_path.name, + ) + ) + elif not canonical_path.exists(): + _copy_canonical_object(plan.source_path, canonical_path, plan.file_hash) + elif hash_file(canonical_path) != plan.file_hash: + raise DocumentImportError( + f"Canonical object content mismatch: {plan.canonical_relative_path}" + ) + + conn.commit() + + return DocumentImportResult( + plan=plan, + document=document, + document_object=document_object, + duplicate=document_existed and object_existed, + ) + + +def hash_file(path: Path) -> str: + """Return the SHA-256 hex digest for a local file.""" + + digest = sha256() + with path.open("rb") as source_file: + for chunk in iter(lambda: source_file.read(1024 * 1024), b""): + digest.update(chunk) + return digest.hexdigest() + + +def guess_media_type(path: Path) -> str: + """Return a stable media type for a local document path.""" + + media_type, _ = mimetypes.guess_type(path.name) + return media_type or "application/octet-stream" + + +def _copy_canonical_object(source_path: Path, canonical_path: Path, file_hash: str) -> None: + canonical_path.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(source_path, canonical_path) + if hash_file(canonical_path) != file_hash: + canonical_path.unlink(missing_ok=True) + raise DocumentImportError( + f"Copied document hash did not match source: {source_path.name}" + ) + protect_managed_path(canonical_path) diff --git a/src/bankbuddy/bb/storage.py b/src/bankbuddy/bb/storage.py index dc33b25..8120c74 100644 --- a/src/bankbuddy/bb/storage.py +++ b/src/bankbuddy/bb/storage.py @@ -110,6 +110,40 @@ def create_document_object( **{**record.__dict__, "object_key": object_key}, ) + def find_document_object( + self, + *, + storage_root_code: str, + object_key: str, + ) -> DocumentObjectRecord | None: + """Return one document object by storage root and key.""" + + normalized_key = validate_storage_key(object_key) + row = self._conn.execute( + """ + select + BB_DOCUMENT_OBJECT.document_object_id, + BB_DOCUMENT_OBJECT.document_id, + BB_STORAGE_ROOT.storage_root_code, + BB_DOCUMENT_OBJECT.object_key, + BB_DOCUMENT_OBJECT.object_role, + BB_DOCUMENT_OBJECT.content_hash, + BB_DOCUMENT_OBJECT.byte_size, + BB_DOCUMENT_OBJECT.media_type, + BB_DOCUMENT_OBJECT.original_file_name, + BB_DOCUMENT_OBJECT.storage_root_id + from BB_DOCUMENT_OBJECT + join BB_STORAGE_ROOT using (storage_root_id) + where + BB_STORAGE_ROOT.storage_root_code = ? + and BB_DOCUMENT_OBJECT.object_key = ? + """, + (storage_root_code, normalized_key), + ).fetchone() + if row is None: + return None + return _document_object_from_row(row) + def create_document_view( self, record: DocumentViewCreate, @@ -238,3 +272,18 @@ def _storage_root_from_row(row: sqlite3.Row) -> StorageRootRecord: permissions_mode=str(row["permissions_mode"]), active=bool(row["active"]), ) + + +def _document_object_from_row(row: sqlite3.Row) -> DocumentObjectRecord: + return DocumentObjectRecord( + document_object_id=int(row["document_object_id"]), + document_id=int(row["document_id"]), + storage_root_id=int(row["storage_root_id"]), + storage_root_code=str(row["storage_root_code"]), + object_key=str(row["object_key"]), + object_role=str(row["object_role"]), + content_hash=str(row["content_hash"]), + byte_size=row["byte_size"], + media_type=row["media_type"], + original_file_name=row["original_file_name"], + ) diff --git a/tests/test_bb_documents_cli.py b/tests/test_bb_documents_cli.py new file mode 100644 index 0000000..efab9a9 --- /dev/null +++ b/tests/test_bb_documents_cli.py @@ -0,0 +1,118 @@ +from hashlib import sha256 + +from click.testing import CliRunner + +from bankbuddy.bb.cli import main +from bankbuddy.database import connect_database +from bankbuddy.paths import resolve_app_paths + + +def test_bb_documents_import_dry_run_reports_plan_without_writes(tmp_path) -> None: + home = tmp_path / "home" + source = tmp_path / "statement.pdf" + source_bytes = b"%PDF-1.4 placeholder" + source.write_bytes(source_bytes) + file_hash = sha256(source_bytes).hexdigest() + + result = CliRunner().invoke( + main, + ["documents", "import", "--dry-run", "--file", str(source)], + env={"BANKBUDDY_HOME": str(home)}, + ) + + assert result.exit_code == 0 + assert "Dry run: yes" in result.output + assert "File: statement.pdf" in result.output + assert f"SHA-256: {file_hash}" in result.output + assert "Media type: application/pdf" in result.output + assert ( + f"Canonical object: financial/canonical/sha256/{file_hash[:2]}/" + f"{file_hash[2:4]}/{file_hash}.pdf" + ) in result.output + assert "Database changed: no" in result.output + assert "Files changed: none" in result.output + assert not (home / "database" / "bankbuddy.sqlite3").exists() + assert not (home / "financial" / "canonical").exists() + + +def test_bb_documents_import_records_document_and_canonical_object( + tmp_path, +) -> None: + home = tmp_path / "home" + source = tmp_path / "statement.pdf" + source_bytes = b"%PDF-1.4 placeholder" + source.write_bytes(source_bytes) + file_hash = sha256(source_bytes).hexdigest() + + result = CliRunner().invoke( + main, + ["documents", "import", "--file", str(source)], + env={"BANKBUDDY_HOME": str(home)}, + ) + + assert result.exit_code == 0 + assert "Dry run: no" in result.output + assert "Document ID: 1" in result.output + assert "Document object ID: 1" in result.output + assert "Duplicate: no" in result.output + canonical_path = ( + home / "financial" / "canonical" / "sha256" / file_hash[:2] / file_hash[2:4] + / f"{file_hash}.pdf" + ) + assert canonical_path.read_bytes() == source_bytes + with connect_database(resolve_app_paths(home)) as conn: + document_count = conn.execute("select count(*) from BB_DOCUMENT").fetchone()[0] + object_count = conn.execute( + "select count(*) from BB_DOCUMENT_OBJECT" + ).fetchone()[0] + document = conn.execute( + """ + select file_hash, original_file_name + from BB_DOCUMENT + where document_id = 1 + """ + ).fetchone() + document_object = conn.execute( + """ + select object_key, object_role, content_hash, byte_size, media_type + from BB_DOCUMENT_OBJECT + where document_object_id = 1 + """ + ).fetchone() + + assert document_count == 1 + assert object_count == 1 + assert document["file_hash"] == file_hash + assert document["original_file_name"] == "statement.pdf" + assert document_object["object_key"] == ( + f"sha256/{file_hash[:2]}/{file_hash[2:4]}/{file_hash}.pdf" + ) + assert document_object["object_role"] == "canonical" + assert document_object["content_hash"] == file_hash + assert document_object["byte_size"] == len(source_bytes) + assert document_object["media_type"] == "application/pdf" + + +def test_bb_documents_import_is_idempotent_for_existing_hash(tmp_path) -> None: + home = tmp_path / "home" + source = tmp_path / "statement.pdf" + source.write_bytes(b"%PDF-1.4 placeholder") + + runner = CliRunner() + env = {"BANKBUDDY_HOME": str(home)} + first = runner.invoke(main, ["documents", "import", "--file", str(source)], env=env) + second = runner.invoke(main, ["documents", "import", "--file", str(source)], env=env) + + assert first.exit_code == 0 + assert second.exit_code == 0 + assert "Document ID: 1" in second.output + assert "Document object ID: 1" in second.output + assert "Duplicate: yes" in second.output + with connect_database(resolve_app_paths(home)) as conn: + document_count = conn.execute("select count(*) from BB_DOCUMENT").fetchone()[0] + object_count = conn.execute( + "select count(*) from BB_DOCUMENT_OBJECT" + ).fetchone()[0] + + assert document_count == 1 + assert object_count == 1