codeforester · codeforester · Jun 16, 2026 · Jun 16, 2026
diff --git a/.ai-context/COMMANDS.md b/.ai-context/COMMANDS.md
@@ -35,12 +35,16 @@ Run the v2 financial intelligence CLI with:
 bb --help
 bb init
 bb status
+bb documents import --dry-run --file path/to/document.pdf
+bb documents import --file path/to/document.pdf
 ```
 
 `bb` is the side-by-side command surface for new `BB_` schema work. It should
 grow new document/entity/observation workflows while the legacy command
 surfaces remain available. Use `bb init` to apply current migrations and
 prepare v2 `financial/` storage roots for the active data home.
+`bb documents import` is parser-free: it hashes the explicit file, records v2
+document/object metadata, and copies the canonical object into managed storage.
 
 ## BankBuddy CLI
 

diff --git a/.ai-context/STATUS.md b/.ai-context/STATUS.md
@@ -15,7 +15,8 @@ section is `Unreleased`.
 - Banking CLI commands for banks, accounts, statement refs, imports,
   transactions, categories, reports, exports, storage migration, and status.
 - Side-by-side `bb` CLI for v2 financial intelligence initialization,
-  foundation status, storage readiness, and `BB_` schema visibility.
+  foundation status, storage readiness, generic document import, and `BB_`
+  schema visibility.
 - Supported banking imports for Bank of America PDF/CSV, Apple Card PDF, ICICI
   `.xls`, and HDFC `.xls`.
 - Statement inventory and statement coverage auditing.
@@ -32,6 +33,8 @@ section is `Unreleased`.
   leaving existing `bankbuddy` and `taxbuddy` commands in place.
 - `bb init` for applying migrations and preparing v2 financial storage roots
   without depending on the legacy `bankbuddy` command surface.
+- Generic `bb documents import` for parser-free explicit-file intake into
+  `BB_DOCUMENT` and `BB_DOCUMENT_OBJECT` with SHA-256 canonical storage keys.
 - Prospective relicensing from MIT to AGPL-3.0-or-later.
 - Canonical data-home layout with `database/`, `bank/`, and `tax/` directories.
 - First TaxBuddy CLI slice and `tax_documents` metadata index.

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -32,6 +32,9 @@ and versions are tracked in the repo-root `VERSION` file.
 
 ### Added
 
+- Added `bb documents import --dry-run --file ...` and
+  `bb documents import --file ...` for parser-free v2 document intake with
+  SHA-256 canonical object storage and idempotent duplicate handling.
 - Added `bb init` and v2 storage readiness output in `bb status`, so the v2
   command surface can initialize migrations and financial storage roots
   independently of the legacy `bankbuddy` CLI.

diff --git a/README.md b/README.md
@@ -45,6 +45,8 @@ Activate the project shell once, then run the CLI directly:
 basectl activate bankbuddy
 bb init
 bb status
+bb documents import --dry-run --file path/to/document.pdf
+bb documents import --file path/to/document.pdf
 bankbuddy --help
 bankbuddy status
 bankbuddy init
@@ -164,6 +166,11 @@ Use `bb init` for the v2 financial intelligence foundation. It applies the
 current SQLite migrations and prepares the `financial/` storage roots used by
 document-first workflows.
 
+Use `bb documents import` for parser-free v2 document intake. It hashes the
+file, records a `BB_DOCUMENT`, stores a canonical object under
+`financial/canonical`, and leaves bank-specific parsing or inference for later
+workflows.
+
 Switch the current shell by exporting `BANKBUDDY_ENV`:
 
 ```bash

diff --git a/src/bankbuddy/bb/cli.py b/src/bankbuddy/bb/cli.py
@@ -2,12 +2,16 @@
 
 from __future__ import annotations
 
+from pathlib import Path
 import sqlite3
 
 import click
 
 from bankbuddy import __version__
 from bankbuddy.database import initialize_database
+from bankbuddy.bb.documents import DocumentImportError
+from bankbuddy.bb.documents import import_document
+from bankbuddy.bb.documents import plan_document_import
 from bankbuddy.paths import resolve_app_paths
 from bankbuddy.bb.storage import ensure_financial_storage_dirs
 from bankbuddy.runtime import CliRuntime
@@ -160,6 +164,47 @@ def init_command(ctx: click.Context) -> None:
     click.echo(f"Initialized BankBuddy v2 at {paths.root}")
 
 
+@main.group()
+def documents() -> None:
+    """Manage v2 documents."""
+
+
+@documents.command("import")
+@click.option("--dry-run", is_flag=True, help="Plan the import without writes.")
+@click.option(
+    "--file",
+    "file_path",
+    required=True,
+    type=click.Path(dir_okay=False, path_type=Path),
+    help="Document file to import.",
+)
+@click.pass_context
+def documents_import(
+    ctx: click.Context,
+    dry_run: bool,
+    file_path: Path,
+) -> None:
+    """Import one document into v2 canonical storage."""
+
+    runtime = runtime_from_context(ctx)
+    paths = resolve_app_paths(environment=runtime.environment)
+    try:
+        if dry_run:
+            plan = plan_document_import(paths, file_path)
+            _print_document_import_plan(plan, dry_run=True)
+            click.echo("Database changed: no")
+            click.echo("Files changed: none")
+            return
+        result = import_document(paths, file_path)
+    except (OSError, DocumentImportError) as exc:
+        raise click.ClickException(str(exc)) from exc
+
+    _print_document_import_plan(result.plan, dry_run=False)
+    click.echo(f"Document ID: {result.document.document_id}")
+    click.echo(f"Document object ID: {result.document_object.document_object_id}")
+    click.echo(f"Duplicate: {_yes_no(result.duplicate)}")
+
+
 def _database_table_names(database_path) -> set[str]:
     if not database_path.exists():
         return set()
@@ -176,6 +221,15 @@ def _yes_no(value: bool) -> str:
     return "yes" if value else "no"
 
 
+def _print_document_import_plan(plan, *, dry_run: bool) -> None:
+    click.echo(f"Dry run: {_yes_no(dry_run)}")
+    click.echo(f"File: {plan.source_path.name}")
+    click.echo(f"SHA-256: {plan.file_hash}")
+    click.echo(f"Size: {plan.byte_size} bytes")
+    click.echo(f"Media type: {plan.media_type}")
+    click.echo(f"Canonical object: {plan.canonical_relative_path}")
+
+
 def _v2_storage_ready(paths) -> bool:
     return all(
         path.is_dir()

diff --git a/src/bankbuddy/bb/documents.py b/src/bankbuddy/bb/documents.py
@@ -0,0 +1,153 @@
+"""Generic document import services for the BankBuddy v2 model."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from hashlib import sha256
+import mimetypes
+from pathlib import Path
+import shutil
+
+from bankbuddy.bb.dao import FinancialIntelligenceDAO
+from bankbuddy.bb.records import DocumentCreate
+from bankbuddy.bb.records import DocumentObjectCreate
+from bankbuddy.bb.records import DocumentObjectRecord
+from bankbuddy.bb.records import DocumentRecord
+from bankbuddy.bb.storage import FinancialStorageDAO
+from bankbuddy.bb.storage import object_key_for_hash
+from bankbuddy.bb.storage import protect_managed_path
+from bankbuddy.bb.storage import resolve_storage_path
+from bankbuddy.database import connect_database
+from bankbuddy.database import initialize_database
+from bankbuddy.paths import AppPaths
+
+
+@dataclass(frozen=True)
+class DocumentImportPlan:
+    """Dry-run-safe plan for importing one document."""
+
+    source_path: Path
+    file_hash: str
+    byte_size: int
+    media_type: str
+    object_key: str
+    canonical_relative_path: str
+
+
+@dataclass(frozen=True)
+class DocumentImportResult:
+    """Result of importing one document into v2 storage."""
+
+    plan: DocumentImportPlan
+    document: DocumentRecord
+    document_object: DocumentObjectRecord
+    duplicate: bool
+
+
+class DocumentImportError(ValueError):
+    """Raised when a generic document import cannot be planned or completed."""
+
+
+def plan_document_import(paths: AppPaths, source_path: Path) -> DocumentImportPlan:
+    """Return a deterministic import plan without creating directories or rows."""
+
+    resolved_source = source_path.expanduser()
+    if not resolved_source.is_file():
+        raise DocumentImportError(f"Document file does not exist: {resolved_source}")
+
+    file_hash = hash_file(resolved_source)
+    object_key = object_key_for_hash(file_hash, resolved_source.suffix)
+    return DocumentImportPlan(
+        source_path=resolved_source,
+        file_hash=file_hash,
+        byte_size=resolved_source.stat().st_size,
+        media_type=guess_media_type(resolved_source),
+        object_key=object_key,
+        canonical_relative_path=f"financial/canonical/{object_key}",
+    )
+
+
+def import_document(paths: AppPaths, source_path: Path) -> DocumentImportResult:
+    """Import one document into the v2 canonical object store."""
+
+    plan = plan_document_import(paths, source_path)
+    initialize_database(paths)
+
+    with connect_database(paths) as conn:
+        documents = FinancialIntelligenceDAO(conn)
+        storage = FinancialStorageDAO(conn)
+        document = documents.find_document_by_hash(plan.file_hash)
+        document_existed = document is not None
+        if document is None:
+            document = documents.create_document(
+                DocumentCreate(
+                    file_hash=plan.file_hash,
+                    original_file_name=plan.source_path.name,
+                )
+            )
+
+        document_object = storage.find_document_object(
+            storage_root_code="FINANCIAL_CANONICAL",
+            object_key=plan.object_key,
+        )
+        object_existed = document_object is not None
+        canonical_root = storage.get_storage_root("FINANCIAL_CANONICAL")
+        canonical_path = resolve_storage_path(paths, canonical_root, plan.object_key)
+
+        if document_object is None:
+            _copy_canonical_object(plan.source_path, canonical_path, plan.file_hash)
+            document_object = storage.create_document_object(
+                DocumentObjectCreate(
+                    document_id=document.document_id,
+                    storage_root_code="FINANCIAL_CANONICAL",
+                    object_key=plan.object_key,
+                    object_role="canonical",
+                    content_hash=plan.file_hash,
+                    byte_size=plan.byte_size,
+                    media_type=plan.media_type,
+                    original_file_name=plan.source_path.name,
+                )
+            )
+        elif not canonical_path.exists():
+            _copy_canonical_object(plan.source_path, canonical_path, plan.file_hash)
+        elif hash_file(canonical_path) != plan.file_hash:
+            raise DocumentImportError(
+                f"Canonical object content mismatch: {plan.canonical_relative_path}"
+            )
+
+        conn.commit()
+
+    return DocumentImportResult(
+        plan=plan,
+        document=document,
+        document_object=document_object,
+        duplicate=document_existed and object_existed,
+    )
+
+
+def hash_file(path: Path) -> str:
+    """Return the SHA-256 hex digest for a local file."""
+
+    digest = sha256()
+    with path.open("rb") as source_file:
+        for chunk in iter(lambda: source_file.read(1024 * 1024), b""):
+            digest.update(chunk)
+    return digest.hexdigest()
+
+
+def guess_media_type(path: Path) -> str:
+    """Return a stable media type for a local document path."""
+
+    media_type, _ = mimetypes.guess_type(path.name)
+    return media_type or "application/octet-stream"
+
+
+def _copy_canonical_object(source_path: Path, canonical_path: Path, file_hash: str) -> None:
+    canonical_path.parent.mkdir(parents=True, exist_ok=True)
+    shutil.copy2(source_path, canonical_path)
+    if hash_file(canonical_path) != file_hash:
+        canonical_path.unlink(missing_ok=True)
+        raise DocumentImportError(
+            f"Copied document hash did not match source: {source_path.name}"
+        )
+    protect_managed_path(canonical_path)
diff --git a/src/bankbuddy/bb/storage.py b/src/bankbuddy/bb/storage.py
@@ -110,6 +110,40 @@ def create_document_object(
             **{**record.__dict__, "object_key": object_key},
         )
 
+    def find_document_object(
+        self,
+        *,
+        storage_root_code: str,
+        object_key: str,
+    ) -> DocumentObjectRecord | None:
+        """Return one document object by storage root and key."""
+
+        normalized_key = validate_storage_key(object_key)
+        row = self._conn.execute(
+            """
+            select
+                BB_DOCUMENT_OBJECT.document_object_id,
+                BB_DOCUMENT_OBJECT.document_id,
+                BB_STORAGE_ROOT.storage_root_code,
+                BB_DOCUMENT_OBJECT.object_key,
+                BB_DOCUMENT_OBJECT.object_role,
+                BB_DOCUMENT_OBJECT.content_hash,
+                BB_DOCUMENT_OBJECT.byte_size,
+                BB_DOCUMENT_OBJECT.media_type,
+                BB_DOCUMENT_OBJECT.original_file_name,
+                BB_DOCUMENT_OBJECT.storage_root_id
+            from BB_DOCUMENT_OBJECT
+            join BB_STORAGE_ROOT using (storage_root_id)
+            where
+                BB_STORAGE_ROOT.storage_root_code = ?
+                and BB_DOCUMENT_OBJECT.object_key = ?
+            """,
+            (storage_root_code, normalized_key),
+        ).fetchone()
+        if row is None:
+            return None
+        return _document_object_from_row(row)
+
     def create_document_view(
         self,
         record: DocumentViewCreate,
@@ -238,3 +272,18 @@ def _storage_root_from_row(row: sqlite3.Row) -> StorageRootRecord:
         permissions_mode=str(row["permissions_mode"]),
         active=bool(row["active"]),
     )
+
+
+def _document_object_from_row(row: sqlite3.Row) -> DocumentObjectRecord:
+    return DocumentObjectRecord(
+        document_object_id=int(row["document_object_id"]),
+        document_id=int(row["document_id"]),
+        storage_root_id=int(row["storage_root_id"]),
+        storage_root_code=str(row["storage_root_code"]),
+        object_key=str(row["object_key"]),
+        object_role=str(row["object_role"]),
+        content_hash=str(row["content_hash"]),
+        byte_size=row["byte_size"],
+        media_type=row["media_type"],
+        original_file_name=row["original_file_name"],
+    )