Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .ai-context/COMMANDS.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,16 @@ Run the v2 financial intelligence CLI with:
bb --help
bb init
bb status
bb documents import --dry-run --file path/to/document.pdf
bb documents import --file path/to/document.pdf
```

`bb` is the side-by-side command surface for new `BB_` schema work. It should
grow new document/entity/observation workflows while the legacy command
surfaces remain available. Use `bb init` to apply current migrations and
prepare v2 `financial/` storage roots for the active data home.
`bb documents import` is parser-free: it hashes the explicit file, records v2
document/object metadata, and copies the canonical object into managed storage.

## BankBuddy CLI

Expand Down
5 changes: 4 additions & 1 deletion .ai-context/STATUS.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@ section is `Unreleased`.
- Banking CLI commands for banks, accounts, statement refs, imports,
transactions, categories, reports, exports, storage migration, and status.
- Side-by-side `bb` CLI for v2 financial intelligence initialization,
foundation status, storage readiness, and `BB_` schema visibility.
foundation status, storage readiness, generic document import, and `BB_`
schema visibility.
- Supported banking imports for Bank of America PDF/CSV, Apple Card PDF, ICICI
`.xls`, and HDFC `.xls`.
- Statement inventory and statement coverage auditing.
Expand All @@ -32,6 +33,8 @@ section is `Unreleased`.
leaving existing `bankbuddy` and `taxbuddy` commands in place.
- `bb init` for applying migrations and preparing v2 financial storage roots
without depending on the legacy `bankbuddy` command surface.
- Generic `bb documents import` for parser-free explicit-file intake into
`BB_DOCUMENT` and `BB_DOCUMENT_OBJECT` with SHA-256 canonical storage keys.
- Prospective relicensing from MIT to AGPL-3.0-or-later.
- Canonical data-home layout with `database/`, `bank/`, and `tax/` directories.
- First TaxBuddy CLI slice and `tax_documents` metadata index.
Expand Down
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,9 @@ and versions are tracked in the repo-root `VERSION` file.

### Added

- Added `bb documents import --dry-run --file ...` and
`bb documents import --file ...` for parser-free v2 document intake with
SHA-256 canonical object storage and idempotent duplicate handling.
- Added `bb init` and v2 storage readiness output in `bb status`, so the v2
command surface can initialize migrations and financial storage roots
independently of the legacy `bankbuddy` CLI.
Expand Down
7 changes: 7 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ Activate the project shell once, then run the CLI directly:
basectl activate bankbuddy
bb init
bb status
bb documents import --dry-run --file path/to/document.pdf
bb documents import --file path/to/document.pdf
bankbuddy --help
bankbuddy status
bankbuddy init
Expand Down Expand Up @@ -164,6 +166,11 @@ Use `bb init` for the v2 financial intelligence foundation. It applies the
current SQLite migrations and prepares the `financial/` storage roots used by
document-first workflows.

Use `bb documents import` for parser-free v2 document intake. It hashes the
file, records a `BB_DOCUMENT`, stores a canonical object under
`financial/canonical`, and leaves bank-specific parsing or inference for later
workflows.

Switch the current shell by exporting `BANKBUDDY_ENV`:

```bash
Expand Down
54 changes: 54 additions & 0 deletions src/bankbuddy/bb/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,16 @@

from __future__ import annotations

from pathlib import Path
import sqlite3

import click

from bankbuddy import __version__
from bankbuddy.database import initialize_database
from bankbuddy.bb.documents import DocumentImportError
from bankbuddy.bb.documents import import_document
from bankbuddy.bb.documents import plan_document_import
from bankbuddy.paths import resolve_app_paths
from bankbuddy.bb.storage import ensure_financial_storage_dirs
from bankbuddy.runtime import CliRuntime
Expand Down Expand Up @@ -160,6 +164,47 @@ def init_command(ctx: click.Context) -> None:
click.echo(f"Initialized BankBuddy v2 at {paths.root}")


@main.group()
def documents() -> None:
"""Manage v2 documents."""


@documents.command("import")
@click.option("--dry-run", is_flag=True, help="Plan the import without writes.")
@click.option(
"--file",
"file_path",
required=True,
type=click.Path(dir_okay=False, path_type=Path),
help="Document file to import.",
)
@click.pass_context
def documents_import(
ctx: click.Context,
dry_run: bool,
file_path: Path,
) -> None:
"""Import one document into v2 canonical storage."""

runtime = runtime_from_context(ctx)
paths = resolve_app_paths(environment=runtime.environment)
try:
if dry_run:
plan = plan_document_import(paths, file_path)
_print_document_import_plan(plan, dry_run=True)
click.echo("Database changed: no")
click.echo("Files changed: none")
return
result = import_document(paths, file_path)
except (OSError, DocumentImportError) as exc:
raise click.ClickException(str(exc)) from exc

_print_document_import_plan(result.plan, dry_run=False)
click.echo(f"Document ID: {result.document.document_id}")
click.echo(f"Document object ID: {result.document_object.document_object_id}")
click.echo(f"Duplicate: {_yes_no(result.duplicate)}")


def _database_table_names(database_path) -> set[str]:
if not database_path.exists():
return set()
Expand All @@ -176,6 +221,15 @@ def _yes_no(value: bool) -> str:
return "yes" if value else "no"


def _print_document_import_plan(plan, *, dry_run: bool) -> None:
click.echo(f"Dry run: {_yes_no(dry_run)}")
click.echo(f"File: {plan.source_path.name}")
click.echo(f"SHA-256: {plan.file_hash}")
click.echo(f"Size: {plan.byte_size} bytes")
click.echo(f"Media type: {plan.media_type}")
click.echo(f"Canonical object: {plan.canonical_relative_path}")


def _v2_storage_ready(paths) -> bool:
return all(
path.is_dir()
Expand Down
153 changes: 153 additions & 0 deletions src/bankbuddy/bb/documents.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
"""Generic document import services for the BankBuddy v2 model."""

from __future__ import annotations

from dataclasses import dataclass
from hashlib import sha256
import mimetypes
from pathlib import Path
import shutil

from bankbuddy.bb.dao import FinancialIntelligenceDAO
from bankbuddy.bb.records import DocumentCreate
from bankbuddy.bb.records import DocumentObjectCreate
from bankbuddy.bb.records import DocumentObjectRecord
from bankbuddy.bb.records import DocumentRecord
from bankbuddy.bb.storage import FinancialStorageDAO
from bankbuddy.bb.storage import object_key_for_hash
from bankbuddy.bb.storage import protect_managed_path
from bankbuddy.bb.storage import resolve_storage_path
from bankbuddy.database import connect_database
from bankbuddy.database import initialize_database
from bankbuddy.paths import AppPaths


@dataclass(frozen=True)
class DocumentImportPlan:
"""Dry-run-safe plan for importing one document."""

source_path: Path
file_hash: str
byte_size: int
media_type: str
object_key: str
canonical_relative_path: str


@dataclass(frozen=True)
class DocumentImportResult:
"""Result of importing one document into v2 storage."""

plan: DocumentImportPlan
document: DocumentRecord
document_object: DocumentObjectRecord
duplicate: bool


class DocumentImportError(ValueError):
"""Raised when a generic document import cannot be planned or completed."""


def plan_document_import(paths: AppPaths, source_path: Path) -> DocumentImportPlan:
"""Return a deterministic import plan without creating directories or rows."""

resolved_source = source_path.expanduser()
if not resolved_source.is_file():
raise DocumentImportError(f"Document file does not exist: {resolved_source}")

file_hash = hash_file(resolved_source)
object_key = object_key_for_hash(file_hash, resolved_source.suffix)
return DocumentImportPlan(
source_path=resolved_source,
file_hash=file_hash,
byte_size=resolved_source.stat().st_size,
media_type=guess_media_type(resolved_source),
object_key=object_key,
canonical_relative_path=f"financial/canonical/{object_key}",
)


def import_document(paths: AppPaths, source_path: Path) -> DocumentImportResult:
"""Import one document into the v2 canonical object store."""

plan = plan_document_import(paths, source_path)
initialize_database(paths)

with connect_database(paths) as conn:
documents = FinancialIntelligenceDAO(conn)
storage = FinancialStorageDAO(conn)
document = documents.find_document_by_hash(plan.file_hash)
document_existed = document is not None
if document is None:
document = documents.create_document(
DocumentCreate(
file_hash=plan.file_hash,
original_file_name=plan.source_path.name,
)
)

document_object = storage.find_document_object(
storage_root_code="FINANCIAL_CANONICAL",
object_key=plan.object_key,
)
object_existed = document_object is not None
canonical_root = storage.get_storage_root("FINANCIAL_CANONICAL")
canonical_path = resolve_storage_path(paths, canonical_root, plan.object_key)

if document_object is None:
_copy_canonical_object(plan.source_path, canonical_path, plan.file_hash)
document_object = storage.create_document_object(
DocumentObjectCreate(
document_id=document.document_id,
storage_root_code="FINANCIAL_CANONICAL",
object_key=plan.object_key,
object_role="canonical",
content_hash=plan.file_hash,
byte_size=plan.byte_size,
media_type=plan.media_type,
original_file_name=plan.source_path.name,
)
)
elif not canonical_path.exists():
_copy_canonical_object(plan.source_path, canonical_path, plan.file_hash)
elif hash_file(canonical_path) != plan.file_hash:
raise DocumentImportError(
f"Canonical object content mismatch: {plan.canonical_relative_path}"
)

conn.commit()

return DocumentImportResult(
plan=plan,
document=document,
document_object=document_object,
duplicate=document_existed and object_existed,
)


def hash_file(path: Path) -> str:
"""Return the SHA-256 hex digest for a local file."""

digest = sha256()
with path.open("rb") as source_file:
for chunk in iter(lambda: source_file.read(1024 * 1024), b""):
digest.update(chunk)
return digest.hexdigest()


def guess_media_type(path: Path) -> str:
"""Return a stable media type for a local document path."""

media_type, _ = mimetypes.guess_type(path.name)
return media_type or "application/octet-stream"


def _copy_canonical_object(source_path: Path, canonical_path: Path, file_hash: str) -> None:
canonical_path.parent.mkdir(parents=True, exist_ok=True)
shutil.copy2(source_path, canonical_path)
if hash_file(canonical_path) != file_hash:
canonical_path.unlink(missing_ok=True)
raise DocumentImportError(
f"Copied document hash did not match source: {source_path.name}"
)
protect_managed_path(canonical_path)
49 changes: 49 additions & 0 deletions src/bankbuddy/bb/storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,40 @@ def create_document_object(
**{**record.__dict__, "object_key": object_key},
)

def find_document_object(
self,
*,
storage_root_code: str,
object_key: str,
) -> DocumentObjectRecord | None:
"""Return one document object by storage root and key."""

normalized_key = validate_storage_key(object_key)
row = self._conn.execute(
"""
select
BB_DOCUMENT_OBJECT.document_object_id,
BB_DOCUMENT_OBJECT.document_id,
BB_STORAGE_ROOT.storage_root_code,
BB_DOCUMENT_OBJECT.object_key,
BB_DOCUMENT_OBJECT.object_role,
BB_DOCUMENT_OBJECT.content_hash,
BB_DOCUMENT_OBJECT.byte_size,
BB_DOCUMENT_OBJECT.media_type,
BB_DOCUMENT_OBJECT.original_file_name,
BB_DOCUMENT_OBJECT.storage_root_id
from BB_DOCUMENT_OBJECT
join BB_STORAGE_ROOT using (storage_root_id)
where
BB_STORAGE_ROOT.storage_root_code = ?
and BB_DOCUMENT_OBJECT.object_key = ?
""",
(storage_root_code, normalized_key),
).fetchone()
if row is None:
return None
return _document_object_from_row(row)

def create_document_view(
self,
record: DocumentViewCreate,
Expand Down Expand Up @@ -238,3 +272,18 @@ def _storage_root_from_row(row: sqlite3.Row) -> StorageRootRecord:
permissions_mode=str(row["permissions_mode"]),
active=bool(row["active"]),
)


def _document_object_from_row(row: sqlite3.Row) -> DocumentObjectRecord:
return DocumentObjectRecord(
document_object_id=int(row["document_object_id"]),
document_id=int(row["document_id"]),
storage_root_id=int(row["storage_root_id"]),
storage_root_code=str(row["storage_root_code"]),
object_key=str(row["object_key"]),
object_role=str(row["object_role"]),
content_hash=str(row["content_hash"]),
byte_size=row["byte_size"],
media_type=row["media_type"],
original_file_name=row["original_file_name"],
)
Loading
Loading