diff --git a/README.ja.md b/README.ja.md index 05e8470..9ac4633 100644 --- a/README.ja.md +++ b/README.ja.md @@ -82,7 +82,7 @@ exstruct input.xlsx -o out.json --pretty # 整形 JSON をファイルへ exstruct input.xlsx --format yaml # YAML(pyyaml が必要) exstruct input.xlsx --format toon # TOON(python-toon が必要) exstruct input.xlsx --sheets-dir sheets/ # シートごとに分割出力 -exstruct input.xlsx --auto-page-breaks-dir auto_areas/ # COM 限定(利用可能な環境のみ表示) +exstruct input.xlsx --auto-page-breaks-dir auto_areas/ # 常時表示。実行には standard/verbose + Excel COM が必要 exstruct input.xlsx --alpha-col # 列キーを A, B, ..., AA 形式で出力 exstruct input.xlsx --include-backend-metadata # shape/chart の backend metadata を含める exstruct input.xlsx --mode light # セル+テーブル候補のみ @@ -90,8 +90,8 @@ exstruct input.xlsx --mode libreoffice # COM なしで図形/コネクタ/ exstruct input.xlsx --pdf --image # PDF と PNG(Excel COM 必須) ``` -自動改ページ範囲の書き出しは API/CLI 両方に対応(Excel/COM が必要)し、CLI は利用可能な環境でのみ `--auto-page-breaks-dir` を表示します。 -`mode=libreoffice` では `--pdf` / `--image` / `--auto-page-breaks-dir` を早期エラーにし、これらの機能は `standard` または `verbose` + Excel COM を前提にします。 +自動改ページ範囲の書き出しは API/CLI 両方に対応(Excel/COM が必要)し、CLI では `--auto-page-breaks-dir` を常時表示したうえで実行時に検証します。 +`mode=libreoffice` では `--pdf` / `--image` / `--auto-page-breaks-dir` を早期エラーにし、`mode=light` でも `--auto-page-breaks-dir` を拒否します。これらの機能は `standard` または `verbose` + Excel COM を前提にします。 CLI の既定では列キーは従来どおり 0 始まりの数値文字列(`"0"`, `"1"`, ...)です。Excel 形式(`"A"`, `"B"`, ...)が必要な場合は `--alpha-col` を指定してください。 CLI の既定では shape/chart の `provenance` / `approximation_level` / `confidence` も出力しません。必要な場合は `--include-backend-metadata` を指定してください。 注意: MCP の `exstruct_extract` は `options.alpha_col=true` が既定で、CLI の既定(`false`)とは異なります。 diff --git a/README.md b/README.md index a59b358..885605a 100644 --- a/README.md +++ b/README.md @@ -93,7 +93,7 @@ exstruct input.xlsx -o out.json --pretty # write pretty JSON to a file exstruct input.xlsx --format yaml # YAML (requires pyyaml) exstruct input.xlsx --format toon # TOON (requires python-toon) exstruct input.xlsx --sheets-dir sheets/ # write one file per sheet -exstruct input.xlsx --auto-page-breaks-dir auto_areas/ # COM only; shown only when available +exstruct input.xlsx --auto-page-breaks-dir auto_areas/ # always shown; execution requires standard/verbose + Excel COM exstruct input.xlsx --alpha-col # output column keys as A, B, ..., AA exstruct input.xlsx --include-backend-metadata # include shape/chart backend metadata exstruct input.xlsx --mode light # cells + table candidates only @@ -101,8 +101,8 @@ exstruct input.xlsx --mode libreoffice # best-effort extraction of shapes/co exstruct input.xlsx --pdf --image # PDF and PNGs (Excel COM required) ``` -Auto page-break export is available from both the API and the CLI when Excel/COM is available. The CLI exposes `--auto-page-breaks-dir` only in COM-capable environments. -`mode=libreoffice` rejects `--pdf`, `--image`, and `--auto-page-breaks-dir` early. Use `standard` or `verbose` with Excel COM for those features. +Auto page-break export is available from both the API and the CLI when Excel/COM is available. The CLI always exposes `--auto-page-breaks-dir`, but validates it at execution time. +`mode=libreoffice` rejects `--pdf`, `--image`, and `--auto-page-breaks-dir` early, and `mode=light` also rejects `--auto-page-breaks-dir`. Use `standard` or `verbose` with Excel COM for those features. By default, the CLI keeps legacy 0-based numeric string column keys (`"0"`, `"1"`, ...). Use `--alpha-col` when you need Excel-style keys (`"A"`, `"B"`, ...). By default, serialized shape/chart output omits backend metadata (`provenance`, `approximation_level`, `confidence`) to reduce token usage. Use `--include-backend-metadata` or the corresponding Python/MCP option when you need it. Note: MCP `exstruct_extract` defaults to `options.alpha_col=true`, which differs from the CLI default (`false`). diff --git a/dev-docs/adr/ADR-0008-extraction-cli-runtime-capability-validation.md b/dev-docs/adr/ADR-0008-extraction-cli-runtime-capability-validation.md new file mode 100644 index 0000000..60a1f0e --- /dev/null +++ b/dev-docs/adr/ADR-0008-extraction-cli-runtime-capability-validation.md @@ -0,0 +1,75 @@ +# ADR-0008: Extraction CLI Runtime Capability Validation + +## Status + +`accepted` + +## Background + +The extraction CLI currently probes Excel COM availability while building its +argument parser so that `--auto-page-breaks-dir` is shown only on COM-capable +hosts. On Windows this probe may instantiate `xlwings.App()` and launch Excel +even for lightweight commands such as `exstruct --help`. + +This behavior creates two policy problems that are likely to recur: + +- parser construction performs host-dependent side effects and adds startup + latency before the user has requested any COM-only behavior +- help output changes by host capability even though the CLI syntax itself is + stable across hosts + +ExStruct already treats extraction-mode validation as part of the product +contract in ADR-0001, and treats rich-backend fallback as a runtime concern in +ADR-0002. We need a CLI-facing policy for how capability-gated extraction flags +should be exposed and validated without reintroducing startup probes. + +## Decision + +- Extraction CLI parser construction must remain side-effect-free and must not + probe COM or launch Excel. +- Capability-gated extraction flags may remain visible in help output when + their syntax is stable across hosts and their requirements can be validated at + execution time. +- `--auto-page-breaks-dir` is always exposed in extraction CLI help. +- `--auto-page-breaks-dir` is validated only when the user requests it: + - `mode="libreoffice"` keeps the existing invalid-combination validation + - `mode="light"` is rejected explicitly because auto page-break export + requires COM-backed `standard` or `verbose` + - `mode="standard"` / `mode="verbose"` require Excel COM and fail with an + explicit runtime error when COM is unavailable +- Ordinary extraction without requested COM-only side outputs keeps the existing + fallback policy from ADR-0002. + +## Consequences + +- `exstruct --help` and parser construction become faster and stop triggering + Excel startup side effects. +- CLI help becomes consistent across hosts because it documents syntax instead + of reflecting a startup-time environment probe. +- Users on unsupported hosts may still see `--auto-page-breaks-dir`, but they + now receive an actionable runtime error instead of hidden syntax or silent + export skipping. +- Future extraction CLI features that are host-capability-gated should prefer + execution-time validation over parser-time probing unless the syntax itself is + host-specific. + +## Rationale + +- Tests: + - `tests/cli/test_cli.py` +- Code: + - `src/exstruct/cli/main.py` + - `src/exstruct/cli/availability.py` +- Related specs: + - `docs/cli.md` + - `README.md` + - `README.ja.md` + - `dev-docs/specs/excel-extraction.md` + +## Supersedes + +- None + +## Superseded by + +- None diff --git a/dev-docs/adr/README.md b/dev-docs/adr/README.md index 07d34ff..77a7ac4 100644 --- a/dev-docs/adr/README.md +++ b/dev-docs/adr/README.md @@ -42,3 +42,4 @@ ADRs record what was decided, under which constraints, and which trade-offs were | `ADR-0005` | PathPolicy Safety Boundary | `accepted` | `safety` | | `ADR-0006` | Public Edit API and Host-Owned Safety Boundary | `accepted` | `editing` | | `ADR-0007` | Editing CLI as Public Operational Interface | `accepted` | `editing` | +| `ADR-0008` | Extraction CLI Runtime Capability Validation | `accepted` | `cli` | diff --git a/dev-docs/adr/decision-map.md b/dev-docs/adr/decision-map.md index a0e3b46..2620cc7 100644 --- a/dev-docs/adr/decision-map.md +++ b/dev-docs/adr/decision-map.md @@ -6,6 +6,7 @@ This document is a human-readable map for navigating ADRs by domain. - `ADR-0001` Extraction Mode Responsibility Boundaries (`accepted`) - `ADR-0002` Rich Backend Fallback Policy (`accepted`) +- `ADR-0008` Extraction CLI Runtime Capability Validation (`accepted`) ## mode @@ -33,6 +34,7 @@ This document is a human-readable map for navigating ADRs by domain. - `ADR-0003` Output Serialization Omission Policy (`accepted`) - `ADR-0004` Patch Backend Selection Policy (`accepted`) - `ADR-0007` Editing CLI as Public Operational Interface (`accepted`) +- `ADR-0008` Extraction CLI Runtime Capability Validation (`accepted`) ## mcp @@ -57,6 +59,7 @@ This document is a human-readable map for navigating ADRs by domain. ## cli - `ADR-0007` Editing CLI as Public Operational Interface (`accepted`) +- `ADR-0008` Extraction CLI Runtime Capability Validation (`accepted`) ## Supersession Relationships diff --git a/dev-docs/adr/index.yaml b/dev-docs/adr/index.yaml index 15089ad..e2fcd6b 100644 --- a/dev-docs/adr/index.yaml +++ b/dev-docs/adr/index.yaml @@ -102,3 +102,19 @@ adrs: - docs/api.md - README.md - README.ja.md + - id: ADR-0008 + title: Extraction CLI Runtime Capability Validation + status: accepted + path: dev-docs/adr/ADR-0008-extraction-cli-runtime-capability-validation.md + primary_domain: cli + domains: + - cli + - extraction + - compatibility + supersedes: [] + superseded_by: [] + related_specs: + - docs/cli.md + - README.md + - README.ja.md + - dev-docs/specs/excel-extraction.md diff --git a/dev-docs/specs/excel-extraction.md b/dev-docs/specs/excel-extraction.md index 18d6e70..4cf454d 100644 --- a/dev-docs/specs/excel-extraction.md +++ b/dev-docs/specs/excel-extraction.md @@ -56,6 +56,7 @@ What is extracted: - print_areas are retrieved via pre-com (openpyxl); COM only supplements missing parts - auto_page_breaks are retrieved via COM only +- The extraction CLI always exposes auto page-break export syntax and validates the required mode/runtime at execution time instead of probing COM during parser construction ## Colors Map diff --git a/docs/cli.md b/docs/cli.md index 687e650..8309870 100644 --- a/docs/cli.md +++ b/docs/cli.md @@ -120,6 +120,7 @@ empty workbook. | `--include-backend-metadata` | Include shape/chart backend metadata (`provenance`, `approximation_level`, `confidence`) in structured output. | | `--sheets-dir DIR` | Write one file per sheet (format follows `--format`). | | `--print-areas-dir DIR` | Write one file per print area (format follows `--format`). | +| `--auto-page-breaks-dir DIR` | Write one file per auto page-break area. The flag is always shown in help, but execution requires `--mode standard` or `--mode verbose` with Excel COM. | ## Common workflows @@ -157,6 +158,8 @@ exstruct sample.xlsx --pdf --image --dpi 144 -o out.json are usually the better fit. - On non-COM environments, prefer `--mode libreoffice` for best-effort rich extraction on `.xlsx/.xlsm`, or `--mode light` for minimal extraction. - `--mode libreoffice` is best-effort, not a strict subset of COM output. It does not render PDFs/PNGs and does not compute auto page-break areas in v1. +- `--auto-page-breaks-dir` is always shown in help output and is validated at execution time. - `--mode libreoffice` combined with `--pdf`, `--image`, or `--auto-page-breaks-dir` fails early with a configuration error instead of silently ignoring the option. +- `--mode light` also rejects `--auto-page-breaks-dir`; use `--mode standard` or `--mode verbose` with Excel COM for auto page-break export. - `--sheets-dir` and `--print-areas-dir` accept existing or new directories (created if missing). - `--alpha-col` switches row column keys from legacy numeric strings (`"0"`, `"1"`, ...) to Excel-style keys (`"A"`, `"B"`, ...). CLI default is disabled for backward compatibility. diff --git a/src/exstruct/cli/availability.py b/src/exstruct/cli/availability.py index fc4c6b7..0e7ec78 100644 --- a/src/exstruct/cli/availability.py +++ b/src/exstruct/cli/availability.py @@ -22,7 +22,7 @@ class ComAvailability(BaseModel): def get_com_availability() -> ComAvailability: - """Detect whether Excel COM is available for CLI features. + """Detect whether Excel COM is available for runtime CLI features. Returns: ComAvailability describing whether COM features can be used. diff --git a/src/exstruct/cli/main.py b/src/exstruct/cli/main.py index c3d66d3..5d9a25f 100644 --- a/src/exstruct/cli/main.py +++ b/src/exstruct/cli/main.py @@ -7,8 +7,9 @@ import sys from exstruct import process_excel -from exstruct.cli.availability import ComAvailability, get_com_availability +from exstruct.cli.availability import get_com_availability from exstruct.cli.edit import is_edit_subcommand, run_edit_cli +from exstruct.constraints import validate_libreoffice_process_request def _ensure_utf8_stdout() -> None: @@ -29,33 +30,21 @@ def _ensure_utf8_stdout() -> None: return -def _add_auto_page_breaks_argument( - parser: argparse.ArgumentParser, availability: ComAvailability -) -> None: - """Add auto page-break export option when COM is available.""" - if not availability.available: - return +def _add_auto_page_breaks_argument(parser: argparse.ArgumentParser) -> None: + """Add the auto page-break export option to the extraction CLI.""" parser.add_argument( "--auto-page-breaks-dir", type=Path, help=( "Optional directory to write one file per auto page-break area " - "(Excel COM only; not supported in libreoffice mode)." + "(format follows --format; requires --mode standard or " + "--mode verbose with Excel COM)." ), ) -def build_parser( - availability: ComAvailability | None = None, -) -> argparse.ArgumentParser: - """Build the CLI argument parser. - - Args: - availability: Optional COM availability for tests or overrides. - - Returns: - Configured argument parser. - """ +def build_parser() -> argparse.ArgumentParser: + """Build the CLI argument parser.""" parser = argparse.ArgumentParser( description="CLI for ExStruct extraction.", epilog=( @@ -130,10 +119,7 @@ def build_parser( type=Path, help="Optional directory to write one file per print area (format follows --format).", ) - resolved_availability = ( - availability if availability is not None else get_com_availability() - ) - _add_auto_page_breaks_argument(parser, resolved_availability) + _add_auto_page_breaks_argument(parser) parser.add_argument( "--alpha-col", action="store_true", @@ -150,6 +136,36 @@ def build_parser( return parser +def _validate_auto_page_breaks_request(args: argparse.Namespace) -> None: + """Validate runtime requirements for auto page-break export.""" + auto_page_breaks_dir = getattr(args, "auto_page_breaks_dir", None) + if auto_page_breaks_dir is None: + return + + message = ( + "--auto-page-breaks-dir requires --mode standard or --mode verbose " + "with Excel COM." + ) + if args.mode == "libreoffice": + validate_libreoffice_process_request( + args.input, + mode=args.mode, + include_auto_page_breaks=True, + pdf=args.pdf, + image=args.image, + ) + return + if args.mode == "light": + raise RuntimeError(message) + + availability = get_com_availability() + if availability.available: + return + + reason = f" Reason: {availability.reason}" if availability.reason else "" + raise RuntimeError(f"{message}{reason}") + + def main(argv: list[str] | None = None) -> int: """Run the CLI entrypoint. @@ -173,6 +189,7 @@ def main(argv: list[str] | None = None) -> int: return 0 try: + _validate_auto_page_breaks_request(args) process_excel( file_path=input_path, output_path=args.output, diff --git a/tasks/feature_spec.md b/tasks/feature_spec.md index fe3d38b..f4bb9dc 100644 --- a/tasks/feature_spec.md +++ b/tasks/feature_spec.md @@ -42,3 +42,128 @@ - `not-needed` - rationale: this was release preparation and task-log retention cleanup. The policy decisions already live in `ADR-0006`, `ADR-0007`, and the editing specs. + +## 2026-03-20 issue #107 extraction CLI startup optimization + +### Goal + +- Stop probing Excel COM availability while building the extraction CLI parser. +- Keep `--auto-page-breaks-dir` visible in help output on every host. +- Validate `--auto-page-breaks-dir` only when the user actually requests it at execution time. +- Return an explicit CLI error when auto page-break export is requested from an unsupported mode or unsupported runtime. + +### Public contract + +- `build_parser()` and `exstruct --help` must not call the COM availability probe. +- `--auto-page-breaks-dir` is always listed in extraction CLI help output. +- `--auto-page-breaks-dir` runtime behavior: + - `mode="libreoffice"` keeps the existing `ConfigError` path and combined-error precedence. + - `mode="light"` is rejected explicitly by the CLI with a message that auto page-break export requires `standard` or `verbose` with Excel COM. + - `mode="standard"` / `mode="verbose"` trigger COM availability probing only when the flag is present. + - When COM is unavailable, the CLI exits non-zero and prints an actionable message that names the flag and includes the availability reason when present. +- Existing `--pdf` / `--image` runtime behavior is unchanged in this task. + +### Permanent destinations + +- `dev-docs/adr/` + - `ADR-0008` records the extraction CLI policy change: runtime capability validation instead of parser-time environment probing. +- `docs/` + - `docs/cli.md` becomes the canonical public CLI contract for always-visible `--auto-page-breaks-dir` and runtime validation wording. +- `README.md` and `README.ja.md` + - Update quick-start examples and current-behavior prose so they no longer claim the flag is hidden on unsupported hosts. +- `dev-docs/specs/` + - `dev-docs/specs/excel-extraction.md` should keep the internal guarantee that auto page-break extraction is COM-only and tied to runtime validation. +- `tasks/feature_spec.md` and `tasks/todo.md` + - Retain only the temporary working record, verification, and migration notes for this issue. + +### Constraints + +- Do not broaden the task into a Python API or MCP contract change. +- Do not rewrite historical release notes that describe the old behavior at the time they shipped. +- Preserve existing file-not-found behavior and existing `libreoffice` combined-error precedence. + +### Verification plan + +- `tests/cli/test_cli.py` + - Help output always includes `--auto-page-breaks-dir`. + - Parser/help generation does not call `get_com_availability()`. + - `mode="light"` + `--auto-page-breaks-dir` fails with a clear runtime error. + - `mode="standard"` / `mode="verbose"` + `--auto-page-breaks-dir` + unavailable COM fails with a clear runtime error. + - Existing `libreoffice` rejection tests still pass. +- Targeted pytest for the CLI extraction test module. +- `uv run task build-docs` +- `uv run task precommit-run` + +### ADR verdict + +- `required` +- rationale: this changes the public extraction CLI contract for help visibility and execution-time validation of a COM-only flag, and it sets a reusable policy for future capability-gated CLI features. + +## 2026-03-20 issue #107 review follow-up: libreoffice auto page-break fast-fail + +### Goal + +- Align `mode="libreoffice"` handling for `--auto-page-breaks-dir` with the new CLI-side fast-fail policy. +- Preserve the existing LibreOffice single-error and combined-error precedence from the shared validator. +- Prove that the CLI rejects the invalid request before `process_excel()` runs. + +### Public contract + +- `--mode libreoffice --auto-page-breaks-dir ...` fails in the CLI layer without calling `process_excel()`. +- When `--pdf` or `--image` is also present, the CLI keeps the existing combined LibreOffice error message precedence. +- This follow-up does not change the already documented `standard` / `verbose` COM-runtime validation policy. + +### Permanent destinations + +- No new permanent destination is required beyond the documents already updated for issue `#107`. +- The durable contract remains in `docs/cli.md`, `README.md`, `README.ja.md`, `dev-docs/specs/excel-extraction.md`, and `dev-docs/adr/ADR-0008-extraction-cli-runtime-capability-validation.md`. + +### Constraints + +- Reuse the existing LibreOffice validator instead of duplicating message composition in the CLI. +- Keep parser/help no-probe behavior unchanged. + +### Verification plan + +- `tests/cli/test_cli.py` + - `mode="libreoffice"` + `--auto-page-breaks-dir` rejects before `process_excel()`. + - `mode="libreoffice"` + rendering + `--auto-page-breaks-dir` keeps the combined error and also rejects before `process_excel()`. +- Targeted pytest for `tests/cli/test_cli.py`. + +### ADR verdict + +- `not-needed` +- rationale: this is a corrective follow-up that aligns implementation with the already-recorded `ADR-0008` policy rather than creating a new architectural decision. + +## 2026-03-20 issue #107 review follow-up: wording and help-text clarity + +### Goal + +- Resolve the PR review wording nits in tracked documentation. +- Make the extraction CLI help text for `--auto-page-breaks-dir` match the runtime contract already documented elsewhere. + +### Public contract + +- The help text for `--auto-page-breaks-dir` states that it writes one file per auto page-break area, follows `--format`, and requires `--mode standard` or `--mode verbose` with Excel COM. +- This follow-up does not change runtime behavior; it only tightens wording and help-text clarity. + +### Permanent destinations + +- No new permanent destination is required. +- The durable wording lives in `src/exstruct/cli/main.py`, `dev-docs/adr/ADR-0008-extraction-cli-runtime-capability-validation.md`, and the existing issue `#107` task notes. + +### Constraints + +- Keep the change limited to wording/help-text clarity; do not change runtime validation or expand scope beyond the reviewed lines. + +### Verification plan + +- `tests/cli/test_cli.py` + - Help output still includes `--auto-page-breaks-dir`. + - Help output includes the clarified runtime-contract wording. +- `uv run pytest tests/cli/test_cli.py -q` + +### ADR verdict + +- `not-needed` +- rationale: this is a wording-only follow-up under the existing `ADR-0008` decision. diff --git a/tasks/todo.md b/tasks/todo.md index a1fb48d..32ed7d7 100644 --- a/tasks/todo.md +++ b/tasks/todo.md @@ -26,3 +26,95 @@ - `rg -n "0\.7\.0|v0\.7\.0" pyproject.toml uv.lock CHANGELOG.md mkdocs.yml docs/release-notes/v0.7.0.md` - `rg -n "^## " tasks/feature_spec.md tasks/todo.md` - `git diff --check -- CHANGELOG.md docs/release-notes/v0.7.0.md mkdocs.yml pyproject.toml uv.lock tasks/feature_spec.md tasks/todo.md dev-docs/architecture/overview.md` + +## 2026-03-20 issue #107 extraction CLI startup optimization + +### Planning + +- [x] Confirm issue `#107` details with `gh issue view 107`. +- [x] Read current extraction CLI code, related docs/specs, and relevant tests. +- [x] Classify ADR need for the public CLI contract change. +- [x] Add the issue `#107` working spec to `tasks/feature_spec.md`. +- [x] Add or update the ADR for extraction CLI runtime capability validation. +- [x] Refactor extraction CLI parser construction so it never probes COM availability. +- [x] Always register `--auto-page-breaks-dir` in extraction CLI help. +- [x] Add runtime validation for `--auto-page-breaks-dir` covering `light` mode and unavailable COM on `standard` / `verbose`. +- [x] Keep existing `libreoffice` rejection and combined-error precedence intact. +- [x] Update extraction CLI tests for no-probe help and runtime validation. +- [x] Update current user docs and internal specs for the new CLI contract. +- [x] Run targeted pytest for extraction CLI coverage. +- [x] Run `uv run task build-docs`. +- [x] Run `uv run task precommit-run`. +- [x] Review task/spec retention and record permanent destinations in the Review section. + +### Review + +- Extraction CLI parser construction is now side-effect free: `build_parser()` always registers `--auto-page-breaks-dir`, and COM probing happens only when that flag is requested at runtime from a supported mode. +- `mode="light"` now fails explicitly for `--auto-page-breaks-dir`, while `mode="libreoffice"` keeps the existing core validation and combined-error precedence. +- Permanent destinations: + - `dev-docs/adr/ADR-0008-extraction-cli-runtime-capability-validation.md` records the policy change and why parser-time probing is forbidden. + - `docs/cli.md`, `README.md`, and `README.ja.md` now describe the always-visible flag and execution-time validation contract. + - `dev-docs/specs/excel-extraction.md` now records the internal guarantee that the extraction CLI validates auto page-break export at runtime instead of parser construction. +- ADR checks: + - `adr-linter`: no high/medium/low findings for `ADR-0008`. + - `adr-reviewer`: `ready`, no findings. + - `adr-reconciler`: no policy drift, no stale references, two low-evidence findings addressed by adding `verbose` and `main(["--help"])` regression tests. + - `adr-indexer`: index artifacts were synchronized manually from the ADR source text (`README.md`, `index.yaml`, `decision-map.md`). +- Verification: + - `gh issue view 107 --json number,title,body,labels,assignees,state,url` + - `uv run pytest tests/cli/test_cli.py tests/cli/test_edit_cli.py -q` + - `uv run task build-docs` + - `uv run task precommit-run` + - `git diff --check` + +## 2026-03-20 issue #107 review follow-up: libreoffice auto page-break fast-fail + +### Planning + +- [x] Re-read the CLI review comment and compare it with `src/exstruct/cli/main.py` and the shared LibreOffice validator. +- [x] Update `tasks/feature_spec.md` with the follow-up contract and verification scope. +- [x] Make the CLI reject `--mode libreoffice --auto-page-breaks-dir` before `process_excel()`. +- [x] Preserve the existing combined LibreOffice error precedence when rendering flags are also present. +- [x] Add regression tests that prove the CLI rejects these requests before `process_excel()` runs. +- [x] Run targeted pytest for `tests/cli/test_cli.py`. +- [x] Update this Review section with the final validation result and retention decision. + +### Review + +- The review finding was valid: `_validate_auto_page_breaks_request()` treated `mode="light"` as a CLI-side fast-fail but let `mode="libreoffice"` fall through to the engine layer, which made responsibility and failure timing inconsistent. +- `src/exstruct/cli/main.py` now reuses `validate_libreoffice_process_request(...)` for `--auto-page-breaks-dir` in `mode="libreoffice"`, so the CLI rejects invalid requests before `process_excel()` while preserving the existing single-error and combined-error message precedence. +- `tests/cli/test_cli.py` now proves both `libreoffice + --auto-page-breaks-dir` and `libreoffice + --pdf + --auto-page-breaks-dir` fail before `process_excel()` runs, and that these paths do not probe COM availability. +- Retention decision: + - No new permanent document was needed. This follow-up only brought the implementation back into alignment with the policy already recorded in `ADR-0008`, `docs/cli.md`, `README.md`, `README.ja.md`, and `dev-docs/specs/excel-extraction.md`. + - The temporary working notes for this follow-up can stay limited to this section in `tasks/feature_spec.md` and `tasks/todo.md`. +- Verification: + - `uv run pytest tests/cli/test_cli.py -q` + - `uv run task precommit-run` + - `git diff --check` + +## 2026-03-20 issue #107 review follow-up: wording and help-text clarity + +### Planning + +- [x] Retrieve the PR review comments with `gh` and classify which findings are substantive. +- [x] Confirm the wording nits in `tasks/todo.md` and `dev-docs/adr/ADR-0008-extraction-cli-runtime-capability-validation.md`. +- [x] Clarify the `--auto-page-breaks-dir` help text in `src/exstruct/cli/main.py` so it matches the runtime contract. +- [x] Update CLI help tests for the clarified wording. +- [x] Run targeted pytest for `tests/cli/test_cli.py`. +- [x] Run `uv run task precommit-run`. +- [x] Record the review outcome and retention decision here. + +### Review + +- The explicit PR review findings were valid but minor: `tasks/todo.md` and `dev-docs/adr/ADR-0008-extraction-cli-runtime-capability-validation.md` each had a hyphenation nit (`low-evidence`, `side-effect-free`). +- A separate suppressed Copilot note about `--auto-page-breaks-dir` help text was also substantively valid: the old string mentioned only LibreOffice rejection and omitted that output files follow `--format`, while the actual runtime contract also rejects `light` and requires `standard`/`verbose` with Excel COM. +- `src/exstruct/cli/main.py` now states the fuller contract in the argument help text, and `tests/cli/test_cli.py` now checks for the clarified help wording without depending on exact `argparse` line wrapping. +- Retention decision: + - No new ADR or spec migration was needed. The durable contract remains in `ADR-0008`, `docs/cli.md`, and the README files; this follow-up only aligns wording and help text with that existing policy. + - The temporary working record can stay limited to this section in `tasks/feature_spec.md` and `tasks/todo.md`. +- Verification: + - `gh pr view 111 --json number,title,reviewDecision,reviews,comments,files,url` + - `gh api repos/harumiWeb/exstruct/pulls/111/comments` + - `uv run pytest tests/cli/test_cli.py -q` + - `uv run task precommit-run` + - `git diff --check` diff --git a/tests/cli/test_cli.py b/tests/cli/test_cli.py index d35eb1d..786a1e6 100644 --- a/tests/cli/test_cli.py +++ b/tests/cli/test_cli.py @@ -372,12 +372,17 @@ def test_cli_libreoffice_rejects_auto_page_breaks_dir( ) -> None: """Verify that the CLI rejects auto page-break export in LibreOffice mode.""" + def _raise_process_excel(*_args: object, **_kwargs: object) -> None: + raise AssertionError("process_excel should not run for CLI-side rejection") + + def _raise_com_probe() -> ComAvailability: + raise AssertionError("LibreOffice validation should not probe COM") + + monkeypatch.setattr("exstruct.cli.main.process_excel", _raise_process_excel) + monkeypatch.setattr("exstruct.cli.main.get_com_availability", _raise_com_probe) + xlsx = _prepare_sample_excel(tmp_path) auto_dir = tmp_path / "auto" - monkeypatch.setattr( - "exstruct.cli.main.get_com_availability", - lambda: ComAvailability(available=True, reason=None), - ) result = _run_cli( [ @@ -399,12 +404,17 @@ def test_cli_libreoffice_rejects_rendering_and_auto_page_breaks( ) -> None: """Verify that the CLI LibreOffice rejects rendering and auto page breaks.""" + def _raise_process_excel(*_args: object, **_kwargs: object) -> None: + raise AssertionError("process_excel should not run for CLI-side rejection") + + def _raise_com_probe() -> ComAvailability: + raise AssertionError("LibreOffice validation should not probe COM") + + monkeypatch.setattr("exstruct.cli.main.process_excel", _raise_process_excel) + monkeypatch.setattr("exstruct.cli.main.get_com_availability", _raise_com_probe) + xlsx = _prepare_sample_excel(tmp_path) auto_dir = tmp_path / "auto" - monkeypatch.setattr( - "exstruct.cli.main.get_com_availability", - lambda: ComAvailability(available=True, reason=None), - ) result = _run_cli( [ @@ -425,20 +435,109 @@ def test_cli_libreoffice_rejects_rendering_and_auto_page_breaks( ) -def test_cli_parser_includes_auto_page_breaks_option() -> None: - """Ensure the auto page-breaks option is registered when COM is available.""" - availability = ComAvailability(available=True, reason=None) - parser = build_parser(availability=availability) +def test_cli_parser_always_includes_auto_page_breaks_option() -> None: + """Ensure the auto page-breaks option is always registered with clear help.""" + parser = build_parser() help_text = parser.format_help() assert "--auto-page-breaks-dir" in help_text + assert "format follows --format" in help_text + assert "requires --mode" in help_text + assert "standard or --mode verbose with Excel COM" in help_text + +def test_cli_parser_help_does_not_probe_com_availability( + monkeypatch: pytest.MonkeyPatch, +) -> None: + """Ensure help generation never triggers COM availability probing.""" + + def _raise() -> ComAvailability: + raise AssertionError("get_com_availability should not run during help setup") -def test_cli_parser_excludes_auto_page_breaks_option() -> None: - """Ensure the auto page-breaks option is hidden when COM is unavailable.""" - availability = ComAvailability(available=False, reason="disabled") - parser = build_parser(availability=availability) + monkeypatch.setattr("exstruct.cli.main.get_com_availability", _raise) + + parser = build_parser() help_text = parser.format_help() - assert "--auto-page-breaks-dir" not in help_text + assert "--auto-page-breaks-dir" in help_text + + +def test_cli_main_help_does_not_probe_com_availability( + monkeypatch: pytest.MonkeyPatch, +) -> None: + """Ensure the main help entrypoint never triggers COM availability probing.""" + + def _raise() -> ComAvailability: + raise AssertionError("get_com_availability should not run during --help") + + monkeypatch.setattr("exstruct.cli.main.get_com_availability", _raise) + + stdout_buffer = io.StringIO() + with redirect_stdout(stdout_buffer), pytest.raises(SystemExit) as exc_info: + cli_main(["--help"]) + + assert exc_info.value.code == 0 + assert "--auto-page-breaks-dir" in stdout_buffer.getvalue() + + +def test_cli_auto_page_breaks_rejects_light_mode( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + """Verify that light mode rejects auto page-break export explicitly.""" + + def _raise() -> ComAvailability: + raise AssertionError("light mode validation should not probe COM") + + monkeypatch.setattr("exstruct.cli.main.get_com_availability", _raise) + + xlsx = _prepare_sample_excel(tmp_path) + auto_dir = tmp_path / "auto" + result = _run_cli( + [ + str(xlsx), + "--mode", + "light", + "--auto-page-breaks-dir", + str(auto_dir), + ] + ) + + assert result.returncode == 1 + combined_output = _stdout_text(result) + _stderr_text(result) + assert ( + "--auto-page-breaks-dir requires --mode standard or --mode verbose " + "with Excel COM." + ) in combined_output + + +@pytest.mark.parametrize("mode", ["standard", "verbose"]) # type: ignore[misc] +def test_cli_auto_page_breaks_requires_com_at_runtime( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch, mode: str +) -> None: + """Verify that auto page-break export fails clearly when COM is unavailable.""" + + xlsx = _prepare_sample_excel(tmp_path) + auto_dir = tmp_path / "auto" + monkeypatch.setattr( + "exstruct.cli.main.get_com_availability", + lambda: ComAvailability(available=False, reason="Non-Windows platform."), + ) + + result = _run_cli( + [ + str(xlsx), + "--mode", + mode, + "--auto-page-breaks-dir", + str(auto_dir), + ] + ) + + assert result.returncode == 1 + combined_output = _stdout_text(result) + _stderr_text(result) + assert ( + "--auto-page-breaks-dir requires --mode standard or --mode verbose " + "with Excel COM." + ) in combined_output + assert "Reason: Non-Windows platform." in combined_output def test_CLI_stdout_is_utf8_with_cp932_env(tmp_path: Path) -> None: diff --git a/tests/cli/test_edit_cli.py b/tests/cli/test_edit_cli.py index 078b4f6..1fed5fc 100644 --- a/tests/cli/test_edit_cli.py +++ b/tests/cli/test_edit_cli.py @@ -9,7 +9,6 @@ from pydantic import BaseModel import pytest -from exstruct.cli.availability import ComAvailability import exstruct.cli.edit as edit_cli_module from exstruct.cli.edit import is_edit_subcommand import exstruct.cli.main as cli_main_module @@ -429,9 +428,7 @@ def _raise_os_error(_request: object) -> object: def test_extraction_help_mentions_editing_commands() -> None: - help_text = build_parser( - availability=ComAvailability(available=False, reason="test") - ).format_help() + help_text = build_parser().format_help() assert "Editing commands:" in help_text assert "exstruct patch --input book.xlsx --ops ops.json" in help_text