Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions dev-docs/architecture/overview.md
Original file line number Diff line number Diff line change
Expand Up @@ -98,10 +98,11 @@ CLI entry point
`validate`
- `edit.py` contains the Phase 2 editing parser, JSON serialization helpers,
and wrappers around `exstruct.edit`
- `exstruct.__init__`, `exstruct.edit.__init__`, and lightweight CLI startup
paths must remain side-effect-free: `--help` and `ops` routing should defer
heavy extraction/edit implementation imports until command execution needs
them
- `exstruct.__init__`, `exstruct.edit.__init__`, `exstruct.engine`, and
lightweight CLI startup paths must remain side-effect-free where practical:
`--help` and `ops` routing should defer heavy extraction/edit implementation
imports until command execution needs them, and importing `exstruct.engine`
should not eagerly load extraction/render runtime dependencies

### edit/

Expand Down
213 changes: 201 additions & 12 deletions src/exstruct/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,21 +15,207 @@
validate_libreoffice_extraction_request,
validate_libreoffice_process_request,
)
from .core import cells as _cells
from .core.cells import set_table_detection_params
from .core.integrate import extract_workbook
from .io import (
save_auto_page_break_views,
save_print_area_views,
save_sheets,
serialize_workbook,
)
from .models import SheetData, WorkbookData, convert_workbook_keys_to_alpha
from .render import export_pdf, export_sheet_images
from .models import SheetData, WorkbookData

ExtractionMode = Literal["light", "libreoffice", "standard", "verbose"]


def set_table_detection_params(
*,
table_score_threshold: float | None = None,
density_min: float | None = None,
coverage_min: float | None = None,
min_nonempty_cells: int | None = None,
) -> None:
"""Lazily proxy table-detection configuration updates."""
from .core.cells import (
set_table_detection_params as set_table_detection_params_impl,
)

set_table_detection_params_impl(
table_score_threshold=table_score_threshold,
density_min=density_min,
coverage_min=coverage_min,
min_nonempty_cells=min_nonempty_cells,
)


def extract_workbook(
file_path: str | Path,
mode: ExtractionMode = "standard",
*,
include_cell_links: bool | None = None,
include_print_areas: bool | None = None,
include_auto_page_breaks: bool = False,
include_colors_map: bool | None = None,
include_default_background: bool = False,
ignore_colors: set[str] | None = None,
include_formulas_map: bool | None = None,
include_merged_cells: bool | None = None,
include_merged_values_in_rows: bool = True,
) -> WorkbookData:
"""Lazily proxy workbook extraction."""
from .core.integrate import extract_workbook as extract_workbook_impl

return extract_workbook_impl(
file_path,
mode=mode,
include_cell_links=include_cell_links,
include_print_areas=include_print_areas,
include_auto_page_breaks=include_auto_page_breaks,
include_colors_map=include_colors_map,
include_default_background=include_default_background,
ignore_colors=ignore_colors,
include_formulas_map=include_formulas_map,
include_merged_cells=include_merged_cells,
include_merged_values_in_rows=include_merged_values_in_rows,
)


def convert_workbook_keys_to_alpha(workbook: WorkbookData) -> WorkbookData:
"""Lazily proxy workbook key conversion."""
from .models import (
convert_workbook_keys_to_alpha as convert_workbook_keys_to_alpha_impl,
)

return convert_workbook_keys_to_alpha_impl(workbook)


def serialize_workbook(
model: WorkbookData,
fmt: Literal["json", "yaml", "yml", "toon"] = "json",
*,
pretty: bool = False,
indent: int | None = None,
include_backend_metadata: bool = False,
) -> str:
"""Lazily proxy workbook serialization."""
from .io import serialize_workbook as serialize_workbook_impl

return serialize_workbook_impl(
model,
fmt=fmt,
pretty=pretty,
indent=indent,
include_backend_metadata=include_backend_metadata,
)


def save_sheets(
workbook: WorkbookData,
output_dir: Path,
fmt: Literal["json", "yaml", "yml", "toon"] = "json",
*,
pretty: bool = False,
indent: int | None = None,
include_backend_metadata: bool = False,
) -> dict[str, Path]:
"""Lazily proxy per-sheet export."""
from .io import save_sheets as save_sheets_impl

return save_sheets_impl(
workbook,
output_dir,
fmt=fmt,
pretty=pretty,
indent=indent,
include_backend_metadata=include_backend_metadata,
)


def save_print_area_views(
workbook: WorkbookData,
output_dir: Path,
fmt: Literal["json", "yaml", "yml", "toon"] = "json",
*,
pretty: bool = False,
indent: int | None = None,
normalize: bool = False,
include_shapes: bool = True,
include_charts: bool = True,
include_shape_size: bool = True,
include_chart_size: bool = True,
include_backend_metadata: bool = False,
) -> dict[str, Path]:
"""Lazily proxy print-area export."""
from .io import save_print_area_views as save_print_area_views_impl

return save_print_area_views_impl(
workbook,
output_dir,
fmt=fmt,
pretty=pretty,
indent=indent,
normalize=normalize,
include_shapes=include_shapes,
include_charts=include_charts,
include_shape_size=include_shape_size,
include_chart_size=include_chart_size,
include_backend_metadata=include_backend_metadata,
)


def save_auto_page_break_views(
workbook: WorkbookData,
output_dir: Path,
fmt: Literal["json", "yaml", "yml", "toon"] = "json",
*,
pretty: bool = False,
indent: int | None = None,
normalize: bool = False,
include_shapes: bool = True,
include_charts: bool = True,
include_shape_size: bool = True,
include_chart_size: bool = True,
include_backend_metadata: bool = False,
) -> dict[str, Path]:
"""Lazily proxy auto page-break export."""
from .io import (
save_auto_page_break_views as save_auto_page_break_views_impl,
)

return save_auto_page_break_views_impl(
workbook,
output_dir,
fmt=fmt,
pretty=pretty,
indent=indent,
normalize=normalize,
include_shapes=include_shapes,
include_charts=include_charts,
include_shape_size=include_shape_size,
include_chart_size=include_chart_size,
include_backend_metadata=include_backend_metadata,
)


def export_pdf(excel_path: str | Path, output_pdf: str | Path) -> list[str]:
"""Lazily proxy PDF rendering."""
from .render import export_pdf as export_pdf_impl

return export_pdf_impl(excel_path, output_pdf)


def export_sheet_images(
excel_path: str | Path,
output_dir: str | Path,
dpi: int = 144,
*,
sheet: str | None = None,
a1_range: str | None = None,
) -> list[Path]:
"""Lazily proxy sheet image rendering."""
from .render import export_sheet_images as export_sheet_images_impl

return export_sheet_images_impl(
excel_path,
output_dir,
dpi=dpi,
sheet=sheet,
a1_range=a1_range,
)


class TableParams(TypedDict, total=False):
"""Table detection parameter overrides."""

Expand Down Expand Up @@ -240,7 +426,9 @@ def _table_params_scope(self) -> Iterator[None]:
if not self.options.table_params:
yield
return
prev = cast(TableParams, dict(_cells._DETECTION_CONFIG))
from .core import cells as cells_module

prev = cast(TableParams, dict(cells_module._DETECTION_CONFIG))
set_table_detection_params(**self.options.table_params)
try:
yield
Expand Down Expand Up @@ -496,6 +684,7 @@ def serialize(
use_fmt = fmt or self.output.format.fmt
use_pretty = self.output.format.pretty if pretty is None else pretty
use_indent = self.output.format.indent if indent is None else indent

return serialize_workbook(
filtered,
fmt=use_fmt,
Expand Down
70 changes: 70 additions & 0 deletions tests/cli/test_cli_lazy_imports.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,76 @@ def test_import_exstruct_stays_lightweight() -> None:
}


def test_import_engine_module_stays_lightweight() -> None:
payload = _run_probe(
"""
import json
import sys
import exstruct.engine
print(json.dumps({
"engine": "exstruct.engine" in sys.modules,
"core_cells": "exstruct.core.cells" in sys.modules,
"core_integrate": "exstruct.core.integrate" in sys.modules,
"io": "exstruct.io" in sys.modules,
"render": "exstruct.render" in sys.modules,
"numpy": "numpy" in sys.modules,
"pandas": "pandas" in sys.modules,
"openpyxl": "openpyxl" in sys.modules,
"xlwings": "xlwings" in sys.modules,
"PIL": "PIL" in sys.modules,
}))
"""
)

assert payload == {
"engine": True,
"core_cells": False,
"core_integrate": False,
"io": False,
"render": False,
"numpy": False,
"pandas": False,
"openpyxl": False,
"xlwings": False,
"PIL": False,
}


def test_import_public_engine_export_stays_lightweight() -> None:
payload = _run_probe(
"""
import json
import sys
from exstruct import ExStructEngine
print(json.dumps({
"engine": "exstruct.engine" in sys.modules,
"core_cells": "exstruct.core.cells" in sys.modules,
"core_integrate": "exstruct.core.integrate" in sys.modules,
"io": "exstruct.io" in sys.modules,
"render": "exstruct.render" in sys.modules,
"numpy": "numpy" in sys.modules,
"pandas": "pandas" in sys.modules,
"openpyxl": "openpyxl" in sys.modules,
"xlwings": "xlwings" in sys.modules,
"PIL": "PIL" in sys.modules,
}))
"""
)

assert payload == {
"engine": True,
"core_cells": False,
"core_integrate": False,
"io": False,
"render": False,
"numpy": False,
"pandas": False,
"openpyxl": False,
"xlwings": False,
"PIL": False,
}


def test_import_cli_main_does_not_load_edit_or_extraction_modules() -> None:
payload = _run_probe(
"""
Expand Down
Loading