Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 98 additions & 0 deletions ryan_library/functions/misc_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from typing import TypedDict
from pathlib import Path
from importlib import metadata
import re
from openpyxl.utils import get_column_letter
from openpyxl import Workbook
from openpyxl.worksheet.worksheet import Worksheet
Expand Down Expand Up @@ -101,6 +102,9 @@ class ExcelExporter:
set_column_widths: Apply specific column widths to a worksheet.
auto_adjust_column_widths: Automatically adjust column widths based on data."""

MAX_EXCEL_ROWS: int = 1_048_576
MAX_EXCEL_COLUMNS: int = 16_384

def export_dataframes(
self,
export_dict: dict[str, ExportContent],
Expand Down Expand Up @@ -155,6 +159,20 @@ def export_dataframes(
f"For file '{file_name}', the number of dataframes ({len(dataframes)}) and sheets ({len(sheets)}) must match."
)

if self._exceeds_excel_limits(dataframes=dataframes):
logging.warning(
"Data for '%s' exceeds Excel size limits. Exporting to Parquet and CSV instead.",
file_name,
)
self._export_as_parquet_and_csv(
file_name=file_name,
dataframes=dataframes,
sheets=sheets,
datetime_string=datetime_string,
output_directory=output_directory,
)
continue

# Determine the export path
export_filename: str = f"{datetime_string}_{file_name}.xlsx"
export_path: Path = (
Expand Down Expand Up @@ -208,6 +226,86 @@ def export_dataframes(
logging.error(f"Failed to write to '{export_path}': {e}")
raise

def _exceeds_excel_limits(self, dataframes: list[pd.DataFrame]) -> bool:
"""Return True if any dataframe exceeds Excel's size limits."""

for df in dataframes:
num_data_rows: int = len(df.index)
num_columns: int = len(df.columns)
header_rows: int = df.columns.nlevels if num_columns > 0 else 0
total_rows: int = num_data_rows + header_rows

if total_rows > self.MAX_EXCEL_ROWS or num_columns > self.MAX_EXCEL_COLUMNS:
logging.debug(
"Dataframe size rows=%s (including %s header rows) columns=%s exceeds Excel limits (rows=%s, columns=%s).",
total_rows,
header_rows,
num_columns,
self.MAX_EXCEL_ROWS,
self.MAX_EXCEL_COLUMNS,
)
return True
return False

def _export_as_parquet_and_csv(
self,
file_name: str,
dataframes: list[pd.DataFrame],
sheets: list[str],
datetime_string: str,
output_directory: Path | None,
) -> None:
"""Export dataframes to Parquet and CSV files when Excel limits are exceeded."""

export_targets: list[tuple[pd.DataFrame, str, Path, Path]] = []

for df, sheet in zip(dataframes, sheets):
sanitized_sheet: str = self._sanitize_name(sheet)
base_filename: str = f"{datetime_string}_{file_name}_{sanitized_sheet}"

parquet_path: Path = self._build_output_path(
Comment on lines +262 to +266

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Prevent overwriting when sanitized sheet names collide

Output filenames are derived from self._sanitize_name(sheet) and then reused for both Parquet and CSV exports, but there is no guarantee that the sanitized names are unique. If two sheets differ only by characters that _sanitize_name strips (e.g. "Sales#1" vs "Sales@1"), both exports target the same base_filename, so the later write silently replaces the earlier files and those data are lost. Consider de‑duplicating sanitized names or appending a counter when a collision occurs before writing.

Useful? React with 👍 / 👎.

base_filename=f"{base_filename}.parquet", output_directory=output_directory
)
csv_path: Path = self._build_output_path(
base_filename=f"{base_filename}.csv", output_directory=output_directory
)

parquet_path.parent.mkdir(parents=True, exist_ok=True)
export_targets.append((df, sheet, parquet_path, csv_path))

for df, sheet, parquet_path, _ in export_targets:
try:
df.to_parquet(path=parquet_path, index=False)
logging.info("Exported Parquet to %s", parquet_path)
except (ImportError, ValueError) as exc:
message: str = (
"Unable to export Parquet for "
f"'{file_name}' sheet '{sheet}': {exc}. Install pyarrow or fastparquet."
)
logging.error(message)
print(message)
except Exception as exc: # pragma: no cover - unforeseen errors should be logged
logging.exception(
"Unexpected error during Parquet export for '%s' sheet '%s': %s", file_name, sheet, exc
)

for df, sheet, _, csv_path in export_targets:
df.to_csv(path_or_buf=csv_path, index=False)
logging.info("Exported CSV to %s", csv_path)

def _build_output_path(self, base_filename: str, output_directory: Path | None) -> Path:
"""Create the full output path for a file name."""

if output_directory is not None:
return output_directory / base_filename
return Path(base_filename)

def _sanitize_name(self, value: str) -> str:
"""Return a filesystem-friendly version of the provided value."""

sanitized: str = re.sub(pattern=r"[^A-Za-z0-9_-]+", repl="_", string=value).strip("_")
return sanitized or "Sheet"

def save_to_excel(
self,
data_frame: pd.DataFrame,
Expand Down