diff --git a/great-docs.yml b/great-docs.yml index bfffd875..792706f3 100644 --- a/great-docs.yml +++ b/great-docs.yml @@ -253,6 +253,7 @@ nav_icons: API Evolution: git-compare-arrows Versioned Docs: git-branch Color Swatches: pipette + Table Previews: table # Author Information # ------------------ diff --git a/great_docs/__init__.py b/great_docs/__init__.py index 3fd40fdd..da2a4bd2 100644 --- a/great_docs/__init__.py +++ b/great_docs/__init__.py @@ -8,6 +8,8 @@ except PackageNotFoundError: # pragma: no cover __version__ = "0.0.0" +from ._tbl_display import disable_tbl_preview, enable_tbl_preview +from ._tbl_preview import tbl_preview from .cli import main from .config import Config, create_default_config, load_config from .core import GreatDocs @@ -16,9 +18,12 @@ "Config", "GreatDocs", "create_default_config", + "disable_tbl_preview", + "enable_tbl_preview", "load_config", "main", "render_evolution_table", + "tbl_preview", ] diff --git a/great_docs/_tbl_display.py b/great_docs/_tbl_display.py new file mode 100644 index 00000000..ce185404 --- /dev/null +++ b/great_docs/_tbl_display.py @@ -0,0 +1,96 @@ +from __future__ import annotations + +from typing import Any + + +def enable_tbl_preview(**kwargs: Any) -> None: + """Register `tbl_preview` as the default DataFrame display formatter. + + After calling this, any Polars or Pandas DataFrame that is the last expression in a cell (or + passed to `display()`) will be rendered as a `tbl_preview()` table instead of the library's + default HTML. + + Parameters + ---------- + **kwargs + Keyword arguments forwarded to `tbl_preview()` (e.g., `n_head=10`, `show_all=True`, + `show_dimensions=False`). + + Examples + -------- + In a notebook or `.qmd` file: + + .. code-block:: python + + import great_docs as gd + gd.enable_tbl_preview(n_head=8, n_tail=3) + + # Now any DataFrame displayed will use tbl_preview() automatically: + import pandas as pd + pd.read_csv("data.csv") # → rendered as a preview table + """ + try: + ip = _get_ipython() + except RuntimeError: + return + + from great_docs._tbl_preview import _is_pandas, _is_polars, tbl_preview + + def _tbl_preview_formatter(obj: Any) -> str | None: + if _is_polars(obj) or _is_pandas(obj): + return tbl_preview(obj, **kwargs).as_html() + return None + + html_formatter = ip.display_formatter.formatters["text/html"] # type: ignore[union-attr] + + # Register for known DataFrame types (if available) + try: + import polars as pl + + html_formatter.for_type(pl.DataFrame, _tbl_preview_formatter) + except ImportError: + pass + + try: + import pandas as pd + + html_formatter.for_type(pd.DataFrame, _tbl_preview_formatter) + except ImportError: + pass + + +def disable_tbl_preview() -> None: + """Remove the `tbl_preview` display formatter and restore defaults.""" + try: + ip = _get_ipython() + except RuntimeError: + return + + html_formatter = ip.display_formatter.formatters["text/html"] # type: ignore[union-attr] + + try: + import polars as pl + + html_formatter.pop(pl.DataFrame, None) + except ImportError: + pass + + try: + import pandas as pd + + html_formatter.pop(pd.DataFrame, None) + except ImportError: + pass + + +def _get_ipython() -> Any: + """Get the active IPython instance or raise RuntimeError.""" + try: + from IPython import get_ipython + + ip = get_ipython() + if ip is None: + raise RuntimeError("No active IPython session.") + return ip + except ImportError: + raise RuntimeError("IPython is not installed.") diff --git a/great_docs/_tbl_preview.py b/great_docs/_tbl_preview.py new file mode 100644 index 00000000..538f3867 --- /dev/null +++ b/great_docs/_tbl_preview.py @@ -0,0 +1,1230 @@ +from __future__ import annotations + +import html as _html_mod +import secrets +from pathlib import Path +from typing import Any + +# --------------------------------------------------------------------------- +# Public result class +# --------------------------------------------------------------------------- + + +class TblPreview: + """Rendered table preview with `_repr_html_()` support.""" + + def __init__(self, html: str) -> None: + self._html = html + + def _repr_html_(self) -> str: + return self._html + + def as_html(self) -> str: + """Return the raw HTML string.""" + return self._html + + def save(self, path: str | Path) -> None: + """Write the HTML to a file.""" + Path(path).write_text(self._html, encoding="utf-8") + + def __repr__(self) -> str: + return f"TblPreview({len(self._html)} chars)" + + +# --------------------------------------------------------------------------- +# Table-type badge colors (subset of Pointblank's TABLE_TYPE_STYLES) +# --------------------------------------------------------------------------- + +_TABLE_TYPE_STYLES: dict[str, dict[str, str]] = { + "polars": {"bg": "#0075FF", "fg": "#FFFFFF", "label": "Polars"}, + "pandas": {"bg": "#150458", "fg": "#FFFFFF", "label": "Pandas"}, + "csv": {"bg": "#FFF8E1", "fg": "#7A6200", "label": "CSV"}, + "tsv": {"bg": "#E8F5E9", "fg": "#2E7D32", "label": "TSV"}, + "jsonl": {"bg": "#E3F2FD", "fg": "#1565C0", "label": "JSONL"}, + "arrow": {"bg": "#E8EAF6", "fg": "#283593", "label": "Arrow"}, + "parquet": {"bg": "#F3E5F5", "fg": "#6A1B9A", "label": "Parquet"}, + "feather": {"bg": "#FFF3E0", "fg": "#E65100", "label": "Feather"}, + "dict": {"bg": "#F0F0F0", "fg": "#333333", "label": "Table"}, +} + +# --------------------------------------------------------------------------- +# Short dtype labels +# --------------------------------------------------------------------------- + +_POLARS_DTYPE_SHORT: dict[str, str] = { + "Int8": "i8", + "Int16": "i16", + "Int32": "i32", + "Int64": "i64", + "UInt8": "u8", + "UInt16": "u16", + "UInt32": "u32", + "UInt64": "u64", + "Float32": "f32", + "Float64": "f64", + "Boolean": "bool", + "String": "str", + "Utf8": "str", + "Date": "date", + "Datetime": "dtime", + "Time": "time", + "Duration": "dur", + "Categorical": "cat", + "Enum": "enum", + "Binary": "bin", + "Null": "null", + "Object": "obj", + "Decimal": "dec", +} + +_PANDAS_DTYPE_SHORT: dict[str, str] = { + "int8": "i8", + "int16": "i16", + "int32": "i32", + "int64": "i64", + "uint8": "u8", + "uint16": "u16", + "uint32": "u32", + "uint64": "u64", + "float16": "f16", + "float32": "f32", + "float64": "f64", + "bool": "bool", + "object": "str", + "string": "str", + "category": "cat", + "datetime64[ns]": "dtime", + "timedelta64[ns]": "dur", +} + +_ARROW_DTYPE_SHORT: dict[str, str] = { + "int8": "i8", + "int16": "i16", + "int32": "i32", + "int64": "i64", + "uint8": "u8", + "uint16": "u16", + "uint32": "u32", + "uint64": "u64", + "float": "f32", + "halffloat": "f16", + "float16": "f16", + "float32": "f32", + "double": "f64", + "float64": "f64", + "bool": "bool", + "string": "str", + "utf8": "str", + "large_string": "str", + "large_utf8": "str", + "binary": "bin", + "date32": "date", + "date32[day]": "date", + "date64": "date", + "timestamp[ns]": "dtime", + "timestamp[us]": "dtime", + "timestamp[ms]": "dtime", + "timestamp[s]": "dtime", + "time32[ms]": "time", + "time64[us]": "time", + "duration[ns]": "dur", + "duration[us]": "dur", + "null": "null", + "dictionary": "cat", + "decimal128": "dec", +} + + +def _arrow_dtype_short(dtype_str: str) -> str: + """Convert a PyArrow dtype string to a short label.""" + # Try exact match first, then strip parameterized parts + if dtype_str in _ARROW_DTYPE_SHORT: + return _ARROW_DTYPE_SHORT[dtype_str] + base = dtype_str.split("[")[0].split("(")[0].strip() + return _ARROW_DTYPE_SHORT.get(base, base[:4]) + + +# Character width approximation for IBM Plex Mono at 12px +_CHAR_PX = 7.2 +_LABEL_CHAR_PX = 7.8 +_COL_PADDING_PX = 16 +_MIN_COL_WIDTH = 50 + +# --------------------------------------------------------------------------- +# Data normalization +# --------------------------------------------------------------------------- + + +def _is_polars(data: Any) -> bool: + t = type(data).__module__ + return t.startswith("polars") + + +def _is_pandas(data: Any) -> bool: + t = type(data).__module__ + return t.startswith("pandas") + + +def _is_arrow(data: Any) -> bool: + t = type(data).__module__ + return t.startswith("pyarrow") + + +def _normalize_data( + data: Any, +) -> tuple[list[str], list[str], list[list[Any]], int, str]: + """Normalize input data to a common internal representation. + + Returns + ------- + tuple + (col_names, col_dtypes_short, all_rows, total_row_count, tbl_type) where `all_rows` is a + list of rows, each row a list of cell values. + """ + if isinstance(data, (str, Path)): + return _from_file(data) + if isinstance(data, dict): + return _from_dict(data) + if isinstance(data, list) and data and isinstance(data[0], dict): + return _from_list_of_dicts(data) + if _is_polars(data): + return _from_polars(data) + if _is_pandas(data): + return _from_pandas(data) + if _is_arrow(data): + return _from_arrow(data) + raise TypeError( + f"Unsupported data type: {type(data).__name__}. " + "Pass a Polars/Pandas DataFrame, PyArrow Table, file path, dict, " + "or list of dicts." + ) + + +def _from_polars( + df: Any, +) -> tuple[list[str], list[str], list[list[Any]], int, str]: + col_names = df.columns + col_dtypes = [_polars_dtype_short(str(df[c].dtype)) for c in col_names] + n_rows = df.height if hasattr(df, "height") else len(df) + rows = df.rows() # list of tuples + return col_names, col_dtypes, [list(r) for r in rows], n_rows, "polars" + + +def _from_pandas( + df: Any, +) -> tuple[list[str], list[str], list[list[Any]], int, str]: + col_names = list(df.columns) + col_dtypes = [_pandas_dtype_short(str(df[c].dtype)) for c in col_names] + n_rows = len(df) + rows = df.values.tolist() + return col_names, col_dtypes, rows, n_rows, "pandas" + + +_FILE_EXT_MAP: dict[str, str] = { + ".csv": "csv", + ".tsv": "tsv", + ".tab": "tsv", + ".jsonl": "jsonl", + ".ndjson": "jsonl", + ".parquet": "parquet", + ".pq": "parquet", + ".feather": "feather", + ".arrow": "arrow", + ".ipc": "arrow", +} + + +def _from_file( + path: str | Path, +) -> tuple[list[str], list[str], list[list[Any]], int, str]: + """Dispatch to the correct reader based on file extension.""" + p = Path(path) + ext = p.suffix.lower() + fmt = _FILE_EXT_MAP.get(ext, "csv") # default to CSV for unknown extensions + if fmt == "csv": + return _from_csv(p) + if fmt == "tsv": + return _from_tsv(p) + if fmt == "jsonl": + return _from_jsonl(p) + if fmt == "parquet": + return _from_parquet(p) + if fmt in ("feather", "arrow"): + return _from_feather(p, fmt) + return _from_csv(p) + + +def _from_csv( + path: str | Path, +) -> tuple[list[str], list[str], list[list[Any]], int, str]: + path = str(path) + try: + import polars as pl + + df = pl.read_csv(path) + names, dtypes, rows, n, _ = _from_polars(df) + return names, dtypes, rows, n, "csv" + except ImportError: + pass + try: + import pandas as pd + + df = pd.read_csv(path) + names, dtypes, rows, n, _ = _from_pandas(df) + return names, dtypes, rows, n, "csv" + except ImportError: + pass + raise ImportError( + "Reading CSV files requires either Polars or Pandas. " + "Install one with: pip install polars (or) pip install pandas" + ) + + +def _from_tsv( + path: str | Path, +) -> tuple[list[str], list[str], list[list[Any]], int, str]: + path = str(path) + try: + import polars as pl + + df = pl.read_csv(path, separator="\t") + names, dtypes, rows, n, _ = _from_polars(df) + return names, dtypes, rows, n, "tsv" + except ImportError: + pass + try: + import pandas as pd + + df = pd.read_csv(path, sep="\t") + names, dtypes, rows, n, _ = _from_pandas(df) + return names, dtypes, rows, n, "tsv" + except ImportError: + pass + raise ImportError( + "Reading TSV files requires either Polars or Pandas. " + "Install one with: pip install polars (or) pip install pandas" + ) + + +def _from_jsonl( + path: str | Path, +) -> tuple[list[str], list[str], list[list[Any]], int, str]: + path = str(path) + try: + import polars as pl + + df = pl.read_ndjson(path) + names, dtypes, rows, n, _ = _from_polars(df) + return names, dtypes, rows, n, "jsonl" + except ImportError: + pass + try: + import pandas as pd + + df = pd.read_json(path, lines=True) + names, dtypes, rows, n, _ = _from_pandas(df) + return names, dtypes, rows, n, "jsonl" + except ImportError: + pass + raise ImportError( + "Reading JSONL files requires either Polars or Pandas. " + "Install one with: pip install polars (or) pip install pandas" + ) + + +def _from_parquet( + path: str | Path, +) -> tuple[list[str], list[str], list[list[Any]], int, str]: + path = str(path) + try: + import polars as pl + + df = pl.read_parquet(path) + names, dtypes, rows, n, _ = _from_polars(df) + return names, dtypes, rows, n, "parquet" + except ImportError: + pass + try: + import pandas as pd + + df = pd.read_parquet(path) + names, dtypes, rows, n, _ = _from_pandas(df) + return names, dtypes, rows, n, "parquet" + except ImportError: + pass + raise ImportError( + "Reading Parquet files requires Polars, Pandas+pyarrow, or " + "Pandas+fastparquet. Install one with: pip install polars (or) " + "pip install pandas pyarrow" + ) + + +def _from_feather( + path: str | Path, + fmt: str = "feather", +) -> tuple[list[str], list[str], list[list[Any]], int, str]: + path = str(path) + try: + import polars as pl + + df = pl.read_ipc(path) + names, dtypes, rows, n, _ = _from_polars(df) + return names, dtypes, rows, n, fmt + except ImportError: + pass + try: + import pandas as pd + + df = pd.read_feather(path) + names, dtypes, rows, n, _ = _from_pandas(df) + return names, dtypes, rows, n, fmt + except ImportError: + pass + raise ImportError( + "Reading Feather/Arrow IPC files requires Polars or Pandas+pyarrow. " + "Install one with: pip install polars (or) pip install pandas pyarrow" + ) + + +def _from_arrow( + tbl: Any, +) -> tuple[list[str], list[str], list[list[Any]], int, str]: + """Convert a PyArrow Table to the internal representation.""" + col_names = tbl.column_names + col_dtypes = [_arrow_dtype_short(str(tbl.field(c).type)) for c in col_names] + n_rows = tbl.num_rows + rows = [list(row.values()) for row in tbl.to_pylist()] + return col_names, col_dtypes, rows, n_rows, "arrow" + + +def _from_dict( + d: dict[str, list], +) -> tuple[list[str], list[str], list[list[Any]], int, str]: + col_names = list(d.keys()) + if not col_names: + return [], [], [], 0, "dict" + n_rows = len(d[col_names[0]]) + col_dtypes = [_infer_dtype(d[c]) for c in col_names] + rows = [[d[c][i] if i < len(d[c]) else None for c in col_names] for i in range(n_rows)] + return col_names, col_dtypes, rows, n_rows, "dict" + + +def _from_list_of_dicts( + lst: list[dict], +) -> tuple[list[str], list[str], list[list[Any]], int, str]: + col_names = list(dict.fromkeys(k for row in lst for k in row)) + n_rows = len(lst) + rows = [[row.get(c) for c in col_names] for row in lst] + col_dtypes = [_infer_dtype([row[i] for row in rows]) for i, _ in enumerate(col_names)] + return col_names, col_dtypes, rows, n_rows, "dict" + + +def _polars_dtype_short(dtype_str: str) -> str: + """Convert a Polars dtype string to a short label.""" + # Strip parameterized parts: Datetime(time_unit='us', ...) → Datetime + base = dtype_str.split("(")[0].strip() + return _POLARS_DTYPE_SHORT.get(base, base.lower()[:4]) + + +def _pandas_dtype_short(dtype_str: str) -> str: + """Convert a Pandas dtype string to a short label.""" + return _PANDAS_DTYPE_SHORT.get(dtype_str, dtype_str[:4]) + + +def _infer_dtype(values: list) -> str: + """Infer a short dtype label from a list of Python values.""" + types = {type(v) for v in values if v is not None} + if not types: + return "null" + if types == {int}: + return "i64" + if types <= {int, float}: + return "f64" + if types == {float}: + return "f64" + if types == {bool}: + return "bool" + return "str" + + +# --------------------------------------------------------------------------- +# Column subsetting +# --------------------------------------------------------------------------- + + +def _apply_column_subset( + col_names: list[str], + col_dtypes: list[str], + rows: list[list], + columns: list[str] | None, +) -> tuple[list[str], list[str], list[list]]: + if columns is None: + return col_names, col_dtypes, rows + indices = [] + for c in columns: + if c not in col_names: + raise ValueError(f"Column {c!r} not found. Available: {col_names}") + indices.append(col_names.index(c)) + new_names = [col_names[i] for i in indices] + new_dtypes = [col_dtypes[i] for i in indices] + new_rows = [[row[i] for i in indices] for row in rows] + return new_names, new_dtypes, new_rows + + +# --------------------------------------------------------------------------- +# Head/tail split +# --------------------------------------------------------------------------- + + +def _compute_head_tail( + rows: list[list], + total_rows: int, + n_head: int, + n_tail: int, + show_all: bool, +) -> tuple[list[list], list[int], bool]: + """Select head and tail rows, compute row numbers. + + Returns + ------- + tuple + (display_rows, row_numbers, is_full_dataset) + """ + if show_all or n_head + n_tail >= total_rows: + row_numbers = list(range(1, total_rows + 1)) + return rows, row_numbers, True + + head_rows = rows[:n_head] + tail_rows = rows[-n_tail:] if n_tail > 0 else [] + display_rows = head_rows + tail_rows + + head_nums = list(range(1, n_head + 1)) + tail_nums = list(range(total_rows - n_tail + 1, total_rows + 1)) if n_tail > 0 else [] + row_numbers = head_nums + tail_nums + + return display_rows, row_numbers, False + + +# --------------------------------------------------------------------------- +# Missing value detection +# --------------------------------------------------------------------------- + +_MISSING_REPRS = {"None", "nan", "NaN", "NA", "NaT", "", ""} + + +def _is_missing(value: Any) -> bool: + if value is None: + return True + try: + import math + + if isinstance(value, float) and math.isnan(value): + return True + except (TypeError, ValueError): + pass + return False + + +# --------------------------------------------------------------------------- +# Alignment detection +# --------------------------------------------------------------------------- + + +def _detect_alignments(col_dtypes: list[str]) -> list[str]: + """Return 'right' for numeric columns, 'left' for everything else.""" + numeric = {"i8", "i16", "i32", "i64", "u8", "u16", "u32", "u64", "f16", "f32", "f64", "dec"} + return ["right" if d in numeric else "left" for d in col_dtypes] + + +# --------------------------------------------------------------------------- +# Column width calculation +# --------------------------------------------------------------------------- + + +def _compute_col_widths( + col_names: list[str], + col_dtypes: list[str], + rows: list[list], + max_col_width: int, + min_tbl_width: int, + show_row_numbers: bool, + row_numbers: list[int], +) -> tuple[list[int], int]: + """Compute pixel widths for each column. + + Returns + ------- + tuple + (col_widths, rownum_width) where rownum_width is 0 if hidden. + """ + widths: list[int] = [] + for i, name in enumerate(col_names): + # Width from column name + name_w = _LABEL_CHAR_PX * len(name) + _COL_PADDING_PX + # Width from dtype label + dtype_w = _LABEL_CHAR_PX * len(col_dtypes[i]) + _COL_PADDING_PX + # Width from content (sample all displayed rows) + max_content_len = 0 + for row in rows: + val = row[i] + cell_str = _format_cell(val) + max_content_len = max(max_content_len, len(cell_str)) + content_w = _CHAR_PX * max_content_len + _COL_PADDING_PX + + w = int(round(min(max(name_w, dtype_w, content_w, _MIN_COL_WIDTH), max_col_width))) + widths.append(w) + + # Row number column width + rownum_width = 0 + if show_row_numbers and row_numbers: + max_num = max(row_numbers) + rownum_width = int(round(len(str(max_num)) * _CHAR_PX + 10)) + rownum_width = max(rownum_width, 35) + + # Scale up to min_tbl_width + total = sum(widths) + rownum_width + if total < min_tbl_width and widths: + remaining = min_tbl_width - total + per_col = remaining // len(widths) + widths = [w + per_col for w in widths] + + return widths, rownum_width + + +# --------------------------------------------------------------------------- +# Cell formatting +# --------------------------------------------------------------------------- + + +def _format_cell(value: Any) -> str: + """Format a cell value as a display string.""" + if value is None: + return "None" + if isinstance(value, bool): + return str(value) + if isinstance(value, float): + import math + + if math.isnan(value): + return "NaN" + if math.isinf(value): + return "Inf" if value > 0 else "-Inf" + # Use 12 significant digits — enough precision for real data + # while trimming IEEE 754 noise (e.g. 3.3000000000000003 → 3.3) + return f"{value:.12g}" + return str(value) + + +def _escape(text: str) -> str: + """HTML-escape a string.""" + return _html_mod.escape(text, quote=True) + + +# --------------------------------------------------------------------------- +# Number formatting +# --------------------------------------------------------------------------- + + +def _format_number(n: int) -> str: + """Format an integer with comma separators.""" + return f"{n:,}" + + +# --------------------------------------------------------------------------- +# HTML rendering +# --------------------------------------------------------------------------- + + +def _render_scoped_css(uid: str) -> str: + """Generate the scoped CSS block for a table instance.""" + s = f"#gd-tbl-{uid}" + return f"""""" + + +def _render_header_html( + uid: str, + tbl_type: str, + n_rows: int, + n_cols: int, + caption: str | None, + show_dimensions: bool, + total_cols: int, +) -> str: + """Render the header rows (banner + optional caption + column labels).""" + parts: list[str] = [] + + if show_dimensions: + style_info = _TABLE_TYPE_STYLES.get(tbl_type, _TABLE_TYPE_STYLES["dict"]) + type_badge = ( + f'{_escape(style_info["label"])}' + ) + rows_badge = ( + f'Rows' + f'{_format_number(n_rows)}' + ) + cols_badge = ( + f'Columns' + f'{_format_number(n_cols)}' + ) + parts.append( + f'' + f'' + f'
' + f"{type_badge}{rows_badge}{cols_badge}" + f"
" + ) + + if caption: + border_class = " gt_bottom_border" if not show_dimensions else "" + parts.append( + f'' + f'{_escape(caption)}' + ) + + return "\n".join(parts) + + +def _render_colgroup_html( + col_widths: list[int], + rownum_width: int, + show_row_numbers: bool, +) -> str: + """Render the element.""" + parts = [""] + if show_row_numbers: + parts.append(f'') + for w in col_widths: + parts.append(f'') + parts.append("") + return "\n".join(parts) + + +def _render_column_labels_html( + col_names: list[str], + col_dtypes: list[str], + alignments: list[str], + show_dtypes: bool, + show_row_numbers: bool, +) -> str: + """Render the column label .""" + parts = [''] + + if show_row_numbers: + parts.append( + '' + ) + + for i, name in enumerate(col_names): + align_cls = f"gt_{alignments[i]}" + if show_dtypes: + label_html = ( + f"
" + f'
{_escape(name)}
' + f'
{_escape(col_dtypes[i])}
' + f"
" + ) + else: + label_html = _escape(name) + + parts.append( + f'' + f"{label_html}" + ) + + parts.append("") + return "\n".join(parts) + + +def _render_body_html( + rows: list[list], + row_numbers: list[int], + col_names: list[str], + alignments: list[str], + col_widths: list[int], + n_head: int, + is_full_dataset: bool, + show_row_numbers: bool, + highlight_missing: bool, +) -> str: + """Render the rows.""" + parts = [''] + + for row_idx, (row, row_num) in enumerate(zip(rows, row_numbers)): + # Add divider class to the last head row + divider = "" + if not is_full_dataset and row_idx == n_head - 1: + divider = ' class="gd-tbl-divider"' + + parts.append(f"") + + if show_row_numbers: + parts.append(f'{row_num}') + + for col_idx, val in enumerate(row): + align_cls = f"gt_{alignments[col_idx]}" + cell_str = _format_cell(val) + cell_html = _escape(cell_str) + + missing_cls = "" + if highlight_missing and _is_missing(val): + missing_cls = " gd-tbl-missing" + + w = col_widths[col_idx] + parts.append( + f'{cell_html}' + ) + + parts.append("") + + parts.append("") + return "\n".join(parts) + + +# --------------------------------------------------------------------------- +# Main entry point +# --------------------------------------------------------------------------- + + +def tbl_preview( + data: Any, + columns: list[str] | None = None, + n_head: int = 5, + n_tail: int = 5, + limit: int = 50, + show_all: bool = False, + show_row_numbers: bool = True, + show_dtypes: bool = True, + show_dimensions: bool = True, + max_col_width: int = 250, + min_tbl_width: int = 500, + caption: str | None = None, + highlight_missing: bool = True, + id: str | None = None, +) -> TblPreview: + """Generate a beautiful table preview. + + Parameters + ---------- + data + The table data. Accepts a Polars DataFrame, Pandas DataFrame, PyArrow Table, file path (CSV, + TSV, JSONL, Parquet, Feather/Arrow IPC), column-oriented dict, or list of row dicts. + columns + Subset of columns to display. `None` shows all columns. + n_head + Number of rows to show from the start of the table. + n_tail + Number of rows to show from the end of the table. + limit + Maximum allowed sum of `n_head` and `n_tail`. + show_all + If `True`, display the entire table (ignores `n_head`/`n_tail`). + show_row_numbers + Whether to show a row-number column on the left. + show_dtypes + Whether to show dtype sublabels under column names. + show_dimensions + Whether to show the header banner with row/column counts. + max_col_width + Maximum width of any column in pixels. + min_tbl_width + Minimum total table width in pixels. + caption + Optional caption displayed below the header banner. + highlight_missing + Whether to highlight missing values (None/NaN/NA). + id + HTML id for the table container. Auto-generated if `None`. + + Returns + ------- + TblPreview + Rendered table with `_repr_html_()`, `as_html()`, and `save()` methods. + """ + if not show_all and n_head + n_tail > limit: + raise ValueError( + f"n_head ({n_head}) + n_tail ({n_tail}) = {n_head + n_tail} " + f"exceeds limit ({limit}). Increase limit= or set show_all=True." + ) + + # 1. Normalize input data + col_names, col_dtypes, all_rows, total_rows, tbl_type = _normalize_data(data) + original_n_cols = len(col_names) + + # 2. Apply column subset + col_names, col_dtypes, all_rows = _apply_column_subset(col_names, col_dtypes, all_rows, columns) + + # 3. Compute head/tail split + display_rows, row_numbers, is_full = _compute_head_tail( + all_rows, total_rows, n_head, n_tail, show_all + ) + + # 4. Detect alignments + alignments = _detect_alignments(col_dtypes) + + # 5. Compute column widths + col_widths, rownum_width = _compute_col_widths( + col_names, + col_dtypes, + display_rows, + max_col_width, + min_tbl_width, + show_row_numbers, + row_numbers, + ) + + # 6. Generate unique ID + uid = id or secrets.token_hex(4) + + # Total columns including row number column + total_cols = len(col_names) + (1 if show_row_numbers else 0) + + # 7. Render HTML components + css = _render_scoped_css(uid) + + header = _render_header_html( + uid, tbl_type, total_rows, original_n_cols, caption, show_dimensions, total_cols + ) + + colgroup = _render_colgroup_html(col_widths, rownum_width, show_row_numbers) + + column_labels = _render_column_labels_html( + col_names, col_dtypes, alignments, show_dtypes, show_row_numbers + ) + + body = _render_body_html( + display_rows, + row_numbers, + col_names, + alignments, + col_widths, + n_head, + is_full, + show_row_numbers, + highlight_missing, + ) + + # 8. Assemble final HTML + html = ( + f'
\n' + f"{css}\n" + f'\n' + f"{colgroup}\n" + f"\n{header}\n{column_labels}\n\n" + f"{body}\n" + f"
\n" + f"
" + ) + + return TblPreview(html) diff --git a/great_docs/assets/_extensions/tbl-preview/_extension.yml b/great_docs/assets/_extensions/tbl-preview/_extension.yml new file mode 100644 index 00000000..217f1e5c --- /dev/null +++ b/great_docs/assets/_extensions/tbl-preview/_extension.yml @@ -0,0 +1,7 @@ +title: Table Preview +author: Great Docs +version: 1.0.0 +quarto-required: ">=1.3.0" +contributes: + shortcodes: + - tbl-preview.lua diff --git a/great_docs/assets/_extensions/tbl-preview/_tbl_preview_shortcode.py b/great_docs/assets/_extensions/tbl-preview/_tbl_preview_shortcode.py new file mode 100644 index 00000000..1cf45216 --- /dev/null +++ b/great_docs/assets/_extensions/tbl-preview/_tbl_preview_shortcode.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python3 +"""CLI helper for the tbl-preview Quarto shortcode.""" + +from __future__ import annotations + +import argparse +import importlib.util +import sys +from pathlib import Path + + +def _load_tbl_preview(): + """Import tbl_preview without triggering great_docs.__init__.""" + try: + from great_docs._tbl_preview import tbl_preview + + return tbl_preview + except (ImportError, ModuleNotFoundError): + # Fall back to loading the module directly by file path. + # Walk up the directory tree until we find great_docs/_tbl_preview.py. + here = Path(__file__).resolve().parent + p = here + while p != p.parent: + candidate = p / "great_docs" / "_tbl_preview.py" + if candidate.exists(): + spec = importlib.util.spec_from_file_location("_tbl_preview", candidate) + mod = importlib.util.module_from_spec(spec) + spec.loader.exec_module(mod) + return mod.tbl_preview + p = p.parent + raise ImportError("Cannot find _tbl_preview.py") + + +def main() -> None: + parser = argparse.ArgumentParser(description="Render a table preview from a data file.") + parser.add_argument( + "file", help="Path to data file (CSV, TSV, JSONL, Parquet, Feather, Arrow IPC)" + ) + parser.add_argument("--columns", default=None, help="Comma-separated column names") + parser.add_argument("--n_head", type=int, default=5) + parser.add_argument("--n_tail", type=int, default=5) + parser.add_argument("--show_all", default="false") + parser.add_argument("--show_row_numbers", default="true") + parser.add_argument("--show_dtypes", default="true") + parser.add_argument("--show_dimensions", default="true") + parser.add_argument("--max_col_width", type=int, default=250) + parser.add_argument("--min_tbl_width", type=int, default=500) + parser.add_argument("--caption", default=None) + args = parser.parse_args() + + tbl_preview = _load_tbl_preview() + + columns = [c.strip() for c in args.columns.split(",")] if args.columns else None + + def _to_bool(s: str) -> bool: + return s.lower() in ("true", "1", "yes") + + result = tbl_preview( + data=args.file, + columns=columns, + n_head=args.n_head, + n_tail=args.n_tail, + show_all=_to_bool(args.show_all), + show_row_numbers=_to_bool(args.show_row_numbers), + show_dtypes=_to_bool(args.show_dtypes), + show_dimensions=_to_bool(args.show_dimensions), + max_col_width=args.max_col_width, + min_tbl_width=args.min_tbl_width, + caption=args.caption, + ) + + sys.stdout.write(result.as_html()) + + +if __name__ == "__main__": + main() diff --git a/great_docs/assets/_extensions/tbl-preview/tbl-preview.lua b/great_docs/assets/_extensions/tbl-preview/tbl-preview.lua new file mode 100644 index 00000000..54fc8b80 --- /dev/null +++ b/great_docs/assets/_extensions/tbl-preview/tbl-preview.lua @@ -0,0 +1,96 @@ +-- tbl-preview.lua — Quarto shortcode for data table previews +-- +-- Usage in .qmd files: +-- +-- {{< tbl-preview file="data/example.csv" >}} +-- {{< tbl-preview file="data.tsv" >}} +-- {{< tbl-preview file="data.jsonl" show_all="true" >}} +-- {{< tbl-preview file="data.parquet" n_head="10" n_tail="5" >}} +-- {{< tbl-preview file="data.csv" show_all="true" caption="My Dataset" >}} +-- +-- Calls the companion _tbl_preview_shortcode.py script, which imports +-- tbl_preview() from great_docs._tbl_preview and prints the resulting +-- HTML to stdout. + +local function kwarg_str(kwargs, key) + local raw = kwargs[key] + if raw == nil then return "" end + local s = pandoc.utils.stringify(raw) + return s or "" +end + +return { + ["tbl-preview"] = function(args, kwargs) + -- File path can be a positional arg or named kwarg + local file = kwarg_str(kwargs, "file") + if file == "" and #args > 0 then + file = pandoc.utils.stringify(args[1]) + end + + if file == "" then + return pandoc.RawBlock( + "html", + "" + ) + end + + -- Locate the helper script (lives alongside this .lua file) + local script_dir = debug.getinfo(1, "S").source:match("@?(.*/)") or "./" + local helper = script_dir .. "_tbl_preview_shortcode.py" + + -- Resolve relative file paths against the Quarto project root + -- (script_dir is /_extensions/tbl-preview/) + if file:sub(1, 1) ~= "/" then + local project_root = script_dir .. "../../" + file = project_root .. file + end + + -- Build CLI arguments + -- Use python3 for macOS compatibility (python may not exist) + local cmd_args = { "python3", helper, file } + + -- Forward optional keyword arguments + local forwarded = { + "columns", "n_head", "n_tail", "show_all", + "show_row_numbers", "show_dtypes", "show_dimensions", + "max_col_width", "min_tbl_width", "caption", + } + for _, key in ipairs(forwarded) do + local val = kwarg_str(kwargs, key) + if val ~= "" then + table.insert(cmd_args, "--" .. key) + table.insert(cmd_args, val) + end + end + + -- Build shell command (quote each argument) + local parts = {} + for _, arg in ipairs(cmd_args) do + local escaped = arg:gsub("'", "'\\''") + table.insert(parts, "'" .. escaped .. "'") + end + local cmd = table.concat(parts, " ") .. " 2>&1" + + local handle = io.popen(cmd) + if not handle then + return pandoc.RawBlock( + "html", + "" + ) + end + + local result = handle:read("*a") + local success = handle:close() + + if not success or result == "" then + local msg = result ~= "" and result or "unknown error" + msg = msg:gsub("-->", "-- >") + return pandoc.RawBlock( + "html", + "" + ) + end + + return pandoc.RawBlock("html", result) + end +} diff --git a/pyproject.toml b/pyproject.toml index e3bb473e..095b4640 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -57,6 +57,8 @@ dev = [ "twine>=3.4", "great_tables>=0.21.0", "pandas>=2.0", + "polars>=0.20", + "pyarrow>=12.0", ] docs = [ "jupyter>=1.0.0", diff --git a/test-packages/synthetic/catalog.py b/test-packages/synthetic/catalog.py index 54fb73fa..be50e9a8 100644 --- a/test-packages/synthetic/catalog.py +++ b/test-packages/synthetic/catalog.py @@ -357,6 +357,10 @@ # 176–177: Auto-include / no-auto-exclude discovery overrides "gdtest_auto_include", # 176 "gdtest_no_auto_exclude", # 177 + # 178: Table preview showcase + "gdtest_tbl_preview", # 178 + # 179: Table preview shortcode showcase + "gdtest_tbl_shortcode", # 179 ] @@ -2005,6 +2009,23 @@ "is set to true, so ALL names pass through — none are automatically " "excluded. Tests complete bypass of the AUTO_EXCLUDE filter." ), + "gdtest_tbl_preview": ( + "Table preview showcase exercising tbl_preview() with six user-guide " + "pages: basic dict/list-of-dicts previews, Pandas DataFrames, Polars " + "DataFrames, missing-value highlighting (None/NaN/Inf), column subsets " + "and wide tables, and a full-options page showing every parameter. " + "Tests badge rendering (Table/Pandas/Polars), head/tail splitting, " + "dark-mode CSS, HTML escaping, and side-by-side comparison with raw " + "DataFrame output." + ), + "gdtest_tbl_shortcode": ( + "tbl-preview Quarto shortcode showcase with CSV, TSV, and JSONL data " + "files in assets/. Five user-guide pages exercise every shortcode " + "parameter — caption, columns, n_head, n_tail, show_all, " + "show_row_numbers, show_dtypes, show_dimensions, max_col_width — " + "plus multi-table pages and wide-table horizontal scroll. No Python " + "code cells; all rendering is done via the shortcode." + ), } diff --git a/test-packages/synthetic/specs/gdtest_tbl_preview.py b/test-packages/synthetic/specs/gdtest_tbl_preview.py new file mode 100644 index 00000000..56b4bac7 --- /dev/null +++ b/test-packages/synthetic/specs/gdtest_tbl_preview.py @@ -0,0 +1,842 @@ +""" +gdtest_tbl_preview — Table preview showcase. + +Dimensions: A1, B1, C1, D1, M4, G1 +Focus: Exercises the ``tbl_preview()`` function with many different table + shapes, data types, and display options. The user guide has twelve + pages: + + 1. Basic preview — dict and list-of-dicts, caption. + 2. Pandas tables — Pandas DataFrame, custom head/tail, show_all. + 3. Polars tables — Polars DataFrame, head-only. + 4. Missing values — None/NaN highlighting on/off, mixed types. + 5. Column options — column subsets, wide tables, hide row numbers/dtypes. + 6. All options — minimal/full chrome, custom widths, side-by-side. + 7. Text-heavy tables — long strings, ellipsis, max_col_width. + 8. TSV files — tab-delimited file reading. + 9. JSONL files — newline-delimited JSON, .ndjson alias. + 10. Parquet files — Apache Parquet columnar format. + 11. Feather & Arrow IPC — Feather files, .arrow/.ipc extensions. + 12. PyArrow tables — in-memory pyarrow.Table objects. + + The API reference documents helper functions that generate sample + data for the previews. +""" + +SPEC = { + "name": "gdtest_tbl_preview", + "description": "Table preview showcase with diverse table types and options.", + "dimensions": ["A1", "B1", "C1", "D1", "M2", "G1"], + "pyproject_toml": { + "project": { + "name": "gdtest-tbl-preview", + "version": "0.1.0", + "description": "Showcase for the tbl_preview() table preview feature", + "dependencies": ["great_docs"], + }, + "build-system": { + "requires": ["setuptools"], + "build-backend": "setuptools.build_meta", + }, + }, + "config": {}, + "files": { + # ── Project root ────────────────────────────────────────────────── + "README.md": ( + "# gdtest-tbl-preview\n\n" + "A showcase site demonstrating the `tbl_preview()` function from\n" + "Great Docs. Each user-guide page exercises a different combination\n" + "of data sources, table shapes, and display options.\n" + ), + # ── Package source ──────────────────────────────────────────────── + "gdtest_tbl_preview/__init__.py": '''\ + """Sample data generators for table preview demos.""" + + __version__ = "0.1.0" + __all__ = [ + "sample_scores", + "sample_inventory", + "sample_wide", + "sample_missing", + "sample_types", + ] + + from .data import ( + sample_scores, + sample_inventory, + sample_wide, + sample_missing, + sample_types, + ) + ''', + "gdtest_tbl_preview/data.py": '''\ + """Functions that generate sample data for preview demos.""" + + from __future__ import annotations + + + def sample_scores(n: int = 20) -> dict[str, list]: + """ + Generate a student scores dataset. + + Parameters + ---------- + n + Number of rows. + + Returns + ------- + dict[str, list] + Column-oriented dict with name, subject, score, grade, and + pass/fail columns. + + Examples + -------- + >>> data = sample_scores(5) + >>> len(data["name"]) + 5 + """ + import random + random.seed(42) + names = ["Alice", "Bob", "Charlie", "Diana", "Eve", + "Frank", "Grace", "Hank", "Iris", "Jack"] + subjects = ["Math", "Science", "English", "History", "Art"] + grades = ["A+", "A", "A-", "B+", "B", "B-", "C+", "C", "D", "F"] + rows_name = [random.choice(names) for _ in range(n)] + rows_subj = [random.choice(subjects) for _ in range(n)] + rows_score = [round(random.uniform(40, 100), 1) for _ in range(n)] + rows_grade = [random.choice(grades) for _ in range(n)] + rows_pass = [s >= 60.0 for s in rows_score] + return { + "name": rows_name, + "subject": rows_subj, + "score": rows_score, + "grade": rows_grade, + "passed": rows_pass, + } + + + def sample_inventory(n: int = 30) -> dict[str, list]: + """ + Generate a product inventory dataset. + + Parameters + ---------- + n + Number of rows. + + Returns + ------- + dict[str, list] + Column-oriented dict with product, category, price, stock, + and rating columns. + + Examples + -------- + >>> data = sample_inventory(10) + >>> len(data["product"]) + 10 + """ + import random + random.seed(99) + products = [ + "Widget", "Gadget", "Doohickey", "Thingamajig", + "Gizmo", "Whatchamacallit", "Contraption", "Apparatus", + ] + categories = ["Electronics", "Tools", "Kitchen", "Garden", "Office"] + rows_prod = [random.choice(products) for _ in range(n)] + rows_cat = [random.choice(categories) for _ in range(n)] + rows_price = [round(random.uniform(5.0, 200.0), 2) for _ in range(n)] + rows_stock = [random.randint(0, 500) for _ in range(n)] + rows_rating = [round(random.uniform(1.0, 5.0), 1) for _ in range(n)] + return { + "product": rows_prod, + "category": rows_cat, + "price": rows_price, + "stock": rows_stock, + "rating": rows_rating, + } + + + def sample_wide(n_rows: int = 15, n_cols: int = 20) -> dict[str, list]: + """ + Generate a wide dataset with many columns. + + Parameters + ---------- + n_rows + Number of rows. + n_cols + Number of columns. + + Returns + ------- + dict[str, list] + Column-oriented dict with columns named ``col_001`` + through ``col_{n_cols:03d}``. + + Examples + -------- + >>> data = sample_wide(5, 8) + >>> len(data) + 8 + """ + import random + random.seed(7) + return { + f"col_{i+1:03d}": [round(random.gauss(0, 1), 3) for _ in range(n_rows)] + for i in range(n_cols) + } + + + def sample_missing(n: int = 15) -> dict[str, list]: + """ + Generate a dataset riddled with missing values. + + Parameters + ---------- + n + Number of rows. + + Returns + ------- + dict[str, list] + Column-oriented dict where roughly 25 percent of values are + ``None`` or ``float('nan')``. + + Examples + -------- + >>> data = sample_missing(10) + >>> None in data["alpha"] + True + """ + import random + import math + random.seed(13) + + def _maybe_none(val): + return None if random.random() < 0.25 else val + + return { + "alpha": [_maybe_none(random.choice(["foo", "bar", "baz"])) for _ in range(n)], + "beta": [_maybe_none(round(random.gauss(50, 15), 2)) for _ in range(n)], + "gamma": [ + float("nan") if random.random() < 0.2 else random.randint(1, 100) + for _ in range(n) + ], + "delta": [_maybe_none(random.choice([True, False])) for _ in range(n)], + } + + + def sample_types() -> dict[str, list]: + """ + Generate a dataset that exercises many Python types. + + Returns + ------- + dict[str, list] + Six rows with int, float, bool, string, None, and large-number + columns. + + Examples + -------- + >>> data = sample_types() + >>> len(data["integer"]) + 6 + """ + return { + "integer": [0, 1, -42, 1_000_000, 2**31, None], + "floating": [0.0, 3.14, -2.718, 1e10, float("inf"), float("nan")], + "boolean": [True, False, True, False, None, True], + "text": ["hello", "world", "", "café", "bold", None], + "big_number": [10**18, 10**15, 10**12, 10**9, 10**6, 10**3], + } + ''', + # ── User guide pages (flat layout) ─────────────────────────────── + "user_guide/01-basic-preview.qmd": ( + "---\n" + "title: Basic Preview\n" + "---\n" + "\n" + "## Default Settings\n" + "\n" + "The simplest way to use `tbl_preview()` — pass a column-oriented\n" + "dict and let the defaults do the work.\n" + "\n" + "```{python}\n" + "from great_docs import tbl_preview\n" + "from gdtest_tbl_preview import sample_scores\n" + "\n" + "tbl_preview(sample_scores(20))\n" + "```\n" + "\n" + "## From a List of Dicts\n" + "\n" + "You can also pass a list of row dicts:\n" + "\n" + "```{python}\n" + "rows = [\n" + ' {"city": "Tokyo", "pop_m": 37.4, "country": "Japan"},\n' + ' {"city": "Delhi", "pop_m": 32.9, "country": "India"},\n' + ' {"city": "Shanghai", "pop_m": 29.2, "country": "China"},\n' + ' {"city": "São Paulo", "pop_m": 22.4, "country": "Brazil"},\n' + ' {"city": "Mexico City", "pop_m": 21.8, "country": "Mexico"},\n' + "]\n" + "tbl_preview(rows)\n" + "```\n" + "\n" + "## With a Caption\n" + "\n" + "```{python}\n" + "tbl_preview(\n" + " sample_scores(12),\n" + ' caption="Student Performance — Fall 2025",\n' + ")\n" + "```\n" + ), + "user_guide/02-pandas-tables.qmd": ( + "---\n" + "title: Pandas Tables\n" + "---\n" + "\n" + "## Pandas DataFrame\n" + "\n" + "Pass a Pandas DataFrame directly. The preview auto-detects the\n" + "library and shows a **Pandas** badge.\n" + "\n" + "```{python}\n" + "import pandas as pd\n" + "from great_docs import tbl_preview\n" + "\n" + "df = pd.DataFrame({\n" + ' "name": ["Alice", "Bob", "Charlie", "Diana", "Eve",\n' + ' "Frank", "Grace", "Hank", "Iris", "Jack",\n' + ' "Kate", "Leo", "Mia", "Noah", "Olivia"],\n' + ' "department": ["Eng", "Sales", "Eng", "HR", "Sales",\n' + ' "Eng", "HR", "Sales", "Eng", "HR",\n' + ' "Sales", "Eng", "HR", "Sales", "Eng"],\n' + ' "salary": [95000, 72000, 88000, 65000, 78000,\n' + " 105000, 62000, 81000, 92000, 58000,\n" + " 74000, 110000, 67000, 83000, 97000],\n" + ' "years": [5, 3, 7, 2, 4, 10, 1, 6, 8, 3, 4, 12, 2, 5, 9],\n' + "})\n" + "\n" + "tbl_preview(df)\n" + "```\n" + "\n" + "## Custom Head and Tail\n" + "\n" + "Show 8 rows from the top and 3 from the bottom:\n" + "\n" + "```{python}\n" + "tbl_preview(df, n_head=8, n_tail=3)\n" + "```\n" + "\n" + "## Show All Rows\n" + "\n" + "```{python}\n" + "tbl_preview(df, show_all=True)\n" + "```\n" + ), + "user_guide/03-polars-tables.qmd": ( + "---\n" + "title: Polars Tables\n" + "---\n" + "\n" + "## Polars DataFrame\n" + "\n" + "Polars DataFrames are detected automatically and show a blue\n" + "**Polars** badge with precise dtype labels.\n" + "\n" + "```{python}\n" + "import polars as pl\n" + "from great_docs import tbl_preview\n" + "\n" + "df = pl.DataFrame({\n" + ' "id": range(1, 26),\n' + ' "value": [x * 1.1 for x in range(1, 26)],\n' + ' "category": ["A", "B", "C", "D", "E"] * 5,\n' + ' "flag": [True, False] * 12 + [True],\n' + "})\n" + "\n" + "tbl_preview(df)\n" + "```\n" + "\n" + "## Head Only (No Tail)\n" + "\n" + "```{python}\n" + "tbl_preview(df, n_head=10, n_tail=0)\n" + "```\n" + ), + "user_guide/04-missing-values.qmd": ( + "---\n" + "title: Missing Values\n" + "---\n" + "\n" + "## Highlighted Missing Values\n" + "\n" + "By default, `None` and `NaN` values are highlighted in red:\n" + "\n" + "```{python}\n" + "from great_docs import tbl_preview\n" + "from gdtest_tbl_preview import sample_missing\n" + "\n" + "tbl_preview(sample_missing(15))\n" + "```\n" + "\n" + "## Without Highlighting\n" + "\n" + "Turn off missing-value highlighting with `highlight_missing=False`:\n" + "\n" + "```{python}\n" + "tbl_preview(sample_missing(15), highlight_missing=False)\n" + "```\n" + "\n" + "## Mixed Python Types\n" + "\n" + "Inf, NaN, None, empty strings, HTML-unsafe characters, and large\n" + "numbers:\n" + "\n" + "```{python}\n" + "from gdtest_tbl_preview import sample_types\n" + "\n" + "tbl_preview(sample_types(), show_all=True)\n" + "```\n" + ), + "user_guide/05-column-options.qmd": ( + "---\n" + "title: Column Options\n" + "---\n" + "\n" + "## Column Subset\n" + "\n" + "Select and reorder columns with the `columns` parameter:\n" + "\n" + "```{python}\n" + "from great_docs import tbl_preview\n" + "from gdtest_tbl_preview import sample_inventory\n" + "\n" + "data = sample_inventory(25)\n" + 'tbl_preview(data, columns=["product", "price", "rating"])\n' + "```\n" + "\n" + "## Wide Table\n" + "\n" + "A table with 20 columns overflows and scrolls horizontally:\n" + "\n" + "```{python}\n" + "from gdtest_tbl_preview import sample_wide\n" + "\n" + "tbl_preview(sample_wide(12, 20))\n" + "```\n" + "\n" + "## No Row Numbers\n" + "\n" + "```{python}\n" + "tbl_preview(\n" + " sample_inventory(10),\n" + " show_row_numbers=False,\n" + ")\n" + "```\n" + "\n" + "## No Dtype Labels\n" + "\n" + "```{python}\n" + "tbl_preview(\n" + " sample_inventory(10),\n" + " show_dtypes=False,\n" + ")\n" + "```\n" + ), + "user_guide/06-all-options.qmd": ( + "---\n" + "title: All Options\n" + "---\n" + "\n" + "## Minimal Chrome\n" + "\n" + "Turn off every optional element — no row numbers, no dtypes,\n" + "no dimension badges:\n" + "\n" + "```{python}\n" + "from great_docs import tbl_preview\n" + "from gdtest_tbl_preview import sample_scores\n" + "\n" + "tbl_preview(\n" + " sample_scores(8),\n" + " show_row_numbers=False,\n" + " show_dtypes=False,\n" + " show_dimensions=False,\n" + " show_all=True,\n" + ")\n" + "```\n" + "\n" + "## Full Chrome with Caption\n" + "\n" + "Everything enabled plus a caption:\n" + "\n" + "```{python}\n" + "tbl_preview(\n" + " sample_scores(50),\n" + " n_head=10,\n" + " n_tail=5,\n" + ' caption="Top & bottom of the class roster",\n' + ")\n" + "```\n" + "\n" + "## Custom Column Width\n" + "\n" + "Restrict columns to 120px max width:\n" + "\n" + "```{python}\n" + "tbl_preview(\n" + " sample_scores(15),\n" + " max_col_width=120,\n" + " min_tbl_width=400,\n" + ")\n" + "```\n" + "\n" + "## Side-by-Side Comparison\n" + "\n" + "Default Pandas output vs. `tbl_preview()` on the same data:\n" + "\n" + "::: {layout-ncol=2}\n" + "\n" + "```{python}\n" + "#| echo: false\n" + "import pandas as pd\n" + "df = pd.DataFrame(sample_scores(10))\n" + "df\n" + "```\n" + "\n" + "```{python}\n" + "#| echo: false\n" + "tbl_preview(df)\n" + "```\n" + "\n" + ":::\n" + ), + "user_guide/07-text-heavy-tables.qmd": ( + "---\n" + "title: Text-Heavy Tables\n" + "---\n" + "\n" + "## Long Strings (Default Width)\n" + "\n" + "Cells with very long text are capped at `max_col_width` (250px\n" + "by default) and show an ellipsis instead of wrapping.\n" + "\n" + "```{python}\n" + "from great_docs import tbl_preview\n" + "\n" + "data = {\n" + ' "id": [1, 2, 3, 4, 5],\n' + ' "title": [\n' + ' "A short title",\n' + ' "A moderately long title that tests mid-range widths",\n' + ' "This title is intentionally very long so that it will definitely exceed the maximum column width and trigger text-overflow ellipsis behavior in the rendered table cell",\n' + ' "Brief",\n' + ' "Another extremely verbose title string that goes on and on to stress-test the truncation and overflow handling in the preview table renderer",\n' + " ],\n" + ' "status": ["draft", "published", "review", "archived", "published"],\n' + "}\n" + "\n" + "tbl_preview(data, show_all=True)\n" + "```\n" + "\n" + "## Descriptions and Paragraphs\n" + "\n" + "Real-world data often has paragraph-length text in columns.\n" + "\n" + "```{python}\n" + "data = {\n" + ' "package": ["NumPy", "Pandas", "Polars", "Great Tables", "Pointblank"],\n' + ' "description": [\n' + ' "Fundamental package for scientific computing with Python. Provides N-dimensional arrays, linear algebra, Fourier transforms, and random number generation.",\n' + ' "Powerful data structures for data analysis, time series, and statistics. Built on NumPy with labeled axes, automatic alignment, and rich I/O.",\n' + ' "Lightning-fast DataFrame library in Rust with a Python API. Lazy evaluation, multi-threaded queries, and Apache Arrow memory format.",\n' + ' "Build beautiful, publication-quality tables in Python. Supports Polars and Pandas DataFrames with fine-grained styling, formatting, and export.",\n' + ' "Data validation library for Python. Define expectations, validate data, and generate detailed reports with table-level and column-level checks.",\n' + " ],\n" + ' "version": ["1.26.0", "2.2.0", "0.20.0", "0.15.0", "0.14.0"],\n' + "}\n" + "\n" + "tbl_preview(data, show_all=True)\n" + "```\n" + "\n" + "## Narrow Max Width (120px)\n" + "\n" + "Force aggressive truncation with a tight `max_col_width`:\n" + "\n" + "```{python}\n" + "tbl_preview(data, show_all=True, max_col_width=120)\n" + "```\n" + "\n" + "## Wide Max Width (500px)\n" + "\n" + "Allow generous room — long text is still capped, but more is visible:\n" + "\n" + "```{python}\n" + "tbl_preview(data, show_all=True, max_col_width=500)\n" + "```\n" + "\n" + "## Mixed Short and Long Columns\n" + "\n" + "Short numeric/code columns alongside verbose text — each column\n" + "gets its own computed width.\n" + "\n" + "```{python}\n" + "data = {\n" + ' "code": ["E001", "E002", "E003", "W001", "W002", "I001", "I002", "E004"],\n' + ' "severity": ["error", "error", "error", "warning", "warning", "info", "info", "error"],\n' + ' "message": [\n' + ' "Undefined variable: foobar",\n' + ' "Type mismatch: expected int, got str in argument `count` of function process_batch()",\n' + ' "Division by zero in expression total / n_items where n_items evaluates to 0",\n' + ' "Unused import: os (imported but never referenced in module)",\n' + ' "Variable `tmp` assigned on line 42 but never used anywhere in the function body",\n' + ' "Module docstring missing: consider adding a module-level docstring",\n' + ' "Line too long: 127 characters (max 120). Consider breaking this into multiple lines for readability",\n' + ' "Syntax error: unexpected token ) at position 34 in expression parse(input))",\n' + " ],\n" + ' "line": [12, 45, 78, 3, 42, 1, 99, 34],\n' + "}\n" + "\n" + "tbl_preview(data, show_all=True)\n" + "```\n" + ), + # ── File-format pages ───────────────────────────────────────────── + "user_guide/08-tsv-files.qmd": ( + "---\n" + "title: TSV Files\n" + "---\n" + "\n" + "## Read a TSV File\n" + "\n" + "`tbl_preview()` auto-detects `.tsv` and `.tab` files and reads\n" + "them with tab-delimited parsing.\n" + "\n" + "```{python}\n" + "#| echo: false\n" + "import pathlib\n" + "\n" + "tsv_path = pathlib.Path('assets/cities.tsv')\n" + "tsv_path.parent.mkdir(parents=True, exist_ok=True)\n" + "tsv_path.write_text(\n" + " 'city\\tcountry\\tpopulation\\tarea_km2\\n'\n" + " 'Tokyo\\tJapan\\t13960000\\t2194\\n'\n" + " 'Delhi\\tIndia\\t11030000\\t1484\\n'\n" + " 'Shanghai\\tChina\\t24870000\\t6341\\n'\n" + " 'São Paulo\\tBrazil\\t12330000\\t1521\\n'\n" + " 'Mexico City\\tMexico\\t9210000\\t1485\\n'\n" + " 'Cairo\\tEgypt\\t9540000\\t3085\\n'\n" + " 'Mumbai\\tIndia\\t12440000\\t603\\n'\n" + " 'Beijing\\tChina\\t21540000\\t16411\\n'\n" + ")\n" + "```\n" + "\n" + "```{python}\n" + "from great_docs import tbl_preview\n" + "\n" + "tbl_preview('assets/cities.tsv', show_all=True)\n" + "```\n" + "\n" + "The badge shows **TSV** and the header reports the correct\n" + "row and column counts.\n" + ), + "user_guide/09-jsonl-files.qmd": ( + "---\n" + "title: JSONL Files\n" + "---\n" + "\n" + "## Read a JSONL File\n" + "\n" + "Newline-delimited JSON (`.jsonl` / `.ndjson`) is a common\n" + "format for streaming data and log records.\n" + "\n" + "```{python}\n" + "#| echo: false\n" + "import pathlib, json\n" + "\n" + "records = [\n" + " {'timestamp': '2025-01-15T08:30:00', 'level': 'INFO', 'module': 'auth', 'message': 'User login successful'},\n" + " {'timestamp': '2025-01-15T08:31:12', 'level': 'WARNING', 'module': 'db', 'message': 'Slow query detected (3.2s)'},\n" + " {'timestamp': '2025-01-15T08:32:45', 'level': 'ERROR', 'module': 'api', 'message': 'Request timeout on /v2/users'},\n" + " {'timestamp': '2025-01-15T08:33:01', 'level': 'INFO', 'module': 'cache', 'message': 'Cache miss for key user:42'},\n" + " {'timestamp': '2025-01-15T08:34:20', 'level': 'DEBUG', 'module': 'auth', 'message': 'Token refresh for session abc123'},\n" + " {'timestamp': '2025-01-15T08:35:55', 'level': 'ERROR', 'module': 'db', 'message': 'Connection pool exhausted'},\n" + " {'timestamp': '2025-01-15T08:36:10', 'level': 'INFO', 'module': 'api', 'message': 'Health check passed'},\n" + " {'timestamp': '2025-01-15T08:37:30', 'level': 'WARNING', 'module': 'auth', 'message': 'Failed login attempt from 192.168.1.100'},\n" + "]\n" + "\n" + "jsonl_path = pathlib.Path('assets/server_logs.jsonl')\n" + "jsonl_path.parent.mkdir(parents=True, exist_ok=True)\n" + "jsonl_path.write_text('\\n'.join(json.dumps(r) for r in records) + '\\n')\n" + "```\n" + "\n" + "```{python}\n" + "from great_docs import tbl_preview\n" + "\n" + "tbl_preview('assets/server_logs.jsonl', show_all=True)\n" + "```\n" + "\n" + "## NDJSON Extension\n" + "\n" + "The `.ndjson` extension is treated identically:\n" + "\n" + "```{python}\n" + "#| echo: false\n" + "import shutil\n" + "shutil.copy('assets/server_logs.jsonl', 'assets/server_logs.ndjson')\n" + "```\n" + "\n" + "```{python}\n" + "tbl_preview('assets/server_logs.ndjson', show_all=True)\n" + "```\n" + ), + "user_guide/10-parquet-files.qmd": ( + "---\n" + "title: Parquet Files\n" + "---\n" + "\n" + "## Read a Parquet File\n" + "\n" + "Apache Parquet is a columnar storage format popular in data\n" + "engineering workflows.\n" + "\n" + "```{python}\n" + "#| echo: false\n" + "import polars as pl, pathlib\n" + "\n" + "df = pl.DataFrame({\n" + " 'product': ['Widget', 'Gadget', 'Gizmo', 'Doohickey', 'Thingamajig'],\n" + " 'category': ['Electronics', 'Tools', 'Kitchen', 'Garden', 'Office'],\n" + " 'price': [29.99, 49.50, 12.00, 8.75, 199.99],\n" + " 'in_stock': [True, False, True, True, False],\n" + " 'rating': [4.5, 3.8, 4.9, 4.2, 2.1],\n" + "})\n" + "\n" + "pq_path = pathlib.Path('assets/products.parquet')\n" + "pq_path.parent.mkdir(parents=True, exist_ok=True)\n" + "df.write_parquet(str(pq_path))\n" + "```\n" + "\n" + "```{python}\n" + "from great_docs import tbl_preview\n" + "\n" + "tbl_preview('assets/products.parquet', show_all=True)\n" + "```\n" + "\n" + "The badge shows **Parquet** and dtype labels are preserved\n" + "from the original Polars schema.\n" + ), + "user_guide/11-feather-arrow-files.qmd": ( + "---\n" + "title: Feather & Arrow IPC Files\n" + "---\n" + "\n" + "## Feather File\n" + "\n" + "Feather (Apache Arrow IPC format) is fast for local analytics.\n" + "\n" + "```{python}\n" + "#| echo: false\n" + "import polars as pl, pathlib\n" + "\n" + "df = pl.DataFrame({\n" + " 'name': ['Alice', 'Bob', 'Charlie', 'Diana', 'Eve', 'Frank'],\n" + " 'department': ['Engineering', 'Marketing', 'Engineering', 'Sales', 'Marketing', 'Sales'],\n" + " 'salary': [95000, 72000, 105000, 68000, 88000, 71000],\n" + " 'years': [5, 3, 8, 2, 6, 4],\n" + "})\n" + "\n" + "feather_path = pathlib.Path('assets/employees.feather')\n" + "feather_path.parent.mkdir(parents=True, exist_ok=True)\n" + "df.write_ipc(str(feather_path))\n" + "```\n" + "\n" + "```{python}\n" + "from great_docs import tbl_preview\n" + "\n" + "tbl_preview('assets/employees.feather', show_all=True)\n" + "```\n" + "\n" + "## Arrow IPC Extension\n" + "\n" + "Files with `.arrow` or `.ipc` extensions are also read as\n" + "Arrow IPC, but get the **Arrow** badge instead of Feather:\n" + "\n" + "```{python}\n" + "#| echo: false\n" + "import shutil\n" + "shutil.copy('assets/employees.feather', 'assets/employees.arrow')\n" + "```\n" + "\n" + "```{python}\n" + "tbl_preview('assets/employees.arrow', show_all=True)\n" + "```\n" + ), + "user_guide/12-arrow-tables.qmd": ( + "---\n" + "title: PyArrow Tables\n" + "---\n" + "\n" + "## In-Memory Arrow Table\n" + "\n" + "`tbl_preview()` also accepts a `pyarrow.Table` directly —\n" + "no file needed.\n" + "\n" + "```{python}\n" + "import pyarrow as pa\n" + "from great_docs import tbl_preview\n" + "\n" + "tbl = pa.table({\n" + " 'city': ['Tokyo', 'Delhi', 'Shanghai', 'São Paulo', 'Mexico City',\n" + " 'Cairo', 'Mumbai', 'Beijing', 'Dhaka', 'Osaka'],\n" + " 'country': ['Japan', 'India', 'China', 'Brazil', 'Mexico',\n" + " 'Egypt', 'India', 'China', 'Bangladesh', 'Japan'],\n" + " 'population_m': [13.96, 11.03, 24.87, 12.33, 9.21,\n" + " 9.54, 12.44, 21.54, 8.91, 2.75],\n" + " 'area_km2': [2194, 1484, 6341, 1521, 1485,\n" + " 3085, 603, 16411, 306, 225],\n" + "})\n" + "\n" + "tbl_preview(tbl, show_all=True)\n" + "```\n" + "\n" + "## Arrow Table with Typed Columns\n" + "\n" + "PyArrow preserves rich type information — booleans, dates,\n" + "decimals — which `tbl_preview()` maps to short dtype labels.\n" + "\n" + "```{python}\n" + "import pyarrow as pa\n" + "from datetime import date\n" + "\n" + "tbl = pa.table({\n" + " 'event': ['Launch', 'Update', 'Hotfix', 'Deprecation'],\n" + " 'date': [date(2025, 1, 15), date(2025, 3, 1), date(2025, 3, 12), date(2025, 6, 30)],\n" + " 'critical': [True, False, True, False],\n" + " 'affected_users': [50000, 12000, 8500, 2000],\n" + "})\n" + "\n" + "tbl_preview(tbl, show_all=True)\n" + "```\n" + ), + }, + "expected": { + "detected_name": "gdtest-tbl-preview", + "detected_module": "gdtest_tbl_preview", + "detected_parser": "numpy", + "export_names": [ + "sample_scores", + "sample_inventory", + "sample_wide", + "sample_missing", + "sample_types", + ], + "num_exports": 5, + "section_titles": ["Functions"], + "has_user_guide": True, + }, +} diff --git a/test-packages/synthetic/specs/gdtest_tbl_shortcode.py b/test-packages/synthetic/specs/gdtest_tbl_shortcode.py new file mode 100644 index 00000000..55710e51 --- /dev/null +++ b/test-packages/synthetic/specs/gdtest_tbl_shortcode.py @@ -0,0 +1,232 @@ +""" +gdtest_tbl_shortcode — Exercise the {{< tbl-preview >}} Quarto shortcode. + +Dimensions: A1, B1, C4, D2, E6, F1, G1, H7 +Focus: The tbl-preview shortcode with data files in assets/ — CSV, TSV, + JSONL — exercising every shortcode parameter. No Python code cells; + all table rendering is done purely through the shortcode. + + 1. CSV basics — default shortcode, caption, column subset. + 2. TSV files — tab-delimited data via the shortcode. + 3. JSONL files — newline-delimited JSON via the shortcode. + 4. Shortcode options — head/tail, show_all, hide row numbers, + hide dtypes, hide dimensions, max_col_width. + 5. Multiple tables — several shortcodes on a single page. +""" + +SPEC = { + "name": "gdtest_tbl_shortcode", + "description": "tbl-preview Quarto shortcode with CSV, TSV, and JSONL data files.", + "dimensions": ["A1", "B1", "C4", "D2", "E6", "F1", "G1", "H7"], + "pyproject_toml": { + "project": { + "name": "gdtest-tbl-shortcode", + "version": "0.1.0", + "description": "Showcase for the tbl-preview Quarto shortcode", + "dependencies": ["great_docs"], + }, + "build-system": { + "requires": ["setuptools"], + "build-backend": "setuptools.build_meta", + }, + }, + "config": {}, + "files": { + # ── Project root ────────────────────────────────────────────────── + "README.md": ( + "# gdtest-tbl-shortcode\n\n" + "A showcase site demonstrating the `{{< tbl-preview >}}` Quarto\n" + "shortcode. All tables are rendered from data files in `assets/`\n" + "— no Python code cells needed.\n" + ), + # ── Python module (minimal) ────────────────────────────────────── + "gdtest_tbl_shortcode/__init__.py": ( + '"""Shortcode demo package."""\n' + "\n" + '__version__ = "0.1.0"\n' + '__all__ = ["describe"]\n' + "\n" + "\n" + "def describe(name: str) -> str:\n" + ' """Describe a dataset by name.\n' + "\n" + " Parameters\n" + " ----------\n" + " name\n" + " The dataset name.\n" + "\n" + " Returns\n" + " -------\n" + " str\n" + " A human-readable description.\n" + ' """\n' + ' return f"Dataset: {name}"\n' + ), + # ── Data files in assets/ ───────────────────────────────────────── + "assets/students.csv": ( + "name,subject,score,grade,passed\n" + "Alice,Math,95.5,A,true\n" + "Bob,Science,82.0,B,true\n" + "Charlie,English,71.3,C,true\n" + "Diana,History,60.0,D,true\n" + "Eve,Art,55.8,F,false\n" + "Frank,Math,88.2,B+,true\n" + "Grace,Science,79.9,C+,true\n" + "Hank,English,91.0,A-,true\n" + "Iris,History,66.4,D+,true\n" + "Jack,Art,73.7,C,true\n" + ), + "assets/products.tsv": ( + "product\tcategory\tprice\tstock\trating\n" + "Widget\tElectronics\t29.99\t150\t4.5\n" + "Gadget\tTools\t49.50\t80\t3.8\n" + "Gizmo\tKitchen\t12.00\t300\t4.9\n" + "Doohickey\tGarden\t8.75\t0\t4.2\n" + "Thingamajig\tOffice\t199.99\t25\t2.1\n" + "Contraption\tElectronics\t65.00\t44\t3.5\n" + "Apparatus\tTools\t120.00\t12\t4.7\n" + ), + "assets/server_logs.jsonl": ( + '{"timestamp":"2025-01-15T08:30:00","level":"INFO","module":"auth","message":"User login successful"}\n' + '{"timestamp":"2025-01-15T08:31:12","level":"WARNING","module":"db","message":"Slow query detected (3.2s)"}\n' + '{"timestamp":"2025-01-15T08:32:45","level":"ERROR","module":"api","message":"Request timeout on /v2/users"}\n' + '{"timestamp":"2025-01-15T08:33:01","level":"INFO","module":"cache","message":"Cache miss for key user:42"}\n' + '{"timestamp":"2025-01-15T08:34:20","level":"DEBUG","module":"auth","message":"Token refresh for session abc123"}\n' + '{"timestamp":"2025-01-15T08:35:55","level":"ERROR","module":"db","message":"Connection pool exhausted"}\n' + ), + "assets/wide_metrics.csv": ( + "metric,jan,feb,mar,apr,may,jun,jul,aug,sep,oct,nov,dec\n" + "revenue,120.5,135.2,128.7,142.3,155.8,148.9,162.1,170.4,165.3,178.2,185.6,192.0\n" + "costs,80.1,82.3,79.5,85.2,90.1,88.7,95.3,100.2,97.8,105.1,110.3,115.0\n" + "users,1200,1350,1280,1420,1558,1489,1621,1704,1653,1782,1856,1920\n" + "sessions,3500,3800,3600,4100,4500,4200,4800,5100,4900,5300,5600,5800\n" + "conversion,3.2,3.5,3.1,3.8,4.0,3.7,4.2,4.5,4.3,4.6,4.8,5.0\n" + ), + # ── User guide pages (flat layout) ──────────────────────────────── + "user_guide/01-csv-basics.qmd": ( + "---\n" + "title: CSV via Shortcode\n" + "---\n" + "\n" + "## Default Preview\n" + "\n" + "The simplest usage — just point at a CSV file:\n" + "\n" + '{{< tbl-preview file="assets/students.csv" >}}\n' + "\n" + "## With Caption\n" + "\n" + "Add a descriptive caption:\n" + "\n" + '{{< tbl-preview file="assets/students.csv" caption="Student Scores — Fall 2025" >}}\n' + "\n" + "## Column Subset\n" + "\n" + "Show only selected columns:\n" + "\n" + '{{< tbl-preview file="assets/students.csv" columns="name,score,grade" show_all="true" >}}\n' + ), + "user_guide/02-tsv-files.qmd": ( + "---\n" + "title: TSV via Shortcode\n" + "---\n" + "\n" + "## Product Inventory\n" + "\n" + "Tab-delimited files work just the same:\n" + "\n" + '{{< tbl-preview file="assets/products.tsv" show_all="true" >}}\n' + "\n" + "## Head Only\n" + "\n" + "Show just the first 3 rows:\n" + "\n" + '{{< tbl-preview file="assets/products.tsv" n_head="3" n_tail="0" >}}\n' + ), + "user_guide/03-jsonl-files.qmd": ( + "---\n" + "title: JSONL via Shortcode\n" + "---\n" + "\n" + "## Server Logs\n" + "\n" + "JSONL (newline-delimited JSON) files are auto-detected:\n" + "\n" + '{{< tbl-preview file="assets/server_logs.jsonl" show_all="true" >}}\n' + "\n" + "## With Narrow Columns\n" + "\n" + "Constrain column widths to see truncation:\n" + "\n" + '{{< tbl-preview file="assets/server_logs.jsonl" show_all="true" max_col_width="120" >}}\n' + ), + "user_guide/04-shortcode-options.qmd": ( + "---\n" + "title: Shortcode Options\n" + "---\n" + "\n" + "## Show All Rows\n" + "\n" + '{{< tbl-preview file="assets/students.csv" show_all="true" >}}\n' + "\n" + "## Custom Head/Tail Split\n" + "\n" + '{{< tbl-preview file="assets/students.csv" n_head="3" n_tail="2" >}}\n' + "\n" + "## Hide Row Numbers\n" + "\n" + '{{< tbl-preview file="assets/students.csv" show_all="true" show_row_numbers="false" >}}\n' + "\n" + "## Hide Dtype Labels\n" + "\n" + '{{< tbl-preview file="assets/students.csv" show_all="true" show_dtypes="false" >}}\n' + "\n" + "## Hide Dimensions Banner\n" + "\n" + '{{< tbl-preview file="assets/students.csv" show_all="true" show_dimensions="false" >}}\n' + "\n" + "## Minimal Chrome\n" + "\n" + "No row numbers, no dtypes, no dimensions — just the data:\n" + "\n" + '{{< tbl-preview file="assets/students.csv" show_all="true" ' + 'show_row_numbers="false" show_dtypes="false" show_dimensions="false" >}}\n' + ), + "user_guide/05-multiple-tables.qmd": ( + "---\n" + "title: Multiple Tables\n" + "---\n" + "\n" + "## Side-by-Side Comparisons\n" + "\n" + "Multiple shortcodes on one page, each with different files:\n" + "\n" + "### Students (CSV)\n" + "\n" + '{{< tbl-preview file="assets/students.csv" n_head="3" n_tail="0" caption="Top 3 Students" >}}\n' + "\n" + "### Products (TSV)\n" + "\n" + '{{< tbl-preview file="assets/products.tsv" n_head="3" n_tail="0" caption="Top 3 Products" >}}\n' + "\n" + "### Server Logs (JSONL)\n" + "\n" + '{{< tbl-preview file="assets/server_logs.jsonl" n_head="3" n_tail="0" caption="Recent Logs" >}}\n' + "\n" + "## Wide Table\n" + "\n" + "A 13-column metrics table with horizontal scroll:\n" + "\n" + '{{< tbl-preview file="assets/wide_metrics.csv" show_all="true" caption="Monthly Metrics 2025" >}}\n' + ), + }, + "expected": { + "detected_name": "gdtest-tbl-shortcode", + "detected_module": "gdtest_tbl_shortcode", + "detected_parser": "numpy", + "export_names": ["describe"], + "num_exports": 1, + "section_titles": ["Functions"], + "has_user_guide": True, + }, +} diff --git a/tests/test_tbl_preview.py b/tests/test_tbl_preview.py new file mode 100644 index 00000000..773d3a12 --- /dev/null +++ b/tests/test_tbl_preview.py @@ -0,0 +1,1734 @@ +"""Tests for the tbl_preview module.""" + +from __future__ import annotations + +import math +from pathlib import Path +from typing import Any + +import pytest + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _make_dict(n_rows: int = 10, n_cols: int = 3) -> dict[str, list]: + """Create a simple column-oriented dict for testing.""" + data: dict[str, list] = {} + for c in range(n_cols): + col_name = f"col_{c}" + data[col_name] = list(range(n_rows)) + return data + + +def _make_mixed_dict() -> dict[str, list]: + """Dict with mixed types including None and NaN.""" + return { + "name": ["Alice", "Bob", None, "Dave", "Eve"], + "score": [95.5, float("nan"), 88.0, None, 72.3], + "active": [True, False, True, None, False], + } + + +def _make_list_of_dicts(n: int = 5) -> list[dict]: + return [{"x": i, "y": f"val_{i}", "z": i * 1.1} for i in range(n)] + + +# --------------------------------------------------------------------------- +# TblPreview result class +# --------------------------------------------------------------------------- + + +class TestTblPreviewResult: + """Tests for the TblPreview wrapper class.""" + + def test_repr_html(self): + from great_docs._tbl_preview import TblPreview + + tp = TblPreview("
hello
") + assert tp._repr_html_() == "
hello
" + + def test_as_html(self): + from great_docs._tbl_preview import TblPreview + + tp = TblPreview("

test

") + assert tp.as_html() == "

test

" + + def test_save(self, tmp_path: Path): + from great_docs._tbl_preview import TblPreview + + tp = TblPreview("data
") + out = tmp_path / "out.html" + tp.save(out) + assert out.read_text(encoding="utf-8") == "data
" + + def test_repr(self): + from great_docs._tbl_preview import TblPreview + + tp = TblPreview("abcdef") + assert "6 chars" in repr(tp) + + +# --------------------------------------------------------------------------- +# Data normalization +# --------------------------------------------------------------------------- + + +class TestNormalizeData: + """Tests for data normalization from different sources.""" + + def test_dict_input(self): + from great_docs._tbl_preview import _normalize_data + + data = {"a": [1, 2, 3], "b": ["x", "y", "z"]} + names, dtypes, rows, n, tbl_type = _normalize_data(data) + assert names == ["a", "b"] + assert n == 3 + assert tbl_type == "dict" + assert len(rows) == 3 + assert rows[0] == [1, "x"] + + def test_list_of_dicts_input(self): + from great_docs._tbl_preview import _normalize_data + + data = [{"x": 1, "y": "a"}, {"x": 2, "y": "b"}] + names, dtypes, rows, n, tbl_type = _normalize_data(data) + assert names == ["x", "y"] + assert n == 2 + assert tbl_type == "dict" + + def test_empty_dict(self): + from great_docs._tbl_preview import _normalize_data + + names, dtypes, rows, n, tbl_type = _normalize_data({}) + assert names == [] + assert n == 0 + + def test_unsupported_type_raises(self): + from great_docs._tbl_preview import _normalize_data + + with pytest.raises(TypeError, match="Unsupported data type"): + _normalize_data(42) + + def test_csv_file(self, tmp_path: Path): + from great_docs._tbl_preview import _normalize_data + + csv = tmp_path / "test.csv" + csv.write_text("a,b,c\n1,x,3.0\n2,y,4.0\n") + names, dtypes, rows, n, tbl_type = _normalize_data(str(csv)) + assert "a" in names + assert n == 2 + assert tbl_type == "csv" + + def test_csv_path_object(self, tmp_path: Path): + from great_docs._tbl_preview import _normalize_data + + csv = tmp_path / "test.csv" + csv.write_text("col1,col2\n10,20\n30,40\n") + names, _, rows, n, tbl_type = _normalize_data(csv) # Path object + assert n == 2 + assert tbl_type == "csv" + + +class TestPolarsIntegration: + """Tests with Polars DataFrames (skipped if not installed).""" + + @pytest.fixture(autouse=True) + def _require_polars(self): + pytest.importorskip("polars") + + def test_polars_dataframe(self): + import polars as pl + from great_docs._tbl_preview import _normalize_data + + df = pl.DataFrame({"a": [1, 2, 3], "b": ["x", "y", "z"]}) + names, dtypes, rows, n, tbl_type = _normalize_data(df) + assert names == ["a", "b"] + assert n == 3 + assert tbl_type == "polars" + assert dtypes[0] in ("i64", "i32") + assert dtypes[1] == "str" + + +class TestPandasIntegration: + """Tests with Pandas DataFrames (skipped if not installed).""" + + @pytest.fixture(autouse=True) + def _require_pandas(self): + pytest.importorskip("pandas") + + def test_pandas_dataframe(self): + import pandas as pd + from great_docs._tbl_preview import _normalize_data + + df = pd.DataFrame({"a": [1, 2, 3], "b": ["x", "y", "z"]}) + names, dtypes, rows, n, tbl_type = _normalize_data(df) + assert names == ["a", "b"] + assert n == 3 + assert tbl_type == "pandas" + + +# --------------------------------------------------------------------------- +# Column subsetting +# --------------------------------------------------------------------------- + + +class TestColumnSubset: + def test_none_returns_all(self): + from great_docs._tbl_preview import _apply_column_subset + + names = ["a", "b", "c"] + dtypes = ["i64", "str", "f64"] + rows = [[1, "x", 1.0]] + out_names, out_dtypes, out_rows = _apply_column_subset(names, dtypes, rows, None) + assert out_names == names + + def test_subset(self): + from great_docs._tbl_preview import _apply_column_subset + + names = ["a", "b", "c"] + dtypes = ["i64", "str", "f64"] + rows = [[1, "x", 1.0], [2, "y", 2.0]] + out_names, _, out_rows = _apply_column_subset(names, dtypes, rows, ["c", "a"]) + assert out_names == ["c", "a"] + assert out_rows[0] == [1.0, 1] + + def test_invalid_column_raises(self): + from great_docs._tbl_preview import _apply_column_subset + + with pytest.raises(ValueError, match="not found"): + _apply_column_subset(["a", "b"], ["i64", "str"], [[1, "x"]], ["z"]) + + +# --------------------------------------------------------------------------- +# Head / tail split +# --------------------------------------------------------------------------- + + +class TestHeadTail: + def test_small_dataset_full(self): + from great_docs._tbl_preview import _compute_head_tail + + rows = [[i] for i in range(8)] + display, nums, is_full = _compute_head_tail(rows, 8, 5, 5, False) + assert is_full is True + assert len(display) == 8 + + def test_large_dataset_split(self): + from great_docs._tbl_preview import _compute_head_tail + + rows = [[i] for i in range(100)] + display, nums, is_full = _compute_head_tail(rows, 100, 5, 5, False) + assert is_full is False + assert len(display) == 10 + assert nums[:5] == [1, 2, 3, 4, 5] + assert nums[5:] == [96, 97, 98, 99, 100] + + def test_show_all(self): + from great_docs._tbl_preview import _compute_head_tail + + rows = [[i] for i in range(100)] + display, nums, is_full = _compute_head_tail(rows, 100, 5, 5, True) + assert is_full is True + assert len(display) == 100 + + def test_zero_tail(self): + from great_docs._tbl_preview import _compute_head_tail + + rows = [[i] for i in range(20)] + display, nums, is_full = _compute_head_tail(rows, 20, 5, 0, False) + assert is_full is False + assert len(display) == 5 + assert nums == [1, 2, 3, 4, 5] + + +# --------------------------------------------------------------------------- +# Cell formatting and escaping +# --------------------------------------------------------------------------- + + +class TestFormatting: + def test_format_none(self): + from great_docs._tbl_preview import _format_cell + + assert _format_cell(None) == "None" + + def test_format_bool(self): + from great_docs._tbl_preview import _format_cell + + assert _format_cell(True) == "True" + assert _format_cell(False) == "False" + + def test_format_nan(self): + from great_docs._tbl_preview import _format_cell + + assert _format_cell(float("nan")) == "NaN" + + def test_format_inf(self): + from great_docs._tbl_preview import _format_cell + + assert _format_cell(float("inf")) == "Inf" + assert _format_cell(float("-inf")) == "-Inf" + + def test_format_float(self): + from great_docs._tbl_preview import _format_cell + + assert _format_cell(3.14) == "3.14" + + def test_format_float_precision(self): + from great_docs._tbl_preview import _format_cell + + # IEEE 754 noise should be trimmed + assert _format_cell(3 * 1.1) == "3.3" + assert _format_cell(24.200000000000003) == "24.2" + + def test_format_string(self): + from great_docs._tbl_preview import _format_cell + + assert _format_cell("hello") == "hello" + + def test_escape_html(self): + from great_docs._tbl_preview import _escape + + assert _escape("bold") == "<b>bold</b>" + assert _escape('"quoted"') == ""quoted"" + + +# --------------------------------------------------------------------------- +# Missing value detection +# --------------------------------------------------------------------------- + + +class TestMissingDetection: + def test_none_is_missing(self): + from great_docs._tbl_preview import _is_missing + + assert _is_missing(None) is True + + def test_nan_is_missing(self): + from great_docs._tbl_preview import _is_missing + + assert _is_missing(float("nan")) is True + + def test_string_not_missing(self): + from great_docs._tbl_preview import _is_missing + + assert _is_missing("hello") is False + + def test_zero_not_missing(self): + from great_docs._tbl_preview import _is_missing + + assert _is_missing(0) is False + assert _is_missing(0.0) is False + + +# --------------------------------------------------------------------------- +# Alignment detection +# --------------------------------------------------------------------------- + + +class TestAlignments: + def test_numeric_right(self): + from great_docs._tbl_preview import _detect_alignments + + alignments = _detect_alignments(["i64", "f64", "u32"]) + assert all(a == "right" for a in alignments) + + def test_string_left(self): + from great_docs._tbl_preview import _detect_alignments + + alignments = _detect_alignments(["str", "cat", "bool"]) + assert all(a == "left" for a in alignments) + + def test_mixed(self): + from great_docs._tbl_preview import _detect_alignments + + alignments = _detect_alignments(["i64", "str", "f64"]) + assert alignments == ["right", "left", "right"] + + +# --------------------------------------------------------------------------- +# Dtype inference +# --------------------------------------------------------------------------- + + +class TestDtypeInference: + def test_int_list(self): + from great_docs._tbl_preview import _infer_dtype + + assert _infer_dtype([1, 2, 3]) == "i64" + + def test_float_list(self): + from great_docs._tbl_preview import _infer_dtype + + assert _infer_dtype([1.0, 2.0]) == "f64" + + def test_mixed_int_float(self): + from great_docs._tbl_preview import _infer_dtype + + assert _infer_dtype([1, 2.0]) == "f64" + + def test_bool_list(self): + from great_docs._tbl_preview import _infer_dtype + + assert _infer_dtype([True, False]) == "bool" + + def test_string_list(self): + from great_docs._tbl_preview import _infer_dtype + + assert _infer_dtype(["a", "b"]) == "str" + + def test_none_only(self): + from great_docs._tbl_preview import _infer_dtype + + assert _infer_dtype([None, None]) == "null" + + +# --------------------------------------------------------------------------- +# Number formatting +# --------------------------------------------------------------------------- + + +class TestNumberFormatting: + def test_small_number(self): + from great_docs._tbl_preview import _format_number + + assert _format_number(42) == "42" + + def test_thousands(self): + from great_docs._tbl_preview import _format_number + + assert _format_number(1234567) == "1,234,567" + + +# --------------------------------------------------------------------------- +# Full tbl_preview() function +# --------------------------------------------------------------------------- + + +class TestTblPreviewFunction: + """Integration tests for the main tbl_preview() entry point.""" + + def test_dict_basic(self): + from great_docs._tbl_preview import tbl_preview + + data = {"a": [1, 2, 3], "b": ["x", "y", "z"]} + result = tbl_preview(data) + html = result.as_html() + assert "gd-tbl-preview" in html + assert "gt_table" in html + assert "gt_col_heading" in html + + def test_list_of_dicts(self): + from great_docs._tbl_preview import tbl_preview + + data = [{"x": 1, "y": "a"}, {"x": 2, "y": "b"}] + result = tbl_preview(data) + assert "gt_table" in result.as_html() + + def test_head_tail_with_large_data(self): + from great_docs._tbl_preview import tbl_preview + + data = {"val": list(range(100))} + result = tbl_preview(data, n_head=3, n_tail=2) + html = result.as_html() + # Should show 5 data rows + divider + assert "gd-tbl-divider" in html + + def test_show_all(self): + from great_docs._tbl_preview import tbl_preview + + data = {"val": list(range(20))} + result = tbl_preview(data, show_all=True) + html = result.as_html() + # Divider class should not appear on any element + assert 'class="gd-tbl-divider"' not in html + + def test_column_subset(self): + from great_docs._tbl_preview import tbl_preview + + data = {"a": [1], "b": [2], "c": [3]} + result = tbl_preview(data, columns=["c", "a"]) + html = result.as_html() + assert 'id="c"' in html + assert 'id="a"' in html + + def test_caption(self): + from great_docs._tbl_preview import tbl_preview + + data = {"a": [1]} + result = tbl_preview(data, caption="My Table") + assert "My Table" in result.as_html() + + def test_no_dimensions(self): + from great_docs._tbl_preview import tbl_preview + + data = {"a": [1]} + result = tbl_preview(data, show_dimensions=False) + html = result.as_html() + # No badge spans should appear in the (CSS still has the class) + assert 'class="gd-tbl-badge' not in html + + def test_no_row_numbers(self): + from great_docs._tbl_preview import tbl_preview + + data = {"a": [1, 2]} + result = tbl_preview(data, show_row_numbers=False) + # No with rownum class in the body (CSS still defines the class) + assert 'class="gt_row gt_right gd-tbl-rownum"' not in result.as_html() + + def test_no_dtypes(self): + from great_docs._tbl_preview import tbl_preview + + data = {"a": [1]} + result = tbl_preview(data, show_dtypes=False) + # No dtype div should appear in column headings (CSS still defines the class) + assert 'class="gd-tbl-dtype"' not in result.as_html() + + def test_highlight_missing(self): + from great_docs._tbl_preview import tbl_preview + + data = {"a": [1, None, 3]} + result = tbl_preview(data, highlight_missing=True) + assert "gd-tbl-missing" in result.as_html() + + def test_no_highlight_missing(self): + from great_docs._tbl_preview import tbl_preview + + data = {"a": [1, None, 3]} + result = tbl_preview(data, highlight_missing=False) + # No missing class on any (CSS still defines the class) + assert 'gd-tbl-missing"' not in result.as_html() + + def test_custom_id(self): + from great_docs._tbl_preview import tbl_preview + + data = {"a": [1]} + result = tbl_preview(data, id="my-table") + assert 'id="gd-tbl-my-table"' in result.as_html() + + def test_limit_exceeded_raises(self): + from great_docs._tbl_preview import tbl_preview + + data = {"a": [1]} + with pytest.raises(ValueError, match="exceeds limit"): + tbl_preview(data, n_head=30, n_tail=30, limit=50) + + def test_html_escaping_in_data(self): + from great_docs._tbl_preview import tbl_preview + + data = {"col": [""]} + result = tbl_preview(data) + html = result.as_html() + assert "