diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 8d0423c..5144040 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -5,7 +5,7 @@
repos:
# Ruff - replaces black, isort, flake8
- repo: https://github.com/astral-sh/ruff-pre-commit
- rev: v0.7.4
+ rev: v0.15.2
hooks:
# Run the linter
- id: ruff
@@ -15,7 +15,7 @@ repos:
# mypy - type checking
- repo: https://github.com/pre-commit/mirrors-mypy
- rev: v1.13.0
+ rev: v1.19.1
hooks:
- id: mypy
additional_dependencies:
@@ -37,7 +37,7 @@ repos:
# Standard pre-commit hooks
- repo: https://github.com/pre-commit/pre-commit-hooks
- rev: v5.0.0
+ rev: v6.0.0
hooks:
- id: trailing-whitespace
- id: end-of-file-fixer
@@ -52,7 +52,7 @@ repos:
# Security scanning with bandit
- repo: https://github.com/PyCQA/bandit
- rev: 1.7.10
+ rev: 1.9.3
hooks:
- id: bandit
args: [-c, pyproject.toml, -r, src/]
diff --git a/src/tablesleuth/api/routers/gizmosql.py b/src/tablesleuth/api/routers/gizmosql.py
index 0249c76..3803cbf 100644
--- a/src/tablesleuth/api/routers/gizmosql.py
+++ b/src/tablesleuth/api/routers/gizmosql.py
@@ -350,15 +350,13 @@ def compare_performance(req: CompareRequest) -> dict[str, Any]:
profiler.register_delta_table_with_version(
"ver_b", req.path, int(req.id_b), storage_options=req.storage_options
)
- label_a, label_b = "ver_a", "ver_b"
+ analyzer = SnapshotPerformanceAnalyzer(profiler)
+ comparison = analyzer.compare_query_performance("ver_a", "ver_b", req.query)
+ return _serialize_comparison(comparison)
else:
raise ValueError(f"Unsupported format: {req.format}")
- analyzer = SnapshotPerformanceAnalyzer(profiler)
- comparison = analyzer.compare_query_performance(label_a, label_b, req.query)
- return _serialize_comparison(comparison)
-
except HTTPException:
raise
except ValueError as exc:
diff --git a/src/tablesleuth/cli/config_check.py b/src/tablesleuth/cli/config_check.py
index 9078d48..64fec32 100644
--- a/src/tablesleuth/cli/config_check.py
+++ b/src/tablesleuth/cli/config_check.py
@@ -122,7 +122,7 @@ def config_check(verbose: bool, with_gizmosql: bool) -> None:
"TABLESLEUTH_CATALOG_NAME": "Default catalog override",
"TABLESLEUTH_GIZMO_URI": "GizmoSQL URI override",
"TABLESLEUTH_GIZMO_USERNAME": "GizmoSQL username override",
- "TABLESLEUTH_GIZMO_PASSWORD": "GizmoSQL password override",
+ "TABLESLEUTH_GIZMO_PASSWORD": "GizmoSQL password override", # nosec B105
"PYICEBERG_HOME": "PyIceberg config directory",
}
diff --git a/src/tablesleuth/services/iceberg_manifest_patch.py b/src/tablesleuth/services/iceberg_manifest_patch.py
index 0945b2a..b2aed62 100644
--- a/src/tablesleuth/services/iceberg_manifest_patch.py
+++ b/src/tablesleuth/services/iceberg_manifest_patch.py
@@ -1,14 +1,30 @@
-"""Patch Iceberg snapshot metadata to work around DuckDB's uppercase file_format bug.
+"""Patch Iceberg snapshot metadata to work around DuckDB iceberg_scan() issues.
-DuckDB's iceberg_scan() rejects delete files whose file_format is stored as 'PARQUET'
-(uppercase) in the manifest avro — it only accepts 'parquet' (lowercase). This module
-creates a lightweight, temporary, locally-patched copy of the metadata chain:
+Three problems are corrected:
- metadata.json → patched temp copy (JSON rewrite, trivial)
- manifest-list.avro → patched temp copy (avro rewrite via fastavro, redirects
- delete-manifest paths to temp copies)
- delete manifests → fastavro-rewritten temp copies (file_format lowercased,
- handles null/Snappy/deflate codecs transparently)
+1. **Uppercase file_format** — DuckDB's ``iceberg_scan()`` rejects delete files whose
+ ``file_format`` is stored as ``'PARQUET'`` (uppercase); it only accepts ``'parquet'``.
+ Affected delete manifest avro files are re-encoded with the value lowercased.
+
+2. **Stale current-snapshot-id** — DuckDB applies delete files based on the table's
+ ``current-snapshot-id`` rather than the ``version =>`` argument. The patched
+ ``metadata.json`` always sets ``current-snapshot-id`` to the target snapshot.
+
+3. **Relative paths in local tables** — When an Iceberg table lives on the local
+ filesystem, manifest paths and data file paths stored in the metadata chain are
+ often relative. DuckDB resolves them from GizmoSQL's working directory, which
+ differs from the server's CWD on Windows. The patched metadata chain rewrites
+ all relative paths to absolute posix paths so DuckDB can find them regardless of
+ CWD.
+
+This module creates a lightweight, temporary, locally-patched copy of the metadata chain:
+
+ metadata.json → patched temp copy (JSON rewrite)
+ manifest-list.avro → patched temp copy (avro rewrite via fastavro; redirects
+ any re-encoded manifests and resolves relative paths)
+ manifests (all) → fastavro-rewritten temp copies when needed:
+ - data manifests: relative file_path resolved to absolute
+ - delete manifests: same + file_format lowercased
Data manifests and all actual data / delete Parquet files are NOT copied or modified;
they are referenced by their original S3 / local paths.
@@ -61,11 +77,26 @@ def _read_bytes(uri: str) -> bytes:
return Path(uri).read_bytes()
-def _patch_file_format_in_record(obj: Any) -> tuple[Any, bool]:
- """Recursively lowercase any file_format == 'PARQUET' field in a deserialized Avro object.
+def _is_relative_local_path(path: str) -> bool:
+ """Return True if *path* is a relative local filesystem path (not a URI, not absolute).
+
+ S3 URIs, file:// URIs, and absolute OS paths all return False.
+ """
+ if not path:
+ return False
+ if path.startswith(("s3://", "s3a://", "gs://", "az://", "abfs://", "file://")):
+ return False
+ return not Path(path).is_absolute()
+
+
+def _patch_manifest_record(obj: Any, fix_format: bool) -> tuple[Any, bool]:
+ """Recursively patch a deserialized Avro manifest record.
- Handles any nesting depth so it works regardless of whether the manifest uses
- the v1 or v2 Iceberg schema layout (direct record vs union-wrapped struct).
+ Two transformations:
+ - ``file_path`` fields whose value is a relative local path are resolved to
+ absolute posix paths (so DuckDB can open them from any CWD).
+ - ``file_format`` fields whose value is ``'PARQUET'`` are lowercased to
+ ``'parquet'`` when *fix_format* is True (delete manifests only).
Returns:
Tuple of (possibly-modified object, changed_flag).
@@ -74,11 +105,14 @@ def _patch_file_format_in_record(obj: Any) -> tuple[Any, bool]:
changed = False
new_obj: dict[str, Any] = {}
for k, v in obj.items():
- if k == "file_format" and isinstance(v, str) and v == "PARQUET":
+ if k == "file_path" and isinstance(v, str) and _is_relative_local_path(v):
+ new_obj[k] = Path(v).resolve().as_posix()
+ changed = True
+ elif k == "file_format" and fix_format and isinstance(v, str) and v == "PARQUET":
new_obj[k] = "parquet"
changed = True
else:
- new_v, sub_changed = _patch_file_format_in_record(v)
+ new_v, sub_changed = _patch_manifest_record(v, fix_format)
new_obj[k] = new_v
if sub_changed:
changed = True
@@ -87,7 +121,7 @@ def _patch_file_format_in_record(obj: Any) -> tuple[Any, bool]:
changed = False
new_list: list[Any] = []
for item in obj:
- new_item, sub_changed = _patch_file_format_in_record(item)
+ new_item, sub_changed = _patch_manifest_record(item, fix_format)
new_list.append(new_item)
if sub_changed:
changed = True
@@ -96,14 +130,17 @@ def _patch_file_format_in_record(obj: Any) -> tuple[Any, bool]:
return obj, False
-def _patch_delete_manifest(content: bytes) -> bytes:
- """Rewrite a delete manifest avro, lowercasing file_format 'PARQUET' → 'parquet'.
+def _patch_manifest(content: bytes, is_delete: bool) -> bytes:
+ """Patch a manifest avro file using fastavro round-tripping.
- Uses fastavro for proper Avro round-tripping so it works with null, Snappy,
- and deflate codecs. Falls back to binary substitution if fastavro fails.
+ For data manifests (``is_delete=False``): resolves relative ``file_path`` values.
+ For delete manifests (``is_delete=True``): also lowercases ``file_format``.
+
+ Falls back to binary ``b'PARQUET'`` → ``b'parquet'`` substitution for delete
+ manifests if fastavro fails (handles most null-codec avro files).
Returns the original bytes object unchanged if no patching was needed.
- Caller uses identity comparison (patched is content) to detect this.
+ Caller uses identity comparison (``patched is content``) to detect this.
"""
import fastavro
@@ -114,7 +151,7 @@ def _patch_delete_manifest(content: bytes) -> bytes:
changed = False
for record in reader:
- new_record, was_changed = _patch_file_format_in_record(record)
+ new_record, was_changed = _patch_manifest_record(record, fix_format=is_delete)
records.append(new_record)
if was_changed:
changed = True
@@ -124,27 +161,22 @@ def _patch_delete_manifest(content: bytes) -> bytes:
out = io.BytesIO()
fastavro.writer(out, fastavro.parse_schema(schema), records)
- patched = out.getvalue()
- logger.warning(
- "Patched delete manifest: lowercased %d file_format field(s)",
- sum(
- 1
- for r in records
- if isinstance(r.get("data_file"), dict)
- and r["data_file"].get("file_format") == "parquet"
- ),
- )
- return patched
+ return out.getvalue()
except Exception as exc:
logger.warning("fastavro round-trip failed (%s), falling back to binary patch", exc)
- if b"PARQUET" not in content:
- return content
- return content.replace(b"PARQUET", b"parquet")
+ if is_delete and b"PARQUET" in content:
+ return content.replace(b"PARQUET", b"parquet")
+ return content
def _rewrite_manifest_list(content: bytes, path_map: dict[str, str]) -> bytes:
- """Rewrite a manifest-list avro file, updating manifest_path values per path_map.
+ """Rewrite a manifest-list avro file, updating manifest_path values.
+
+ Two transformations are applied:
+ - Paths in *path_map* are replaced with their mapped values (patched manifests).
+ - Relative local paths not in path_map are resolved to absolute posix paths so
+ DuckDB can open them when the manifest-list lives in a temp directory.
Uses fastavro for proper avro round-tripping (path strings change length so
binary substitution is not safe here).
@@ -159,6 +191,9 @@ def _rewrite_manifest_list(content: bytes, path_map: dict[str, str]) -> bytes:
if orig in path_map:
record = dict(record) # type: ignore[arg-type]
record["manifest_path"] = path_map[orig]
+ elif _is_relative_local_path(orig):
+ record = dict(record) # type: ignore[arg-type]
+ record["manifest_path"] = Path(orig).resolve().as_posix()
records.append(record) # type: ignore[arg-type]
out = io.BytesIO()
@@ -178,21 +213,7 @@ def patched_iceberg_metadata(
) -> Generator[str, None, None]: # noqa: UP043
"""Yield a ``file://`` URI for a locally-patched copy of Iceberg snapshot metadata.
- Two problems are corrected:
-
- 1. **DuckDB delete-file format bug** — DuckDB's ``iceberg_scan()`` rejects delete
- manifests whose ``file_format`` is ``'PARQUET'`` (uppercase). Affected avro
- files are re-encoded with the value lowercased.
-
- 2. **Stale current-snapshot-id** — DuckDB applies delete files based on the
- table's ``current-snapshot-id`` rather than the ``version =>`` argument.
- When comparing an older snapshot against a newer one that introduced deletes,
- the older snapshot would incorrectly inherit those deletes. The patched
- metadata sets ``current-snapshot-id`` to the target snapshot so DuckDB only
- sees the delete files that belong to that particular snapshot.
-
- A temporary local ``metadata.json`` is always written (even when no avro
- re-encoding is needed) to carry the corrected ``current-snapshot-id``.
+ See module docstring for the three problems corrected.
Args:
native_table: PyIceberg ``Table`` instance (from ``IcebergTableInfo.native_table``).
@@ -209,8 +230,7 @@ def patched_iceberg_metadata(
snapshot_id,
)
- # Read the metadata JSON (always local-accessible since IcebergMetadataService
- # already resolved it; for S3 tables PyIceberg caches/fetches it).
+ # Read the metadata JSON.
try:
meta_bytes = _read_bytes(metadata_location)
except Exception as exc:
@@ -235,7 +255,7 @@ def patched_iceberg_metadata(
manifest_list_uri: str = target_snap["manifest-list"]
logger.debug("patched_iceberg_metadata: manifest_list_uri=%r", manifest_list_uri)
- # Read the manifest-list avro to discover which manifests are DELETE manifests.
+ # Read the manifest-list avro.
try:
ml_bytes = _read_bytes(manifest_list_uri)
except Exception as exc:
@@ -248,78 +268,71 @@ def patched_iceberg_metadata(
ml_reader = fastavro.reader(io.BytesIO(ml_bytes))
ml_records: list[dict] = list(ml_reader) # type: ignore[arg-type]
- # Identify DELETE manifests (content == 1 in the manifest-list schema).
- delete_manifest_uris = [r["manifest_path"] for r in ml_records if r.get("content", 0) == 1]
-
- logger.debug(
- "patched_iceberg_metadata: found %d delete manifest(s): %r",
- len(delete_manifest_uris),
- delete_manifest_uris,
- )
+ logger.debug("patched_iceberg_metadata: manifest-list has %d entries", len(ml_records))
- # Always create a patched metadata copy even when no delete manifests need format
- # fixing. DuckDB's iceberg_scan() applies delete files based on the table's
- # *current-snapshot-id* rather than on the version specified via ``version =>``.
- # Setting current-snapshot-id to the target snapshot in a local copy ensures
- # DuckDB only sees delete files that belong to that snapshot, preventing older
- # snapshots from inheriting delete records added by a newer current snapshot.
with tempfile.TemporaryDirectory(prefix="tablesleuth_iceberg_patch_") as tmpdir:
tmp = Path(tmpdir)
- path_map: dict[str, str] = {} # original URI → local posix path
+ # Maps original manifest URI → local posix path for any re-encoded manifest.
+ path_map: dict[str, str] = {}
+
+ # Process ALL manifests — both data (content=0) and delete (content=1).
+ # Data manifests may have relative file_path values; delete manifests may
+ # additionally have uppercase file_format values.
+ for idx, ml_record in enumerate(ml_records):
+ manifest_uri: str = ml_record.get("manifest_path", "")
+ is_delete = ml_record.get("content", 0) == 1
- for idx, del_uri in enumerate(delete_manifest_uris):
try:
- raw = _read_bytes(del_uri)
+ raw = _read_bytes(manifest_uri)
except Exception as exc:
- logger.warning("Cannot read delete manifest %r: %s", del_uri, exc)
+ logger.warning("Cannot read manifest %r: %s", manifest_uri, exc)
continue
- patched = _patch_delete_manifest(raw)
+ patched = _patch_manifest(raw, is_delete=is_delete)
if patched is raw:
- # No uppercase PARQUET found — no format patch needed for this manifest.
logger.debug(
- "patched_iceberg_metadata: delete manifest %r needed no patching", del_uri
+ "patched_iceberg_metadata: manifest %r needed no patching", manifest_uri
)
continue
- local_name = f"delete_manifest_{idx}.avro"
+ local_name = f"manifest_{idx}.avro"
local_path = tmp / local_name
local_path.write_bytes(patched)
# Use posix path (no file:// prefix) so DuckDB can open it directly.
- # file:///C:/... URIs get mangled by DuckDB's internal path stripping
- # (file:// stripped → /C:/... which is invalid on Windows).
local_posix = local_path.as_posix()
- path_map[del_uri] = local_posix
+ path_map[manifest_uri] = local_posix
logger.debug(
- "patched_iceberg_metadata: patched delete manifest %r → %s", del_uri, local_posix
+ "patched_iceberg_metadata: patched manifest %r → %s", manifest_uri, local_posix
)
- # Rewrite the manifest-list only when some delete manifests were re-encoded.
+ # Rewrite the manifest-list when: (a) some manifests were re-encoded, OR
+ # (b) it contains relative manifest_path values (Windows local tables).
+ needs_ml_rewrite = bool(path_map) or any(
+ _is_relative_local_path(r.get("manifest_path", "")) for r in ml_records
+ )
+
new_ml_posix: str | None = None
- if path_map:
+ if needs_ml_rewrite:
patched_ml = _rewrite_manifest_list(ml_bytes, path_map)
ml_path = tmp / "manifest_list.avro"
ml_path.write_bytes(patched_ml)
new_ml_posix = ml_path.as_posix()
logger.debug("patched_iceberg_metadata: rewrote manifest-list → %s", new_ml_posix)
- else:
- logger.debug(
- "patched_iceberg_metadata: %d delete manifest(s) found; none needed format patching",
- len(delete_manifest_uris),
- )
- # Always rewrite the metadata JSON so that:
- # 1. current-snapshot-id points to the target snapshot (not the table's
- # current HEAD), preventing DuckDB from applying newer delete files.
- # 2. If delete manifests were re-encoded, the manifest-list path is updated.
+ # Always rewrite metadata.json to:
+ # 1. Set current-snapshot-id to target snapshot (prevents DuckDB from
+ # applying delete files from newer snapshots to this one).
+ # 2. Point this snapshot's manifest-list to the patched copy (if any).
+ # 3. Resolve relative manifest-list paths for all other snapshots.
patched_meta = json.loads(json.dumps(metadata)) # deep copy
patched_meta["current-snapshot-id"] = snapshot_id
- if new_ml_posix is not None:
- for snap in patched_meta.get("snapshots", []):
- if snap.get("snapshot-id") == snapshot_id:
- snap["manifest-list"] = new_ml_posix
- break
+ for snap in patched_meta.get("snapshots", []):
+ ml = snap.get("manifest-list", "")
+ if snap.get("snapshot-id") == snapshot_id and new_ml_posix is not None:
+ snap["manifest-list"] = new_ml_posix
+ elif _is_relative_local_path(ml):
+ snap["manifest-list"] = Path(ml).resolve().as_posix()
meta_path = tmp / "metadata.json"
meta_path.write_text(json.dumps(patched_meta), encoding="utf-8")
diff --git a/src/tablesleuth/web/index.html b/src/tablesleuth/web/index.html
index a5333a5..384ba75 100644
--- a/src/tablesleuth/web/index.html
+++ b/src/tablesleuth/web/index.html
@@ -1 +1 @@
-
TableSleuth
TableSleuth
Forensic analysis for open table formats — Parquet, Apache Iceberg, Delta Lake.