diff --git a/pycharm_plugin/src/main/kotlin/com/projspec/toolwindow/HtmlContent.kt b/pycharm_plugin/src/main/kotlin/com/projspec/toolwindow/HtmlContent.kt
index 2e81e49..102ce2b 100644
--- a/pycharm_plugin/src/main/kotlin/com/projspec/toolwindow/HtmlContent.kt
+++ b/pycharm_plugin/src/main/kotlin/com/projspec/toolwindow/HtmlContent.kt
@@ -556,8 +556,18 @@ body { margin: 0; padding: 0; font-family: var(--vscode-font-family); color: var
}
}
function fmtAge(ts) {
- const days = Math.floor((Date.now() / 1000 - parseFloat(ts)) / 86400);
- if (days === 0) return 'today';
+ const secs = Math.floor(Date.now() / 1000 - parseFloat(ts));
+ if (secs < 0) return 'just now';
+ const days = Math.floor(secs / 86400);
+ if (days === 0) {
+ if (secs < 60) return 'just now';
+ if (secs < 3600) {
+ const m = Math.floor(secs / 60);
+ return m + ' minute' + (m !== 1 ? 's' : '') + ' ago';
+ }
+ const h = Math.floor(secs / 3600);
+ return h + ' hour' + (h !== 1 ? 's' : '') + ' ago';
+ }
if (days === 1) return 'yesterday';
if (days < 30) return days + ' days ago';
if (days < 365) return Math.floor(days / 30) + ' months ago';
@@ -641,6 +651,8 @@ body { margin: 0; padding: 0; font-family: var(--vscode-font-family); color: var
const by = project.last_modified_by != null ? project.last_modified_by : null;
metaParts.push('last modified ' + age + (by ? ' by ' + by : ''));
}
+ if (project.scanned_at != null)
+ metaParts.push('scanned ' + fmtAge(project.scanned_at));
if (metaParts.length > 0) {
const meta = document.createElement('div');
meta.className = 'meta';
@@ -914,15 +926,60 @@ body { margin: 0; padding: 0; font-family: var(--vscode-font-family); color: var
body.innerHTML = sanitizeHtml(html);
w.appendChild(body);
} else {
+ // Datasets (and other content) may carry rich previews in
+ // metadata.html_repr (an HTML fragment) and metadata.thumbnail
+ // (a data: image URL). Embed those rather than dumping their
+ // (often huge) raw strings into the YAML tree.
+ const meta = (kind === 'content' && data && typeof data === 'object'
+ && data.metadata && typeof data.metadata === 'object') ? data.metadata : null;
+ const htmlRepr = meta && typeof meta.html_repr === 'string' ? meta.html_repr : null;
+ const thumb = meta && typeof meta.thumbnail === 'string' ? meta.thumbnail : null;
+
const tree = document.createElement('div');
tree.className = 'tree yaml';
- tree.appendChild(renderYaml(stripKlass(data)));
+ tree.appendChild(renderYaml(stripPreview(stripKlass(data))));
w.appendChild(tree);
+
+ if (thumb) w.appendChild(thumbnailImg(thumb));
+ if (htmlRepr) {
+ const body = document.createElement('div');
+ body.className = 'widget-html';
+ body.innerHTML = sanitizeHtml(htmlRepr);
+ w.appendChild(body);
+ }
}
return w;
}
+ function stripPreview(obj) {
+ if (!obj || typeof obj !== 'object' || Array.isArray(obj)) return obj;
+ if (!obj.metadata || typeof obj.metadata !== 'object' || Array.isArray(obj.metadata)) return obj;
+ const meta = {};
+ let changed = false;
+ for (const k of Object.keys(obj.metadata)) {
+ if (k === 'html_repr' || k === 'thumbnail') { changed = true; continue; }
+ meta[k] = obj.metadata[k];
+ }
+ if (!changed) return obj;
+ const out = {};
+ for (const k of Object.keys(obj)) out[k] = obj[k];
+ out.metadata = meta;
+ return out;
+ }
+
+ function thumbnailImg(src) {
+ const wrap = document.createElement('div');
+ wrap.className = 'widget-html';
+ if (/^data:image\//i.test(src)) {
+ const img = document.createElement('img');
+ img.src = src;
+ img.alt = 'thumbnail';
+ wrap.appendChild(img);
+ }
+ return wrap;
+ }
+
function sanitizeHtml(html) {
const tpl = document.createElement('template');
tpl.innerHTML = String(html);
diff --git a/pyproject.toml b/pyproject.toml
index 3240168..380a1ea 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -28,11 +28,12 @@ dependencies = [
"fsspec",
"click",
"jinja2",
+ "intake==2.1.0a2"
]
[project.optional-dependencies]
test = ["pytest", "pytest-cov", "django", "streamlit", "copier", "jinja2-time", "flask",
- "maturin", "uv", "briefcase"]
+ "maturin", "uv", "briefcase", "textual"]
qt = ["pyqt>5,<6", "pyqtwebengin>5,<6"]
textual = ["textual>=0.80"]
ipywidget = ["anywidget>=0.9", "ipywidgets>=8", "ipython"]
diff --git a/src/projspec/__main__.py b/src/projspec/__main__.py
index ada5cb6..47141bc 100755
--- a/src/projspec/__main__.py
+++ b/src/projspec/__main__.py
@@ -95,12 +95,6 @@ def version():
default=False,
help="JSON output, for projects only",
)
-@click.option(
- "--html-out",
- is_flag=True,
- default=False,
- help="HTML output, for projects only",
-)
@click.option(
"--walk", is_flag=True, help="Descend into child directories of each match"
)
@@ -112,7 +106,6 @@ def scan(
types,
xtypes,
json_out,
- html_out,
walk,
summary,
library,
@@ -146,8 +139,6 @@ def scan(
else:
if json_out:
print(json.dumps(proj.to_dict(compact=False)))
- elif html_out:
- print(proj._repr_html_())
else:
print(proj)
diff --git a/src/projspec/config.py b/src/projspec/config.py
index 7c0d2e5..9bf7d61 100644
--- a/src/projspec/config.py
+++ b/src/projspec/config.py
@@ -5,7 +5,7 @@
from typing import Any
-conf: dict[str, dict[str, Any]] = {}
+conf: dict[str, Any] = {}
default_conf_dir = os.path.join(os.path.expanduser("~"), ".config/projspec")
@@ -33,12 +33,19 @@ def coerce(template, val):
def defaults():
return {
"library_path": f"{conf_dir()}/library.json",
+ "auto_rescan": 7 * 24 * 60 * 60, # one week, in seconds
"scan_types": [".py", ".yaml", ".yml", ".toml", ".json", ".md"],
"scan_max_files": 100,
"scan_max_size": 5 * 2**10,
"remote_artifact_status": False,
"capture_artifact_output": True,
"preferred_install_methods": ["conda", "pip"],
+ "data_min_fraction": 0.5,
+ "data_min_file_size": 1024 * 1024,
+ "data_min_total_size": 10 * 1024 * 1024,
+ "data_min_play_size": 1, # 64 * 1024,
+ "data_consolidate_min_group": 3,
+ "data_inspect_max_datasets": 50,
"excludes": [
"bld",
"build",
@@ -56,6 +63,11 @@ def defaults():
config_doc = {
"library_path": "location of persisted project objects",
+ "auto_rescan": (
+ "maximum age (seconds) of a project loaded from the library before it "
+ "is automatically rescanned and re-saved. Set to 0 to disable "
+ "automatic rescanning. Default is one week."
+ ),
"scan_types": "files extensions automatically read for scanning",
"scan_max_files": "don't scan files if more than this number in the project",
"scan_max_size": "don't scan files bigger than this (in bytes)",
@@ -68,6 +80,34 @@ def defaults():
"ordered list of preferred installer names for install_tool(), "
"e.g. ['uv', 'conda', 'pip']. Empty list uses the platform default."
),
+ "data_min_fraction": (
+ "fraction (0-1) of a project's total bytes that must be data files "
+ "before a code/other project is also reported as a DataProject. Data "
+ "below this fraction is only scanned if the project matches no other "
+ "type, or individual files exceed data_min_file_size."
+ ),
+ "data_min_file_size": (
+ "a single data file at or above this size (bytes) is considered "
+ "significant enough to scan even in an otherwise code project."
+ ),
+ "data_min_total_size": (
+ "minimum total size (bytes) of candidate data before a directory that "
+ "also matches another project type is additionally reported as a "
+ "DataProject (used together with data_min_fraction)."
+ ),
+ "data_min_play_size": (
+ "floor (bytes) below which even a directory that matches no other "
+ "project type is dismissed as toy/play data and not reported as a "
+ "DataProject."
+ ),
+ "data_consolidate_min_group": (
+ "minimum number of numbered/related files (e.g. 001.csv, 002.csv) that "
+ "are consolidated into a single dataset."
+ ),
+ "data_inspect_max_datasets": (
+ "do not run intake inspection if more than this many distinct datasets "
+ "are found in a directory (avoids huge scans)."
+ ),
"excludes": (
"directory names to skip when walking a project tree for child projects "
"and file statistics. Directories whose names start with '.' or '_' are "
diff --git a/src/projspec/content/__init__.py b/src/projspec/content/__init__.py
index 4c02338..fcf96d8 100644
--- a/src/projspec/content/__init__.py
+++ b/src/projspec/content/__init__.py
@@ -7,11 +7,16 @@
PipelineStage,
ServiceDependency,
)
-from projspec.content.data import TabularData, IntakeSource
+from projspec.content.data import (
+ Dataset,
+ FrictionlessData,
+ IntakeSource,
+ TabularData,
+)
from projspec.content.env_var import EnvironmentVariables
from projspec.content.environment import Environment, Stack, Precision
from projspec.content.executable import Command
-from projspec.content.metadata import DescriptiveMetadata, License
+from projspec.content.metadata import Citation, DescriptiveMetadata, License
from projspec.content.package import PythonPackage
from projspec.content.vcs import VCSInfo
@@ -22,10 +27,13 @@
"GithubAction",
"PipelineStage",
"ServiceDependency",
- "TabularData",
+ "Dataset",
+ "FrictionlessData",
"IntakeSource",
+ "TabularData",
"EnvironmentVariables",
"Command",
+ "Citation",
"License",
"DescriptiveMetadata",
"PythonPackage",
diff --git a/src/projspec/content/data.py b/src/projspec/content/data.py
index 3ebdf47..fc6997e 100644
--- a/src/projspec/content/data.py
+++ b/src/projspec/content/data.py
@@ -1,4 +1,9 @@
-"""Contents specifying datasets"""
+"""Content classes describing datasets found within a project.
+
+These describe data assets in a formal way, without loading the data. Most
+of them mirror the things that ``intake`` (v2, ``intake.readers``) can tell us
+about a URL/glob/list of files via :func:`intake.readers.inspect.inspect_dataset`.
+"""
from dataclasses import dataclass, field
@@ -7,107 +12,83 @@
@dataclass
class TabularData(BaseContent):
- """A tabular dataset, columns and rows
+ """A tabular (columnar) dataset, e.g. CSV/parquet/SQL.
- This lists loadable tabular files with defined schema, typically from formats such as
- JSON, CSV, and parquet.
+ ``schema`` is a free-form mapping describing the columns; its exact form
+ depends on where it was sourced (FrictionlessData resource schema, a
+ HuggingFace ``features`` block, or intake's ``datashape``).
"""
icon = "๐"
name: str
+ schema: dict = field(default_factory=dict)
metadata: dict = field(default_factory=dict)
- # allowed schema formats:
- # - dtype-like {fieldname: string-type}
- # - dtype-complex {fieldname: {...}}
- # - list like [{name:, ...}]
- # We may choose to normalise to just one of these eventually
- schema: dict | list = field(default_factory=dict)
@dataclass
-class IntakeSource(BaseContent):
- """A catalog of data assets, including basic properties (location) and how to load/process them.
+class FrictionlessData(BaseContent):
+ """A data resource described by the FrictionlessData standard.
- See https://intake.readthedocs.io/en/latest/
+ See https://datapackage.org/standard/data-resource/ .
"""
+ icon = "๐ชช"
+
+ name: str
+ schema: dict = field(default_factory=dict)
+
+
+@dataclass
+class IntakeSource(BaseContent):
+ """A named entry in an intake catalog."""
+
icon = "๐"
- # TODO: add better fields: args, driver/reader, metadata, description
name: str
@dataclass
-class DataResource(BaseContent):
- """A data resource found inside a data-only directory.
-
- Describes one logical dataset โ which may be a flat collection of files, a
- Hive-partitioned tree, an Iceberg/Delta table, a Zarr store, or any other
- recognised on-disk layout.
-
- The `path` field is a human-readable basename that identifies the resource:
-
- - Single file: `"data.csv"`
- - Multi-file series: `"part*.parquet"` (glob-style, common prefix + `*` + ext)
- - Directory-as-dataset (Hive partition, Zarr store, โฆ): `"year=2024/"`
-
- The `modality` field classifies the broad nature of the data using the
- vocabulary established by intake's `structure` tags and napari's layer
- type system:
-
- - `"tabular"` โ row/column data (CSV, Parquet, ORC, Excel, โฆ)
- - `"array"` โ N-dimensional arrays (NumPy, HDF5, NetCDF, Zarr, โฆ)
- - `"image"` โ 2-D/3-D images (PNG, JPEG, TIFF, DICOM, NIfTI, โฆ)
- - `"timeseries"` โ time-indexed signals (WAV, GRIB, โฆ)
- - `"geospatial"` โ vector/raster geodata (Shapefile, GeoJSON, GeoTIFF, โฆ)
- - `"model"` โ ML model weights (GGUF, SafeTensors, PyTorch, โฆ)
- - `"nested"` โ hierarchical / JSON-like (Avro, YAML, XML, โฆ)
- - `"document"` โ human-readable documents (PDF, DOCX, โฆ)
- - `"video"` โ video streams (MP4, AVI, โฆ)
- - `"archive"` โ compressed bundles (ZIP, tar.gz, โฆ)
- - `""` โ unknown / mixed
-
- The `schema` field is format-specific:
-
- - Tabular (Parquet, Arrow, CSV, โฆ): `{column_name: dtype_str, โฆ}`
- - Image / array: `{"width": int, "height": int, "channels": int, "mode": str}`
- - Audio: `{"sample_rate": int, "channels": int, "frames": int}`
- - HDF5 / Zarr / NetCDF: `{"variables": [...], "dims": {...}, "attrs": {...}}`
- - Unknown / library not available: `{}`
+class Dataset(BaseContent):
+ """A generic dataset discovered on disk and described by intake.
+
+ This is produced by :class:`projspec.proj.data_project.DataProject` after
+ scanning files/globs with :func:`intake.readers.inspect.inspect_dataset`.
+
+ The dataset's short identifying name is *not* stored on the object: a
+ :class:`DataProject` exposes its datasets as an ``AttrDict`` keyed by that
+ name (e.g. ``proj.contents.dataset["*.csv"]``), so duplicating it here
+ would be redundant.
+
+ Attributes
+ ----------
+ url:
+ The URL, glob or list of URLs that make up this dataset, relative to
+ (or rooted at) the project directory.
+ datatype:
+ The intake ``BaseData`` subclass name detected (e.g. ``"CSV"``,
+ ``"Parquet"``), or ``None`` if intake could not identify the type.
+ structure:
+ Structural tags reported by intake (e.g. ``{"table"}``,
+ ``{"array", "image"}``).
+ schema:
+ The ``datashape`` mapping returned by intake (columns/dtypes, dims,
+ etc.); empty if no reader could describe the data.
+ n_files:
+ Number of files that make up the dataset (after glob expansion).
+ total_size:
+ Total bytes across all files in the dataset, if known.
+ metadata:
+ Any other useful summary information from intake (shape, npartitions,
+ recommended readers, description, โฆ).
"""
- icon = "๐ฅ"
-
- path: str # basename (or glob pattern / dir/ ) identifying this resource
- format: str # canonical format string, e.g. "parquet", "csv", "png", "hdf5"
- modality: str = "" # broad data nature; see docstring for vocabulary
- layout: str = "" # "flat"|"hive"|"iceberg"|"delta"|"zarr_store"|"tiledarray"|""
- file_count: int = 0
- total_size: int = 0 # bytes; 0 when unknown (e.g. remote FS without size info)
- schema: dict | list = field(default_factory=dict)
- # full path to one representative file, for use by preview loaders
- sample_path: str = ""
- metadata: dict = field(default_factory=dict) # catch-all extras
- _html = None
-
- def __repr__(self) -> str:
- from projspec.content.data_html import repr_text
-
- return repr_text(self)
-
- def _repr_html_(self) -> str:
- """Jupyter rich display โ returns cached HTML, rendering on first call."""
- # TODO: this is probably not what we want jupyter to dysplay, but it's
- # convenient for now.
- if self._html is None:
- from projspec.content.data_html import repr_html
-
- self._html = repr_html(self)
- return self._html
-
- def to_dict(self, compact=False):
- d = super().to_dict(compact=compact)
- if not compact:
- d["_html"] = self._repr_html_()
- return d
+ icon = "๐๏ธ"
+
+ url: str | list[str] = ""
+ datatype: str | None = None
+ structure: list[str] = field(default_factory=list)
+ schema: dict = field(default_factory=dict)
+ n_files: int = 1
+ total_size: int | None = None
+ metadata: dict = field(default_factory=dict)
diff --git a/src/projspec/content/data_html.py b/src/projspec/content/data_html.py
deleted file mode 100644
index 530fb60..0000000
--- a/src/projspec/content/data_html.py
+++ /dev/null
@@ -1,632 +0,0 @@
-"""Text and HTML representations for DataResource.
-
-``repr_text`` โ plain-text one-liner for ``__repr__``.
-``repr_html`` โ rich HTML card for Jupyter's ``_repr_html_`` protocol.
-
-The HTML card has two sections:
-
-1. **Metadata table** โ name, format, modality, layout, file count, total size,
- schema (collapsed by default when it has many entries).
-
-2. **Preview** (optional) โ a lightweight peek at the actual data using
- whichever optional library is available for the format. The section is
- silently omitted when no suitable loader can be imported.
-
-All loader imports are guarded with ``try/except ImportError`` so that the
-representation degrades gracefully when optional dependencies are absent.
-"""
-
-from __future__ import annotations
-
-import base64
-import html as _html
-from typing import TYPE_CHECKING
-
-if TYPE_CHECKING:
- from projspec.content.data import DataResource
-
-# ---------------------------------------------------------------------------
-# Helpers
-# ---------------------------------------------------------------------------
-
-_MODALITY_ICON: dict[str, str] = {
- "tabular": "📊", # ๐
- "image": "🖼", # ๐ผ
- "array": "🧮", # ๐งฎ
- "timeseries": "📈", # ๐
- "geospatial": "🌍", # ๐
- "model": "🧠", # ๐ง
- "nested": "📂", # ๐
- "document": "📄", # ๐
- "video": "🎬", # ๐ฌ
- "archive": "📦", # ๐ฆ
- "": "🗂", # ๐
-}
-
-
-def _fmt_size(n: int) -> str:
- """Human-readable byte count."""
- if n <= 0:
- return "unknown"
- for unit in ("B", "KB", "MB", "GB", "TB"):
- if n < 1024:
- return f"{n:.1f} {unit}" if unit != "B" else f"{n} B"
- n /= 1024 # type: ignore[assignment]
- return f"{n:.1f} PB"
-
-
-def _esc(s: object) -> str:
- return _html.escape(str(s))
-
-
-# ---------------------------------------------------------------------------
-# Plain-text repr
-# ---------------------------------------------------------------------------
-
-
-def repr_text(dr: "DataResource") -> str:
- """One-line text representation of a DataResource."""
- size = _fmt_size(dr.total_size)
- schema_hint = ""
- if isinstance(dr.schema, dict) and dr.schema:
- keys = list(dr.schema)[:3]
- extra = f", +{len(dr.schema) - 3} more" if len(dr.schema) > 3 else ""
- schema_hint = f" [{', '.join(str(k) for k in keys)}{extra}]"
- elif isinstance(dr.schema, list) and dr.schema:
- schema_hint = f" [{len(dr.schema)} fields]"
-
- parts = [
- f"DataResource({dr.path!r}",
- f"format={dr.format!r}",
- ]
- if dr.modality:
- parts.append(f"modality={dr.modality!r}")
- if dr.layout and dr.layout not in ("flat", ""):
- parts.append(f"layout={dr.layout!r}")
- parts.append(f"files={dr.file_count}")
- parts.append(f"size={size}")
- if schema_hint:
- parts.append(f"schema={schema_hint.strip()}")
- return ", ".join(parts) + ")"
-
-
-# ---------------------------------------------------------------------------
-# HTML repr
-# ---------------------------------------------------------------------------
-
-# No inline styles โ class names are present for external styling by the
-# host environment (Jupyter, VS Code webview, etc.).
-_CARD_CSS = ""
-
-
-def repr_html(dr: "DataResource") -> str:
- """Rich HTML card representation of a DataResource."""
- icon = _MODALITY_ICON.get(dr.modality, _MODALITY_ICON[""])
- size_str = _fmt_size(dr.total_size)
-
- # ---- header ----
- modality_badge = (
- f'{_esc(dr.modality)}' if dr.modality else ""
- )
- format_badge = f'{_esc(dr.format)}'
- layout_badge = (
- f'{_esc(dr.layout)}'
- if dr.layout and dr.layout not in ("flat", "")
- else ""
- )
-
- header = (
- f'
"
- )
-
- # ---- metadata table ----
- meta_rows = [
- ("Files", str(dr.file_count)),
- ("Total size", size_str),
- ]
-
- meta_html_rows = "".join(
- f"| {_esc(k)} | {v} |
" for k, v in meta_rows
- )
- schema_html = _render_schema(dr.schema)
-
- meta_section = (
- f'"
- )
-
- # ---- preview ----
- preview_html = _build_preview(dr)
- preview_section = ""
- if preview_html:
- preview_section = (
- f''
- f'
Preview
'
- f"{preview_html}"
- f"
"
- )
-
- return (
- _CARD_CSS
- + f''
- + header
- + meta_section
- + preview_section
- + "
"
- )
-
-
-# ---------------------------------------------------------------------------
-# Schema rendering
-# ---------------------------------------------------------------------------
-
-
-def _render_schema(schema: dict | list) -> str:
- """Render schema as a collapsible HTML block."""
- if not schema:
- return ""
-
- if isinstance(schema, dict):
- # Tabular-style {col: dtype} or structural {"variables": [...], ...}
- rows = ""
- for k, v in schema.items():
- rows += f"| {_esc(k)} | {_esc(v)} |
"
- table = (
- f''
- f"| Field | Type / Value |
"
- f"{rows}"
- f"
"
- )
- n = len(schema)
- open_attr = "open" if n <= 8 else ""
- return (
- f''
- f'Schema ({n} {"field" if n == 1 else "fields"})
'
- f"{table} "
- )
-
- if isinstance(schema, list):
- # List-of-dicts (frictionless style) or plain list
- if schema and isinstance(schema[0], dict):
- # Render each dict as a row; use union of all keys as columns
- all_keys: list[str] = []
- for item in schema:
- for k in item:
- if k not in all_keys:
- all_keys.append(k)
- header_row = "".join(f"{_esc(k)} | " for k in all_keys)
- body_rows = ""
- for item in schema:
- cells = "".join(f"{_esc(item.get(k, ''))} | " for k in all_keys)
- body_rows += f"{cells}
"
- table = (
- f''
- f"{header_row}
{body_rows}
"
- )
- else:
- items_html = "".join(f"{_esc(s)}" for s in schema)
- table = f""
-
- n = len(schema)
- open_attr = "open" if n <= 8 else ""
- return (
- f''
- f'Schema ({n} {"field" if n == 1 else "fields"})
'
- f"{table} "
- )
-
- return ""
-
-
-# ---------------------------------------------------------------------------
-# Preview builders โ one function per modality family, all return HTML str
-# or None when no loader is available.
-# ---------------------------------------------------------------------------
-
-#: How many rows to show in tabular previews.
-_PREVIEW_ROWS = 5
-
-
-def _obj_to_preview_html(obj) -> str:
- """Return the richest HTML string available for *obj*.
-
- Tries ``_repr_html_()`` first (pandas DataFrame, polars DataFrame, xarray
- Dataset, โฆ), then falls back to ``__repr__``. The result is always
- wrapped in a ```` so callers can rely on valid HTML.
- """
- if hasattr(obj, "_repr_html_"):
- try:
- h = obj._repr_html_()
- if h:
- return f'
{h}
'
- except Exception:
- pass
- return f'
'
-
-
-def _build_preview(dr: "DataResource") -> str | None:
- """Return an HTML preview fragment, or None if not possible."""
- fmt = dr.format
- modality = dr.modality
- sample = dr.sample_path if dr.sample_path else None
-
- if sample is None:
- return None
-
- if modality == "tabular":
- return _preview_tabular(dr, sample)
- if modality == "image":
- return _preview_image(dr, sample)
- if modality == "array":
- return _preview_array(dr, sample)
- if modality == "timeseries" and fmt in ("wav", "flac", "mp3", "ogg"):
- return _preview_audio(dr, sample)
- return None
-
-
-# --- tabular ---
-
-
-def _preview_tabular(dr: "DataResource", path: str) -> str | None:
- fmt = dr.format
- fs = dr.proj.fs
-
- try:
- if fmt == "parquet":
- return _preview_parquet(fs, path)
- if fmt == "csv":
- return _preview_csv(fs, path)
- if fmt in ("tsv", "psv"):
- sep = "\t" if fmt == "tsv" else "|"
- return _preview_csv(fs, path, sep=sep)
- if fmt == "arrow":
- return _preview_arrow(fs, path)
- if fmt == "jsonlines":
- return _preview_jsonlines(fs, path)
- if fmt == "excel":
- return _preview_excel(fs, path)
- if fmt in ("sqlite", "duckdb"):
- return _preview_sql(fs, path, fmt)
- if fmt == "orc":
- return _preview_orc(fs, path)
- except Exception:
- pass
- return None
-
-
-def _preview_parquet(fs, path: str) -> str | None:
- """Read only the first row group (or N rows from it) โ no full file scan."""
- try:
- import pyarrow.parquet as pq
-
- with fs.open(path, "rb") as fh:
- pf = pq.ParquetFile(fh)
- # read_row_group reads one row group's pages, not the whole file
- batch = pf.read_row_group(0)
- if batch.num_rows > _PREVIEW_ROWS:
- batch = batch.slice(0, _PREVIEW_ROWS)
- # Convert to pandas so we get _repr_html_() for free
- df = batch.to_pandas()
- return _obj_to_preview_html(df)
- except ImportError:
- pass
- try:
- # polars can read a row-count-limited slice without decoding the rest
- import polars as pl
-
- with fs.open(path, "rb") as fh:
- df = pl.read_parquet(fh, n_rows=_PREVIEW_ROWS)
- return _obj_to_preview_html(df)
- except ImportError:
- pass
- return None
-
-
-def _preview_csv(fs, path: str, sep: str = ",") -> str | None:
- # pandas nrows= stops parsing after N data lines โ minimal I/O
- try:
- import pandas as pd
-
- with fs.open(path, "r", encoding="utf-8", errors="replace") as fh:
- df = pd.read_csv(fh, sep=sep, nrows=_PREVIEW_ROWS)
- return _obj_to_preview_html(df)
- except ImportError:
- pass
- try:
- import polars as pl
-
- with fs.open(path, "rb") as fh:
- df = pl.read_csv(fh, n_rows=_PREVIEW_ROWS, separator=sep)
- return _obj_to_preview_html(df)
- except ImportError:
- pass
- return None
-
-
-def _preview_arrow(fs, path: str) -> str | None:
- """Read only the first record batch โ no full file deserialisation."""
- try:
- import pyarrow.ipc as ipc
-
- with fs.open(path, "rb") as fh:
- try:
- # IPC file format: random-access; read just batch 0
- reader = ipc.open_file(fh)
- batch = reader.get_batch(0)
- except Exception:
- fh.seek(0)
- # IPC stream format: sequential; read just the first batch
- reader = ipc.open_stream(fh)
- batch = reader.read_next_batch()
- if batch.num_rows > _PREVIEW_ROWS:
- batch = batch.slice(0, _PREVIEW_ROWS)
- df = batch.to_pandas()
- return _obj_to_preview_html(df)
- except ImportError:
- pass
- return None
-
-
-def _preview_jsonlines(fs, path: str) -> str | None:
- # pandas nrows= stops reading after N lines
- try:
- import pandas as pd
-
- with fs.open(path, "r", encoding="utf-8", errors="replace") as fh:
- df = pd.read_json(fh, lines=True, nrows=_PREVIEW_ROWS)
- return _obj_to_preview_html(df)
- except ImportError:
- pass
- return None
-
-
-def _preview_excel(fs, path: str) -> str | None:
- # nrows= limits rows read from the sheet
- try:
- import pandas as pd
-
- with fs.open(path, "rb") as fh:
- df = pd.read_excel(fh, nrows=_PREVIEW_ROWS)
- return _obj_to_preview_html(df)
- except ImportError:
- pass
- return None
-
-
-def _preview_sql(fs, path: str, fmt: str) -> str | None:
- # SQLite/DuckDB: only works with a local path (not a remote FS)
- try:
- if getattr(fs, "protocol", "file") not in ("file", "local", ""):
- return None
- if fmt == "duckdb":
- try:
- import duckdb
-
- con = duckdb.connect(path, read_only=True)
- tables = con.execute("SHOW TABLES").fetchall()
- if not tables:
- return None
- tname = tables[0][0]
- df = con.execute(
- f'SELECT * FROM "{tname}" LIMIT {_PREVIEW_ROWS}'
- ).fetchdf()
- return _obj_to_preview_html(df)
- except ImportError:
- pass
- else:
- import sqlite3
- import pandas as pd
-
- con = sqlite3.connect(path)
- cur = con.cursor()
- cur.execute("SELECT name FROM sqlite_master WHERE type='table'")
- tables = cur.fetchall()
- if not tables:
- return None
- tname = tables[0][0]
- df = pd.read_sql(f'SELECT * FROM "{tname}" LIMIT {_PREVIEW_ROWS}', con)
- return _obj_to_preview_html(df)
- except Exception:
- pass
- return None
-
-
-def _preview_orc(fs, path: str) -> str | None:
- try:
- import pyarrow.orc as orc
-
- with fs.open(path, "rb") as fh:
- table = orc.ORCFile(fh).read().slice(0, _PREVIEW_ROWS)
- df = table.to_pandas()
- return _obj_to_preview_html(df)
- except ImportError:
- pass
- return None
-
-
-# --- image ---
-
-
-def _preview_image(dr: "DataResource", path: str) -> str | None:
- try:
- from PIL import Image
- import io
-
- fs = dr.proj.fs
- with fs.open(path, "rb") as fh:
- raw: bytes = fh.read()
-
- img = Image.open(io.BytesIO(raw))
- img.thumbnail((600, 200))
-
- buf = io.BytesIO()
- # Save as PNG for lossless display regardless of source format
- rgb = img.convert("RGB") if img.mode not in ("RGB", "L", "RGBA") else img
- rgb.save(buf, format="PNG")
- b64 = base64.b64encode(buf.getvalue()).decode("ascii")
-
- w, h = img.size
- schema = dr.schema if isinstance(dr.schema, dict) else {}
- info = f"{schema.get('width', w)}ร{schema.get('height', h)}"
- if "mode" in schema:
- info += f", mode={schema['mode']}"
-
- return (
- f'

'
- f'
{_esc(info)}
'
- )
- except ImportError:
- pass
- except Exception:
- pass
- return None
-
-
-# --- array ---
-
-
-def _preview_array(dr: "DataResource", path: str) -> str | None:
- fmt = dr.format
- fs = dr.proj.fs
-
- if fmt == "numpy":
- return _preview_numpy(fs, path)
- if fmt == "hdf5":
- return _preview_hdf5(fs, path)
- if fmt == "netcdf":
- return _preview_netcdf(fs, path)
- if fmt == "zarr":
- return _preview_zarr(dr)
- return None
-
-
-def _array_info_html(info: dict) -> str:
- rows = "".join(
- f"
| {_esc(k)} | {_esc(v)} |
"
- for k, v in info.items()
- )
- return f'
'
-
-
-def _preview_numpy(fs, path: str) -> str | None:
- """Read only the .npy header to get shape/dtype, then load a minimal slice."""
- try:
- import numpy as np
- import numpy.lib.format as nf
- import io
-
- with fs.open(path, "rb") as fh:
- raw_header = fh.read(512) # header is always โค 512 bytes
-
- buf = io.BytesIO(raw_header)
- nf.read_magic(buf)
- # read_array_header_1_0 is the stable API across numpy versions;
- # newer numpy also exposes read_array_header โ try both.
- try:
- shape, _, dtype = nf.read_array_header_1_0(buf)
- except AttributeError:
- shape, _, dtype = nf.read_array_header(buf) # type: ignore[attr-defined]
-
- info: dict = {"shape": str(shape), "dtype": str(dtype)}
-
- # Load the full array only when it's small enough (โค 1 MB heuristic)
- # or when we can cheaply slice the first N rows.
- try:
- total_elements = 1
- for s in shape:
- total_elements *= s
- item_size = np.dtype(dtype).itemsize
- if total_elements * item_size <= 1_048_576:
- with fs.open(path, "rb") as fh:
- arr = np.load(io.BytesIO(fh.read()), allow_pickle=False)
- sliced = arr[:_PREVIEW_ROWS] if arr.ndim >= 1 else arr
- info["preview"] = repr(sliced)
- except Exception:
- pass
-
- return _array_info_html(info)
- except Exception:
- pass
- return None
-
-
-def _preview_hdf5(fs, path: str) -> str | None:
- """Open the HDF5 file and read only metadata โ no array data loaded."""
- try:
- import h5py
-
- with fs.open(path, "rb") as fh:
- with h5py.File(fh, "r") as f:
- keys = list(f.keys())[:8]
- info: dict = {"top-level keys": ", ".join(keys) or "(none)"}
- for k in keys[:3]:
- obj = f[k]
- if hasattr(obj, "shape"):
- info[k] = f"shape={obj.shape}, dtype={obj.dtype}"
- else:
- info[k] = f"group ({len(obj)} members)"
- return _array_info_html(info)
- except ImportError:
- pass
- return None
-
-
-def _preview_netcdf(fs, path: str) -> str | None:
- """Open the dataset lazily (no data loaded) and render its repr."""
- try:
- import xarray as xr
-
- with fs.open(path, "rb") as fh:
- # engine="scipy" reads lazily; no array data is decoded here
- ds = xr.open_dataset(fh, engine="scipy")
- # xarray Dataset has a rich _repr_html_()
- return _obj_to_preview_html(ds)
- except ImportError:
- pass
- return None
-
-
-def _preview_zarr(dr: "DataResource") -> str | None:
- """Use the schema cached at parse time โ zero extra I/O."""
- schema = dr.schema
- if not schema or not isinstance(schema, dict):
- return None
- info = {}
- if "arrays" in schema:
- info["arrays"] = ", ".join(str(a) for a in schema["arrays"][:8]) or "(none)"
- if "groups" in schema:
- info["groups"] = ", ".join(str(g) for g in schema["groups"][:8]) or "(none)"
- if "attrs" in schema:
- info["attrs"] = str(dict(list(schema["attrs"].items())[:4]))
- return _array_info_html(info) if info else None
-
-
-# --- audio ---
-
-
-def _preview_audio(dr: "DataResource", path: str) -> str | None:
- """Read only the audio file header โ no sample data loaded."""
- try:
- import soundfile as sf
-
- fs = dr.proj.fs
- with fs.open(path, "rb") as fh:
- info = sf.info(fh)
- details = {
- "sample rate": f"{info.samplerate} Hz",
- "channels": str(info.channels),
- "duration": f"{info.frames / info.samplerate:.2f} s",
- "format": info.format,
- "subtype": info.subtype,
- }
- return _array_info_html(details)
- except ImportError:
- pass
- return None
diff --git a/src/projspec/html.py b/src/projspec/html.py
deleted file mode 100644
index eb62124..0000000
--- a/src/projspec/html.py
+++ /dev/null
@@ -1,47 +0,0 @@
-def dict_to_html(data: dict, title="Data", open_level=2) -> str:
- """
- Convert a nested dictionary to expandable HTML using
tags.
-
- Args:
- data: The dictionary to convert
- title: Title for the details element
- open_level: whether to set elements as expanded; yes if > 0, and will
- decrement for inner levels.
-
- Returns:
- String containing HTML with expandable details elements
- """
- # With help from Claude Sonnet 4.
- if not isinstance(data, dict):
- return f"{data}"
-
- if not data:
- return ""
- open = "open" if open_level > 0 else "closed"
-
- html = [
- f'{title}
'
- ]
-
- for key, value in data.items():
- if isinstance(value, dict):
- html.append(dict_to_html(value, key, open_level - 1))
- elif isinstance(value, (list, tuple)):
- html.append(
- f'{key}
'
- )
- for i, item in enumerate(value):
- if isinstance(item, dict):
- html.append(dict_to_html(item, f"{key}[{i}]", open_level - 1))
- else:
- html.append(f' {item}
')
- html.append(" ")
- else:
- html.append(
- f'{key}: {value}
'
- )
-
- html.append(" ")
- return "".join(html)
diff --git a/src/projspec/library.py b/src/projspec/library.py
index db6bcb5..0b3f313 100644
--- a/src/projspec/library.py
+++ b/src/projspec/library.py
@@ -1,10 +1,12 @@
import json
import os
+import time
import fsspec
from projspec.config import get_conf
from projspec.proj import Project
+from projspec.utils import DEFAULT
class ProjectLibrary:
@@ -15,14 +17,28 @@ class ProjectLibrary:
# TODO: support for remote libraries
- def __init__(self, library_path: str | None = None, auto_save: bool = True):
- self.path = library_path or get_conf("library_path")
- self.entries: dict[str, Project] = {}
+ def __init__(
+ self,
+ library_path: str | None | type = DEFAULT,
+ auto_save: bool = True,
+ entries: dict | None = None,
+ ):
+ self.path = (
+ get_conf("library_path") if library_path is DEFAULT else library_path
+ )
+ self.entries: dict[str, Project] = {} if entries is None else entries
self.auto_save = auto_save
self.load()
def load(self):
- """Loads scanned project objects from JSON file"""
+ """Loads scanned project objects from JSON file.
+
+ Any entry whose last scan is older than the ``auto_rescan`` config
+ value (in seconds) is automatically rescanned and the refreshed
+ library is saved back. Set ``auto_rescan`` to 0 to disable this.
+ """
+ if self.path is None:
+ return
try:
with fsspec.open(self.path, "r") as f:
self.entries = {
@@ -30,6 +46,35 @@ def load(self):
}
except FileNotFoundError:
self.entries = {}
+ return
+ self._auto_rescan()
+
+ def _auto_rescan(self):
+ """Rescan entries older than the ``auto_rescan`` config threshold."""
+ max_age = get_conf("auto_rescan")
+ if not max_age or max_age <= 0:
+ return
+ now = time.time()
+ rescanned = False
+ for key, proj in list(self.entries.items()):
+ scanned_at = getattr(proj, "scanned_at", None)
+ if scanned_at is None or (now - scanned_at) < max_age:
+ continue
+ try:
+ # Rescan from the project's own path, preserving the library
+ # key so the entry's identity does not drift.
+ fresh = Project(
+ proj.path,
+ storage_options=proj.storage_options,
+ walk=False,
+ )
+ except Exception:
+ # never let an unreachable/changed project break library load
+ continue
+ self.entries[key] = fresh
+ rescanned = True
+ if rescanned and self.auto_save and self.path is not None:
+ self.save()
def clear(self):
"""Clears scanned project objects from JSON file and memory"""
@@ -46,6 +91,8 @@ def add_entry(self, path: str, entry: Project):
def save(self):
"""Serialise the state of the scanned project objects to file"""
# don't catch
+ if self.path is None:
+ raise ValueError("Cannot save without .path set")
data = {k: v.to_dict(compact=False) for k, v in self.entries.items()}
with fsspec.open(self.path, "w") as f:
json.dump(data, f)
diff --git a/src/projspec/proj/__init__.py b/src/projspec/proj/__init__.py
index 328cd80..2f535b5 100644
--- a/src/projspec/proj/__init__.py
+++ b/src/projspec/proj/__init__.py
@@ -24,7 +24,7 @@
from projspec.proj.conda_package import CondaRecipe, RattlerRecipe
from projspec.proj.conda_project import CondaProject
from projspec.proj.conda_workspace import CondaWorkspace
-from projspec.proj.data_dir import Data
+from projspec.proj.data_project import DataProject
from projspec.proj.datapackage import DataPackage, DVCRepo
from projspec.proj.dataworkflows import (
Airflow,
@@ -43,6 +43,7 @@
from projspec.proj.golang import Golang
from projspec.proj.helm import HelmChart
from projspec.proj.hf import HuggingFaceRepo
+from projspec.proj.knowledge_catalog import KnowledgeCatalog
from projspec.proj.ide import JetbrainsIDE, NvidiaAIWorkbench, VSCode
from projspec.proj.infra import (
Ansible,
@@ -92,9 +93,9 @@
"CondaWorkspace",
"RattlerRecipe",
# Data
- "Data",
"DataPackage",
"DVCRepo",
+ "DataProject",
# Data/ML workflows
"Airflow",
"Dagster",
@@ -121,6 +122,8 @@
"HelmChart",
# HuggingFace
"HuggingFaceRepo",
+ # Knowledge
+ "KnowledgeCatalog",
# IDE
"AIEnabled",
"BackstageCatalog",
diff --git a/src/projspec/proj/_consolidate.py b/src/projspec/proj/_consolidate.py
new file mode 100644
index 0000000..9ee93f4
--- /dev/null
+++ b/src/projspec/proj/_consolidate.py
@@ -0,0 +1,258 @@
+"""Consolidate sets of related files into logical datasets.
+
+Intake can already recognise some directory-based datasets (hive-partitioned
+parquet, zarr, delta, โฆ) by their characteristic contents. This module covers
+the complementary case where a directory holds *many individually-named files
+that obviously belong together*, e.g.::
+
+ 001.csv 002.csv 003.csv -> one CSV dataset
+ part-00000.parquet part-00001โฆ -> one parquet dataset
+ data_2019.json data_2020.json -> one JSON dataset
+ green.gif red.gif blue.gif -> one GIF (image) dataset
+
+The output is a list of :class:`FileGroup` objects. Each group is either a
+single standalone file or a consolidated set, and exposes a ``glob`` (or list of
+members) suitable for handing straight to
+:func:`intake.readers.inspect.inspect_dataset`.
+
+The logic here is deliberately filesystem-agnostic: it operates on
+``(basename, size)`` pairs so it can be unit-tested without any I/O.
+"""
+
+from __future__ import annotations
+
+import os
+import re
+from dataclasses import dataclass, field
+
+# A maximal run of digits anywhere in the stem - the most common way numbered
+# file series differ (001, 00001, 2020, ...).
+_DIGITS = re.compile(r"\d+")
+# Tokens for the "one differing token" heuristic (split on common separators).
+_SEP = re.compile(r"[._\- ]+")
+
+
+@dataclass
+class FileGroup:
+ """A standalone file or a consolidated set of related files.
+
+ Attributes
+ ----------
+ members:
+ Basenames belonging to this group, sorted.
+ ext:
+ Common file extension (lower-case, including the dot), or ``""``.
+ total_size:
+ Sum of the sizes of all members (bytes); ``None`` if unknown.
+ pattern:
+ For consolidated groups, a glob basename that matches all members
+ (e.g. ``"*.csv"`` or ``"part-*.parquet"``). For a single file this is
+ just that file's basename.
+ consolidated:
+ ``True`` when this group represents more than one physical file.
+ """
+
+ members: list[str]
+ ext: str = ""
+ total_size: int | None = None
+ pattern: str = ""
+ consolidated: bool = False
+
+ @property
+ def name(self) -> str:
+ """A short identifying name for the group."""
+ if self.consolidated:
+ return self.pattern
+ return self.members[0]
+
+ def url(self, root: str) -> str | list[str]:
+ """Build the URL/glob (rooted at *root*) to hand to intake.
+
+ A consolidated group whose members match a simple glob is expressed as
+ a single ``root/pattern`` glob string; otherwise it is returned as an
+ explicit list of member URLs. A single file is returned as one URL.
+ """
+ root = root.rstrip("/")
+ if not self.consolidated:
+ return f"{root}/{self.members[0]}"
+ if self.pattern and "*" in self.pattern:
+ return f"{root}/{self.pattern}"
+ return [f"{root}/{m}" for m in self.members]
+
+
+def _split_ext(name: str) -> tuple[str, str]:
+ """Split into ``(stem, ext)`` with a lower-cased extension.
+
+ Handles common double extensions like ``.csv.gz`` / ``.tar.gz`` so that a
+ series of compressed parts groups correctly.
+ """
+ lower = name.lower()
+ for double in (".csv.gz", ".json.gz", ".tar.gz", ".tar.bz2", ".tsv.gz"):
+ if lower.endswith(double) and len(name) > len(double):
+ return name[: -len(double)], double
+ stem, ext = os.path.splitext(name)
+ return stem, ext.lower()
+
+
+def _digit_pattern(stem: str) -> str | None:
+ """Mask digit runs in *stem* with ``#``, or ``None`` if it has no digits.
+
+ ``part-00001`` -> ``part-#``; ``data2020`` -> ``data#``. Consecutive digit
+ runs collapse to a single placeholder so that ``a1b2`` and ``a3b4`` share a
+ key.
+ """
+ if not _DIGITS.search(stem):
+ return None
+ return _DIGITS.sub("#", stem)
+
+
+def _glob_from_digit_pattern(pattern: str) -> str:
+ """Turn a masked pattern (``part-#``) into a glob stem (``part-*``)."""
+ return pattern.replace("#", "*")
+
+
+def _token_signature(stem: str) -> tuple[tuple[str, ...], int] | None:
+ """Return ``(tokens_with_one_blanked, blank_index)`` for the token heuristic.
+
+ Used for non-numeric series such as ``green``/``red``/``blue``. We only
+ consider stems that split into the *same* number of tokens differing in
+ exactly one position; here we just return the token tuple so the caller can
+ group by "all-but-one token equal".
+ """
+ tokens = tuple(t for t in _SEP.split(stem) if t)
+ if not tokens:
+ return None
+ return tokens, len(tokens)
+
+
+def consolidate(
+ files: list[tuple[str, int | None]],
+ min_group: int = 3,
+ min_token_group: int = 2,
+) -> list[FileGroup]:
+ """Group a flat list of files into datasets.
+
+ Parameters
+ ----------
+ files:
+ ``[(basename, size_or_None), ...]`` for the files directly in a
+ directory (not directories, not recursive).
+ min_group:
+ Minimum number of files sharing a digit-masked pattern before they are
+ consolidated. Below this they are emitted as standalone files.
+ min_token_group:
+ Minimum size for the (weaker) "one differing token" heuristic used for
+ non-numeric series like colour names.
+
+ Returns
+ -------
+ list[FileGroup]
+ One entry per resulting dataset, sorted by name. Files that match no
+ consolidation rule are returned as singleton, non-consolidated groups.
+ """
+ sizes: dict[str, int | None] = {n: s for n, s in files}
+ remaining = set(sizes)
+ groups: list[FileGroup] = []
+
+ # โโ Pass 1: digit-run patterns within each extension โโโโโโโโโโโโโโโโโโ
+ # key: (ext, digit_masked_stem) -> [names]
+ digit_buckets: dict[tuple[str, str], list[str]] = {}
+ for name in list(remaining):
+ stem, ext = _split_ext(name)
+ pat = _digit_pattern(stem)
+ if pat is not None:
+ digit_buckets.setdefault((ext, pat), []).append(name)
+
+ for (ext, pat), members in digit_buckets.items():
+ if len(members) >= min_group:
+ members = sorted(members)
+ remaining.difference_update(members)
+ glob_stem = _glob_from_digit_pattern(pat)
+ groups.append(
+ FileGroup(
+ members=members,
+ ext=ext,
+ total_size=_sum_sizes(members, sizes),
+ pattern=f"{glob_stem}{ext}",
+ consolidated=True,
+ )
+ )
+
+ # โโ Pass 2: "one differing token" within each extension โโโโโโโโโโโโโโโ
+ # Group stems that share all tokens but one (same token count).
+ token_buckets: dict[tuple[str, int, int, tuple[str, ...]], list[str]] = {}
+ for name in list(remaining):
+ stem, ext = _split_ext(name)
+ sig = _token_signature(stem)
+ if sig is None:
+ continue
+ tokens, ntok = sig
+ # For each position, the key is (ext, ntok, blanked_index, other_tokens)
+ for i in range(ntok):
+ others = tokens[:i] + ("*",) + tokens[i + 1 :]
+ token_buckets.setdefault((ext, ntok, i, others), []).append(name)
+
+ used_in_token_pass: set[str] = set()
+ # Prefer the largest buckets first so a file lands in its best group.
+ for (ext, ntok, idx, others), members in sorted(
+ token_buckets.items(), key=lambda kv: -len(kv[1])
+ ):
+ members = [m for m in members if m in remaining and m not in used_in_token_pass]
+ if len(members) >= min_token_group and len(set(members)) >= min_token_group:
+ members = sorted(members)
+ used_in_token_pass.update(members)
+ remaining.difference_update(members)
+ glob_stem = "*".join("" if t == "*" else t for t in others)
+ # rebuild a readable glob like "*.gif" / "frame_*_left.png"
+ pattern = _normalise_token_glob(others)
+ groups.append(
+ FileGroup(
+ members=members,
+ ext=ext,
+ total_size=_sum_sizes(members, sizes),
+ pattern=f"{pattern}{ext}",
+ consolidated=True,
+ )
+ )
+
+ # โโ Pass 3: leftovers are standalone files โโโโโโโโโโโโโโโโโโโโโโโโโโโโ
+ for name in sorted(remaining):
+ _, ext = _split_ext(name)
+ groups.append(
+ FileGroup(
+ members=[name],
+ ext=ext,
+ total_size=sizes.get(name),
+ pattern=name,
+ consolidated=False,
+ )
+ )
+
+ return sorted(groups, key=lambda g: g.name)
+
+
+def _normalise_token_glob(tokens: tuple[str, ...]) -> str:
+ """Join token glob pieces, collapsing the blanked position to ``*``.
+
+ ``("*",)`` -> ``"*"``
+ ``("frame", "*", "left")`` -> ``"frame_*_left"`` (best-effort separator)
+ """
+ parts = [("*" if t == "*" else t) for t in tokens]
+ # We lost the original separators; "_" is the most common, and the exact
+ # separator does not matter for globbing since "*" spans it anyway when the
+ # blank is interior. For a single trailing/leading blank this yields "*".
+ glob = "_".join(parts)
+ # Tidy duplicate stars produced by adjacent blanks.
+ while "**" in glob:
+ glob = glob.replace("**", "*")
+ return glob
+
+
+def _sum_sizes(members: list[str], sizes: dict[str, int | None]) -> int | None:
+ total = 0
+ for m in members:
+ s = sizes.get(m)
+ if s is None:
+ return None
+ total += s
+ return total
diff --git a/src/projspec/proj/base.py b/src/projspec/proj/base.py
index 25d5e7c..1766a51 100644
--- a/src/projspec/proj/base.py
+++ b/src/projspec/proj/base.py
@@ -3,18 +3,19 @@
import logging
import os
import stat
+import time
from collections.abc import Iterable
from itertools import chain
from functools import cached_property
import fsspec
import fsspec.implementations.local
-import projspec.utils
import toml
from projspec.config import get_conf
from projspec.utils import (
AttrDict,
+ DEFAULT,
IndentDumper,
PickleableTomlDecoder,
camel_to_snake,
@@ -34,6 +35,37 @@ def _fmt_size(n: int) -> str:
n /= 1024
+def _humanize_age(ts: float) -> str:
+ """Render a Unix timestamp as a relative "X ago" string.
+
+ e.g. "just now", "5 minutes ago", "3 hours ago", "today", "yesterday",
+ "4 days ago", "2 months ago", "1 year ago".
+ """
+ import datetime
+
+ age = datetime.datetime.now() - datetime.datetime.fromtimestamp(ts)
+ days = age.days
+ if days < 0:
+ # clock skew / future timestamp - treat as just now
+ return "just now"
+ if days == 0:
+ secs = int(age.total_seconds())
+ if secs < 60:
+ return "just now"
+ if secs < 3600:
+ mins = secs // 60
+ return f"{mins} minute{'s' if mins != 1 else ''} ago"
+ hours = secs // 3600
+ return f"{hours} hour{'s' if hours != 1 else ''} ago"
+ if days == 1:
+ return "yesterday"
+ if days < 30:
+ return f"{days} days ago"
+ if days < 365:
+ return f"{days // 30} months ago"
+ return f"{days // 365} year{'s' if days >= 730 else ''} ago"
+
+
class ParseFailed(ValueError):
"""Exception raised when parsing fails: a directory does not meet the given spec."""
@@ -315,6 +347,8 @@ def resolve(
types = set(camel_to_snake(_) for _ in types or ())
if types and types - set(registry):
raise ValueError(f"Unknown types: {set(types) - set(registry)}")
+ # record when this (re)scan happened
+ self.scanned_at = time.time()
# sorting to ensure consistency
for name in sorted(registry):
cls = registry[name]
@@ -410,26 +444,18 @@ def _stats_line(self) -> str:
# last modified
lm = self.last_modified
if lm is not None:
- import datetime
-
- age = datetime.datetime.now() - datetime.datetime.fromtimestamp(lm)
- days = age.days
- if days == 0:
- age_str = "today"
- elif days == 1:
- age_str = "yesterday"
- elif days < 30:
- age_str = f"{days} days ago"
- elif days < 365:
- age_str = f"{days // 30} months ago"
- else:
- age_str = f"{days // 365} year{'s' if days >= 730 else ''} ago"
+ age_str = _humanize_age(lm)
by = self.last_modified_by
if by:
parts.append(f"last modified {age_str} by {by}")
else:
parts.append(f"last modified {age_str}")
+ # when this project was last scanned
+ scanned_at = getattr(self, "scanned_at", None)
+ if scanned_at is not None:
+ parts.append(f"scanned {_humanize_age(scanned_at)}")
+
return " " + " ยท ".join(parts) if parts else ""
def __repr__(self):
@@ -557,17 +583,23 @@ def to_dict(self, compact=True) -> dict:
is_writable=self.is_writable,
last_modified=self.last_modified,
last_modified_by=self.last_modified_by,
+ scanned_at=self.scanned_at,
)
if not compact:
dic["klass"] = "project"
return dic.to_dict(compact=compact)
- def _repr_html_(self):
- from projspec.html import dict_to_html
+ def _ipython_display_(self):
+ """Auto-display as the interactive widget when possible.
- # TODO: add tooltips to docs or spec links
- # TODO: remove redundant information?
- return dict_to_html(self.to_dict(), title=self.url)
+ Falls back to a plain ``repr`` when ``anywidget`` /
+ ``ipywidgets`` is not available - Jupyter will then use the
+ normal text representation.
+ """
+ from projspec.library import ProjectLibrary
+
+ lib = ProjectLibrary(entries={"memory": self}, library_path=None)
+ lib._ipython_display_()
@staticmethod
def from_dict(dic):
@@ -583,6 +615,11 @@ def from_dict(dic):
proj.path = dic["url"]
proj.storage_options = dic["storage_options"]
proj.fs, proj.url = fsspec.url_to_fs(proj.path, **proj.storage_options)
+ scanned_at = dic.get("scanned_at")
+ try:
+ proj.scanned_at = float(scanned_at)
+ except (TypeError, ValueError):
+ proj.scanned_at = time.time()
# Restore cached tree stats so a round-tripped Project never re-walks.
# Keys default to None if absent (e.g. older serialised data).
proj.__dict__["_tree_stats"] = {
@@ -640,7 +677,7 @@ def make(self, qname: str, **kwargs):
art.make(**kwargs)
return art
- def add_to_library(self, path=None):
+ def add_to_library(self, path=DEFAULT):
"""Add this project to the current session library"""
# TODO: prevent overwrite?
from projspec.library import ProjectLibrary
diff --git a/src/projspec/proj/data_dir.py b/src/projspec/proj/data_dir.py
deleted file mode 100644
index c0172c9..0000000
--- a/src/projspec/proj/data_dir.py
+++ /dev/null
@@ -1,679 +0,0 @@
-"""ProjectSpec for bare data directories.
-
-Matches directories whose contents are predominantly data files (by extension or
-by a recognised on-disk layout such as Hive partitioning, Apache Iceberg, Delta
-Lake, or Zarr), with no requirement for any declarative metadata file.
-"""
-
-from __future__ import annotations
-
-import os
-import re
-from posixpath import basename as _basename
-
-from projspec.proj import ProjectSpec, ParseFailed
-from projspec.utils import AttrDict
-
-_EXT_TO_FORMAT: dict[str, tuple[str, str]] = {
- # Tabular / columnar -------------------------------------------------------
- ".csv": ("csv", "tabular"),
- ".tsv": ("tsv", "tabular"),
- ".psv": ("psv", "tabular"),
- ".parquet": ("parquet", "tabular"),
- ".parq": ("parquet", "tabular"),
- ".pq": ("parquet", "tabular"),
- ".arrow": ("arrow", "tabular"),
- ".ipc": ("arrow", "tabular"),
- ".feather": ("arrow", "tabular"), # Feather v1/v2 (magic: FEA1 / ARROW1)
- ".orc": ("orc", "tabular"),
- ".avro": ("avro", "tabular"),
- ".xls": ("excel", "tabular"),
- ".xlsx": ("excel", "tabular"),
- ".xlsm": ("excel", "tabular"),
- ".xlsb": ("excel", "tabular"),
- ".jsonl": ("jsonlines", "tabular"),
- ".ndjson": ("jsonlines", "tabular"),
- ".db": ("sqlite", "tabular"), # DuckDB / SQLite (disambiguated by magic)
- ".sqlite": ("sqlite", "tabular"),
- ".sqlitedb": ("sqlite", "tabular"),
- ".duckdb": ("duckdb", "tabular"),
- # Array / scientific -------------------------------------------------------
- ".npy": ("numpy", "array"),
- ".npz": ("numpy", "array"),
- ".hdf5": ("hdf5", "array"),
- ".hdf": ("hdf5", "array"),
- ".h5": ("hdf5", "array"),
- ".h4": ("hdf5", "array"),
- ".he5": ("hdf5", "array"),
- ".nc": ("netcdf", "array"),
- ".nc3": ("netcdf", "array"),
- ".nc4": ("netcdf", "array"),
- ".mat": ("matlab", "array"),
- ".fits": ("fits", "array"),
- ".grib": ("grib", "timeseries"),
- ".grb": ("grib", "timeseries"),
- ".grib2": ("grib", "timeseries"),
- ".grb2": ("grib", "timeseries"),
- ".asdf": ("asdf", "array"),
- ".zarr": ("zarr", "array"),
- # Image / biomedical imaging -----------------------------------------------
- ".png": ("png", "image"),
- ".jpg": ("jpeg", "image"),
- ".jpeg": ("jpeg", "image"),
- ".tif": ("tiff", "image"), # also geotiff โ ambiguous; image wins
- ".tiff": ("tiff", "image"),
- ".cog": ("tiff", "geospatial"), # Cloud-Optimised GeoTIFF
- ".bmp": ("bmp", "image"),
- ".gif": ("gif", "image"),
- ".webp": ("webp", "image"),
- ".dcm": ("dicom", "image"),
- ".dicom": ("dicom", "image"),
- ".nii": ("nifti", "image"),
- ".nrrd": ("nrrd", "image"),
- ".nhdr": ("nrrd", "image"),
- ".mha": ("metaimage", "image"),
- ".mhd": ("metaimage", "image"),
- ".svs": ("svs", "image"), # Aperio whole-slide image
- ".ndpi": ("ndpi", "image"), # Hamamatsu whole-slide image
- ".scn": ("scn", "image"), # Leica whole-slide image
- ".lsm": ("lsm", "image"), # Zeiss confocal
- ".exr": ("exr", "image"), # OpenEXR HDR
- ".qptiff": ("qptiff", "image"), # PerkinElmer whole-slide
- # Geospatial ---------------------------------------------------------------
- ".shp": ("shapefile", "geospatial"),
- ".shx": ("shapefile", "geospatial"),
- ".dbf": ("shapefile", "geospatial"),
- ".geojson": ("geojson", "geospatial"),
- ".gpkg": ("geopackage", "geospatial"),
- ".fgb": ("flatgeobuf", "geospatial"),
- ".kml": ("kml", "geospatial"),
- ".pmtiles": ("pmtiles", "geospatial"),
- # Audio --------------------------------------------------------------------
- ".wav": ("wav", "timeseries"),
- ".flac": ("flac", "timeseries"),
- ".mp3": ("mp3", "timeseries"),
- ".ogg": ("ogg", "timeseries"),
- # Video --------------------------------------------------------------------
- ".mp4": ("mp4", "video"),
- ".avi": ("avi", "video"),
- ".mov": ("mov", "video"),
- ".mkv": ("mkv", "video"),
- ".webm": ("webm", "video"),
- # ML model weights ---------------------------------------------------------
- ".safetensors": ("safetensors", "model"),
- ".gguf": ("gguf", "model"),
- ".pt": ("pytorch", "model"),
- ".pth": ("pytorch", "model"),
- ".onnx": ("onnx", "model"),
- ".tfrec": ("tfrecord", "model"),
- # Archive / bundle ---------------------------------------------------------
- ".pkl": ("pickle", "archive"),
- ".bin": ("binary", "archive"),
-}
-
-_DATA_EXTENSIONS: frozenset[str] = frozenset(_EXT_TO_FORMAT)
-
-# Magic-byte signatures (format, modality, offset, bytes_pattern).
-_MAGIC: list[tuple[str, str, int | None, bytes]] = [
- # Fixed-offset signatures
- ("dicom", "image", 128, b"DICM"), # DICOM preamble
- ("nifti", "image", 344, b"ni1\x00"), # NIfTI-1
- ("nifti", "image", 344, b"n+1\x00"), # NIfTI-1 single file
- ("duckdb", "tabular", 8, b"DUCK"),
- ("safetensors", "model", 8, b"{"), # SafeTensors JSON header
- ("wav", "timeseries", 8, b"WAVE"), # RIFFโฆWAVE
- # Offset-0 signatures
- ("parquet", "tabular", 0, b"PAR1"),
- ("hdf5", "array", 0, b"\x89HDF"),
- ("netcdf", "array", 0, b"CDF\x01"), # NetCDF classic
- ("netcdf", "array", 0, b"CDF\x02"), # NetCDF-64bit
- ("orc", "tabular", 0, b"ORC"),
- ("avro", "tabular", 0, b"Obj\x01"),
- ("arrow", "tabular", 0, b"ARROW1"), # IPC stream
- ("arrow", "tabular", 0, b"FEA1"), # Feather v1
- ("numpy", "array", 0, b"\x93NUMPY"),
- ("matlab", "array", 0, b"MATLAB"),
- ("fits", "array", 0, b"SIMPLE"),
- ("grib", "timeseries", 0, b"GRIB"),
- ("asdf", "array", 0, b"#ASDF"),
- ("flatgeobuf", "geospatial", 0, b"fgb"),
- ("gguf", "model", 0, b"GGUF"),
- ("png", "image", 0, b"\x89PNG"),
- ("jpeg", "image", 0, b"\xff\xd8\xff"),
- ("tiff", "image", 0, b"II*\x00"), # little-endian TIFF
- ("tiff", "image", 0, b"MM\x00*"), # big-endian TIFF
- ("sqlite", "tabular", 0, b"SQLite format"),
- ("shapefile", "geospatial", 0, b"\x00\x00\x27\x0a"),
- ("pmtiles", "geospatial", 0, b"PMTiles"),
-]
-
-# Regex that matches Hive-style partition directory names (e.g. "year=2024").
-_HIVE_DIR_RE = re.compile(r"^[^=]+=.+$")
-
-
-def _read_schema(path: str, fmt: str, fs) -> dict | list:
- """Return a best-effort schema dict/list for *path*, or {} on any failure."""
- try:
- if fmt == "parquet":
- try:
- import pyarrow.parquet as pq
-
- with fs.open(path, "rb") as fh:
- pf = pq.ParquetFile(fh)
- return {field.name: str(field.type) for field in pf.schema_arrow}
- except ImportError:
- pass
-
- elif fmt == "arrow":
- try:
- import pyarrow.ipc as ipc
-
- with fs.open(path, "rb") as fh:
- reader = ipc.open_file(fh)
- return {field.name: str(field.type) for field in reader.schema}
- except ImportError:
- pass
-
- elif fmt == "hdf5":
- try:
- import h5py
-
- with fs.open(path, "rb") as fh:
- with h5py.File(fh, "r") as ds:
- return {
- "variables": list(ds.keys()),
- "attrs": dict(ds.attrs),
- }
- except ImportError:
- pass
-
- elif fmt == "netcdf":
- try:
- import netCDF4 as nc # type: ignore[import]
-
- with fs.open(path, "rb") as fh:
- ds = nc.Dataset("in-mem", memory=fh.read())
- return {
- "variables": list(ds.variables.keys()),
- "dims": {k: len(v) for k, v in ds.dimensions.items()},
- }
- except ImportError:
- try:
- import xarray as xr # type: ignore[import]
-
- with fs.open(path, "rb") as fh:
- ds = xr.open_dataset(fh, engine="scipy")
- return {
- "variables": list(ds.data_vars),
- "dims": dict(ds.dims),
- }
- except ImportError:
- pass
-
- elif fmt in ("jpeg", "png", "bmp", "gif", "webp", "tiff"):
- try:
- from PIL import Image # type: ignore[import]
-
- with fs.open(path, "rb") as fh:
- img = Image.open(fh)
- img.load()
- mode = img.mode
- channels = len(img.getbands())
- return {
- "width": img.width,
- "height": img.height,
- "channels": channels,
- "mode": mode,
- }
- except ImportError:
- pass
-
- elif fmt in ("wav", "flac", "mp3", "ogg"):
- try:
- import soundfile as sf # type: ignore[import]
-
- with fs.open(path, "rb") as fh:
- info = sf.info(fh)
- return {
- "sample_rate": info.samplerate,
- "channels": info.channels,
- "frames": info.frames,
- }
- except ImportError:
- pass
-
- except Exception: # โ never let schema extraction abort parsing
- pass
-
- return {}
-
-
-def _filelist_dirs(filelist: list[dict]) -> list[dict]:
- """Return only directory entries from a filelist."""
- return [e for e in filelist if e.get("type", "") == "directory"]
-
-
-def _filelist_files(filelist: list[dict]) -> list[dict]:
- """Return only file entries from a filelist."""
- return [e for e in filelist if e.get("type", "") != "directory"]
-
-
-def _fmt_from_path(path: str) -> tuple[str, str] | None:
- """Return (format, modality) for *path* by extension, or None if unknown."""
- ext = os.path.splitext(path)[1].lower()
- return _EXT_TO_FORMAT.get(ext)
-
-
-def _identify_by_magic(path: str, fs) -> tuple[str, str] | None:
- """Return (format, modality) by probing *path*'s header bytes, or None.
-
- Reads up to 1 KiB. Checks fixed-offset patterns first (longer offsets
- first, to avoid short patterns shadowing longer ones), then scans for
- anywhere-patterns via re.search.
- """
- try:
- with fs.open(path, "rb") as fh:
- head = fh.read(1024)
- except Exception:
- return None
-
- for fmt, modality, offset, pattern in _MAGIC:
- if offset is None:
- if re.search(re.escape(pattern), head):
- return fmt, modality
- else:
- if head[offset : offset + len(pattern)] == pattern:
- return fmt, modality
- return None
-
-
-# Token that may vary across files in a series: digits, dashes, underscores, dots.
-# Alphabetic variation (e.g. "users" vs "orders") disqualifies collation.
-_SERIES_VAR_RE = re.compile(r"^[\d\-_.]+$")
-
-
-def _common_affix(stems: list[str]) -> tuple[str, str]:
- """Return the longest (prefix, suffix) shared by every stem in *stems*."""
- if not stems:
- return "", ""
- prefix = os.path.commonprefix(stems)
- # Reverse each stem to find common suffix via commonprefix trick
- rev = [s[::-1] for s in stems]
- suffix = os.path.commonprefix(rev)[::-1]
- # Ensure prefix and suffix don't overlap (can happen with a single-char stem)
- if len(prefix) + len(suffix) > min(len(s) for s in stems):
- suffix = ""
- return prefix, suffix
-
-
-def _group_by_naming_series(entries: list[dict]) -> list[list[dict]]:
- """Partition *entries* (same-format file list) into naming-series groups.
-
- Two or more files belong to the same series when their basenames (stems)
- differ only in a contiguous segment that consists solely of digits, dashes,
- underscores, or dots โ i.e. a numeric counter or a date component.
-
- A single file is always its own series (trivially consistent).
-
- Returns a list of groups, each group being a non-empty list of entries that
- share a common naming pattern.
- """
- if len(entries) <= 1:
- return [entries] if entries else []
-
- # Compute stems once
- stems = [os.path.splitext(_basename(e["name"]))[0] for e in entries]
-
- prefix, suffix = _common_affix(stems)
- plen, slen = len(prefix), len(suffix)
-
- # Extract the variable middle segment for each stem
- variables = []
- for stem in stems:
- mid = stem[plen : len(stem) - slen if slen else len(stem)]
- variables.append(mid)
-
- # All files form one series if:
- # 1. There is a non-trivial shared prefix OR suffix (at least 1 char), AND
- # 2. Every variable segment is numeric/date-like (no alphabetic chars)
- has_affix = plen >= 1 or slen >= 1
- all_numeric_var = all(_SERIES_VAR_RE.match(v) or v == "" for v in variables)
-
- if has_affix and all_numeric_var:
- return [entries]
-
- # Otherwise fall back: each file is its own "series" (separate resource)
- return [[e] for e in entries]
-
-
-# Notably absent: datapackage.json, catalog.yaml/yml, .dvc/ โ those belong
-# to projspec.proj.datapackage and are treated as compatible companions.
-_NON_DATA_SENTINELS: frozenset[str] = frozenset(
- {
- # Python
- "pyproject.toml",
- "setup.py",
- "setup.cfg",
- "hatch.toml",
- # Rust
- "Cargo.toml",
- # JavaScript / Node
- "package.json",
- # Go
- "go.mod",
- # Container / infra
- "Dockerfile",
- "docker-compose.yml",
- "docker-compose.yaml",
- # Helm
- "Chart.yaml",
- # Ruby / Java / .NET
- "Gemfile",
- "pom.xml",
- "build.gradle",
- "*.csproj",
- # R
- "DESCRIPTION",
- # Conda
- "environment.yml",
- "environment.yaml",
- "meta.yaml",
- # Pixi
- "pixi.toml",
- # Mkdocs / Sphinx / RTD
- "mkdocs.yml",
- "mkdocs.yaml",
- "conf.py",
- ".readthedocs.yaml",
- ".readthedocs.yml",
- # Scripts / notebooks that imply code-first dirs
- "Makefile",
- }
-)
-
-
-class Data(ProjectSpec):
- """A directory whose primary contents are data files.
-
- Matches on any of:
- - At least one file with an unambiguous data extension (CSV, Parquet, Arrow,
- HDF5, images, audio, etc.) โ without requiring a metadata sidecar.
- - A recognised directory layout: Hive partitioning (`key=value/` subdirs),
- Apache Iceberg (`metadata/` directory), Delta Lake (`_delta_log/`), or
- a Zarr store (`.zattrs` / `.zgroup` at the root).
-
- If no non-datapackage project signals are present in the directory the spec
- parses unconditionally. If sentinel files that indicate another project type
- (`pyproject.toml`, `Cargo.toml`, `package.json`, โฆ) are found, parsing
- succeeds only when the majority of bytes in the root file listing belong to
- recognised data files; otherwise `ParseFailed` is raised so that the
- directory is not double-counted as both a code project and a data project.
- """
-
- icon = "๐๏ธ"
-
- def match(self) -> bool:
- # Fast path: structural layout signals (no file-content inspection needed)
- if self._detect_layout():
- return True
- # Slow path: any top-level file with an unambiguous data extension
- return any(
- os.path.splitext(name)[1].lower() in _DATA_EXTENSIONS
- for name in self.proj.basenames
- )
-
- def parse(self) -> None:
- if self._has_non_data_sentinels():
- if not self._data_bytes_majority():
- raise ParseFailed(
- "Non-data project sentinels found and data files are not "
- "the majority of bytes โ skipping Data spec"
- )
-
- layout = self._detect_layout()
- resources: list
-
- if layout in ("hive", "iceberg", "delta"):
- resources = self._parse_layout_dirs(layout)
- # Delta/Iceberg also commonly store data files at the root level
- # alongside the log/metadata directory; collect those too.
- if layout in ("iceberg", "delta"):
- root_resources = self._parse_flat()
- resources = resources + root_resources
- elif layout in ("zarr_store", "tiledarray"):
- resources = [self._parse_zarr_root()]
- else:
- resources = self._parse_flat()
-
- if not resources:
- raise ParseFailed("No recognisable data files found")
-
- if len(resources) == 1:
- self._contents["data_resource"] = resources[0]
- else:
- self._contents["data_resource"] = AttrDict(
- {_safe_key(r.path): r for r in resources}
- )
-
- def _has_non_data_sentinels(self) -> bool:
- """Return True if any non-datapackage project sentinel is present."""
- basenames = self.proj.basenames
- return any(name in _NON_DATA_SENTINELS for name in basenames)
-
- def _data_bytes_majority(self) -> bool:
- """Return True if data files account for >50 % of root-listing bytes.
-
- Files with unknown / zero size are excluded from both totals so they
- do not unfairly skew the ratio.
- """
- total_bytes = 0
- data_bytes = 0
- for entry in self.proj.filelist:
- size = entry.get("size") or 0
- if size <= 0:
- continue
- total_bytes += size
- ext = os.path.splitext(entry["name"].rsplit("/", 1)[-1])[1].lower()
- if ext in _DATA_EXTENSIONS:
- data_bytes += size
- if total_bytes == 0:
- return False
- return data_bytes > total_bytes / 2
-
- def _detect_layout(self) -> str:
- """Return a layout string, or '' if none of the known layouts match.
-
- Uses the `contains` sentinel approach from intake: certain well-known
- files/directories at the root identify a directory as a logical dataset.
- """
- basenames = self.proj.basenames
- # Zarr store: .zattrs, .zgroup, or zarr.json at the root
- # (zarr.json is the Zarr v3 sentinel; .zattrs/.zgroup are v2)
- if any(s in basenames for s in (".zattrs", ".zgroup", "zarr.json")):
- return "zarr_store"
- dir_names = {_basename(e["name"]) for e in _filelist_dirs(self.proj.filelist)}
- # Delta Lake
- if "_delta_log" in dir_names:
- return "delta"
- # TileDB array directory
- if "__meta" in dir_names and "__schema" in dir_names:
- return "tiledarray"
- # Apache Iceberg: metadata/ directory present
- if "metadata" in dir_names:
- return "iceberg"
- # Partitioned Parquet: _metadata sentinel file at root (written by Spark/Dask)
- if "_metadata" in basenames:
- return "iceberg"
- # Hive: any top-level subdirectory whose name matches key=value
- if any(_HIVE_DIR_RE.match(d) for d in dir_names):
- return "hive"
- return ""
-
- def _resource_from_entries(
- self, entries: list[dict], fmt: str, modality: str, layout: str
- ):
- """Build a DataResource from a list of same-format file entries.
-
- The `path` field is set to:
-
- - Single file: the bare basename, e.g. `"data.csv"`.
- - Multi-file series: a glob pattern, e.g. `"part*.csv"`, built from
- the shared prefix/suffix of the basenames.
- """
- from projspec.content.data import DataResource
-
- full_paths = [e["name"] for e in entries]
- total_size = sum(e.get("size", 0) or 0 for e in entries)
- sample_path = full_paths[0] if full_paths else ""
- schema = _read_schema(sample_path, fmt, self.proj.fs) if sample_path else {}
-
- ext = os.path.splitext(_basename(full_paths[0]))[1] if full_paths else ""
-
- if len(entries) == 1:
- path = _basename(full_paths[0]) or fmt
- else:
- stems = [os.path.splitext(_basename(p))[0] for p in full_paths]
- prefix, suffix = _common_affix(stems)
- stem_pattern = (prefix.rstrip("-_.") or fmt) + "*" + suffix
- path = stem_pattern + ext
-
- return DataResource(
- proj=self.proj,
- path=path,
- format=fmt,
- modality=modality,
- layout=layout,
- file_count=len(entries),
- total_size=total_size,
- schema=schema,
- sample_path=sample_path,
- )
-
- def _parse_flat(self) -> list:
- """Group top-level files by format and naming series.
-
- Files of the same format are only collated into a single DataResource
- when they share a consistent naming schema โ i.e. their stems differ
- only in a numeric or date-like segment (e.g. `part0.csv`,
- `part1.csv` or `2024-02.tiff`, `2024-03.tiff`). Files whose
- stems vary in alphabetic content (e.g. `users.csv`, `orders.csv`)
- each become their own DataResource.
- """
- # First bucket by (fmt, modality)
- fmt_groups: dict[tuple[str, str], list[dict]] = {}
- for entry in _filelist_files(self.proj.filelist):
- fmt_info = _fmt_from_path(entry["name"])
- if fmt_info is None:
- continue
- fmt_groups.setdefault(fmt_info, []).append(entry)
-
- resources = []
- for (fmt, modality), entries in fmt_groups.items():
- # Split each format-group into naming series
- for series in _group_by_naming_series(entries):
- resources.append(
- self._resource_from_entries(series, fmt, modality, "flat")
- )
- return resources
-
- def _parse_layout_dirs(self, layout: str) -> list:
- """One DataResource per top-level subdirectory (partition / table root).
-
- Within each subdirectory the dominant format is determined, then files
- are checked for a consistent naming series before collating.
- """
- dir_entries = _filelist_dirs(self.proj.filelist)
- resources = []
- for dir_entry in dir_entries:
- dir_path = dir_entry["name"]
- dir_name = _basename(dir_path)
- # Skip hidden/internal dirs for iceberg/delta
- if layout in ("iceberg", "delta") and dir_name.startswith(
- ("metadata", "_delta_log", "_")
- ):
- continue
- # Enumerate files one level inside this subdirectory
- try:
- sub_filelist = self.proj.fs.ls(dir_path, detail=True)
- except Exception:
- continue
-
- sub_files = _filelist_files(sub_filelist)
- # Determine dominant (fmt, modality) by file count
- fmt_counts: dict[tuple[str, str], int] = {}
- for e in sub_files:
- fmt_info = _fmt_from_path(e["name"])
- if fmt_info:
- fmt_counts[fmt_info] = fmt_counts.get(fmt_info, 0) + 1
- if not fmt_counts:
- continue
- dominant = max(fmt_counts, key=lambda k: fmt_counts[k])
- dominant_fmt, dominant_modality = dominant
- dominant_files = [
- e for e in sub_files if _fmt_from_path(e["name"]) == dominant
- ]
- resource = self._resource_from_entries(
- dominant_files, dominant_fmt, dominant_modality, layout
- )
- # Override path with the directory basename + trailing slash
- # (partition dirs are already logically grouped by the directory)
- resource.path = dir_name + "/"
- resources.append(resource)
- return resources
-
- def _parse_zarr_root(self):
- """Describe the whole directory as a single array-store resource.
-
- Used for Zarr stores and TileDB arrays โ both are directory-as-dataset
- layouts with no individual data files at the root.
- """
- from projspec.content.data import DataResource
-
- url = self.proj.url
- layout = self._detect_layout()
- # TileDB directories are not Zarr; distinguish the format accordingly
- if layout == "tiledarray":
- fmt, modality = "tiledb", "array"
- schema: dict | list = {}
- else:
- fmt, modality = "zarr", "array"
- schema = {}
- try:
- import zarr # type: ignore[import]
-
- store = zarr.open(url, mode="r")
- schema = {
- "arrays": list(store.array_keys()),
- "groups": list(store.group_keys()),
- "attrs": dict(store.attrs),
- }
- except (ImportError, Exception):
- pass
-
- total_size = sum(
- e.get("size", 0) or 0 for e in _filelist_files(self.proj.filelist)
- )
- return DataResource(
- proj=self.proj,
- path=(_basename(url) or fmt) + "/",
- format=fmt,
- modality=modality,
- layout=layout,
- file_count=len(_filelist_files(self.proj.filelist)),
- total_size=total_size,
- schema=schema,
- sample_path="",
- )
-
-
-# ---------------------------------------------------------------------------
-# Utilities
-# ---------------------------------------------------------------------------
-
-
-def _safe_key(name: str) -> str:
- """Convert an arbitrary name to a valid Python identifier for AttrDict keys."""
- key = re.sub(r"[^0-9a-zA-Z_]", "_", name)
- if key and key[0].isdigit():
- key = "_" + key
- return key or "_unnamed"
diff --git a/src/projspec/proj/data_project.py b/src/projspec/proj/data_project.py
new file mode 100644
index 0000000..d930b2f
--- /dev/null
+++ b/src/projspec/proj/data_project.py
@@ -0,0 +1,434 @@
+"""The :class:`DataProject` project type.
+
+A *data project* is a directory that is wholly or substantially made up of
+data files (as opposed to source code, docs or config). Examples:
+
+* a directory of CSV/parquet/JSON files exported from a database,
+* a folder of images or arrays,
+* a code project that *also* ships a significant amount of bundled data.
+
+Detection policy
+----------------
+Scanning data is comparatively expensive (intake reads magic bytes / samples),
+so we only do it when the data is *worth* describing. Data is considered
+significant when **any** of the following holds:
+
+* the candidate data files make up at least ``data_min_fraction`` of the
+ project's total bytes, **and** their combined size is at least
+ ``data_min_total_size`` (guards against a project of tiny files);
+* at least one individual data file is at least ``data_min_file_size``
+ (a single big file is always worth describing);
+* the directory matched no other project type at all (a bare data dump), as
+ long as the data clears ``data_min_total_size``.
+
+Consolidation
+-------------
+Before handing files to intake, obviously-related files are grouped into a
+single dataset (see :mod:`projspec.proj._consolidate`):
+
+* numbered series โ ``001.csv``, ``002.csv`` โ ``*.csv``
+* spark/dask parts โ ``part-00000.parquet`` โฆ โ ``part-*.parquet``
+* token series โ ``green.gif``, ``red.gif`` โ ``*.gif``
+
+Intake's own directory-dataset recognition (hive parquet, zarr, delta, โฆ) is
+preserved: such directories are inspected as a whole rather than file-by-file.
+
+Per-dataset significance
+------------------------
+Just as the whole directory must clear the significance bar above, the
+individual datasets within a data project are filtered too: a dataset whose
+size is less than ``data_min_fraction`` of the largest dataset is treated as
+incidental and dropped (see :meth:`DataProject._filter_small_datasets`). This
+mirrors the project-level fraction test so that a project dominated by one big
+dataset doesn't also report a handful of tiny, unrelated ones.
+"""
+
+from __future__ import annotations
+
+import logging
+
+from projspec.config import get_conf
+from projspec.proj.base import ProjectSpec, ParseFailed
+from projspec.proj._consolidate import consolidate, FileGroup
+from projspec.utils import AttrDict
+
+logger = logging.getLogger("projspec.data_project")
+
+# Extensions that are *not* data: source code, build/config, docs. Anything
+# else (or no extension) is a candidate data file. Kept conservative on
+# purpose - intake makes the final call on whether something is real data.
+_NON_DATA_EXT = {
+ # python / compiled
+ ".py",
+ ".pyc",
+ ".pyi",
+ ".pyx",
+ ".pxd",
+ ".so",
+ ".pyd",
+ ".ipynb",
+ # other languages
+ ".c",
+ ".h",
+ ".cpp",
+ ".hpp",
+ ".cc",
+ ".rs",
+ ".go",
+ ".java",
+ ".kt",
+ ".scala",
+ ".js",
+ ".jsx",
+ ".ts",
+ ".tsx",
+ ".rb",
+ ".php",
+ ".swift",
+ ".m",
+ ".sh",
+ ".bash",
+ ".lua",
+ ".pl",
+ ".r",
+ ".jl",
+ # config / build / project metadata
+ ".toml",
+ ".cfg",
+ ".ini",
+ ".lock",
+ ".mk",
+ ".cmake",
+ ".gradle",
+ ".bazel",
+ ".dockerfile",
+ ".env",
+ ".editorconfig",
+ ".gitignore",
+ ".gitattributes",
+ # docs / web
+ ".md",
+ ".rst",
+ ".txt",
+ ".html",
+ ".htm",
+ ".css",
+ ".scss",
+ ".tex",
+ # these are ambiguous - yaml/json are often config but also data; we treat
+ # them as candidate data only when they dominate (handled by thresholds).
+}
+
+# Directory-based dataset markers intake understands; if present we inspect the
+# whole directory rather than enumerating files.
+_DIR_DATASET_MARKERS = (
+ "_metadata",
+ "_common_metadata",
+ "_delta_log",
+ ".zgroup",
+ ".zarray",
+ "zarr.json",
+ "_latest.manifest",
+)
+
+
+class DataProject(ProjectSpec):
+ """A project that is wholly or substantially composed of data files.
+
+ Produces one :class:`projspec.content.data.Dataset` content object per
+ consolidated dataset found, populated from
+ :func:`intake.readers.inspect.inspect_dataset` where intake is available.
+ """
+
+ icon = "๐๏ธ"
+ spec_doc = (
+ "https://intake.readthedocs.io/en/latest/api2.html"
+ "#intake.readers.inspect.inspect_dataset"
+ )
+
+ # โโ helpers โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
+ @staticmethod
+ def _is_data_ext(name: str) -> bool:
+ """Whether a basename looks like a data file (not code/docs/config)."""
+ lower = name.lower()
+ if lower.startswith("."):
+ return False # dotfiles are metadata, not data
+ if "." not in lower:
+ return False # no extension - usually not a recognisable dataset
+ for double in (".csv.gz", ".json.gz", ".tsv.gz"):
+ if lower.endswith(double):
+ return True
+ ext = "." + lower.rsplit(".", 1)[-1]
+ return ext not in _NON_DATA_EXT
+
+ def _candidate_files(self) -> list[tuple[str, int | None]]:
+ """``(basename, size)`` for data-like files directly in the root."""
+ out = []
+ for info in self.proj.filelist:
+ if info.get("type") == "directory":
+ continue
+ name = info["name"].rsplit("/", 1)[-1]
+ if self._is_data_ext(name):
+ out.append((name, info.get("size")))
+ return out
+
+ def _has_dir_dataset(self) -> bool:
+ """True if the root itself is an intake directory-dataset (hive, zarrโฆ)."""
+ return any(m in self.proj.basenames for m in _DIR_DATASET_MARKERS)
+
+ # โโ match โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
+ def match(self) -> bool:
+ """Cheap check: are there any candidate data files (or a dir-dataset)?
+
+ Significance (size/fraction) is enforced in :meth:`parse` so that
+ ``match`` stays cheap and never reads file contents.
+ """
+ if self._has_dir_dataset():
+ return True
+ return bool(self._candidate_files())
+
+ # โโ significance policy โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
+ def _other_type_matches(self) -> bool:
+ """Cheaply test whether any *other* project type matches this directory.
+
+ ``parse`` runs in registry order, so ``self.proj.specs`` is not yet
+ complete when ``DataProject`` is parsed. Instead we re-run the cheap
+ ``match()`` of every other registered spec. ``match`` is contractually
+ cheap (basename checks), so this adds little cost and only happens once
+ per directory that has candidate data.
+ """
+ from projspec.proj.base import registry, ProjectExtra
+
+ for name, cls in registry.items():
+ if name == "data_project":
+ continue
+ # ProjectExtra specs (licences, CI, intake catalogs, โฆ) are
+ # cross-cutting add-ons, not standalone project types, so a match
+ # from one of them should not suppress a data project.
+ if issubclass(cls, ProjectExtra):
+ continue
+ try:
+ inst = cls(self.proj) # __init__ runs match(), raises if no match
+ except Exception:
+ continue
+ else:
+ logger.debug("DataProject deferring to %s for %s", name, self.proj.url)
+ return True
+ return False
+
+ def _is_significant(self, data_bytes: int, max_file: int) -> bool:
+ """Apply the detection policy described in the module docstring."""
+ min_file = get_conf("data_min_file_size")
+ min_total = get_conf("data_min_total_size")
+ min_frac = get_conf("data_min_fraction")
+ min_play = get_conf("data_min_play_size")
+
+ # 1. a single big file is always worth describing
+ if max_file >= min_file:
+ return True
+
+ total = self.proj.total_size or data_bytes
+ # 2. data dominates the project by byte fraction (and isn't trivially small)
+ if total and data_bytes / total >= min_frac and data_bytes >= min_total:
+ return True
+
+ # 3. nothing else matched -> treat any non-play data dump as a project.
+ # Here the bar is only "more than play data", not the full
+ # data_min_total_size used for the also-a-data-project case above.
+ if data_bytes >= min_play and not self._other_type_matches():
+ return True
+
+ return False
+
+ def _filter_small_datasets(self, datasets: list) -> list:
+ """Drop datasets that are a small fraction of the largest one.
+
+ Operates on a list of ``(name, Dataset)`` pairs (the form used while
+ assembling :meth:`parse`'s output).
+
+ Just as :meth:`_is_significant` decides whether the directory as a
+ whole is data-y enough to report, this applies the same spirit to the
+ individual datasets within a data project: a dataset whose size is
+ less than ``data_min_fraction`` of the biggest dataset is treated as
+ incidental and discarded.
+
+ The comparison is by byte size relative to the largest dataset. If
+ fewer than two datasets are present, or any dataset's size is unknown
+ (``None``), no filtering is applied (we can't reason about fractions).
+ """
+ if len(datasets) < 2:
+ return datasets
+ sizes = [getattr(ds, "total_size", None) for _, ds in datasets]
+ if any(s is None for s in sizes):
+ return datasets
+ largest = max(s for s in sizes if s is not None)
+ if largest <= 0:
+ return datasets
+ min_frac = get_conf("data_min_fraction")
+ kept = [
+ pair
+ for pair, s in zip(datasets, sizes)
+ if s is not None and s / largest >= min_frac
+ ]
+ # never drop everything: if the threshold somehow excludes all (e.g.
+ # min_frac > 1), fall back to keeping the original set.
+ return kept or datasets
+
+ # โโ parse โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
+ def parse(self) -> None:
+ candidates = self._candidate_files()
+ has_dir_dataset = self._has_dir_dataset()
+
+ data_bytes = sum(s or 0 for _, s in candidates)
+ max_file = max((s or 0 for _, s in candidates), default=0)
+
+ if not has_dir_dataset and not self._is_significant(data_bytes, max_file):
+ raise ParseFailed("Data present but not a significant data project")
+
+ groups: list[FileGroup]
+ if has_dir_dataset:
+ # Let intake describe the whole directory as one dataset.
+ name = self.proj.url.rstrip("/").rsplit("/", 1)[-1] or "dataset"
+ groups = [
+ FileGroup(
+ members=[],
+ total_size=self.proj.total_size,
+ pattern=name,
+ consolidated=True,
+ )
+ ]
+ dir_dataset = True
+ else:
+ min_group = get_conf("data_consolidate_min_group")
+ groups = consolidate(candidates, min_group=min_group)
+ dir_dataset = False
+
+ if len(groups) > get_conf("data_inspect_max_datasets"):
+ logger.debug(
+ "Too many datasets (%d) in %s; describing without intake",
+ len(groups),
+ self.proj.url,
+ )
+ described = [self._describe_without_intake(g) for g in groups]
+ else:
+ described = [self._describe(g, dir_dataset=dir_dataset) for g in groups]
+
+ # Each entry is a (name, Dataset) pair. Only keep datasets that intake
+ # could assign a datatype to; datasets whose type could not be
+ # identified are not useful as data content.
+ described = [(name, ds) for name, ds in described if ds.datatype is not None]
+
+ # Drop datasets that are only a small fraction of the largest one,
+ # analogous to the project-level significance test.
+ described = self._filter_small_datasets(described)
+
+ if not described:
+ raise ParseFailed("No datasets with an identified datatype found")
+
+ # Datasets are keyed by their (unique) name; the name is therefore not
+ # duplicated as a field on the Dataset objects themselves.
+ datasets = AttrDict()
+ for name, ds in described:
+ key = name
+ # guard against the (rare) case of duplicate names
+ n = 2
+ while key in datasets:
+ key = f"{name}#{n}"
+ n += 1
+ datasets[key] = ds
+ self._contents = AttrDict(dataset=datasets)
+
+ # โโ dataset description โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
+ def _root_url(self) -> str:
+ """Protocol-qualified root URL for handing to intake / building dataset
+ URLs.
+
+ ``self.proj.url`` is the filesystem-specific path with the protocol
+ stripped (e.g. ``bucket/key`` for ``s3://bucket/key``). Intake needs
+ the protocol to pick the right filesystem, so we restore it here.
+ """
+ return self.proj.fs.unstrip_protocol(self.proj.url)
+
+ def _dataset_url(self, group: FileGroup, dir_dataset: bool):
+ if dir_dataset:
+ return self._root_url()
+ return group.url(self._root_url())
+
+ def _describe_without_intake(self, group: FileGroup):
+ """Build a Dataset content object using only filename info (no I/O).
+
+ Returns a ``(name, Dataset)`` pair; the name becomes the key in the
+ project's ``contents.dataset`` mapping.
+ """
+ from projspec.content.data import Dataset
+
+ return group.name, Dataset(
+ proj=self.proj,
+ url=group.url(self._root_url()),
+ datatype=None,
+ structure=[],
+ schema={},
+ n_files=len(group.members) or 1,
+ total_size=group.total_size,
+ metadata={},
+ )
+
+ def _describe(self, group: FileGroup, dir_dataset: bool = False):
+ """Describe a single file-group as a Dataset, using intake if available."""
+ from projspec.content.data import Dataset
+
+ url = self._dataset_url(group, dir_dataset)
+ info: dict | None = None
+ try:
+ from intake.readers.inspect import inspect_dataset
+
+ # storage_options keep remote access working; the size guard and
+ # timeout protect against pathological inputs.
+ info = inspect_dataset(
+ url,
+ storage_options=self.proj.storage_options or None,
+ )
+ except ImportError:
+ logger.debug("intake not installed; describing %s by name only", url)
+ except Exception as exc: # never let a bad file abort the whole parse
+ logger.debug("inspect_dataset failed for %s: %s", url, exc)
+
+ if not info:
+ return self._describe_without_intake(group)
+
+ n_files = info.get("n_files") or (len(group.members) or 1)
+ total = info.get("file_size_bytes")
+ if total is None:
+ total = group.total_size
+
+ meta = {
+ k: info[k]
+ for k in (
+ "shape",
+ "npartitions",
+ "reader_used",
+ "description",
+ "html_repr",
+ "thumbnail",
+ )
+ if info.get(k) is not None
+ }
+ # report which readers intake thinks can load this, if any
+ readers = info.get("readers") or {}
+ if readers:
+ meta["readers"] = sorted(readers)
+
+ structure = info.get("structure") or set()
+ name = group.pattern if dir_dataset else group.name
+ return name, Dataset(
+ proj=self.proj,
+ url=url,
+ datatype=info.get("detected_type"),
+ structure=sorted(structure)
+ if isinstance(structure, set)
+ else list(structure),
+ schema=info.get("datashape") or {},
+ n_files=n_files,
+ total_size=total,
+ metadata=meta,
+ )
diff --git a/src/projspec/proj/knowledge_catalog.py b/src/projspec/proj/knowledge_catalog.py
new file mode 100644
index 0000000..bf31fca
--- /dev/null
+++ b/src/projspec/proj/knowledge_catalog.py
@@ -0,0 +1,208 @@
+"""The :class:`KnowledgeCatalog` project spec.
+
+Detects an *Open Knowledge Format* (OKF) bundle: a directory tree of markdown
+files with YAML frontmatter, where every non-reserved ``.md`` file is a
+"concept" carrying at least a ``type`` field. Two filenames are reserved at
+any level: ``index.md`` (directory listing) and ``log.md`` (update history).
+
+See https://github.com/GoogleCloudPlatform/knowledge-catalog/blob/main/okf/SPEC.md
+"""
+
+from __future__ import annotations
+
+import os
+from io import StringIO
+
+from projspec.proj import ProjectSpec
+from projspec.proj.base import ParseFailed
+from projspec.utils import AttrDict
+
+# Filenames with reserved meaning that are never concept documents (ยง3.1).
+_RESERVED = {"index.md", "log.md"}
+
+
+def _split_frontmatter(text: str | bytes) -> dict | None:
+ """Return the parsed YAML frontmatter block of a markdown document.
+
+ Returns ``None`` when the document has no parseable ``---``-delimited
+ frontmatter mapping at its start.
+ """
+ import yaml
+
+ if isinstance(text, bytes):
+ text = text.decode("utf-8", "replace")
+ # Frontmatter must be delimited by '---' on its own line at the start and a
+ # closing '---'. Require at least the opening and closing fences.
+ if text.count("---\n") < 2 and not text.lstrip().startswith("---"):
+ return None
+ parts = text.split("---\n")
+ if len(parts) < 3:
+ return None
+ # parts[0] is whatever precedes the first fence (should be empty/whitespace)
+ if parts[0].strip():
+ return None
+ try:
+ meta = yaml.safe_load(StringIO(parts[1]))
+ except Exception:
+ return None
+ return meta if isinstance(meta, dict) else None
+
+
+class KnowledgeCatalog(ProjectSpec):
+ """An Open Knowledge Format (OKF) knowledge bundle.
+
+ An OKF bundle is a directory of markdown "concept" documents, each with a
+ YAML frontmatter block declaring a ``type``. Reserved ``index.md`` /
+ ``log.md`` files provide directory listings and update history.
+
+ Produces one :class:`projspec.content.metadata.DescriptiveMetadata` per
+ concept, keyed by its *concept ID* (the file path within the bundle with
+ the ``.md`` suffix removed, e.g. ``tables/orders``).
+ """
+
+ icon = "๐"
+ spec_doc = (
+ "https://github.com/GoogleCloudPlatform/knowledge-catalog/blob/main/okf/SPEC.md"
+ )
+
+ def match(self) -> bool:
+ """Cheap check: a reserved ``index.md`` is present, plus either another
+ markdown document or a subdirectory that might hold concepts.
+
+ Full validation (that concepts carry a ``type`` field) is deferred to
+ :meth:`parse`, which raises :class:`ParseFailed` if none qualify, so a
+ plain ``index.md`` from some other tool does not register as an OKF
+ bundle.
+ """
+ if "index.md" not in self.proj.basenames:
+ return False
+ # another markdown concept at the root...
+ for name in self.proj.basenames:
+ if name.endswith(".md") and name not in _RESERVED:
+ return True
+ # ...or a subdirectory that might contain concepts
+ for info in self.proj.filelist:
+ if info.get("type") == "directory":
+ base = str(info["name"]).rstrip("/").rsplit("/", 1)[-1]
+ # skip hidden/dunder dirs (handled like project walking)
+ if not base.startswith((".", "_")):
+ return True
+ return False
+
+ def _concept_files(self) -> list[str]:
+ """Full paths of candidate concept documents (recursive, non-reserved)."""
+ root = self.proj.url.rstrip("/")
+ try:
+ # glob may return a list or (with detail) a dict keyed by path
+ paths = list(self.proj.fs.glob(f"{root}/**/*.md"))
+ except Exception:
+ # fall back to the top-level listing if globbing isn't supported
+ paths = [
+ full
+ for name, full in self.proj.basenames.items()
+ if name.endswith(".md")
+ ]
+ out = []
+ for p in paths:
+ p = str(p)
+ base = p.rsplit("/", 1)[-1]
+ if base in _RESERVED:
+ continue
+ out.append(p)
+ return sorted(out)
+
+ def _concept_id(self, full_path: str) -> str:
+ """The concept ID: bundle-relative path with the ``.md`` suffix removed."""
+ root = self.proj.url.rstrip("/") + "/"
+ rel = full_path[len(root) :] if full_path.startswith(root) else full_path
+ if rel.endswith(".md"):
+ rel = rel[: -len(".md")]
+ return rel
+
+ def parse(self) -> None:
+ from projspec.content.metadata import DescriptiveMetadata
+
+ concepts = AttrDict()
+ for full in self._concept_files():
+ try:
+ with self.proj.fs.open(full, "rt") as f:
+ text = f.read()
+ except OSError:
+ continue
+ meta = _split_frontmatter(text)
+ if not meta:
+ # not a conformant concept document - skip
+ continue
+ type_ = meta.get("type")
+ if not type_ or not str(type_).strip():
+ # ยง9: every concept frontmatter must carry a non-empty `type`
+ continue
+
+ entry: dict[str, str] = {"type": str(type_)}
+ for field in ("title", "description", "resource", "timestamp"):
+ val = meta.get(field)
+ if val:
+ entry[field] = str(val)
+ tags = meta.get("tags")
+ if tags:
+ if isinstance(tags, (list, tuple)):
+ entry["tags"] = ", ".join(str(t) for t in tags)
+ else:
+ entry["tags"] = str(tags)
+
+ key = self._concept_id(full)
+ concepts[key] = DescriptiveMetadata(proj=self.proj, meta=entry)
+
+ if not concepts:
+ raise ParseFailed("No OKF concept documents with a 'type' field found")
+
+ # The bundle-root index.md may declare the OKF version it targets.
+ bundle_meta: dict[str, str] = {}
+ if "index.md" in self.proj.basenames:
+ try:
+ with self.proj.get_file("index.md") as f:
+ idx = _split_frontmatter(f.read())
+ except OSError:
+ idx = None
+ if idx and idx.get("okf_version"):
+ bundle_meta["okf_version"] = str(idx["okf_version"])
+
+ contents = AttrDict(concept=concepts)
+ if bundle_meta:
+ contents["descriptive_metadata"] = DescriptiveMetadata(
+ proj=self.proj, meta=bundle_meta
+ )
+ self._contents = contents
+ self._artifacts = AttrDict()
+
+ @staticmethod
+ def _create(path: str) -> None:
+ """Scaffold a minimal but conformant OKF bundle."""
+ name = os.path.basename(path.rstrip("/")) or "bundle"
+
+ with open(f"{path}/index.md", "w") as f:
+ f.write(
+ "---\n"
+ 'okf_version: "0.1"\n'
+ "---\n\n"
+ f"# {name}\n\n"
+ "* [Overview](overview.md) - what this bundle contains\n"
+ )
+
+ with open(f"{path}/log.md", "w") as f:
+ f.write(
+ "# Update Log\n\n"
+ "## 2026-01-01\n"
+ "* **Initialization**: Created the bundle.\n"
+ )
+
+ with open(f"{path}/overview.md", "w") as f:
+ f.write(
+ "---\n"
+ "type: Reference\n"
+ f"title: {name} overview\n"
+ "description: A short description of this knowledge bundle.\n"
+ "---\n\n"
+ f"# {name}\n\n"
+ "Free-form markdown describing the knowledge captured here.\n"
+ )
diff --git a/src/projspec/textapp/main.py b/src/projspec/textapp/main.py
index 5911bb4..bff7407 100644
--- a/src/projspec/textapp/main.py
+++ b/src/projspec/textapp/main.py
@@ -171,19 +171,9 @@ def _basename(url: str) -> str:
def _fmt_age(ts: float) -> str:
- import datetime
+ from projspec.proj.base import _humanize_age
- days = (datetime.datetime.now() - datetime.datetime.fromtimestamp(ts)).days
- if days == 0:
- return "today"
- if days == 1:
- return "yesterday"
- if days < 30:
- return f"{days} days ago"
- if days < 365:
- return f"{days // 30} months ago"
- yrs = days // 365
- return f"{yrs} year{'s' if yrs > 1 else ''} ago"
+ return _humanize_age(ts)
def _is_enum(v: Any) -> bool:
@@ -253,7 +243,15 @@ def _yaml_lines(
return [f"{pad}{_role('{}', 'muted')}"]
out = []
for k, v in data.items():
- if _is_enum(v):
+ # The web UIs embed these as live HTML / an image; a TUI can't, so
+ # show a short placeholder rather than dumping the huge raw string.
+ if k in ("html_repr", "thumbnail") and isinstance(v, str):
+ note = "HTML preview" if k == "html_repr" else "image thumbnail"
+ out.append(
+ f"{pad}{_role(str(k), 'field')}: "
+ f"{_role(f'<{note} available in graphical UI>', 'muted')}"
+ )
+ elif _is_enum(v):
out.append(
f"{pad}{_role(str(k), 'field')}: "
f"{_role(_enum_label(v, enums), 'enum')}"
@@ -713,6 +711,9 @@ def compose(self) -> ComposeResult:
age = _fmt_age(float(last_modified))
by = self.project.get("last_modified_by")
meta_parts.append("last modified " + age + (f" by {by}" if by else ""))
+ scanned_at = self.project.get("scanned_at")
+ if scanned_at is not None:
+ meta_parts.append("scanned " + _fmt_age(float(scanned_at)))
if meta_parts:
yield Static(" ยท ".join(meta_parts), classes="meta")
# Build the full list of chips first, then split into horizontal
diff --git a/src/projspec/utils.py b/src/projspec/utils.py
index fa70261..64dd015 100644
--- a/src/projspec/utils.py
+++ b/src/projspec/utils.py
@@ -15,6 +15,10 @@
logger = logging.getLogger("projspec")
+class DEFAULT:
+ ...
+
+
class Enum(enum.Enum):
"""Named enum values, so that str(x) looks like the label."""
@@ -113,7 +117,10 @@ def from_dict(dic, proj=None):
if dic["klass"] == "project":
return Project.from_dict(dic)
category, name = dic.pop("klass")
- cls = get_cls(name, category)
+ try:
+ cls = get_cls(name, category)
+ except KeyError:
+ return None
if category == "enum":
return cls(dic["value"])
obj = object.__new__(cls)
diff --git a/src/projspec/webui/panel.js b/src/projspec/webui/panel.js
index 870e249..b1f3904 100644
--- a/src/projspec/webui/panel.js
+++ b/src/projspec/webui/panel.js
@@ -102,8 +102,18 @@
}
}
function fmtAge(ts) {
- const days = Math.floor((Date.now() / 1000 - parseFloat(ts)) / 86400);
- if (days === 0) return 'today';
+ const secs = Math.floor(Date.now() / 1000 - parseFloat(ts));
+ if (secs < 0) return 'just now';
+ const days = Math.floor(secs / 86400);
+ if (days === 0) {
+ if (secs < 60) return 'just now';
+ if (secs < 3600) {
+ const m = Math.floor(secs / 60);
+ return m + ' minute' + (m !== 1 ? 's' : '') + ' ago';
+ }
+ const h = Math.floor(secs / 3600);
+ return h + ' hour' + (h !== 1 ? 's' : '') + ' ago';
+ }
if (days === 1) return 'yesterday';
if (days < 30) return days + ' days ago';
if (days < 365) return Math.floor(days / 30) + ' months ago';
@@ -200,6 +210,8 @@
const by = project.last_modified_by != null ? project.last_modified_by : null;
metaParts.push('last modified ' + age + (by ? ' by ' + by : ''));
}
+ if (project.scanned_at != null)
+ metaParts.push('scanned ' + fmtAge(project.scanned_at));
if (metaParts.length > 0) {
const meta = document.createElement('div');
meta.className = 'meta';
@@ -456,14 +468,69 @@
body.innerHTML = sanitizeHtml(html);
w.appendChild(body);
} else {
+ // Datasets (and other content) may carry rich previews in
+ // ``metadata.html_repr`` (an HTML fragment) and
+ // ``metadata.thumbnail`` (a data: image URL). Embed those rather
+ // than dumping their (often huge) raw strings into the YAML tree.
+ const meta = (kind === 'content' && data && typeof data === 'object'
+ && data.metadata && typeof data.metadata === 'object') ? data.metadata : null;
+ const htmlRepr = meta && typeof meta.html_repr === 'string' ? meta.html_repr : null;
+ const thumb = meta && typeof meta.thumbnail === 'string' ? meta.thumbnail : null;
+
const tree = document.createElement('div');
tree.className = 'tree yaml';
- tree.appendChild(renderYaml(stripKlass(data)));
+ tree.appendChild(renderYaml(stripPreview(stripKlass(data))));
w.appendChild(tree);
+
+ if (thumb) w.appendChild(thumbnailImg(thumb));
+ if (htmlRepr) {
+ const body = document.createElement('div');
+ body.className = 'widget-html';
+ body.innerHTML = sanitizeHtml(htmlRepr);
+ w.appendChild(body);
+ }
}
return w;
}
+ /**
+ * Return a shallow copy of a content dict with the embedded-preview keys
+ * (``metadata.html_repr`` / ``metadata.thumbnail``) removed, so the YAML
+ * tree doesn't show their large raw strings - they are rendered as live
+ * HTML / an image instead.
+ */
+ function stripPreview(obj) {
+ if (!obj || typeof obj !== 'object' || Array.isArray(obj)) return obj;
+ if (!obj.metadata || typeof obj.metadata !== 'object' || Array.isArray(obj.metadata)) return obj;
+ const meta = {};
+ let changed = false;
+ for (const k of Object.keys(obj.metadata)) {
+ if (k === 'html_repr' || k === 'thumbnail') { changed = true; continue; }
+ meta[k] = obj.metadata[k];
+ }
+ if (!changed) return obj;
+ const out = {};
+ for (const k of Object.keys(obj)) out[k] = obj[k];
+ out.metadata = meta;
+ return out;
+ }
+
+ /**
+ * Build an
for a ``data:image/...`` thumbnail URL. Only accepts
+ * data: image URLs (never remote/javascript URLs).
+ */
+ function thumbnailImg(src) {
+ const wrap = document.createElement('div');
+ wrap.className = 'widget-html';
+ if (/^data:image\//i.test(src)) {
+ const img = document.createElement('img');
+ img.src = src;
+ img.alt = 'thumbnail';
+ wrap.appendChild(img);
+ }
+ return wrap;
+ }
+
/**
* Minimal HTML sanitisation for content-provided ``_html`` fragments.
* The markup originates from projspec itself, so we don't need a
diff --git a/tests/test_basic.py b/tests/test_basic.py
index 778d53b..b9f011d 100644
--- a/tests/test_basic.py
+++ b/tests/test_basic.py
@@ -1,5 +1,6 @@
import json
import pickle
+import time
import pytest
@@ -14,7 +15,32 @@ def test_basic(proj):
assert "src/projspec" in proj.children
assert repr(proj).count("\n") == 0
assert str(proj).count("\n") > 0
- proj._repr_html_()
+ proj._ipython_display_()
+
+
+def test_humanize_age():
+ from projspec.proj.base import _humanize_age
+
+ now = time.time()
+ assert _humanize_age(now) == "just now"
+ assert _humanize_age(now + 100) == "just now" # future / clock skew
+ assert _humanize_age(now - 5 * 60) == "5 minutes ago"
+ assert _humanize_age(now - 60) == "1 minute ago"
+ assert _humanize_age(now - 3 * 3600) == "3 hours ago"
+ assert _humanize_age(now - 1.5 * 86400) == "yesterday"
+ assert _humanize_age(now - 10 * 86400) == "10 days ago"
+ assert _humanize_age(now - 60 * 86400) == "2 months ago"
+ assert _humanize_age(now - 400 * 86400) == "1 year ago"
+ assert _humanize_age(now - 800 * 86400) == "2 years ago"
+
+
+def test_scanned_at_in_stats_line(proj):
+ # scanned_at should appear in the textual surfaces
+ assert "scanned " in proj._stats_line()
+ assert "scanned " in proj.text_summary()
+ assert "scanned " in str(proj)
+ # bare summary omits the stats line entirely
+ assert "scanned " not in proj.text_summary(bare=True)
def test_errors():
diff --git a/tests/test_data_html.py b/tests/test_data_html.py
deleted file mode 100644
index 2d6e6ea..0000000
--- a/tests/test_data_html.py
+++ /dev/null
@@ -1,449 +0,0 @@
-"""Tests for projspec.content.data_html โ repr_text and repr_html.
-
-These tests use a mock DataResource to avoid needing real data files on disk
-for basic formatting checks, then run format-specific loader tests when the
-required optional libraries are available.
-"""
-
-from __future__ import annotations
-
-import io
-import os
-import tempfile
-from unittest.mock import MagicMock
-
-import pytest
-
-import projspec
-
-# ---------------------------------------------------------------------------
-# Helpers
-# ---------------------------------------------------------------------------
-
-
-def _make_dr(
- path="mytable.parquet",
- fmt="parquet",
- modality="tabular",
- layout="flat",
- file_count=3,
- total_size=1024 * 512,
- schema=None,
- sample_path="",
- metadata=None,
-):
- """Build a DataResource backed by a real Project (the repo root) but with
- controlled field values."""
- from projspec.content.data import DataResource
-
- mock_proj = MagicMock(spec=projspec.Project)
- # Use a real local filesystem via fsspec
- import fsspec
-
- mock_proj.fs = fsspec.filesystem("file")
- mock_proj.url = "/tmp"
-
- return DataResource(
- proj=mock_proj,
- path=path,
- format=fmt,
- modality=modality,
- layout=layout,
- file_count=file_count,
- total_size=total_size,
- schema=schema or {},
- sample_path=sample_path,
- metadata=metadata or {},
- )
-
-
-# ---------------------------------------------------------------------------
-# repr_text tests
-# ---------------------------------------------------------------------------
-
-
-class TestReprText:
- def test_basic_fields_present(self):
- dr = _make_dr()
- text = repr(dr)
- assert "mytable.parquet" in text
- assert "parquet" in text
- assert "tabular" in text
- assert "files=3" in text
-
- def test_size_formatting(self):
- dr = _make_dr(total_size=1024)
- text = repr(dr)
- assert "KB" in text or "B" in text
-
- def test_size_zero(self):
- dr = _make_dr(total_size=0)
- text = repr(dr)
- assert "unknown" in text
-
- def test_schema_hint_dict(self):
- dr = _make_dr(schema={"col_a": "int64", "col_b": "float32", "col_c": "str"})
- text = repr(dr)
- assert "col_a" in text
-
- def test_schema_hint_many_fields(self):
- schema = {f"col_{i}": "int64" for i in range(10)}
- dr = _make_dr(schema=schema)
- text = repr(dr)
- assert "+7 more" in text
-
- def test_schema_hint_list(self):
- dr = _make_dr(schema=[{"name": "a"}, {"name": "b"}])
- text = repr(dr)
- assert "2 fields" in text
-
- def test_non_flat_layout_shown(self):
- dr = _make_dr(layout="hive")
- text = repr(dr)
- assert "hive" in text
-
- def test_flat_layout_hidden(self):
- dr = _make_dr(layout="flat")
- text = repr(dr)
- assert "layout" not in text
-
- def test_no_modality(self):
- dr = _make_dr(modality="")
- text = repr(dr)
- assert "modality" not in text
-
- def test_single_line(self):
- dr = _make_dr()
- text = repr(dr)
- assert "\n" not in text
-
- def test_path_shown(self):
- """repr_text must show the path field, not a separate name."""
- dr = _make_dr(path="part*.csv")
- text = repr(dr)
- assert "part*.csv" in text
-
- def test_dir_path_shown(self):
- dr = _make_dr(path="year=2024/")
- text = repr(dr)
- assert "year=2024/" in text
-
-
-# ---------------------------------------------------------------------------
-# repr_html tests
-# ---------------------------------------------------------------------------
-
-
-class TestReprHtml:
- def test_returns_string(self):
- dr = _make_dr()
- html = dr._repr_html_()
- assert isinstance(html, str)
- assert len(html) > 0
-
- def test_contains_path(self):
- dr = _make_dr(path="my_dataset.parquet")
- html = dr._repr_html_()
- assert "my_dataset.parquet" in html
-
- def test_contains_glob_path(self):
- dr = _make_dr(path="part*.parquet")
- html = dr._repr_html_()
- assert "part*.parquet" in html
-
- def test_contains_dir_path(self):
- dr = _make_dr(path="year=2024/")
- html = dr._repr_html_()
- assert "year=2024/" in html
-
- def test_contains_format_badge(self):
- dr = _make_dr(fmt="parquet")
- html = dr._repr_html_()
- assert "parquet" in html
-
- def test_contains_modality_badge(self):
- dr = _make_dr(modality="tabular")
- html = dr._repr_html_()
- assert "tabular" in html
-
- def test_contains_file_count(self):
- dr = _make_dr(file_count=7)
- html = dr._repr_html_()
- assert "7" in html
-
- def test_contains_size(self):
- dr = _make_dr(total_size=2048)
- html = dr._repr_html_()
- assert "KB" in html or "B" in html
-
- def test_schema_dict_rendered(self):
- dr = _make_dr(schema={"id": "int64", "name": "string"})
- html = dr._repr_html_()
- assert "id" in html
- assert "int64" in html
-
- def test_schema_list_of_dicts_rendered(self):
- dr = _make_dr(
- schema=[
- {"name": "id", "type": "integer"},
- {"name": "val", "type": "number"},
- ]
- )
- html = dr._repr_html_()
- assert "id" in html
- assert "integer" in html
-
- def test_schema_empty_no_details(self):
- dr = _make_dr(schema={})
- html = dr._repr_html_()
- assert "Schema" not in html
-
- def test_no_preview_section_without_sample_path(self):
- dr = _make_dr(sample_path="")
- html = dr._repr_html_()
- assert "Preview" not in html
-
- def test_layout_badge_shown_for_hive(self):
- dr = _make_dr(layout="hive")
- html = dr._repr_html_()
- assert "hive" in html
-
- def test_layout_badge_hidden_for_flat(self):
- dr = _make_dr(layout="flat")
- html = dr._repr_html_()
- assert 'ps-badge-gray">flat<' not in html
-
- def test_html_structure(self):
- dr = _make_dr()
- html = dr._repr_html_()
- assert "ps-data-card" in html
- assert "ps-data-card-header" in html
- assert "ps-data-meta" in html
-
- def test_icon_present_for_known_modality(self):
- dr = _make_dr(modality="image")
- html = dr._repr_html_()
- # Image icon is ๐ผ (🖼)
- assert "🖼" in html
-
- def test_icon_fallback_for_unknown_modality(self):
- dr = _make_dr(modality="")
- html = dr._repr_html_()
- # Fallback icon 🗂
- assert "🗂" in html
-
- def test_large_schema_collapsed(self):
- schema = {f"col_{i}": "int64" for i in range(20)}
- dr = _make_dr(schema=schema)
- html = dr._repr_html_()
- # details element should NOT have open attribute when >8 fields
- assert (
- "' in html
- )
-
- def test_small_schema_open(self):
- schema = {f"col_{i}": "int64" for i in range(4)}
- dr = _make_dr(schema=schema)
- html = dr._repr_html_()
- assert " with a dataframe class
- assert "dataframe" in html or "ps-df-wrap" in html
-
- def test_csv_preview_row_limit(self, tmp_path):
- """Only _PREVIEW_ROWS rows of data should appear, not all 50."""
- pytest.importorskip("pandas")
- import pandas as pd
-
- path = str(tmp_path / "big.csv")
- pd.DataFrame({"v": range(50)}).to_csv(path, index=False)
- dr = self._dr_for_file(path, "csv", "tabular")
- html = dr._repr_html_()
- # Extract just the preview section so CSS text doesn't interfere
- preview_start = html.find('')
- assert preview_start != -1, "no preview section found"
- preview_html = html[preview_start:]
- # The last row value (49) should not appear as a table cell
- assert "
49 | " not in preview_html
-
- def test_parquet_preview(self, tmp_path):
- pytest.importorskip("pyarrow")
- import pyarrow as pa
- import pyarrow.parquet as pq
-
- path = str(tmp_path / "data.parquet")
- table = pa.table({"a": [1, 2, 3], "b": ["x", "y", "z"]})
- pq.write_table(table, path)
- dr = self._dr_for_file(path, "parquet", "tabular")
- html = dr._repr_html_()
- assert "Preview" in html
- assert "
1 MB threshold
- np.save(path, np.zeros((512, 512), dtype="float64"))
- dr = self._dr_for_file(path, "numpy", "array")
- html = dr._repr_html_()
- assert "(512, 512)" in html # shape shown
- assert "float64" in html # dtype shown
- # The data slice key ("preview") should NOT appear in the info table;
- # check the table cell content rather than the CSS class names
- assert ">preview<" not in html # no | preview | row
-
-
-# ---------------------------------------------------------------------------
-# fmt_size helper
-# ---------------------------------------------------------------------------
-
-
-def test_fmt_size():
- from projspec.content.data_html import _fmt_size
-
- assert _fmt_size(0) == "unknown"
- assert _fmt_size(512) == "512 B"
- assert "KB" in _fmt_size(2048)
- assert "MB" in _fmt_size(2 * 1024 * 1024)
- assert "GB" in _fmt_size(3 * 1024**3)
diff --git a/tests/test_data_project.py b/tests/test_data_project.py
index 3dae345..f0d13b5 100644
--- a/tests/test_data_project.py
+++ b/tests/test_data_project.py
@@ -1,326 +1,594 @@
-import json
+"""Tests for the DataProject spec and the file-consolidation helper.
+
+The consolidation helper is filesystem-agnostic and tested directly on
+``(basename, size)`` lists. The DataProject spec is tested end-to-end by
+writing files into a tmpdir and constructing a real ``projspec.Project``.
+
+Intake may or may not be installed (and which readers are available varies),
+so the DataProject assertions only check things that do not depend on a
+specific reader being present: that the project is/ isn't detected, how files
+are consolidated, file counts and sizes. Where intake is available we also
+spot-check ``datatype``/``structure``.
+"""
+
import os
import pytest
import projspec
-from projspec.content.data import DataResource
-from projspec.utils import from_dict
-
-
-def _data_project(tmp_path):
- """Return a projspec.Project rooted at *tmp_path* (no walk needed)."""
- return projspec.Project(str(tmp_path))
-
-
-class TestDataDetection:
- def test_csv_detected(self, tmp_path):
- (tmp_path / "data.csv").write_text("x,y\n1,2\n3,4\n")
- proj = _data_project(tmp_path)
- assert "data" in proj.specs
-
- def test_parquet_detected(self, tmp_path):
- pytest.importorskip("pyarrow")
- import pyarrow as pa
- import pyarrow.parquet as pq
-
- pq.write_table(pa.table({"a": [1, 2]}), str(tmp_path / "t.parquet"))
- proj = _data_project(tmp_path)
- assert "data" in proj.specs
-
- def test_no_data_files_not_detected(self, tmp_path):
- (tmp_path / "README.md").write_text("hello")
- (tmp_path / "config.json").write_text("{}")
- proj = _data_project(tmp_path)
- assert "data" not in proj.specs
-
-
-class TestDataParse:
- def test_single_csv_resource(self, tmp_path):
- (tmp_path / "sales.csv").write_text("col1,col2\n1,a\n2,b\n")
- proj = _data_project(tmp_path)
- dr = proj.specs["data"].contents["data_resource"]
- assert isinstance(dr, DataResource)
- assert dr.path == "sales.csv"
- assert dr.format == "csv"
- assert dr.modality == "tabular"
- assert dr.file_count == 1
-
- def test_series_collated_to_glob_path(self, tmp_path):
- """part0.csv + part1.csv โ path == 'part*.csv'"""
- for i in range(3):
- (tmp_path / f"part{i}.csv").write_text("x\n1\n")
- proj = _data_project(tmp_path)
- dr = proj.specs["data"].contents["data_resource"]
- assert isinstance(dr, DataResource)
- assert dr.path == "part*.csv"
- assert dr.file_count == 3
-
- def test_distinct_csv_files_separate_resources(self, tmp_path):
- """users.csv and orders.csv differ alphabetically โ two resources."""
- (tmp_path / "users.csv").write_text("id\n1\n")
- (tmp_path / "orders.csv").write_text("id\n1\n")
- proj = _data_project(tmp_path)
- dr_map = proj.specs["data"].contents["data_resource"]
- # Two separate DataResource objects, keyed in an AttrDict
- assert len(dr_map) == 2
- paths = {dr_map[k].path for k in dr_map}
- assert "users.csv" in paths
- assert "orders.csv" in paths
-
- def test_sample_path_is_full_path(self, tmp_path):
- csv = tmp_path / "data.csv"
- csv.write_text("x\n1\n")
- proj = _data_project(tmp_path)
- dr = proj.specs["data"].contents["data_resource"]
- assert dr.sample_path == str(csv)
-
- def test_total_size_nonzero(self, tmp_path):
- content = "x,y\n" + "\n".join(f"{i},{i}" for i in range(20))
- (tmp_path / "nums.csv").write_text(content)
- proj = _data_project(tmp_path)
- dr = proj.specs["data"].contents["data_resource"]
- assert dr.total_size > 0
-
-
-class TestDataResourceToDict:
- def _make_dr(self, tmp_path):
- (tmp_path / "items.csv").write_text("id,val\n1,a\n2,b\n")
- proj = _data_project(tmp_path)
- return proj.specs["data"].contents["data_resource"]
-
- def test_compact_omits_klass(self, tmp_path):
- dr = self._make_dr(tmp_path)
- d = dr.to_dict(compact=True)
- assert "klass" not in d
-
- def test_compact_omits_html(self, tmp_path):
- """compact=True is for human/console output โ _html must be absent."""
- dr = self._make_dr(tmp_path)
- d = dr.to_dict(compact=True)
- assert "_html" not in d
-
-
-class TestDataResourceRoundTrip:
- def _roundtrip(self, dr):
- """Serialise to JSON and rehydrate, returning the new DataResource."""
- d = dr.to_dict(compact=False)
- js = json.dumps(d)
- d2 = json.loads(js)
- return from_dict(d2, proj=dr.proj)
-
- def _make_dr(self, tmp_path):
- (tmp_path / "orders.csv").write_text("order_id,amount\n1,99\n2,42\n")
- proj = _data_project(tmp_path)
- return proj.specs["data"].contents["data_resource"]
-
- def test_roundtrip_returns_dataresource(self, tmp_path):
- dr2 = self._roundtrip(self._make_dr(tmp_path))
- assert isinstance(dr2, DataResource)
-
- def test_roundtrip_preserves_path(self, tmp_path):
- dr2 = self._roundtrip(self._make_dr(tmp_path))
- assert dr2.path == "orders.csv"
-
- def test_roundtrip_preserves_format(self, tmp_path):
- dr2 = self._roundtrip(self._make_dr(tmp_path))
- assert dr2.format == "csv"
-
- def test_roundtrip_preserves_modality(self, tmp_path):
- dr2 = self._roundtrip(self._make_dr(tmp_path))
- assert dr2.modality == "tabular"
-
- def test_roundtrip_preserves_file_count(self, tmp_path):
- dr2 = self._roundtrip(self._make_dr(tmp_path))
- assert dr2.file_count == 1
-
- def test_roundtrip_preserves_total_size(self, tmp_path):
- dr = self._make_dr(tmp_path)
- dr2 = self._roundtrip(dr)
- assert dr2.total_size == dr.total_size
-
- def test_roundtrip_preserves_schema(self, tmp_path):
- pytest.importorskip("pyarrow")
- import pyarrow as pa, pyarrow.parquet as pq
-
- pq.write_table(
- pa.table({"col_a": [1, 2, 3], "col_b": ["x", "y", "z"]}),
- str(tmp_path / "data.parquet"),
+from projspec.config import temp_conf
+from projspec.proj._consolidate import consolidate, FileGroup
+from projspec.proj.data_project import DataProject
+from projspec.content.data import Dataset, TabularData, IntakeSource
+
+try:
+ import intake.readers.inspect # noqa: F401
+
+ HAS_INTAKE = True
+except Exception: # pragma: no cover
+ HAS_INTAKE = False
+
+try:
+ import pandas as _pd # noqa: F401
+
+ HAS_PANDAS = True
+except Exception: # pragma: no cover
+ HAS_PANDAS = False
+
+try:
+ # importing here puts PIL in sys.modules so intake's check_imports (which
+ # uses importlib.metadata.distribution and falls back to sys.modules) finds
+ # it - Pillow's distribution name ("pillow") differs from the import name.
+ import PIL # noqa: F401
+ import numpy as _np # noqa: F401
+
+ HAS_PIL = True
+except Exception: # pragma: no cover
+ HAS_PIL = False
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+# Production-equivalent significance thresholds. Tests that depend on these
+# values set them explicitly via temp_conf so they do not rely on (and are not
+# broken by changes to) the config defaults.
+PROD_THRESHOLDS = dict(
+ data_min_fraction=0.5,
+ data_min_file_size=1024 * 1024,
+ data_min_total_size=10 * 1024 * 1024,
+ data_min_play_size=64 * 1024,
+)
+
+
+def write_data(tmpdir, files: dict[str, int | bytes]) -> str:
+ """Write files into *tmpdir*.
+
+ Values are either an int (number of zero bytes to write) or raw bytes.
+ """
+ path = str(tmpdir)
+ for rel, content in files.items():
+ full = os.path.join(path, rel)
+ os.makedirs(os.path.dirname(full), exist_ok=True)
+ data = content if isinstance(content, bytes) else b"\0" * content
+ with open(full, "wb") as f:
+ f.write(data)
+ return path
+
+
+def datasets(proj) -> dict[str, Dataset]:
+ """The ``name -> Dataset`` mapping for a project's data datasets."""
+ dp = proj.specs.get("data_project")
+ return dict(dp.contents.get("dataset", {})) if dp else {}
+
+
+def dataset_names(proj) -> set[str]:
+ """The set of dataset names (mapping keys) for a project."""
+ return set(datasets(proj))
+
+
+# ---------------------------------------------------------------------------
+# consolidate()
+# ---------------------------------------------------------------------------
+
+
+class TestConsolidate:
+ def test_numbered_csv_series(self):
+ files = [(f"{i:03d}.csv", 100) for i in range(1, 6)]
+ groups = consolidate(files)
+ assert len(groups) == 1
+ g = groups[0]
+ assert g.consolidated
+ assert g.pattern == "*.csv"
+ assert len(g.members) == 5
+ assert g.total_size == 500
+
+ def test_spark_parts(self):
+ files = [(f"part-{i:05d}.parquet", 10) for i in range(4)]
+ groups = consolidate(files)
+ assert len(groups) == 1
+ assert groups[0].pattern == "part-*.parquet"
+ assert groups[0].consolidated
+
+ def test_year_series(self):
+ files = [(f"data_{y}.json", 5) for y in range(2015, 2021)]
+ groups = consolidate(files)
+ assert len(groups) == 1
+ assert groups[0].pattern == "data_*.json"
+
+ def test_token_series_colours(self):
+ files = [("red.gif", 1), ("green.gif", 1), ("blue.gif", 1)]
+ groups = consolidate(files, min_token_group=2)
+ assert len(groups) == 1
+ assert groups[0].pattern == "*.gif"
+ assert groups[0].consolidated
+ assert sorted(groups[0].members) == ["blue.gif", "green.gif", "red.gif"]
+
+ def test_below_min_group_stays_standalone(self):
+ # only two numbered files, default min_group=3 -> not consolidated
+ files = [("001.csv", 10), ("002.csv", 10)]
+ groups = consolidate(files, min_group=3, min_token_group=99)
+ assert all(not g.consolidated for g in groups)
+ assert len(groups) == 2
+
+ def test_mixed_extensions_separate_groups(self):
+ files = [(f"{i:03d}.csv", 10) for i in range(5)]
+ files += [(f"{i:03d}.json", 10) for i in range(5)]
+ groups = consolidate(files)
+ patterns = sorted(g.pattern for g in groups)
+ assert patterns == ["*.csv", "*.json"]
+
+ def test_unrelated_files_standalone(self):
+ files = [("readme_data.bin", 10), ("schema.avro", 10)]
+ groups = consolidate(files, min_token_group=99)
+ assert all(not g.consolidated for g in groups)
+ assert {g.name for g in groups} == {"readme_data.bin", "schema.avro"}
+
+ def test_double_extension_grouping(self):
+ files = [(f"part{i}.csv.gz", 10) for i in range(5)]
+ groups = consolidate(files)
+ assert len(groups) == 1
+ assert groups[0].ext == ".csv.gz"
+ assert groups[0].consolidated
+
+ def test_url_glob_vs_list(self, tmp_path):
+ g = FileGroup(
+ members=["001.csv", "002.csv", "003.csv"],
+ ext=".csv",
+ pattern="*.csv",
+ consolidated=True,
)
- proj = _data_project(tmp_path)
- dr = proj.specs["data"].contents["data_resource"]
- dr2 = self._roundtrip(dr)
- assert dr2.schema == dr.schema
-
- def test_roundtrip_html_matches_original(self, tmp_path):
- """_repr_html_() on the rehydrated object must equal the original render."""
- dr = self._make_dr(tmp_path)
- html_original = dr._repr_html_()
- dr2 = self._roundtrip(dr)
- assert dr2._repr_html_() == html_original
-
- def test_roundtrip_html_cached_without_rerender(self, tmp_path):
- """After from_dict the HTML is already in _html โ no re-render occurs."""
- dr = self._make_dr(tmp_path)
- html_original = dr._repr_html_()
- d = dr.to_dict(compact=False)
- d2 = json.loads(json.dumps(d))
- dr2 = from_dict(d2, proj=dr.proj)
-
- # Confirm _html is set directly on the instance (not via lazy render)
- assert (
- "_html" in dr2.__dict__
- ), "_html should be in instance __dict__ after from_dict"
- assert dr2.__dict__["_html"] == html_original
-
- def test_roundtrip_html_survives_missing_sample_path(self, tmp_path):
- """After rehydration, _repr_html_() must work even if sample_path
- no longer resolves (e.g. moved to a different machine)."""
- dr = self._make_dr(tmp_path)
- # Trigger render with a real file, then remove the file
- html_original = dr._repr_html_()
- os.remove(dr.sample_path)
-
- dr2 = self._roundtrip(dr)
- # sample_path is gone โ but HTML was cached in the dict
- assert dr2._repr_html_() == html_original
-
-
-class TestDataConditionalParse:
- """Tests for the 'other project types present' guard in Data.parse()."""
-
- def _big_csv(self, path, rows=500):
- """Write a CSV large enough to dominate byte counts."""
- content = "id,value\n" + "\n".join(f"{i},{i * 2}" for i in range(rows))
- path.write_text(content)
-
- def test_pure_data_dir_no_sentinel(self, tmp_path):
- """No sentinel โ Data always parsed regardless of byte ratios."""
- (tmp_path / "data.csv").write_text("x\n1\n")
- proj = _data_project(tmp_path)
- assert "data" in proj.specs
-
- def test_datapackage_companion_not_a_sentinel(self, tmp_path):
- """datapackage.json is a compatible companion โ not a sentinel."""
- self._big_csv(tmp_path / "data.csv")
- (tmp_path / "datapackage.json").write_text('{"resources": []}')
- proj = _data_project(tmp_path)
- assert "data" in proj.specs
-
- def test_dvc_companion_not_a_sentinel(self, tmp_path):
- """catalog.yaml (IntakeCatalog / DVCRepo companion) is not a sentinel."""
- self._big_csv(tmp_path / "data.csv")
- (tmp_path / "catalog.yaml").write_text("sources: {}")
- proj = _data_project(tmp_path)
- assert "data" in proj.specs
-
- def test_sentinel_present_data_majority(self, tmp_path):
- """Sentinel is present, but data files are the majority of bytes โ Data parsed."""
- self._big_csv(tmp_path / "data.csv") # large data file
- (tmp_path / "pyproject.toml").write_text(
- "[project]\nname='x'\n"
- ) # tiny sentinel
- proj = _data_project(tmp_path)
- assert "data" in proj.specs
-
- def test_sentinel_present_data_majority_parquet(self, tmp_path):
- pytest.importorskip("pyarrow")
- import pyarrow as pa, pyarrow.parquet as pq
-
- pq.write_table(
- pa.table({"x": list(range(1000)), "y": list(range(1000))}),
- str(tmp_path / "data.parquet"),
+ assert g.url("/data/foo") == "/data/foo/*.csv"
+ single = FileGroup(members=["only.csv"], ext=".csv", pattern="only.csv")
+ assert single.url("/data/foo") == "/data/foo/only.csv"
+
+ def test_size_unknown_propagates_none(self):
+ files = [("001.csv", None), ("002.csv", 10), ("003.csv", 10)]
+ groups = consolidate(files)
+ assert groups[0].total_size is None
+
+
+# ---------------------------------------------------------------------------
+# Content classes
+# ---------------------------------------------------------------------------
+
+
+class TestContentClasses:
+ def test_dataset_roundtrip(self, tmp_path):
+ proj = projspec.Project(str(tmp_path))
+ ds = Dataset(
+ proj=proj,
+ url=f"{proj.url}/*.csv",
+ datatype="CSV",
+ structure=["table"],
+ schema={"columns": ["a", "b"]},
+ n_files=3,
+ total_size=999,
+ metadata={"readers": ["DaskCSV"]},
)
- (tmp_path / "Cargo.toml").write_text('[package]\nname="x"\n')
- proj = _data_project(tmp_path)
- assert "data" in proj.specs
-
- # -- mixed dirs where non-data dominates --
-
- def test_sentinel_present_code_majority(self, tmp_path):
- """Sentinel present and code files dominate โ Data spec suppressed."""
- # Large Python source file
- (tmp_path / "main.py").write_text("x = 1\n" * 5000)
- # Tiny CSV
- (tmp_path / "tiny.csv").write_text("a,b\n1,2\n")
- (tmp_path / "pyproject.toml").write_text("[project]\nname='x'\n")
- proj = _data_project(tmp_path)
- assert "data" not in proj.specs
-
- def test_sentinel_present_equal_split_not_majority(self, tmp_path):
- """Exactly 50/50 bytes is not a majority โ Data suppressed."""
- payload = "x" * 1000
- (tmp_path / "code.py").write_text(payload)
- (tmp_path / "data.csv").write_text(payload)
- (tmp_path / "pyproject.toml").write_text("[project]\nname='x'\n")
- proj = _data_project(tmp_path)
- assert "data" not in proj.specs
-
- # -- helpers / unit tests for the private methods --
-
- def test_has_non_data_sentinels_true(self, tmp_path):
- from projspec.proj.data_dir import Data
-
- (tmp_path / "data.csv").write_text("x\n1\n")
- (tmp_path / "pyproject.toml").write_text("")
- proj = object.__new__(projspec.Project)
- import fsspec
-
- proj.fs = fsspec.filesystem("file")
- proj.url = str(tmp_path)
- proj.__dict__["basenames"] = {
- e["name"].rsplit("/", 1)[-1]: e["name"]
- for e in proj.fs.ls(str(tmp_path), detail=True)
- }
- proj.__dict__["filelist"] = proj.fs.ls(str(tmp_path), detail=True)
- inst = Data.__new__(Data)
- inst.proj = proj
- assert inst._has_non_data_sentinels() is True
-
- def test_has_non_data_sentinels_false(self, tmp_path):
- from projspec.proj.data_dir import Data
-
- (tmp_path / "data.csv").write_text("x\n1\n")
- proj = object.__new__(projspec.Project)
- import fsspec
-
- proj.fs = fsspec.filesystem("file")
- proj.url = str(tmp_path)
- proj.__dict__["basenames"] = {
- e["name"].rsplit("/", 1)[-1]: e["name"]
- for e in proj.fs.ls(str(tmp_path), detail=True)
- }
- proj.__dict__["filelist"] = proj.fs.ls(str(tmp_path), detail=True)
- inst = Data.__new__(Data)
- inst.proj = proj
- assert inst._has_non_data_sentinels() is False
-
- def test_data_bytes_majority_true(self, tmp_path):
- from projspec.proj.data_dir import Data
-
- self._big_csv(tmp_path / "data.csv")
- (tmp_path / "small.py").write_text("x = 1\n")
- proj = object.__new__(projspec.Project)
- import fsspec
-
- proj.fs = fsspec.filesystem("file")
- proj.url = str(tmp_path)
- proj.__dict__["filelist"] = proj.fs.ls(str(tmp_path), detail=True)
- inst = Data.__new__(Data)
- inst.proj = proj
- assert inst._data_bytes_majority() is True
-
- def test_data_bytes_majority_false(self, tmp_path):
- from projspec.proj.data_dir import Data
-
- (tmp_path / "main.py").write_text("x = 1\n" * 5000)
- (tmp_path / "tiny.csv").write_text("a\n1\n")
- proj = object.__new__(projspec.Project)
+ d = ds.to_dict(compact=False)
+ assert d["klass"] == ["content", "dataset"]
+ # the dataset name lives in the containing dict's key, not the object
+ assert "name" not in d
+ from projspec.utils import from_dict
+
+ ds2 = from_dict(d, proj=proj)
+ assert isinstance(ds2, Dataset)
+ assert ds2.datatype == "CSV"
+ assert ds2.n_files == 3
+
+ def test_tabular_and_intake_source_registered(self):
+ from projspec.content.base import registry
+
+ assert registry["tabular_data"] is TabularData
+ assert registry["intake_source"] is IntakeSource
+ assert registry["dataset"] is Dataset
+
+
+# ---------------------------------------------------------------------------
+# DataProject detection / significance
+# ---------------------------------------------------------------------------
+
+
+class TestDataProjectSignificance:
+ def test_pure_data_dir_detected(self, tmp_path):
+ # three numbered csvs, well above the play-data floor
+ with temp_conf(**PROD_THRESHOLDS):
+ write_data(tmp_path, {f"{i:03d}.csv": 100_000 for i in range(1, 4)})
+ proj = projspec.Project(str(tmp_path))
+ assert "data_project" in proj.specs
+ ds = datasets(proj)
+ assert len(ds) == 1
+ assert "*.csv" in ds
+ assert ds["*.csv"].n_files == 3
+
+ def test_tiny_play_data_rejected(self, tmp_path):
+ with temp_conf(**PROD_THRESHOLDS):
+ write_data(tmp_path, {f"{i:03d}.csv": 20 for i in range(1, 4)})
+ proj = projspec.Project(str(tmp_path))
+ assert "data_project" not in proj.specs
+
+ def test_big_single_file_in_code_project(self, tmp_path):
+ # python package + one big csv -> both python_code and data_project
+ with temp_conf(**PROD_THRESHOLDS):
+ write_data(
+ tmp_path,
+ {
+ "__init__.py": b"x = 1\n",
+ "big.csv": 2 * 1024 * 1024, # > data_min_file_size (1MB)
+ },
+ )
+ proj = projspec.Project(str(tmp_path))
+ assert "python_code" in proj.specs
+ assert "data_project" in proj.specs
+ ds = datasets(proj)
+ assert "big.csv" in ds
+
+ def test_small_data_in_code_project_ignored(self, tmp_path):
+ with temp_conf(**PROD_THRESHOLDS):
+ write_data(
+ tmp_path,
+ {
+ "__init__.py": b"x = 1\n",
+ "main.py": b"print(1)\n" * 100,
+ "sample.csv": 200, # tiny
+ },
+ )
+ proj = projspec.Project(str(tmp_path))
+ assert "python_code" in proj.specs
+ assert "data_project" not in proj.specs
+
+ def test_fraction_rule_large_data_in_code_project(self, tmp_path):
+ # small code, large data -> data dominates by fraction and total size.
+ # Use a .csv so intake can identify a datatype (datasets with no
+ # identified datatype are dropped from the result).
+ with temp_conf(**PROD_THRESHOLDS):
+ write_data(
+ tmp_path,
+ {
+ "__init__.py": b"x = 1\n",
+ "data.csv": b"a,b,c\n" + b"1,2,3\n" * (4 * 1024 * 1024), # >20MB
+ },
+ )
+ proj = projspec.Project(str(tmp_path))
+ assert "python_code" in proj.specs
+ assert "data_project" in proj.specs
+
+ def test_threshold_overridable_via_config(self, tmp_path):
+ write_data(tmp_path, {f"{i:03d}.csv": 20 for i in range(1, 4)})
+ # with the production play-size floor: rejected
+ with temp_conf(**PROD_THRESHOLDS):
+ proj = projspec.Project(str(tmp_path))
+ assert "data_project" not in proj.specs
+ # with a tiny play-size floor it should be detected
+ with temp_conf(data_min_play_size=1):
+ proj = projspec.Project(str(tmp_path))
+ assert "data_project" in proj.specs
+
+
+# ---------------------------------------------------------------------------
+# DataProject consolidation + intake integration
+# ---------------------------------------------------------------------------
+
+
+class TestDataProjectDatasets:
+ def test_image_series_consolidated(self, tmp_path):
+ with temp_conf(**PROD_THRESHOLDS):
+ write_data(
+ tmp_path,
+ {
+ f"{c}.gif": b"GIF89a" + b"\0" * 50_000
+ for c in ("red", "green", "blue")
+ },
+ )
+ proj = projspec.Project(str(tmp_path))
+ ds = datasets(proj)
+ assert len(ds) == 1
+ assert "*.gif" in ds
+ assert ds["*.gif"].n_files == 3
+
+ def test_directory_dataset_marker(self, tmp_path):
+ # a _metadata marker means intake treats the whole dir as one dataset
+ with temp_conf(**PROD_THRESHOLDS):
+ write_data(
+ tmp_path,
+ {
+ "_metadata": 100,
+ "part-0.parquet": b"PAR1" + b"\0" * 200_000,
+ "part-1.parquet": b"PAR1" + b"\0" * 200_000,
+ },
+ )
+ proj = projspec.Project(str(tmp_path))
+ assert "data_project" in proj.specs
+ ds = datasets(proj)
+ # whole directory described as a single dataset
+ assert len(ds) == 1
+
+ @pytest.mark.skipif(not HAS_INTAKE, reason="intake not installed")
+ def test_intake_identifies_csv(self, tmp_path):
+ with temp_conf(**PROD_THRESHOLDS):
+ rows = b"a,b,c\n" + b"".join(b"1,2,3\n" for _ in range(50_000))
+ write_data(tmp_path, {f"{i:03d}.csv": rows for i in range(1, 4)})
+ proj = projspec.Project(str(tmp_path))
+ ds = datasets(proj)
+ assert len(ds) == 1
+ assert ds["*.csv"].datatype == "CSV"
+ assert "table" in ds["*.csv"].structure
+
+ def test_no_data_files_no_match(self, tmp_path):
+ write_data(tmp_path, {"README.md": b"# hi\n", "setup.py": b"x=1\n"})
+ proj = projspec.Project(str(tmp_path))
+ assert "data_project" not in proj.specs
+
+ @pytest.mark.skipif(not HAS_INTAKE, reason="intake not installed")
+ def test_remote_url_keeps_protocol_for_intake(self):
+ """Regression: scanning a remote (protocol-prefixed) directory must
+ hand intake a protocol-qualified URL.
+
+ ``proj.url`` has the protocol stripped by ``fsspec.url_to_fs``; if that
+ bare path reaches intake it can't pick the filesystem and resolves no
+ files. The dataset URL handed to / stored by intake must keep the
+ protocol (e.g. ``memory://``).
+ """
import fsspec
- proj.fs = fsspec.filesystem("file")
- proj.url = str(tmp_path)
- proj.__dict__["filelist"] = proj.fs.ls(str(tmp_path), detail=True)
- inst = Data.__new__(Data)
- inst.proj = proj
- assert inst._data_bytes_majority() is False
+ fs = fsspec.filesystem("memory")
+ root = "/data_project_remote"
+ rows = b"a,b,c\n" + b"".join(b"1,2,3\n" for _ in range(50_000))
+ try:
+ for i in range(1, 4):
+ with fs.open(f"{root}/{i:03d}.csv", "wb") as f:
+ f.write(rows)
+
+ with temp_conf(data_min_play_size=1, data_min_fraction=0.5):
+ proj = projspec.Project(f"memory://{root}")
+ # the bare filesystem path has no protocol...
+ assert "://" not in proj.url
+ ds = datasets(proj)
+ assert "*.csv" in ds
+ # ...but intake was able to resolve and type the files, and the
+ # stored dataset URL is protocol-qualified.
+ assert ds["*.csv"].datatype == "CSV"
+ assert str(ds["*.csv"].url).startswith("memory://")
+ finally:
+ try:
+ fs.rm(root, recursive=True)
+ except FileNotFoundError:
+ pass
+
+
+# ---------------------------------------------------------------------------
+# match() / _is_data_ext unit checks
+# ---------------------------------------------------------------------------
+
+
+class TestDataExt:
+ @pytest.mark.parametrize(
+ "name,expected",
+ [
+ ("data.csv", True),
+ ("table.parquet", True),
+ ("image.png", True),
+ ("archive.csv.gz", True),
+ ("module.py", False),
+ ("README.md", False),
+ ("pyproject.toml", False),
+ (".gitignore", False),
+ ("Makefile", False), # no extension
+ ("config.ini", False),
+ ],
+ )
+ def test_is_data_ext(self, name, expected):
+ assert DataProject._is_data_ext(name) is expected
+
+
+# ---------------------------------------------------------------------------
+# HTML repr / thumbnail captured into Dataset.metadata
+# ---------------------------------------------------------------------------
+
+
+def _make_csv_bytes(rows: int = 200_000) -> bytes:
+ """A CSV big enough to clear the single-big-file significance threshold."""
+ body = "a,b,c\n" + "\n".join(f"{i},{i * 2},val{i}" for i in range(rows))
+ return body.encode()
+
+
+class TestDatasetHTMLOutput:
+ """The DataProject pipeline should carry intake's ``html_repr`` and
+ ``thumbnail`` through into ``Dataset.metadata`` when a reader discovers the
+ underlying object."""
+
+ @pytest.mark.skipif(not (HAS_INTAKE and HAS_PANDAS), reason="needs intake + pandas")
+ def test_html_repr_for_tabular(self, tmp_path):
+ # single file > data_min_file_size so it is described on its own and a
+ # single-file pandas reader can discover it
+ with temp_conf(**PROD_THRESHOLDS):
+ write_data(tmp_path, {"big.csv": _make_csv_bytes()})
+ proj = projspec.Project(str(tmp_path))
+ ds = datasets(proj)
+ assert len(ds) == 1
+ meta = ds["big.csv"].metadata
+ assert "PandasCSV" in meta.get("readers")
+ assert meta.get("html_repr"), "expected html_repr in Dataset.metadata"
+ assert " html_repr/thumbnail simply
+ # absent, never None-valued keys
+ with temp_conf(data_min_play_size=1):
+ rows = b"a,b,c\n" + b"1,2,3\n" * 10
+ write_data(tmp_path, {f"{i:03d}.csv": rows for i in range(5)})
+ proj = projspec.Project(str(tmp_path))
+ ds = datasets(proj)
+ assert ds, "expected a dataset"
+ for d in ds.values():
+ assert d.datatype is not None
+ assert "html_repr" not in d.metadata or isinstance(
+ d.metadata["html_repr"], str
+ )
+ assert "thumbnail" not in d.metadata or isinstance(
+ d.metadata["thumbnail"], str
+ )
+
+
+# ---------------------------------------------------------------------------
+# Per-dataset fraction filtering (_filter_small_datasets)
+# ---------------------------------------------------------------------------
+
+
+def _bare_data_project(tmp_path) -> DataProject:
+ """A DataProject instance not bound to any real data (for unit testing
+ the pure-Python helper without triggering match()/parse())."""
+ proj = projspec.Project(str(tmp_path))
+ dp = DataProject.__new__(DataProject)
+ dp.proj = proj
+ return dp
+
+
+def _ds(proj, name, size):
+ """Return a ``(name, Dataset)`` pair as consumed by
+ ``DataProject._filter_small_datasets``."""
+ return name, Dataset(
+ proj=proj,
+ url=f"{proj.url}/{name}",
+ datatype="CSV",
+ structure=["table"],
+ schema={},
+ n_files=1,
+ total_size=size,
+ metadata={},
+ )
+
+
+def _kept_names(pairs):
+ return [name for name, _ in pairs]
+
+
+class TestFilterSmallDatasets:
+ def test_drops_dataset_below_fraction_of_largest(self, tmp_path):
+ dp = _bare_data_project(tmp_path)
+ big = _ds(dp.proj, "big.csv", 1000)
+ small = _ds(dp.proj, "small.csv", 10) # 1% of largest
+ with temp_conf(data_min_fraction=0.5):
+ kept = dp._filter_small_datasets([big, small])
+ assert _kept_names(kept) == ["big.csv"]
+
+ def test_keeps_datasets_above_fraction(self, tmp_path):
+ dp = _bare_data_project(tmp_path)
+ a = _ds(dp.proj, "a.csv", 1000)
+ b = _ds(dp.proj, "b.csv", 800) # 80% of largest
+ with temp_conf(data_min_fraction=0.5):
+ kept = dp._filter_small_datasets([a, b])
+ assert set(_kept_names(kept)) == {"a.csv", "b.csv"}
+
+ def test_single_dataset_never_filtered(self, tmp_path):
+ dp = _bare_data_project(tmp_path)
+ only = _ds(dp.proj, "only.csv", 1)
+ with temp_conf(data_min_fraction=0.5):
+ kept = dp._filter_small_datasets([only])
+ assert _kept_names(kept) == ["only.csv"]
+
+ def test_unknown_sizes_disable_filtering(self, tmp_path):
+ dp = _bare_data_project(tmp_path)
+ big = _ds(dp.proj, "big.csv", 1000)
+ unknown = _ds(dp.proj, "u.csv", None)
+ with temp_conf(data_min_fraction=0.5):
+ kept = dp._filter_small_datasets([big, unknown])
+ assert set(_kept_names(kept)) == {"big.csv", "u.csv"}
+
+ def test_never_drops_everything(self, tmp_path):
+ # an impossible threshold (>1) would exclude all -> fall back to all
+ dp = _bare_data_project(tmp_path)
+ a = _ds(dp.proj, "a.csv", 1000)
+ b = _ds(dp.proj, "b.csv", 1000)
+ with temp_conf(data_min_fraction=2.0):
+ kept = dp._filter_small_datasets([a, b])
+ assert set(_kept_names(kept)) == {"a.csv", "b.csv"}
+
+ def test_zero_fraction_keeps_all(self, tmp_path):
+ dp = _bare_data_project(tmp_path)
+ big = _ds(dp.proj, "big.csv", 1000)
+ tiny = _ds(dp.proj, "tiny.csv", 1)
+ with temp_conf(data_min_fraction=0.0):
+ kept = dp._filter_small_datasets([big, tiny])
+ assert set(_kept_names(kept)) == {"big.csv", "tiny.csv"}
+
+ @pytest.mark.skipif(not HAS_INTAKE, reason="intake not installed")
+ def test_end_to_end_drops_tiny_dataset(self, tmp_path):
+ # one large csv-series dataset and one tiny json file; the tiny one
+ # should be dropped as a small fraction of the largest.
+ big_rows = b"a,b,c\n" + b"1,2,3\n" * 20000 # large
+ with temp_conf(data_min_play_size=1, data_min_fraction=0.5):
+ write_data(
+ tmp_path,
+ {
+ **{f"{i:03d}.csv": big_rows for i in range(3)},
+ "tiny.json": b'{"x": 1}\n',
+ },
+ )
+ proj = projspec.Project(str(tmp_path))
+ names = dataset_names(proj)
+ assert "*.csv" in names
+ assert "tiny.json" not in names
+
+ @pytest.mark.skipif(not HAS_INTAKE, reason="intake not installed")
+ def test_end_to_end_keeps_similar_sized_datasets(self, tmp_path):
+ # two datasets of comparable size are both kept (neither is a small
+ # fraction of the other).
+ csv_rows = b"a,b,c\n" + b"1,2,3\n" * 20000
+ json_rows = b'{"x": 1}\n' * 20000
+ with temp_conf(data_min_play_size=1, data_min_fraction=0.5):
+ write_data(
+ tmp_path,
+ {
+ **{f"{i:03d}.csv": csv_rows for i in range(3)},
+ **{f"{i:03d}.json": json_rows for i in range(3)},
+ },
+ )
+ proj = projspec.Project(str(tmp_path))
+ names = dataset_names(proj)
+ assert "*.csv" in names
+ assert "*.json" in names
diff --git a/tests/test_ipywidget_helpers.py b/tests/test_ipywidget_helpers.py
index 84a4dea..485a481 100644
--- a/tests/test_ipywidget_helpers.py
+++ b/tests/test_ipywidget_helpers.py
@@ -367,6 +367,7 @@ def test_add_confirmed_valid_path(self, tmp_path, widget_and_lib):
widget, lib, url = widget_and_lib
new_proj = tmp_path / "newproj"
new_proj.mkdir()
+ (new_proj / "requirements.txt").write_text("numpy")
# Start with just the original entry
original_keys = set(lib.entries)
sends, toasts = _fire(
diff --git a/tests/test_library.py b/tests/test_library.py
index 276e66a..3796588 100644
--- a/tests/test_library.py
+++ b/tests/test_library.py
@@ -1,6 +1,9 @@
+import json
import os
+import time
from projspec import Project
+from projspec.config import temp_conf
from projspec.library import ProjectLibrary
here = os.path.abspath(os.path.dirname(__file__))
@@ -40,3 +43,88 @@ def test_filter(tmp_path):
# miss
assert not library.filter([("spec", "xx")])
+
+
+def test_scanned_at_set_on_scan(tmp_path):
+ (tmp_path / "__init__.py").write_text("x = 1\n")
+ before = time.time()
+ proj = Project(str(tmp_path), walk=False)
+ after = time.time()
+ assert isinstance(proj.scanned_at, float)
+ assert before <= proj.scanned_at <= after
+
+
+def test_scanned_at_serialised_and_roundtrips(tmp_path):
+ (tmp_path / "__init__.py").write_text("x = 1\n")
+ proj = Project(str(tmp_path), walk=False)
+
+ dic = proj.to_dict(compact=False)
+ assert "scanned_at" in dic
+
+ proj2 = Project.from_dict(dic)
+ # round-trips back to the same numeric value (serialiser stringifies floats)
+ assert isinstance(proj2.scanned_at, float)
+ assert proj2.scanned_at == proj.scanned_at
+
+
+def test_scanned_at_defaults_to_now_when_missing(tmp_path):
+ (tmp_path / "__init__.py").write_text("x = 1\n")
+ proj = Project(str(tmp_path), walk=False)
+
+ dic = proj.to_dict(compact=False)
+ dic.pop("scanned_at") # simulate an older library without the field
+
+ before = time.time()
+ proj2 = Project.from_dict(dic)
+ assert before <= proj2.scanned_at <= time.time() + 1
+
+
+def _make_library_with_old_entry(tmp_path, age_seconds):
+ """Create a library file containing one project scanned *age_seconds* ago."""
+ proj_dir = tmp_path / "proj"
+ proj_dir.mkdir()
+ (proj_dir / "__init__.py").write_text("x = 1\n")
+ fn = str(tmp_path / "library.json")
+
+ proj = Project(str(proj_dir), walk=False)
+ library = ProjectLibrary(fn, auto_save=True)
+ key = proj.fs.unstrip_protocol(proj.url)
+ library.add_entry(key, proj)
+
+ # rewrite the saved scanned_at to be old
+ data = json.load(open(fn))
+ for entry in data.values():
+ entry["scanned_at"] = time.time() - age_seconds
+ json.dump(data, open(fn, "w"))
+ return fn, key
+
+
+def test_auto_rescan_refreshes_old_entry(tmp_path):
+ fn, key = _make_library_with_old_entry(tmp_path, age_seconds=1000)
+
+ with temp_conf(auto_rescan=10): # threshold below the entry's age
+ library = ProjectLibrary(fn)
+ # the stale entry was rescanned -> timestamp is fresh
+ assert library.entries[key].scanned_at >= time.time() - 5
+ # ...and the refreshed library was written back to disk
+ data = json.load(open(fn))
+ assert float(data[key]["scanned_at"]) >= time.time() - 5
+
+
+def test_auto_rescan_keeps_fresh_entry(tmp_path):
+ fn, key = _make_library_with_old_entry(tmp_path, age_seconds=5)
+
+ with temp_conf(auto_rescan=1000): # threshold well above the entry's age
+ library = ProjectLibrary(fn)
+ # fresh enough -> not rescanned, original (old) timestamp preserved
+ assert library.entries[key].scanned_at < time.time() - 1
+
+
+def test_auto_rescan_disabled_with_zero(tmp_path):
+ fn, key = _make_library_with_old_entry(tmp_path, age_seconds=10_000)
+ old = json.load(open(fn))[key]["scanned_at"]
+
+ with temp_conf(auto_rescan=0): # disabled entirely
+ library = ProjectLibrary(fn)
+ # the very old entry is kept as-is, never rescanned
+ assert abs(library.entries[key].scanned_at - old) < 1
diff --git a/tests/test_new_specs.py b/tests/test_new_specs.py
index f41a6c5..9d1d3a5 100644
--- a/tests/test_new_specs.py
+++ b/tests/test_new_specs.py
@@ -2016,3 +2016,147 @@ def test_roundtrip_create_and_detect(self, tmpdir):
# Re-scan so scanned_files picks up the new flow.py
proj2 = projspec.Project(path)
assert "metaflow" in proj2
+
+
+# ---------------------------------------------------------------------------
+# KnowledgeCatalog (Open Knowledge Format bundle)
+# ---------------------------------------------------------------------------
+
+
+class TestKnowledgeCatalog:
+ FILES = {
+ "index.md": '---\nokf_version: "0.1"\n---\n\n# My Bundle\n\n'
+ "* [Sales](datasets/sales.md) - sales data\n",
+ "log.md": "# Update Log\n\n## 2026-01-01\n* **Creation**: started.\n",
+ "datasets/sales.md": """\
+ ---
+ type: BigQuery Dataset
+ title: Sales
+ description: All sales-related tables.
+ tags: [sales, revenue]
+ timestamp: 2026-05-28T00:00:00Z
+ ---
+
+ The sales dataset.
+ """,
+ "tables/orders.md": """\
+ ---
+ type: BigQuery Table
+ title: Orders
+ resource: https://example.com/orders
+ ---
+
+ # Schema
+ """,
+ # not a concept: no frontmatter
+ "notes/random.md": "just some prose, no frontmatter\n",
+ }
+
+ def test_match_positive(self, tmpdir):
+ proj = make_proj(tmpdir, self.FILES)
+ from projspec.proj.knowledge_catalog import KnowledgeCatalog
+
+ assert raw_spec(KnowledgeCatalog, proj).match()
+
+ def test_match_root_concept(self, tmpdir):
+ # index.md plus a concept at the root (no subdirs)
+ proj = make_proj(
+ tmpdir,
+ {
+ "index.md": "# Bundle\n",
+ "overview.md": "---\ntype: Reference\n---\nbody\n",
+ },
+ )
+ from projspec.proj.knowledge_catalog import KnowledgeCatalog
+
+ assert raw_spec(KnowledgeCatalog, proj).match()
+
+ def test_match_negative_no_index(self, tmpdir):
+ proj = make_proj(tmpdir, {"tables/orders.md": "---\ntype: T\n---\n"})
+ from projspec.proj.knowledge_catalog import KnowledgeCatalog
+
+ assert not raw_spec(KnowledgeCatalog, proj).match()
+
+ def test_match_negative_empty(self, tmpdir):
+ proj = make_proj(tmpdir, {})
+ from projspec.proj.knowledge_catalog import KnowledgeCatalog
+
+ assert not raw_spec(KnowledgeCatalog, proj).match()
+
+ def test_parse_contents(self, tmpdir):
+ proj = make_proj(tmpdir, self.FILES)
+ from projspec.proj.knowledge_catalog import KnowledgeCatalog
+
+ spec = raw_spec(KnowledgeCatalog, proj)
+ spec.parse()
+ assert "concept" in spec._contents
+ concepts = spec._contents["concept"]
+ # keyed by concept ID (bundle-relative path, no .md)
+ assert set(concepts) == {"datasets/sales", "tables/orders"}
+
+ def test_parse_detail(self, tmpdir):
+ proj = make_proj(tmpdir, self.FILES)
+ from projspec.proj.knowledge_catalog import KnowledgeCatalog
+
+ spec = raw_spec(KnowledgeCatalog, proj)
+ spec.parse()
+ sales = spec._contents["concept"]["datasets/sales"].meta
+ assert sales["type"] == "BigQuery Dataset"
+ assert sales["title"] == "Sales"
+ assert sales["tags"] == "sales, revenue"
+ orders = spec._contents["concept"]["tables/orders"].meta
+ assert orders["type"] == "BigQuery Table"
+ assert orders["resource"] == "https://example.com/orders"
+
+ def test_parse_bundle_version(self, tmpdir):
+ proj = make_proj(tmpdir, self.FILES)
+ from projspec.proj.knowledge_catalog import KnowledgeCatalog
+
+ spec = raw_spec(KnowledgeCatalog, proj)
+ spec.parse()
+ # root index.md okf_version surfaces as bundle-level metadata
+ assert spec._contents["descriptive_metadata"].meta["okf_version"] == "0.1"
+
+ def test_parse_skips_non_typed_docs(self, tmpdir):
+ proj = make_proj(tmpdir, self.FILES)
+ from projspec.proj.knowledge_catalog import KnowledgeCatalog
+
+ spec = raw_spec(KnowledgeCatalog, proj)
+ spec.parse()
+ # notes/random.md has no frontmatter -> not a concept
+ assert "notes/random" not in spec._contents["concept"]
+
+ def test_parse_no_typed_concepts_raises(self, tmpdir):
+ from projspec.proj.knowledge_catalog import KnowledgeCatalog
+ from projspec.proj import ParseFailed
+
+ proj = make_proj(
+ tmpdir,
+ {"index.md": "# index\n", "notes.md": "plain prose, no frontmatter\n"},
+ )
+ spec = raw_spec(KnowledgeCatalog, proj)
+ with pytest.raises(ParseFailed):
+ spec.parse()
+
+ def test_parse_requires_type_field(self, tmpdir):
+ # a markdown doc with frontmatter but no 'type' is not a concept
+ from projspec.proj.knowledge_catalog import KnowledgeCatalog
+ from projspec.proj import ParseFailed
+
+ proj = make_proj(
+ tmpdir,
+ {
+ "index.md": "# index\n",
+ "doc.md": "---\ntitle: No Type Here\n---\nbody\n",
+ },
+ )
+ spec = raw_spec(KnowledgeCatalog, proj)
+ with pytest.raises(ParseFailed):
+ spec.parse()
+
+ def test_roundtrip_create_and_detect(self, tmpdir):
+ path = str(tmpdir)
+ proj = projspec.Project(path)
+ proj.create("KnowledgeCatalog")
+ proj2 = projspec.Project(path)
+ assert "knowledge_catalog" in proj2
diff --git a/tests/test_roundtrips.py b/tests/test_roundtrips.py
index a1ed3a8..a3159e4 100644
--- a/tests/test_roundtrips.py
+++ b/tests/test_roundtrips.py
@@ -37,6 +37,7 @@
"MDBook",
"RTD",
"BackstageCatalog",
+ "KnowledgeCatalog",
# CI/CD โ file-only _create()
"GitHubActions",
"GitLabCI",
diff --git a/tests/test_textapp_helpers.py b/tests/test_textapp_helpers.py
index d898d7e..191d49b 100644
--- a/tests/test_textapp_helpers.py
+++ b/tests/test_textapp_helpers.py
@@ -126,8 +126,15 @@ class TestFmtAge:
def _ts(self, days_ago: float) -> float:
return time.time() - days_ago * 86400
- def test_today(self):
- assert _fmt_age(self._ts(0.1)) == "today"
+ def test_just_now(self):
+ assert _fmt_age(time.time() - 5) == "just now"
+
+ def test_minutes(self):
+ assert _fmt_age(time.time() - 5 * 60) == "5 minutes ago"
+
+ def test_hours(self):
+ # 0.1 days ~= 2.4 hours -> reported in hours, not "today"
+ assert _fmt_age(self._ts(0.1)) == "2 hours ago"
def test_yesterday(self):
assert _fmt_age(self._ts(1.5)) == "yesterday"
@@ -333,6 +340,23 @@ def test_indentation_increases_for_nested(self):
# Outer indent: lines_2 should have more leading spaces
assert lines_2[0].startswith(" " * 2)
+ def test_html_repr_shown_as_placeholder(self):
+ # the giant raw HTML must not be dumped; show a short note instead
+ big = ""
+ lines = _yaml_lines({"metadata": {"html_repr": big}}, {}, 0)
+ combined = " ".join(lines)
+ assert big not in combined
+ assert "html_repr" in combined
+ assert "HTML preview" in combined
+
+ def test_thumbnail_shown_as_placeholder(self):
+ url = "data:image/png;base64," + "A" * 5000
+ lines = _yaml_lines({"metadata": {"thumbnail": url}}, {}, 0)
+ combined = " ".join(lines)
+ assert url not in combined
+ assert "thumbnail" in combined
+ assert "image thumbnail" in combined
+
# ---------------------------------------------------------------------------
# _wrap_chips
diff --git a/tests/test_webui.py b/tests/test_webui.py
index 06c4c96..bb11b2d 100644
--- a/tests/test_webui.py
+++ b/tests/test_webui.py
@@ -224,6 +224,23 @@ def test_panel_js_is_root_scoped():
assert "window.projspecRoot" in js
+def test_panel_js_embeds_dataset_preview():
+ """The shared panel.js must embed a content's ``metadata.html_repr`` as
+ live HTML (via sanitizeHtml + innerHTML) and ``metadata.thumbnail`` as an
+
, rather than dumping their raw strings into the YAML tree."""
+ js = get_panel_js()
+ # preview keys are pulled out of metadata
+ assert "meta.html_repr" in js
+ assert "meta.thumbnail" in js
+ # and removed from the YAML tree via stripPreview
+ assert "stripPreview" in js
+ assert "renderYaml(stripPreview(stripKlass(data)))" in js
+ # html_repr is embedded as sanitised HTML; thumbnail as a data: image
+ assert "sanitizeHtml(htmlRepr)" in js
+ assert "thumbnailImg" in js
+ assert "data:image/" in js
+
+
def test_make_cwd_uses_project_path_not_library_key(tmp_path, monkeypatch):
"""Regression: Make must use the stored ``Project.path`` as the
subprocess cwd, never the library key.
diff --git a/vsextension/src/panel.ts b/vsextension/src/panel.ts
index 9f3d6a7..7149c9c 100644
--- a/vsextension/src/panel.ts
+++ b/vsextension/src/panel.ts
@@ -933,8 +933,18 @@ const PANEL_JS = String.raw`
}
}
function fmtAge(ts) {
- const days = Math.floor((Date.now() / 1000 - parseFloat(ts)) / 86400);
- if (days === 0) return 'today';
+ const secs = Math.floor(Date.now() / 1000 - parseFloat(ts));
+ if (secs < 0) return 'just now';
+ const days = Math.floor(secs / 86400);
+ if (days === 0) {
+ if (secs < 60) return 'just now';
+ if (secs < 3600) {
+ const m = Math.floor(secs / 60);
+ return m + ' minute' + (m !== 1 ? 's' : '') + ' ago';
+ }
+ const h = Math.floor(secs / 3600);
+ return h + ' hour' + (h !== 1 ? 's' : '') + ' ago';
+ }
if (days === 1) return 'yesterday';
if (days < 30) return days + ' days ago';
if (days < 365) return Math.floor(days / 30) + ' months ago';
@@ -1018,6 +1028,8 @@ const PANEL_JS = String.raw`
const by = project.last_modified_by != null ? project.last_modified_by : null;
metaParts.push('last modified ' + age + (by ? ' by ' + by : ''));
}
+ if (project.scanned_at != null)
+ metaParts.push('scanned ' + fmtAge(project.scanned_at));
if (metaParts.length > 0) {
const meta = document.createElement('div');
meta.className = 'meta';
@@ -1309,15 +1321,70 @@ const PANEL_JS = String.raw`
body.innerHTML = sanitizeHtml(html);
w.appendChild(body);
} else {
+ // Datasets (and other content) may carry rich previews in
+ // metadata.html_repr (an HTML fragment) and metadata.thumbnail
+ // (a data: image URL). Embed those rather than dumping their
+ // (often huge) raw strings into the YAML tree.
+ const meta = (kind === 'content' && data && typeof data === 'object'
+ && data.metadata && typeof data.metadata === 'object') ? data.metadata : null;
+ const htmlRepr = meta && typeof meta.html_repr === 'string' ? meta.html_repr : null;
+ const thumb = meta && typeof meta.thumbnail === 'string' ? meta.thumbnail : null;
+
const tree = document.createElement('div');
tree.className = 'tree yaml';
- tree.appendChild(renderYaml(stripKlass(data)));
+ tree.appendChild(renderYaml(stripPreview(stripKlass(data))));
w.appendChild(tree);
+
+ if (thumb) w.appendChild(thumbnailImg(thumb));
+ if (htmlRepr) {
+ const body = document.createElement('div');
+ body.className = 'widget-html';
+ body.innerHTML = sanitizeHtml(htmlRepr);
+ w.appendChild(body);
+ }
}
return w;
}
+ /**
+ * Return a shallow copy of a content dict with the embedded-preview keys
+ * (metadata.html_repr / metadata.thumbnail) removed, so the YAML tree
+ * doesn't show their large raw strings - they are rendered as live
+ * HTML / an image instead.
+ */
+ function stripPreview(obj) {
+ if (!obj || typeof obj !== 'object' || Array.isArray(obj)) return obj;
+ if (!obj.metadata || typeof obj.metadata !== 'object' || Array.isArray(obj.metadata)) return obj;
+ const meta = {};
+ let changed = false;
+ for (const k of Object.keys(obj.metadata)) {
+ if (k === 'html_repr' || k === 'thumbnail') { changed = true; continue; }
+ meta[k] = obj.metadata[k];
+ }
+ if (!changed) return obj;
+ const out = {};
+ for (const k of Object.keys(obj)) out[k] = obj[k];
+ out.metadata = meta;
+ return out;
+ }
+
+ /**
+ * Build an
for a data:image/... thumbnail URL. Only accepts
+ * data: image URLs (never remote/javascript URLs).
+ */
+ function thumbnailImg(src) {
+ const wrap = document.createElement('div');
+ wrap.className = 'widget-html';
+ if (/^data:image\//i.test(src)) {
+ const img = document.createElement('img');
+ img.src = src;
+ img.alt = 'thumbnail';
+ wrap.appendChild(img);
+ }
+ return wrap;
+ }
+
/**
* Minimal sanitisation of content-provided HTML. The markup comes from
* the projspec library itself so we don't need a full DOMPurify - but we
diff --git a/vsextension/src/projspec.ts b/vsextension/src/projspec.ts
index 20afb8b..97ed9a8 100644
--- a/vsextension/src/projspec.ts
+++ b/vsextension/src/projspec.ts
@@ -171,6 +171,7 @@ export interface ProjectData {
is_writable?: string;
last_modified?: string;
last_modified_by?: string;
+ scanned_at?: string;
}
export interface SpecData {
diff --git a/vsextension/tsconfig.json b/vsextension/tsconfig.json
index 356580f..5e22142 100644
--- a/vsextension/tsconfig.json
+++ b/vsextension/tsconfig.json
@@ -8,6 +8,11 @@
],
"sourceMap": true,
"rootDir": "src",
+ "types": [
+ "node",
+ "vscode",
+ "mocha"
+ ],
"strict": true, /* enable all strict type-checking options */
/* Additional Checks */
// "noImplicitReturns": true, /* Report error when not all code paths in function return a value. */