diff --git a/pycharm_plugin/src/main/kotlin/com/projspec/toolwindow/HtmlContent.kt b/pycharm_plugin/src/main/kotlin/com/projspec/toolwindow/HtmlContent.kt
index 2e81e49..102ce2b 100644
--- a/pycharm_plugin/src/main/kotlin/com/projspec/toolwindow/HtmlContent.kt
+++ b/pycharm_plugin/src/main/kotlin/com/projspec/toolwindow/HtmlContent.kt
@@ -556,8 +556,18 @@ body { margin: 0; padding: 0; font-family: var(--vscode-font-family); color: var
         }
     }
     function fmtAge(ts) {
-        const days = Math.floor((Date.now() / 1000 - parseFloat(ts)) / 86400);
-        if (days === 0) return 'today';
+        const secs = Math.floor(Date.now() / 1000 - parseFloat(ts));
+        if (secs < 0) return 'just now';
+        const days = Math.floor(secs / 86400);
+        if (days === 0) {
+            if (secs < 60) return 'just now';
+            if (secs < 3600) {
+                const m = Math.floor(secs / 60);
+                return m + ' minute' + (m !== 1 ? 's' : '') + ' ago';
+            }
+            const h = Math.floor(secs / 3600);
+            return h + ' hour' + (h !== 1 ? 's' : '') + ' ago';
+        }
         if (days === 1) return 'yesterday';
         if (days < 30) return days + ' days ago';
         if (days < 365) return Math.floor(days / 30) + ' months ago';
@@ -641,6 +651,8 @@ body { margin: 0; padding: 0; font-family: var(--vscode-font-family); color: var
             const by = project.last_modified_by != null ? project.last_modified_by : null;
             metaParts.push('last modified ' + age + (by ? ' by ' + by : ''));
         }
+        if (project.scanned_at != null)
+            metaParts.push('scanned ' + fmtAge(project.scanned_at));
         if (metaParts.length > 0) {
             const meta = document.createElement('div');
             meta.className = 'meta';
@@ -914,15 +926,60 @@ body { margin: 0; padding: 0; font-family: var(--vscode-font-family); color: var
             body.innerHTML = sanitizeHtml(html);
             w.appendChild(body);
         } else {
+            // Datasets (and other content) may carry rich previews in
+            // metadata.html_repr (an HTML fragment) and metadata.thumbnail
+            // (a data: image URL). Embed those rather than dumping their
+            // (often huge) raw strings into the YAML tree.
+            const meta = (kind === 'content' && data && typeof data === 'object'
+                && data.metadata && typeof data.metadata === 'object') ? data.metadata : null;
+            const htmlRepr = meta && typeof meta.html_repr === 'string' ? meta.html_repr : null;
+            const thumb = meta && typeof meta.thumbnail === 'string' ? meta.thumbnail : null;
+
             const tree = document.createElement('div');
             tree.className = 'tree yaml';
-            tree.appendChild(renderYaml(stripKlass(data)));
+            tree.appendChild(renderYaml(stripPreview(stripKlass(data))));
             w.appendChild(tree);
+
+            if (thumb) w.appendChild(thumbnailImg(thumb));
+            if (htmlRepr) {
+                const body = document.createElement('div');
+                body.className = 'widget-html';
+                body.innerHTML = sanitizeHtml(htmlRepr);
+                w.appendChild(body);
+            }
         }
 
         return w;
     }
 
+    function stripPreview(obj) {
+        if (!obj || typeof obj !== 'object' || Array.isArray(obj)) return obj;
+        if (!obj.metadata || typeof obj.metadata !== 'object' || Array.isArray(obj.metadata)) return obj;
+        const meta = {};
+        let changed = false;
+        for (const k of Object.keys(obj.metadata)) {
+            if (k === 'html_repr' || k === 'thumbnail') { changed = true; continue; }
+            meta[k] = obj.metadata[k];
+        }
+        if (!changed) return obj;
+        const out = {};
+        for (const k of Object.keys(obj)) out[k] = obj[k];
+        out.metadata = meta;
+        return out;
+    }
+
+    function thumbnailImg(src) {
+        const wrap = document.createElement('div');
+        wrap.className = 'widget-html';
+        if (/^data:image\//i.test(src)) {
+            const img = document.createElement('img');
+            img.src = src;
+            img.alt = 'thumbnail';
+            wrap.appendChild(img);
+        }
+        return wrap;
+    }
+
     function sanitizeHtml(html) {
         const tpl = document.createElement('template');
         tpl.innerHTML = String(html);
diff --git a/pyproject.toml b/pyproject.toml
index 3240168..380a1ea 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -28,11 +28,12 @@ dependencies = [
     "fsspec",
     "click",
     "jinja2",
+    "intake==2.1.0a2"
 ]
 
 [project.optional-dependencies]
 test = ["pytest", "pytest-cov", "django", "streamlit", "copier", "jinja2-time", "flask",
-    "maturin", "uv", "briefcase"]
+    "maturin", "uv", "briefcase", "textual"]
 qt = ["pyqt>5,<6", "pyqtwebengin>5,<6"]
 textual = ["textual>=0.80"]
 ipywidget = ["anywidget>=0.9", "ipywidgets>=8", "ipython"]
diff --git a/src/projspec/__main__.py b/src/projspec/__main__.py
index ada5cb6..47141bc 100755
--- a/src/projspec/__main__.py
+++ b/src/projspec/__main__.py
@@ -95,12 +95,6 @@ def version():
     default=False,
     help="JSON output, for projects only",
 )
-@click.option(
-    "--html-out",
-    is_flag=True,
-    default=False,
-    help="HTML output, for projects only",
-)
 @click.option(
     "--walk", is_flag=True, help="Descend into child directories of each match"
 )
@@ -112,7 +106,6 @@ def scan(
     types,
     xtypes,
     json_out,
-    html_out,
     walk,
     summary,
     library,
@@ -146,8 +139,6 @@ def scan(
             else:
                 if json_out:
                     print(json.dumps(proj.to_dict(compact=False)))
-                elif html_out:
-                    print(proj._repr_html_())
                 else:
                     print(proj)
 
diff --git a/src/projspec/config.py b/src/projspec/config.py
index 7c0d2e5..9bf7d61 100644
--- a/src/projspec/config.py
+++ b/src/projspec/config.py
@@ -5,7 +5,7 @@
 
 from typing import Any
 
-conf: dict[str, dict[str, Any]] = {}
+conf: dict[str, Any] = {}
 default_conf_dir = os.path.join(os.path.expanduser("~"), ".config/projspec")
 
 
@@ -33,12 +33,19 @@ def coerce(template, val):
 def defaults():
     return {
         "library_path": f"{conf_dir()}/library.json",
+        "auto_rescan": 7 * 24 * 60 * 60,  # one week, in seconds
         "scan_types": [".py", ".yaml", ".yml", ".toml", ".json", ".md"],
         "scan_max_files": 100,
         "scan_max_size": 5 * 2**10,
         "remote_artifact_status": False,
         "capture_artifact_output": True,
         "preferred_install_methods": ["conda", "pip"],
+        "data_min_fraction": 0.5,
+        "data_min_file_size": 1024 * 1024,
+        "data_min_total_size": 10 * 1024 * 1024,
+        "data_min_play_size": 1,  # 64 * 1024,
+        "data_consolidate_min_group": 3,
+        "data_inspect_max_datasets": 50,
         "excludes": [
             "bld",
             "build",
@@ -56,6 +63,11 @@ def defaults():
 
 config_doc = {
     "library_path": "location of persisted project objects",
+    "auto_rescan": (
+        "maximum age (seconds) of a project loaded from the library before it "
+        "is automatically rescanned and re-saved. Set to 0 to disable "
+        "automatic rescanning. Default is one week."
+    ),
     "scan_types": "files extensions automatically read for scanning",
     "scan_max_files": "don't scan files if more than this number in the project",
     "scan_max_size": "don't scan files bigger than this (in bytes)",
@@ -68,6 +80,34 @@ def defaults():
         "ordered list of preferred installer names for install_tool(), "
         "e.g. ['uv', 'conda', 'pip']. Empty list uses the platform default."
     ),
+    "data_min_fraction": (
+        "fraction (0-1) of a project's total bytes that must be data files "
+        "before a code/other project is also reported as a DataProject. Data "
+        "below this fraction is only scanned if the project matches no other "
+        "type, or individual files exceed data_min_file_size."
+    ),
+    "data_min_file_size": (
+        "a single data file at or above this size (bytes) is considered "
+        "significant enough to scan even in an otherwise code project."
+    ),
+    "data_min_total_size": (
+        "minimum total size (bytes) of candidate data before a directory that "
+        "also matches another project type is additionally reported as a "
+        "DataProject (used together with data_min_fraction)."
+    ),
+    "data_min_play_size": (
+        "floor (bytes) below which even a directory that matches no other "
+        "project type is dismissed as toy/play data and not reported as a "
+        "DataProject."
+    ),
+    "data_consolidate_min_group": (
+        "minimum number of numbered/related files (e.g. 001.csv, 002.csv) that "
+        "are consolidated into a single dataset."
+    ),
+    "data_inspect_max_datasets": (
+        "do not run intake inspection if more than this many distinct datasets "
+        "are found in a directory (avoids huge scans)."
+    ),
     "excludes": (
         "directory names to skip when walking a project tree for child projects "
         "and file statistics. Directories whose names start with '.' or '_' are "
diff --git a/src/projspec/content/__init__.py b/src/projspec/content/__init__.py
index 4c02338..fcf96d8 100644
--- a/src/projspec/content/__init__.py
+++ b/src/projspec/content/__init__.py
@@ -7,11 +7,16 @@
     PipelineStage,
     ServiceDependency,
 )
-from projspec.content.data import TabularData, IntakeSource
+from projspec.content.data import (
+    Dataset,
+    FrictionlessData,
+    IntakeSource,
+    TabularData,
+)
 from projspec.content.env_var import EnvironmentVariables
 from projspec.content.environment import Environment, Stack, Precision
 from projspec.content.executable import Command
-from projspec.content.metadata import DescriptiveMetadata, License
+from projspec.content.metadata import Citation, DescriptiveMetadata, License
 from projspec.content.package import PythonPackage
 from projspec.content.vcs import VCSInfo
 
@@ -22,10 +27,13 @@
     "GithubAction",
     "PipelineStage",
     "ServiceDependency",
-    "TabularData",
+    "Dataset",
+    "FrictionlessData",
     "IntakeSource",
+    "TabularData",
     "EnvironmentVariables",
     "Command",
+    "Citation",
     "License",
     "DescriptiveMetadata",
     "PythonPackage",
diff --git a/src/projspec/content/data.py b/src/projspec/content/data.py
index 3ebdf47..fc6997e 100644
--- a/src/projspec/content/data.py
+++ b/src/projspec/content/data.py
@@ -1,4 +1,9 @@
-"""Contents specifying datasets"""
+"""Content classes describing datasets found within a project.
+
+These describe data assets in a formal way, without loading the data. Most
+of them mirror the things that ``intake`` (v2, ``intake.readers``) can tell us
+about a URL/glob/list of files via :func:`intake.readers.inspect.inspect_dataset`.
+"""
 
 from dataclasses import dataclass, field
 
@@ -7,107 +12,83 @@
 
 @dataclass
 class TabularData(BaseContent):
-    """A tabular dataset, columns and rows
+    """A tabular (columnar) dataset, e.g. CSV/parquet/SQL.
 
-    This lists loadable tabular files with defined schema, typically from formats such as
-    JSON, CSV, and parquet.
+    ``schema`` is a free-form mapping describing the columns; its exact form
+    depends on where it was sourced (FrictionlessData resource schema, a
+    HuggingFace ``features`` block, or intake's ``datashape``).
     """
 
     icon = "📊"
 
     name: str
+    schema: dict = field(default_factory=dict)
     metadata: dict = field(default_factory=dict)
-    # allowed schema formats:
-    #  - dtype-like {fieldname: string-type}
-    #  - dtype-complex {fieldname: {...}}
-    #  - list like [{name:, ...}]
-    # We may choose to normalise to just one of these eventually
-    schema: dict | list = field(default_factory=dict)
 
 
 @dataclass
-class IntakeSource(BaseContent):
-    """A catalog of data assets, including basic properties (location) and how to load/process them.
+class FrictionlessData(BaseContent):
+    """A data resource described by the FrictionlessData standard.
 
-    See https://intake.readthedocs.io/en/latest/
+    See https://datapackage.org/standard/data-resource/ .
     """
 
+    icon = "🪪"
+
+    name: str
+    schema: dict = field(default_factory=dict)
+
+
+@dataclass
+class IntakeSource(BaseContent):
+    """A named entry in an intake catalog."""
+
     icon = "📖"
 
-    # TODO: add better fields: args, driver/reader, metadata, description
     name: str
 
 
 @dataclass
-class DataResource(BaseContent):
-    """A data resource found inside a data-only directory.
-
-    Describes one logical dataset — which may be a flat collection of files, a
-    Hive-partitioned tree, an Iceberg/Delta table, a Zarr store, or any other
-    recognised on-disk layout.
-
-    The `path` field is a human-readable basename that identifies the resource:
-
-    - Single file: `"data.csv"`
-    - Multi-file series: `"part*.parquet"` (glob-style, common prefix + `*` + ext)
-    - Directory-as-dataset (Hive partition, Zarr store, …): `"year=2024/"`
-
-    The `modality` field classifies the broad nature of the data using the
-    vocabulary established by intake's `structure` tags and napari's layer
-    type system:
-
-    - `"tabular"`    — row/column data (CSV, Parquet, ORC, Excel, …)
-    - `"array"`      — N-dimensional arrays (NumPy, HDF5, NetCDF, Zarr, …)
-    - `"image"`      — 2-D/3-D images (PNG, JPEG, TIFF, DICOM, NIfTI, …)
-    - `"timeseries"` — time-indexed signals (WAV, GRIB, …)
-    - `"geospatial"` — vector/raster geodata (Shapefile, GeoJSON, GeoTIFF, …)
-    - `"model"`      — ML model weights (GGUF, SafeTensors, PyTorch, …)
-    - `"nested"`     — hierarchical / JSON-like (Avro, YAML, XML, …)
-    - `"document"`   — human-readable documents (PDF, DOCX, …)
-    - `"video"`      — video streams (MP4, AVI, …)
-    - `"archive"`    — compressed bundles (ZIP, tar.gz, …)
-    - `""`           — unknown / mixed
-
-    The `schema` field is format-specific:
-
-    - Tabular (Parquet, Arrow, CSV, …): `{column_name: dtype_str, …}`
-    - Image / array: `{"width": int, "height": int, "channels": int, "mode": str}`
-    - Audio: `{"sample_rate": int, "channels": int, "frames": int}`
-    - HDF5 / Zarr / NetCDF: `{"variables": [...], "dims": {...}, "attrs": {...}}`
-    - Unknown / library not available: `{}`
+class Dataset(BaseContent):
+    """A generic dataset discovered on disk and described by intake.
+
+    This is produced by :class:`projspec.proj.data_project.DataProject` after
+    scanning files/globs with :func:`intake.readers.inspect.inspect_dataset`.
+
+    The dataset's short identifying name is *not* stored on the object: a
+    :class:`DataProject` exposes its datasets as an ``AttrDict`` keyed by that
+    name (e.g. ``proj.contents.dataset["*.csv"]``), so duplicating it here
+    would be redundant.
+
+    Attributes
+    ----------
+    url:
+        The URL, glob or list of URLs that make up this dataset, relative to
+        (or rooted at) the project directory.
+    datatype:
+        The intake ``BaseData`` subclass name detected (e.g. ``"CSV"``,
+        ``"Parquet"``), or ``None`` if intake could not identify the type.
+    structure:
+        Structural tags reported by intake (e.g. ``{"table"}``,
+        ``{"array", "image"}``).
+    schema:
+        The ``datashape`` mapping returned by intake (columns/dtypes, dims,
+        etc.); empty if no reader could describe the data.
+    n_files:
+        Number of files that make up the dataset (after glob expansion).
+    total_size:
+        Total bytes across all files in the dataset, if known.
+    metadata:
+        Any other useful summary information from intake (shape, npartitions,
+        recommended readers, description, …).
     """
 
-    icon = "📥"
-
-    path: str  # basename (or glob pattern / dir/ ) identifying this resource
-    format: str  # canonical format string, e.g. "parquet", "csv", "png", "hdf5"
-    modality: str = ""  # broad data nature; see docstring for vocabulary
-    layout: str = ""  # "flat"|"hive"|"iceberg"|"delta"|"zarr_store"|"tiledarray"|""
-    file_count: int = 0
-    total_size: int = 0  # bytes; 0 when unknown (e.g. remote FS without size info)
-    schema: dict | list = field(default_factory=dict)
-    # full path to one representative file, for use by preview loaders
-    sample_path: str = ""
-    metadata: dict = field(default_factory=dict)  # catch-all extras
-    _html = None
-
-    def __repr__(self) -> str:
-        from projspec.content.data_html import repr_text
-
-        return repr_text(self)
-
-    def _repr_html_(self) -> str:
-        """Jupyter rich display — returns cached HTML, rendering on first call."""
-        # TODO: this is probably not what we want jupyter to dysplay, but it's
-        #  convenient for now.
-        if self._html is None:
-            from projspec.content.data_html import repr_html
-
-            self._html = repr_html(self)
-        return self._html
-
-    def to_dict(self, compact=False):
-        d = super().to_dict(compact=compact)
-        if not compact:
-            d["_html"] = self._repr_html_()
-        return d
+    icon = "🗃️"
+
+    url: str | list[str] = ""
+    datatype: str | None = None
+    structure: list[str] = field(default_factory=list)
+    schema: dict = field(default_factory=dict)
+    n_files: int = 1
+    total_size: int | None = None
+    metadata: dict = field(default_factory=dict)
diff --git a/src/projspec/content/data_html.py b/src/projspec/content/data_html.py
deleted file mode 100644
index 530fb60..0000000
--- a/src/projspec/content/data_html.py
+++ /dev/null
@@ -1,632 +0,0 @@
-"""Text and HTML representations for DataResource.
-
-``repr_text``  — plain-text one-liner for ``__repr__``.
-``repr_html``  — rich HTML card for Jupyter's ``_repr_html_`` protocol.
-
-The HTML card has two sections:
-
-1. **Metadata table** — name, format, modality, layout, file count, total size,
-   schema (collapsed by default when it has many entries).
-
-2. **Preview** (optional) — a lightweight peek at the actual data using
-   whichever optional library is available for the format.  The section is
-   silently omitted when no suitable loader can be imported.
-
-All loader imports are guarded with ``try/except ImportError`` so that the
-representation degrades gracefully when optional dependencies are absent.
-"""
-
-from __future__ import annotations
-
-import base64
-import html as _html
-from typing import TYPE_CHECKING
-
-if TYPE_CHECKING:
-    from projspec.content.data import DataResource
-
-# ---------------------------------------------------------------------------
-# Helpers
-# ---------------------------------------------------------------------------
-
-_MODALITY_ICON: dict[str, str] = {
-    "tabular": "&#x1F4CA;",  # 📊
-    "image": "&#x1F5BC;",  # 🖼
-    "array": "&#x1F9EE;",  # 🧮
-    "timeseries": "&#x1F4C8;",  # 📈
-    "geospatial": "&#x1F30D;",  # 🌍
-    "model": "&#x1F9E0;",  # 🧠
-    "nested": "&#x1F4C2;",  # 📂
-    "document": "&#x1F4C4;",  # 📄
-    "video": "&#x1F3AC;",  # 🎬
-    "archive": "&#x1F4E6;",  # 📦
-    "": "&#x1F5C2;",  # 🗂
-}
-
-
-def _fmt_size(n: int) -> str:
-    """Human-readable byte count."""
-    if n <= 0:
-        return "unknown"
-    for unit in ("B", "KB", "MB", "GB", "TB"):
-        if n < 1024:
-            return f"{n:.1f} {unit}" if unit != "B" else f"{n} B"
-        n /= 1024  # type: ignore[assignment]
-    return f"{n:.1f} PB"
-
-
-def _esc(s: object) -> str:
-    return _html.escape(str(s))
-
-
-# ---------------------------------------------------------------------------
-# Plain-text repr
-# ---------------------------------------------------------------------------
-
-
-def repr_text(dr: "DataResource") -> str:
-    """One-line text representation of a DataResource."""
-    size = _fmt_size(dr.total_size)
-    schema_hint = ""
-    if isinstance(dr.schema, dict) and dr.schema:
-        keys = list(dr.schema)[:3]
-        extra = f", +{len(dr.schema) - 3} more" if len(dr.schema) > 3 else ""
-        schema_hint = f" [{', '.join(str(k) for k in keys)}{extra}]"
-    elif isinstance(dr.schema, list) and dr.schema:
-        schema_hint = f" [{len(dr.schema)} fields]"
-
-    parts = [
-        f"DataResource({dr.path!r}",
-        f"format={dr.format!r}",
-    ]
-    if dr.modality:
-        parts.append(f"modality={dr.modality!r}")
-    if dr.layout and dr.layout not in ("flat", ""):
-        parts.append(f"layout={dr.layout!r}")
-    parts.append(f"files={dr.file_count}")
-    parts.append(f"size={size}")
-    if schema_hint:
-        parts.append(f"schema={schema_hint.strip()}")
-    return ", ".join(parts) + ")"
-
-
-# ---------------------------------------------------------------------------
-# HTML repr
-# ---------------------------------------------------------------------------
-
-# No inline styles — class names are present for external styling by the
-# host environment (Jupyter, VS Code webview, etc.).
-_CARD_CSS = ""
-
-
-def repr_html(dr: "DataResource") -> str:
-    """Rich HTML card representation of a DataResource."""
-    icon = _MODALITY_ICON.get(dr.modality, _MODALITY_ICON[""])
-    size_str = _fmt_size(dr.total_size)
-
-    # ---- header ----
-    modality_badge = (
-        f'<span class="ps-badge">{_esc(dr.modality)}</span>' if dr.modality else ""
-    )
-    format_badge = f'<span class="ps-badge-gray">{_esc(dr.format)}</span>'
-    layout_badge = (
-        f'<span class="ps-badge-gray">{_esc(dr.layout)}</span>'
-        if dr.layout and dr.layout not in ("flat", "")
-        else ""
-    )
-
-    header = (
-        f'<div class="ps-data-card-header">'
-        f'<span class="ps-icon">{icon}</span>'
-        f'<span class="ps-name">{_esc(dr.path)}</span>'
-        f"{modality_badge}{format_badge}{layout_badge}"
-        f"</div>"
-    )
-
-    # ---- metadata table ----
-    meta_rows = [
-        ("Files", str(dr.file_count)),
-        ("Total size", size_str),
-    ]
-
-    meta_html_rows = "".join(
-        f"<tr><td>{_esc(k)}</td><td>{v}</td></tr>" for k, v in meta_rows
-    )
-    schema_html = _render_schema(dr.schema)
-
-    meta_section = (
-        f'<div class="ps-data-meta">'
-        f"<table>{meta_html_rows}</table>"
-        f"{schema_html}"
-        f"</div>"
-    )
-
-    # ---- preview ----
-    preview_html = _build_preview(dr)
-    preview_section = ""
-    if preview_html:
-        preview_section = (
-            f'<div class="ps-preview">'
-            f'<div class="ps-preview-title">Preview</div>'
-            f"{preview_html}"
-            f"</div>"
-        )
-
-    return (
-        _CARD_CSS
-        + f'<div class="ps-data-card">'
-        + header
-        + meta_section
-        + preview_section
-        + "</div>"
-    )
-
-
-# ---------------------------------------------------------------------------
-# Schema rendering
-# ---------------------------------------------------------------------------
-
-
-def _render_schema(schema: dict | list) -> str:
-    """Render schema as a collapsible HTML block."""
-    if not schema:
-        return ""
-
-    if isinstance(schema, dict):
-        # Tabular-style {col: dtype} or structural {"variables": [...], ...}
-        rows = ""
-        for k, v in schema.items():
-            rows += f"<tr><td>{_esc(k)}</td><td>{_esc(v)}</td></tr>"
-        table = (
-            f'<table class="ps-schema-table">'
-            f"<tr><th>Field</th><th>Type / Value</th></tr>"
-            f"{rows}"
-            f"</table>"
-        )
-        n = len(schema)
-        open_attr = "open" if n <= 8 else ""
-        return (
-            f'<details {open_attr} style="margin-top:6px">'
-            f'<summary class="ps-schema-toggle">Schema ({n} {"field" if n == 1 else "fields"})</summary>'
-            f"{table}</details>"
-        )
-
-    if isinstance(schema, list):
-        # List-of-dicts (frictionless style) or plain list
-        if schema and isinstance(schema[0], dict):
-            # Render each dict as a row; use union of all keys as columns
-            all_keys: list[str] = []
-            for item in schema:
-                for k in item:
-                    if k not in all_keys:
-                        all_keys.append(k)
-            header_row = "".join(f"<th>{_esc(k)}</th>" for k in all_keys)
-            body_rows = ""
-            for item in schema:
-                cells = "".join(f"<td>{_esc(item.get(k, ''))}</td>" for k in all_keys)
-                body_rows += f"<tr>{cells}</tr>"
-            table = (
-                f'<table class="ps-schema-table">'
-                f"<tr>{header_row}</tr>{body_rows}</table>"
-            )
-        else:
-            items_html = "".join(f"<li>{_esc(s)}</li>" for s in schema)
-            table = f"<ul style='margin:4px 0;padding-left:18px'>{items_html}</ul>"
-
-        n = len(schema)
-        open_attr = "open" if n <= 8 else ""
-        return (
-            f'<details {open_attr} style="margin-top:6px">'
-            f'<summary class="ps-schema-toggle">Schema ({n} {"field" if n == 1 else "fields"})</summary>'
-            f"{table}</details>"
-        )
-
-    return ""
-
-
-# ---------------------------------------------------------------------------
-# Preview builders — one function per modality family, all return HTML str
-# or None when no loader is available.
-# ---------------------------------------------------------------------------
-
-#: How many rows to show in tabular previews.
-_PREVIEW_ROWS = 5
-
-
-def _obj_to_preview_html(obj) -> str:
-    """Return the richest HTML string available for *obj*.
-
-    Tries ``_repr_html_()`` first (pandas DataFrame, polars DataFrame, xarray
-    Dataset, …), then falls back to ``__repr__``.  The result is always
-    wrapped in a ``<div>`` so callers can rely on valid HTML.
-    """
-    if hasattr(obj, "_repr_html_"):
-        try:
-            h = obj._repr_html_()
-            if h:
-                return f'<div class="ps-df-wrap">{h}</div>'
-        except Exception:
-            pass
-    return f'<div class="ps-df-wrap"><pre>{_esc(repr(obj))}</pre></div>'
-
-
-def _build_preview(dr: "DataResource") -> str | None:
-    """Return an HTML preview fragment, or None if not possible."""
-    fmt = dr.format
-    modality = dr.modality
-    sample = dr.sample_path if dr.sample_path else None
-
-    if sample is None:
-        return None
-
-    if modality == "tabular":
-        return _preview_tabular(dr, sample)
-    if modality == "image":
-        return _preview_image(dr, sample)
-    if modality == "array":
-        return _preview_array(dr, sample)
-    if modality == "timeseries" and fmt in ("wav", "flac", "mp3", "ogg"):
-        return _preview_audio(dr, sample)
-    return None
-
-
-# --- tabular ---
-
-
-def _preview_tabular(dr: "DataResource", path: str) -> str | None:
-    fmt = dr.format
-    fs = dr.proj.fs
-
-    try:
-        if fmt == "parquet":
-            return _preview_parquet(fs, path)
-        if fmt == "csv":
-            return _preview_csv(fs, path)
-        if fmt in ("tsv", "psv"):
-            sep = "\t" if fmt == "tsv" else "|"
-            return _preview_csv(fs, path, sep=sep)
-        if fmt == "arrow":
-            return _preview_arrow(fs, path)
-        if fmt == "jsonlines":
-            return _preview_jsonlines(fs, path)
-        if fmt == "excel":
-            return _preview_excel(fs, path)
-        if fmt in ("sqlite", "duckdb"):
-            return _preview_sql(fs, path, fmt)
-        if fmt == "orc":
-            return _preview_orc(fs, path)
-    except Exception:
-        pass
-    return None
-
-
-def _preview_parquet(fs, path: str) -> str | None:
-    """Read only the first row group (or N rows from it) — no full file scan."""
-    try:
-        import pyarrow.parquet as pq
-
-        with fs.open(path, "rb") as fh:
-            pf = pq.ParquetFile(fh)
-            # read_row_group reads one row group's pages, not the whole file
-            batch = pf.read_row_group(0)
-            if batch.num_rows > _PREVIEW_ROWS:
-                batch = batch.slice(0, _PREVIEW_ROWS)
-        # Convert to pandas so we get _repr_html_() for free
-        df = batch.to_pandas()
-        return _obj_to_preview_html(df)
-    except ImportError:
-        pass
-    try:
-        # polars can read a row-count-limited slice without decoding the rest
-        import polars as pl
-
-        with fs.open(path, "rb") as fh:
-            df = pl.read_parquet(fh, n_rows=_PREVIEW_ROWS)
-        return _obj_to_preview_html(df)
-    except ImportError:
-        pass
-    return None
-
-
-def _preview_csv(fs, path: str, sep: str = ",") -> str | None:
-    # pandas nrows= stops parsing after N data lines — minimal I/O
-    try:
-        import pandas as pd
-
-        with fs.open(path, "r", encoding="utf-8", errors="replace") as fh:
-            df = pd.read_csv(fh, sep=sep, nrows=_PREVIEW_ROWS)
-        return _obj_to_preview_html(df)
-    except ImportError:
-        pass
-    try:
-        import polars as pl
-
-        with fs.open(path, "rb") as fh:
-            df = pl.read_csv(fh, n_rows=_PREVIEW_ROWS, separator=sep)
-        return _obj_to_preview_html(df)
-    except ImportError:
-        pass
-    return None
-
-
-def _preview_arrow(fs, path: str) -> str | None:
-    """Read only the first record batch — no full file deserialisation."""
-    try:
-        import pyarrow.ipc as ipc
-
-        with fs.open(path, "rb") as fh:
-            try:
-                # IPC file format: random-access; read just batch 0
-                reader = ipc.open_file(fh)
-                batch = reader.get_batch(0)
-            except Exception:
-                fh.seek(0)
-                # IPC stream format: sequential; read just the first batch
-                reader = ipc.open_stream(fh)
-                batch = reader.read_next_batch()
-        if batch.num_rows > _PREVIEW_ROWS:
-            batch = batch.slice(0, _PREVIEW_ROWS)
-        df = batch.to_pandas()
-        return _obj_to_preview_html(df)
-    except ImportError:
-        pass
-    return None
-
-
-def _preview_jsonlines(fs, path: str) -> str | None:
-    # pandas nrows= stops reading after N lines
-    try:
-        import pandas as pd
-
-        with fs.open(path, "r", encoding="utf-8", errors="replace") as fh:
-            df = pd.read_json(fh, lines=True, nrows=_PREVIEW_ROWS)
-        return _obj_to_preview_html(df)
-    except ImportError:
-        pass
-    return None
-
-
-def _preview_excel(fs, path: str) -> str | None:
-    # nrows= limits rows read from the sheet
-    try:
-        import pandas as pd
-
-        with fs.open(path, "rb") as fh:
-            df = pd.read_excel(fh, nrows=_PREVIEW_ROWS)
-        return _obj_to_preview_html(df)
-    except ImportError:
-        pass
-    return None
-
-
-def _preview_sql(fs, path: str, fmt: str) -> str | None:
-    # SQLite/DuckDB: only works with a local path (not a remote FS)
-    try:
-        if getattr(fs, "protocol", "file") not in ("file", "local", ""):
-            return None
-        if fmt == "duckdb":
-            try:
-                import duckdb
-
-                con = duckdb.connect(path, read_only=True)
-                tables = con.execute("SHOW TABLES").fetchall()
-                if not tables:
-                    return None
-                tname = tables[0][0]
-                df = con.execute(
-                    f'SELECT * FROM "{tname}" LIMIT {_PREVIEW_ROWS}'
-                ).fetchdf()
-                return _obj_to_preview_html(df)
-            except ImportError:
-                pass
-        else:
-            import sqlite3
-            import pandas as pd
-
-            con = sqlite3.connect(path)
-            cur = con.cursor()
-            cur.execute("SELECT name FROM sqlite_master WHERE type='table'")
-            tables = cur.fetchall()
-            if not tables:
-                return None
-            tname = tables[0][0]
-            df = pd.read_sql(f'SELECT * FROM "{tname}" LIMIT {_PREVIEW_ROWS}', con)
-            return _obj_to_preview_html(df)
-    except Exception:
-        pass
-    return None
-
-
-def _preview_orc(fs, path: str) -> str | None:
-    try:
-        import pyarrow.orc as orc
-
-        with fs.open(path, "rb") as fh:
-            table = orc.ORCFile(fh).read().slice(0, _PREVIEW_ROWS)
-        df = table.to_pandas()
-        return _obj_to_preview_html(df)
-    except ImportError:
-        pass
-    return None
-
-
-# --- image ---
-
-
-def _preview_image(dr: "DataResource", path: str) -> str | None:
-    try:
-        from PIL import Image
-        import io
-
-        fs = dr.proj.fs
-        with fs.open(path, "rb") as fh:
-            raw: bytes = fh.read()
-
-        img = Image.open(io.BytesIO(raw))
-        img.thumbnail((600, 200))
-
-        buf = io.BytesIO()
-        # Save as PNG for lossless display regardless of source format
-        rgb = img.convert("RGB") if img.mode not in ("RGB", "L", "RGBA") else img
-        rgb.save(buf, format="PNG")
-        b64 = base64.b64encode(buf.getvalue()).decode("ascii")
-
-        w, h = img.size
-        schema = dr.schema if isinstance(dr.schema, dict) else {}
-        info = f"{schema.get('width', w)}×{schema.get('height', h)}"
-        if "mode" in schema:
-            info += f", mode={schema['mode']}"
-
-        return (
-            f'<div><img class="ps-img-preview" src="data:image/png;base64,{b64}" '
-            f'alt="{_esc(dr.path)}" />'
-            f'<div style="font-size:11px;color:#666;margin-top:3px">{_esc(info)}</div></div>'
-        )
-    except ImportError:
-        pass
-    except Exception:
-        pass
-    return None
-
-
-# --- array ---
-
-
-def _preview_array(dr: "DataResource", path: str) -> str | None:
-    fmt = dr.format
-    fs = dr.proj.fs
-
-    if fmt == "numpy":
-        return _preview_numpy(fs, path)
-    if fmt == "hdf5":
-        return _preview_hdf5(fs, path)
-    if fmt == "netcdf":
-        return _preview_netcdf(fs, path)
-    if fmt == "zarr":
-        return _preview_zarr(dr)
-    return None
-
-
-def _array_info_html(info: dict) -> str:
-    rows = "".join(
-        f"<tr><td><strong>{_esc(k)}</strong></td><td>{_esc(v)}</td></tr>"
-        for k, v in info.items()
-    )
-    return f'<table class="ps-schema-table" style="margin-top:0">{rows}</table>'
-
-
-def _preview_numpy(fs, path: str) -> str | None:
-    """Read only the .npy header to get shape/dtype, then load a minimal slice."""
-    try:
-        import numpy as np
-        import numpy.lib.format as nf
-        import io
-
-        with fs.open(path, "rb") as fh:
-            raw_header = fh.read(512)  # header is always ≤ 512 bytes
-
-        buf = io.BytesIO(raw_header)
-        nf.read_magic(buf)
-        # read_array_header_1_0 is the stable API across numpy versions;
-        # newer numpy also exposes read_array_header — try both.
-        try:
-            shape, _, dtype = nf.read_array_header_1_0(buf)
-        except AttributeError:
-            shape, _, dtype = nf.read_array_header(buf)  # type: ignore[attr-defined]
-
-        info: dict = {"shape": str(shape), "dtype": str(dtype)}
-
-        # Load the full array only when it's small enough (≤ 1 MB heuristic)
-        # or when we can cheaply slice the first N rows.
-        try:
-            total_elements = 1
-            for s in shape:
-                total_elements *= s
-            item_size = np.dtype(dtype).itemsize
-            if total_elements * item_size <= 1_048_576:
-                with fs.open(path, "rb") as fh:
-                    arr = np.load(io.BytesIO(fh.read()), allow_pickle=False)
-                sliced = arr[:_PREVIEW_ROWS] if arr.ndim >= 1 else arr
-                info["preview"] = repr(sliced)
-        except Exception:
-            pass
-
-        return _array_info_html(info)
-    except Exception:
-        pass
-    return None
-
-
-def _preview_hdf5(fs, path: str) -> str | None:
-    """Open the HDF5 file and read only metadata — no array data loaded."""
-    try:
-        import h5py
-
-        with fs.open(path, "rb") as fh:
-            with h5py.File(fh, "r") as f:
-                keys = list(f.keys())[:8]
-                info: dict = {"top-level keys": ", ".join(keys) or "(none)"}
-                for k in keys[:3]:
-                    obj = f[k]
-                    if hasattr(obj, "shape"):
-                        info[k] = f"shape={obj.shape}, dtype={obj.dtype}"
-                    else:
-                        info[k] = f"group ({len(obj)} members)"
-        return _array_info_html(info)
-    except ImportError:
-        pass
-    return None
-
-
-def _preview_netcdf(fs, path: str) -> str | None:
-    """Open the dataset lazily (no data loaded) and render its repr."""
-    try:
-        import xarray as xr
-
-        with fs.open(path, "rb") as fh:
-            # engine="scipy" reads lazily; no array data is decoded here
-            ds = xr.open_dataset(fh, engine="scipy")
-        # xarray Dataset has a rich _repr_html_()
-        return _obj_to_preview_html(ds)
-    except ImportError:
-        pass
-    return None
-
-
-def _preview_zarr(dr: "DataResource") -> str | None:
-    """Use the schema cached at parse time — zero extra I/O."""
-    schema = dr.schema
-    if not schema or not isinstance(schema, dict):
-        return None
-    info = {}
-    if "arrays" in schema:
-        info["arrays"] = ", ".join(str(a) for a in schema["arrays"][:8]) or "(none)"
-    if "groups" in schema:
-        info["groups"] = ", ".join(str(g) for g in schema["groups"][:8]) or "(none)"
-    if "attrs" in schema:
-        info["attrs"] = str(dict(list(schema["attrs"].items())[:4]))
-    return _array_info_html(info) if info else None
-
-
-# --- audio ---
-
-
-def _preview_audio(dr: "DataResource", path: str) -> str | None:
-    """Read only the audio file header — no sample data loaded."""
-    try:
-        import soundfile as sf
-
-        fs = dr.proj.fs
-        with fs.open(path, "rb") as fh:
-            info = sf.info(fh)
-        details = {
-            "sample rate": f"{info.samplerate} Hz",
-            "channels": str(info.channels),
-            "duration": f"{info.frames / info.samplerate:.2f} s",
-            "format": info.format,
-            "subtype": info.subtype,
-        }
-        return _array_info_html(details)
-    except ImportError:
-        pass
-    return None
diff --git a/src/projspec/html.py b/src/projspec/html.py
deleted file mode 100644
index eb62124..0000000
--- a/src/projspec/html.py
+++ /dev/null
@@ -1,47 +0,0 @@
-def dict_to_html(data: dict, title="Data", open_level=2) -> str:
-    """
-    Convert a nested dictionary to expandable HTML using <details> tags.
-
-    Args:
-        data: The dictionary to convert
-        title: Title for the details element
-        open_level: whether to set elements as expanded; yes if > 0, and will
-            decrement for inner levels.
-
-    Returns:
-        String containing HTML with expandable details elements
-    """
-    # With help from Claude Sonnet 4.
-    if not isinstance(data, dict):
-        return f"<span>{data}</span>"
-
-    if not data:
-        return ""
-    open = "open" if open_level > 0 else "closed"
-
-    html = [
-        f'<details {open} style="margin-left: 20px; margin-bottom: 10px;"><summary style="cursor: pointer;'
-        f' color: #2c5aa0; padding: 5px;"><strong>{title}</strong></summary>'
-    ]
-
-    for key, value in data.items():
-        if isinstance(value, dict):
-            html.append(dict_to_html(value, key, open_level - 1))
-        elif isinstance(value, (list, tuple)):
-            html.append(
-                f'<details style="margin-left: 20px; margin-bottom: 10px;"><summary style="cursor: pointer;'
-                f' color: #2c5aa0; padding: 5px;"><strong>{key}</strong></summary>'
-            )
-            for i, item in enumerate(value):
-                if isinstance(item, dict):
-                    html.append(dict_to_html(item, f"{key}[{i}]", open_level - 1))
-                else:
-                    html.append(f'<div style=" margin: 5px 0;"> {item}</div>')
-            html.append("</details>")
-        else:
-            html.append(
-                f'<div style=" margin: 5px 0;"><strong>{key}:</strong> {value}</div>'
-            )
-
-    html.append("</details>")
-    return "".join(html)
diff --git a/src/projspec/library.py b/src/projspec/library.py
index db6bcb5..0b3f313 100644
--- a/src/projspec/library.py
+++ b/src/projspec/library.py
@@ -1,10 +1,12 @@
 import json
 import os
+import time
 
 import fsspec
 
 from projspec.config import get_conf
 from projspec.proj import Project
+from projspec.utils import DEFAULT
 
 
 class ProjectLibrary:
@@ -15,14 +17,28 @@ class ProjectLibrary:
 
     # TODO: support for remote libraries
 
-    def __init__(self, library_path: str | None = None, auto_save: bool = True):
-        self.path = library_path or get_conf("library_path")
-        self.entries: dict[str, Project] = {}
+    def __init__(
+        self,
+        library_path: str | None | type = DEFAULT,
+        auto_save: bool = True,
+        entries: dict | None = None,
+    ):
+        self.path = (
+            get_conf("library_path") if library_path is DEFAULT else library_path
+        )
+        self.entries: dict[str, Project] = {} if entries is None else entries
         self.auto_save = auto_save
         self.load()
 
     def load(self):
-        """Loads scanned project objects from JSON file"""
+        """Loads scanned project objects from JSON file.
+
+        Any entry whose last scan is older than the ``auto_rescan`` config
+        value (in seconds) is automatically rescanned and the refreshed
+        library is saved back. Set ``auto_rescan`` to 0 to disable this.
+        """
+        if self.path is None:
+            return
         try:
             with fsspec.open(self.path, "r") as f:
                 self.entries = {
@@ -30,6 +46,35 @@ def load(self):
                 }
         except FileNotFoundError:
             self.entries = {}
+            return
+        self._auto_rescan()
+
+    def _auto_rescan(self):
+        """Rescan entries older than the ``auto_rescan`` config threshold."""
+        max_age = get_conf("auto_rescan")
+        if not max_age or max_age <= 0:
+            return
+        now = time.time()
+        rescanned = False
+        for key, proj in list(self.entries.items()):
+            scanned_at = getattr(proj, "scanned_at", None)
+            if scanned_at is None or (now - scanned_at) < max_age:
+                continue
+            try:
+                # Rescan from the project's own path, preserving the library
+                # key so the entry's identity does not drift.
+                fresh = Project(
+                    proj.path,
+                    storage_options=proj.storage_options,
+                    walk=False,
+                )
+            except Exception:
+                # never let an unreachable/changed project break library load
+                continue
+            self.entries[key] = fresh
+            rescanned = True
+        if rescanned and self.auto_save and self.path is not None:
+            self.save()
 
     def clear(self):
         """Clears scanned project objects from JSON file and memory"""
@@ -46,6 +91,8 @@ def add_entry(self, path: str, entry: Project):
     def save(self):
         """Serialise the state of the scanned project objects to file"""
         # don't catch
+        if self.path is None:
+            raise ValueError("Cannot save without .path set")
         data = {k: v.to_dict(compact=False) for k, v in self.entries.items()}
         with fsspec.open(self.path, "w") as f:
             json.dump(data, f)
diff --git a/src/projspec/proj/__init__.py b/src/projspec/proj/__init__.py
index 328cd80..2f535b5 100644
--- a/src/projspec/proj/__init__.py
+++ b/src/projspec/proj/__init__.py
@@ -24,7 +24,7 @@
 from projspec.proj.conda_package import CondaRecipe, RattlerRecipe
 from projspec.proj.conda_project import CondaProject
 from projspec.proj.conda_workspace import CondaWorkspace
-from projspec.proj.data_dir import Data
+from projspec.proj.data_project import DataProject
 from projspec.proj.datapackage import DataPackage, DVCRepo
 from projspec.proj.dataworkflows import (
     Airflow,
@@ -43,6 +43,7 @@
 from projspec.proj.golang import Golang
 from projspec.proj.helm import HelmChart
 from projspec.proj.hf import HuggingFaceRepo
+from projspec.proj.knowledge_catalog import KnowledgeCatalog
 from projspec.proj.ide import JetbrainsIDE, NvidiaAIWorkbench, VSCode
 from projspec.proj.infra import (
     Ansible,
@@ -92,9 +93,9 @@
     "CondaWorkspace",
     "RattlerRecipe",
     # Data
-    "Data",
     "DataPackage",
     "DVCRepo",
+    "DataProject",
     # Data/ML workflows
     "Airflow",
     "Dagster",
@@ -121,6 +122,8 @@
     "HelmChart",
     # HuggingFace
     "HuggingFaceRepo",
+    # Knowledge
+    "KnowledgeCatalog",
     # IDE
     "AIEnabled",
     "BackstageCatalog",
diff --git a/src/projspec/proj/_consolidate.py b/src/projspec/proj/_consolidate.py
new file mode 100644
index 0000000..9ee93f4
--- /dev/null
+++ b/src/projspec/proj/_consolidate.py
@@ -0,0 +1,258 @@
+"""Consolidate sets of related files into logical datasets.
+
+Intake can already recognise some directory-based datasets (hive-partitioned
+parquet, zarr, delta, …) by their characteristic contents.  This module covers
+the complementary case where a directory holds *many individually-named files
+that obviously belong together*, e.g.::
+
+    001.csv 002.csv 003.csv          -> one CSV dataset
+    part-00000.parquet part-00001…   -> one parquet dataset
+    data_2019.json data_2020.json    -> one JSON dataset
+    green.gif red.gif blue.gif       -> one GIF (image) dataset
+
+The output is a list of :class:`FileGroup` objects.  Each group is either a
+single standalone file or a consolidated set, and exposes a ``glob`` (or list of
+members) suitable for handing straight to
+:func:`intake.readers.inspect.inspect_dataset`.
+
+The logic here is deliberately filesystem-agnostic: it operates on
+``(basename, size)`` pairs so it can be unit-tested without any I/O.
+"""
+
+from __future__ import annotations
+
+import os
+import re
+from dataclasses import dataclass, field
+
+# A maximal run of digits anywhere in the stem - the most common way numbered
+# file series differ (001, 00001, 2020, ...).
+_DIGITS = re.compile(r"\d+")
+# Tokens for the "one differing token" heuristic (split on common separators).
+_SEP = re.compile(r"[._\- ]+")
+
+
+@dataclass
+class FileGroup:
+    """A standalone file or a consolidated set of related files.
+
+    Attributes
+    ----------
+    members:
+        Basenames belonging to this group, sorted.
+    ext:
+        Common file extension (lower-case, including the dot), or ``""``.
+    total_size:
+        Sum of the sizes of all members (bytes); ``None`` if unknown.
+    pattern:
+        For consolidated groups, a glob basename that matches all members
+        (e.g. ``"*.csv"`` or ``"part-*.parquet"``).  For a single file this is
+        just that file's basename.
+    consolidated:
+        ``True`` when this group represents more than one physical file.
+    """
+
+    members: list[str]
+    ext: str = ""
+    total_size: int | None = None
+    pattern: str = ""
+    consolidated: bool = False
+
+    @property
+    def name(self) -> str:
+        """A short identifying name for the group."""
+        if self.consolidated:
+            return self.pattern
+        return self.members[0]
+
+    def url(self, root: str) -> str | list[str]:
+        """Build the URL/glob (rooted at *root*) to hand to intake.
+
+        A consolidated group whose members match a simple glob is expressed as
+        a single ``root/pattern`` glob string; otherwise it is returned as an
+        explicit list of member URLs.  A single file is returned as one URL.
+        """
+        root = root.rstrip("/")
+        if not self.consolidated:
+            return f"{root}/{self.members[0]}"
+        if self.pattern and "*" in self.pattern:
+            return f"{root}/{self.pattern}"
+        return [f"{root}/{m}" for m in self.members]
+
+
+def _split_ext(name: str) -> tuple[str, str]:
+    """Split into ``(stem, ext)`` with a lower-cased extension.
+
+    Handles common double extensions like ``.csv.gz`` / ``.tar.gz`` so that a
+    series of compressed parts groups correctly.
+    """
+    lower = name.lower()
+    for double in (".csv.gz", ".json.gz", ".tar.gz", ".tar.bz2", ".tsv.gz"):
+        if lower.endswith(double) and len(name) > len(double):
+            return name[: -len(double)], double
+    stem, ext = os.path.splitext(name)
+    return stem, ext.lower()
+
+
+def _digit_pattern(stem: str) -> str | None:
+    """Mask digit runs in *stem* with ``#``, or ``None`` if it has no digits.
+
+    ``part-00001`` -> ``part-#``; ``data2020`` -> ``data#``.  Consecutive digit
+    runs collapse to a single placeholder so that ``a1b2`` and ``a3b4`` share a
+    key.
+    """
+    if not _DIGITS.search(stem):
+        return None
+    return _DIGITS.sub("#", stem)
+
+
+def _glob_from_digit_pattern(pattern: str) -> str:
+    """Turn a masked pattern (``part-#``) into a glob stem (``part-*``)."""
+    return pattern.replace("#", "*")
+
+
+def _token_signature(stem: str) -> tuple[tuple[str, ...], int] | None:
+    """Return ``(tokens_with_one_blanked, blank_index)`` for the token heuristic.
+
+    Used for non-numeric series such as ``green``/``red``/``blue``.  We only
+    consider stems that split into the *same* number of tokens differing in
+    exactly one position; here we just return the token tuple so the caller can
+    group by "all-but-one token equal".
+    """
+    tokens = tuple(t for t in _SEP.split(stem) if t)
+    if not tokens:
+        return None
+    return tokens, len(tokens)
+
+
+def consolidate(
+    files: list[tuple[str, int | None]],
+    min_group: int = 3,
+    min_token_group: int = 2,
+) -> list[FileGroup]:
+    """Group a flat list of files into datasets.
+
+    Parameters
+    ----------
+    files:
+        ``[(basename, size_or_None), ...]`` for the files directly in a
+        directory (not directories, not recursive).
+    min_group:
+        Minimum number of files sharing a digit-masked pattern before they are
+        consolidated.  Below this they are emitted as standalone files.
+    min_token_group:
+        Minimum size for the (weaker) "one differing token" heuristic used for
+        non-numeric series like colour names.
+
+    Returns
+    -------
+    list[FileGroup]
+        One entry per resulting dataset, sorted by name.  Files that match no
+        consolidation rule are returned as singleton, non-consolidated groups.
+    """
+    sizes: dict[str, int | None] = {n: s for n, s in files}
+    remaining = set(sizes)
+    groups: list[FileGroup] = []
+
+    # ── Pass 1: digit-run patterns within each extension ──────────────────
+    # key: (ext, digit_masked_stem) -> [names]
+    digit_buckets: dict[tuple[str, str], list[str]] = {}
+    for name in list(remaining):
+        stem, ext = _split_ext(name)
+        pat = _digit_pattern(stem)
+        if pat is not None:
+            digit_buckets.setdefault((ext, pat), []).append(name)
+
+    for (ext, pat), members in digit_buckets.items():
+        if len(members) >= min_group:
+            members = sorted(members)
+            remaining.difference_update(members)
+            glob_stem = _glob_from_digit_pattern(pat)
+            groups.append(
+                FileGroup(
+                    members=members,
+                    ext=ext,
+                    total_size=_sum_sizes(members, sizes),
+                    pattern=f"{glob_stem}{ext}",
+                    consolidated=True,
+                )
+            )
+
+    # ── Pass 2: "one differing token" within each extension ───────────────
+    # Group stems that share all tokens but one (same token count).
+    token_buckets: dict[tuple[str, int, int, tuple[str, ...]], list[str]] = {}
+    for name in list(remaining):
+        stem, ext = _split_ext(name)
+        sig = _token_signature(stem)
+        if sig is None:
+            continue
+        tokens, ntok = sig
+        # For each position, the key is (ext, ntok, blanked_index, other_tokens)
+        for i in range(ntok):
+            others = tokens[:i] + ("*",) + tokens[i + 1 :]
+            token_buckets.setdefault((ext, ntok, i, others), []).append(name)
+
+    used_in_token_pass: set[str] = set()
+    # Prefer the largest buckets first so a file lands in its best group.
+    for (ext, ntok, idx, others), members in sorted(
+        token_buckets.items(), key=lambda kv: -len(kv[1])
+    ):
+        members = [m for m in members if m in remaining and m not in used_in_token_pass]
+        if len(members) >= min_token_group and len(set(members)) >= min_token_group:
+            members = sorted(members)
+            used_in_token_pass.update(members)
+            remaining.difference_update(members)
+            glob_stem = "*".join("" if t == "*" else t for t in others)
+            # rebuild a readable glob like "*.gif" / "frame_*_left.png"
+            pattern = _normalise_token_glob(others)
+            groups.append(
+                FileGroup(
+                    members=members,
+                    ext=ext,
+                    total_size=_sum_sizes(members, sizes),
+                    pattern=f"{pattern}{ext}",
+                    consolidated=True,
+                )
+            )
+
+    # ── Pass 3: leftovers are standalone files ────────────────────────────
+    for name in sorted(remaining):
+        _, ext = _split_ext(name)
+        groups.append(
+            FileGroup(
+                members=[name],
+                ext=ext,
+                total_size=sizes.get(name),
+                pattern=name,
+                consolidated=False,
+            )
+        )
+
+    return sorted(groups, key=lambda g: g.name)
+
+
+def _normalise_token_glob(tokens: tuple[str, ...]) -> str:
+    """Join token glob pieces, collapsing the blanked position to ``*``.
+
+    ``("*",)``                 -> ``"*"``
+    ``("frame", "*", "left")`` -> ``"frame_*_left"`` (best-effort separator)
+    """
+    parts = [("*" if t == "*" else t) for t in tokens]
+    # We lost the original separators; "_" is the most common, and the exact
+    # separator does not matter for globbing since "*" spans it anyway when the
+    # blank is interior. For a single trailing/leading blank this yields "*".
+    glob = "_".join(parts)
+    # Tidy duplicate stars produced by adjacent blanks.
+    while "**" in glob:
+        glob = glob.replace("**", "*")
+    return glob
+
+
+def _sum_sizes(members: list[str], sizes: dict[str, int | None]) -> int | None:
+    total = 0
+    for m in members:
+        s = sizes.get(m)
+        if s is None:
+            return None
+        total += s
+    return total
diff --git a/src/projspec/proj/base.py b/src/projspec/proj/base.py
index 25d5e7c..1766a51 100644
--- a/src/projspec/proj/base.py
+++ b/src/projspec/proj/base.py
@@ -3,18 +3,19 @@
 import logging
 import os
 import stat
+import time
 from collections.abc import Iterable
 from itertools import chain
 from functools import cached_property
 
 import fsspec
 import fsspec.implementations.local
-import projspec.utils
 import toml
 
 from projspec.config import get_conf
 from projspec.utils import (
     AttrDict,
+    DEFAULT,
     IndentDumper,
     PickleableTomlDecoder,
     camel_to_snake,
@@ -34,6 +35,37 @@ def _fmt_size(n: int) -> str:
         n /= 1024
 
 
+def _humanize_age(ts: float) -> str:
+    """Render a Unix timestamp as a relative "X ago" string.
+
+    e.g. "just now", "5 minutes ago", "3 hours ago", "today", "yesterday",
+    "4 days ago", "2 months ago", "1 year ago".
+    """
+    import datetime
+
+    age = datetime.datetime.now() - datetime.datetime.fromtimestamp(ts)
+    days = age.days
+    if days < 0:
+        # clock skew / future timestamp - treat as just now
+        return "just now"
+    if days == 0:
+        secs = int(age.total_seconds())
+        if secs < 60:
+            return "just now"
+        if secs < 3600:
+            mins = secs // 60
+            return f"{mins} minute{'s' if mins != 1 else ''} ago"
+        hours = secs // 3600
+        return f"{hours} hour{'s' if hours != 1 else ''} ago"
+    if days == 1:
+        return "yesterday"
+    if days < 30:
+        return f"{days} days ago"
+    if days < 365:
+        return f"{days // 30} months ago"
+    return f"{days // 365} year{'s' if days >= 730 else ''} ago"
+
+
 class ParseFailed(ValueError):
     """Exception raised when parsing fails: a directory does not meet the given spec."""
 
@@ -315,6 +347,8 @@ def resolve(
         types = set(camel_to_snake(_) for _ in types or ())
         if types and types - set(registry):
             raise ValueError(f"Unknown types: {set(types) - set(registry)}")
+        # record when this (re)scan happened
+        self.scanned_at = time.time()
         # sorting to ensure consistency
         for name in sorted(registry):
             cls = registry[name]
@@ -410,26 +444,18 @@ def _stats_line(self) -> str:
         # last modified
         lm = self.last_modified
         if lm is not None:
-            import datetime
-
-            age = datetime.datetime.now() - datetime.datetime.fromtimestamp(lm)
-            days = age.days
-            if days == 0:
-                age_str = "today"
-            elif days == 1:
-                age_str = "yesterday"
-            elif days < 30:
-                age_str = f"{days} days ago"
-            elif days < 365:
-                age_str = f"{days // 30} months ago"
-            else:
-                age_str = f"{days // 365} year{'s' if days >= 730 else ''} ago"
+            age_str = _humanize_age(lm)
             by = self.last_modified_by
             if by:
                 parts.append(f"last modified {age_str} by {by}")
             else:
                 parts.append(f"last modified {age_str}")
 
+        # when this project was last scanned
+        scanned_at = getattr(self, "scanned_at", None)
+        if scanned_at is not None:
+            parts.append(f"scanned {_humanize_age(scanned_at)}")
+
         return " " + " · ".join(parts) if parts else ""
 
     def __repr__(self):
@@ -557,17 +583,23 @@ def to_dict(self, compact=True) -> dict:
             is_writable=self.is_writable,
             last_modified=self.last_modified,
             last_modified_by=self.last_modified_by,
+            scanned_at=self.scanned_at,
         )
         if not compact:
             dic["klass"] = "project"
         return dic.to_dict(compact=compact)
 
-    def _repr_html_(self):
-        from projspec.html import dict_to_html
+    def _ipython_display_(self):
+        """Auto-display as the interactive widget when possible.
 
-        # TODO: add tooltips to docs or spec links
-        # TODO: remove redundant information?
-        return dict_to_html(self.to_dict(), title=self.url)
+        Falls back to a plain ``repr`` when ``anywidget`` /
+        ``ipywidgets`` is not available - Jupyter will then use the
+        normal text representation.
+        """
+        from projspec.library import ProjectLibrary
+
+        lib = ProjectLibrary(entries={"memory": self}, library_path=None)
+        lib._ipython_display_()
 
     @staticmethod
     def from_dict(dic):
@@ -583,6 +615,11 @@ def from_dict(dic):
         proj.path = dic["url"]
         proj.storage_options = dic["storage_options"]
         proj.fs, proj.url = fsspec.url_to_fs(proj.path, **proj.storage_options)
+        scanned_at = dic.get("scanned_at")
+        try:
+            proj.scanned_at = float(scanned_at)
+        except (TypeError, ValueError):
+            proj.scanned_at = time.time()
         # Restore cached tree stats so a round-tripped Project never re-walks.
         # Keys default to None if absent (e.g. older serialised data).
         proj.__dict__["_tree_stats"] = {
@@ -640,7 +677,7 @@ def make(self, qname: str, **kwargs):
         art.make(**kwargs)
         return art
 
-    def add_to_library(self, path=None):
+    def add_to_library(self, path=DEFAULT):
         """Add this project to the current session library"""
         # TODO: prevent overwrite?
         from projspec.library import ProjectLibrary
diff --git a/src/projspec/proj/data_dir.py b/src/projspec/proj/data_dir.py
deleted file mode 100644
index c0172c9..0000000
--- a/src/projspec/proj/data_dir.py
+++ /dev/null
@@ -1,679 +0,0 @@
-"""ProjectSpec for bare data directories.
-
-Matches directories whose contents are predominantly data files (by extension or
-by a recognised on-disk layout such as Hive partitioning, Apache Iceberg, Delta
-Lake, or Zarr), with no requirement for any declarative metadata file.
-"""
-
-from __future__ import annotations
-
-import os
-import re
-from posixpath import basename as _basename
-
-from projspec.proj import ProjectSpec, ParseFailed
-from projspec.utils import AttrDict
-
-_EXT_TO_FORMAT: dict[str, tuple[str, str]] = {
-    # Tabular / columnar -------------------------------------------------------
-    ".csv": ("csv", "tabular"),
-    ".tsv": ("tsv", "tabular"),
-    ".psv": ("psv", "tabular"),
-    ".parquet": ("parquet", "tabular"),
-    ".parq": ("parquet", "tabular"),
-    ".pq": ("parquet", "tabular"),
-    ".arrow": ("arrow", "tabular"),
-    ".ipc": ("arrow", "tabular"),
-    ".feather": ("arrow", "tabular"),  # Feather v1/v2 (magic: FEA1 / ARROW1)
-    ".orc": ("orc", "tabular"),
-    ".avro": ("avro", "tabular"),
-    ".xls": ("excel", "tabular"),
-    ".xlsx": ("excel", "tabular"),
-    ".xlsm": ("excel", "tabular"),
-    ".xlsb": ("excel", "tabular"),
-    ".jsonl": ("jsonlines", "tabular"),
-    ".ndjson": ("jsonlines", "tabular"),
-    ".db": ("sqlite", "tabular"),  # DuckDB / SQLite (disambiguated by magic)
-    ".sqlite": ("sqlite", "tabular"),
-    ".sqlitedb": ("sqlite", "tabular"),
-    ".duckdb": ("duckdb", "tabular"),
-    # Array / scientific -------------------------------------------------------
-    ".npy": ("numpy", "array"),
-    ".npz": ("numpy", "array"),
-    ".hdf5": ("hdf5", "array"),
-    ".hdf": ("hdf5", "array"),
-    ".h5": ("hdf5", "array"),
-    ".h4": ("hdf5", "array"),
-    ".he5": ("hdf5", "array"),
-    ".nc": ("netcdf", "array"),
-    ".nc3": ("netcdf", "array"),
-    ".nc4": ("netcdf", "array"),
-    ".mat": ("matlab", "array"),
-    ".fits": ("fits", "array"),
-    ".grib": ("grib", "timeseries"),
-    ".grb": ("grib", "timeseries"),
-    ".grib2": ("grib", "timeseries"),
-    ".grb2": ("grib", "timeseries"),
-    ".asdf": ("asdf", "array"),
-    ".zarr": ("zarr", "array"),
-    # Image / biomedical imaging -----------------------------------------------
-    ".png": ("png", "image"),
-    ".jpg": ("jpeg", "image"),
-    ".jpeg": ("jpeg", "image"),
-    ".tif": ("tiff", "image"),  # also geotiff — ambiguous; image wins
-    ".tiff": ("tiff", "image"),
-    ".cog": ("tiff", "geospatial"),  # Cloud-Optimised GeoTIFF
-    ".bmp": ("bmp", "image"),
-    ".gif": ("gif", "image"),
-    ".webp": ("webp", "image"),
-    ".dcm": ("dicom", "image"),
-    ".dicom": ("dicom", "image"),
-    ".nii": ("nifti", "image"),
-    ".nrrd": ("nrrd", "image"),
-    ".nhdr": ("nrrd", "image"),
-    ".mha": ("metaimage", "image"),
-    ".mhd": ("metaimage", "image"),
-    ".svs": ("svs", "image"),  # Aperio whole-slide image
-    ".ndpi": ("ndpi", "image"),  # Hamamatsu whole-slide image
-    ".scn": ("scn", "image"),  # Leica whole-slide image
-    ".lsm": ("lsm", "image"),  # Zeiss confocal
-    ".exr": ("exr", "image"),  # OpenEXR HDR
-    ".qptiff": ("qptiff", "image"),  # PerkinElmer whole-slide
-    # Geospatial ---------------------------------------------------------------
-    ".shp": ("shapefile", "geospatial"),
-    ".shx": ("shapefile", "geospatial"),
-    ".dbf": ("shapefile", "geospatial"),
-    ".geojson": ("geojson", "geospatial"),
-    ".gpkg": ("geopackage", "geospatial"),
-    ".fgb": ("flatgeobuf", "geospatial"),
-    ".kml": ("kml", "geospatial"),
-    ".pmtiles": ("pmtiles", "geospatial"),
-    # Audio --------------------------------------------------------------------
-    ".wav": ("wav", "timeseries"),
-    ".flac": ("flac", "timeseries"),
-    ".mp3": ("mp3", "timeseries"),
-    ".ogg": ("ogg", "timeseries"),
-    # Video --------------------------------------------------------------------
-    ".mp4": ("mp4", "video"),
-    ".avi": ("avi", "video"),
-    ".mov": ("mov", "video"),
-    ".mkv": ("mkv", "video"),
-    ".webm": ("webm", "video"),
-    # ML model weights ---------------------------------------------------------
-    ".safetensors": ("safetensors", "model"),
-    ".gguf": ("gguf", "model"),
-    ".pt": ("pytorch", "model"),
-    ".pth": ("pytorch", "model"),
-    ".onnx": ("onnx", "model"),
-    ".tfrec": ("tfrecord", "model"),
-    # Archive / bundle ---------------------------------------------------------
-    ".pkl": ("pickle", "archive"),
-    ".bin": ("binary", "archive"),
-}
-
-_DATA_EXTENSIONS: frozenset[str] = frozenset(_EXT_TO_FORMAT)
-
-# Magic-byte signatures (format, modality, offset, bytes_pattern).
-_MAGIC: list[tuple[str, str, int | None, bytes]] = [
-    # Fixed-offset signatures
-    ("dicom", "image", 128, b"DICM"),  # DICOM preamble
-    ("nifti", "image", 344, b"ni1\x00"),  # NIfTI-1
-    ("nifti", "image", 344, b"n+1\x00"),  # NIfTI-1 single file
-    ("duckdb", "tabular", 8, b"DUCK"),
-    ("safetensors", "model", 8, b"{"),  # SafeTensors JSON header
-    ("wav", "timeseries", 8, b"WAVE"),  # RIFF…WAVE
-    # Offset-0 signatures
-    ("parquet", "tabular", 0, b"PAR1"),
-    ("hdf5", "array", 0, b"\x89HDF"),
-    ("netcdf", "array", 0, b"CDF\x01"),  # NetCDF classic
-    ("netcdf", "array", 0, b"CDF\x02"),  # NetCDF-64bit
-    ("orc", "tabular", 0, b"ORC"),
-    ("avro", "tabular", 0, b"Obj\x01"),
-    ("arrow", "tabular", 0, b"ARROW1"),  # IPC stream
-    ("arrow", "tabular", 0, b"FEA1"),  # Feather v1
-    ("numpy", "array", 0, b"\x93NUMPY"),
-    ("matlab", "array", 0, b"MATLAB"),
-    ("fits", "array", 0, b"SIMPLE"),
-    ("grib", "timeseries", 0, b"GRIB"),
-    ("asdf", "array", 0, b"#ASDF"),
-    ("flatgeobuf", "geospatial", 0, b"fgb"),
-    ("gguf", "model", 0, b"GGUF"),
-    ("png", "image", 0, b"\x89PNG"),
-    ("jpeg", "image", 0, b"\xff\xd8\xff"),
-    ("tiff", "image", 0, b"II*\x00"),  # little-endian TIFF
-    ("tiff", "image", 0, b"MM\x00*"),  # big-endian TIFF
-    ("sqlite", "tabular", 0, b"SQLite format"),
-    ("shapefile", "geospatial", 0, b"\x00\x00\x27\x0a"),
-    ("pmtiles", "geospatial", 0, b"PMTiles"),
-]
-
-# Regex that matches Hive-style partition directory names (e.g. "year=2024").
-_HIVE_DIR_RE = re.compile(r"^[^=]+=.+$")
-
-
-def _read_schema(path: str, fmt: str, fs) -> dict | list:
-    """Return a best-effort schema dict/list for *path*, or {} on any failure."""
-    try:
-        if fmt == "parquet":
-            try:
-                import pyarrow.parquet as pq
-
-                with fs.open(path, "rb") as fh:
-                    pf = pq.ParquetFile(fh)
-                    return {field.name: str(field.type) for field in pf.schema_arrow}
-            except ImportError:
-                pass
-
-        elif fmt == "arrow":
-            try:
-                import pyarrow.ipc as ipc
-
-                with fs.open(path, "rb") as fh:
-                    reader = ipc.open_file(fh)
-                    return {field.name: str(field.type) for field in reader.schema}
-            except ImportError:
-                pass
-
-        elif fmt == "hdf5":
-            try:
-                import h5py
-
-                with fs.open(path, "rb") as fh:
-                    with h5py.File(fh, "r") as ds:
-                        return {
-                            "variables": list(ds.keys()),
-                            "attrs": dict(ds.attrs),
-                        }
-            except ImportError:
-                pass
-
-        elif fmt == "netcdf":
-            try:
-                import netCDF4 as nc  # type: ignore[import]
-
-                with fs.open(path, "rb") as fh:
-                    ds = nc.Dataset("in-mem", memory=fh.read())
-                    return {
-                        "variables": list(ds.variables.keys()),
-                        "dims": {k: len(v) for k, v in ds.dimensions.items()},
-                    }
-            except ImportError:
-                try:
-                    import xarray as xr  # type: ignore[import]
-
-                    with fs.open(path, "rb") as fh:
-                        ds = xr.open_dataset(fh, engine="scipy")
-                        return {
-                            "variables": list(ds.data_vars),
-                            "dims": dict(ds.dims),
-                        }
-                except ImportError:
-                    pass
-
-        elif fmt in ("jpeg", "png", "bmp", "gif", "webp", "tiff"):
-            try:
-                from PIL import Image  # type: ignore[import]
-
-                with fs.open(path, "rb") as fh:
-                    img = Image.open(fh)
-                    img.load()
-                    mode = img.mode
-                    channels = len(img.getbands())
-                    return {
-                        "width": img.width,
-                        "height": img.height,
-                        "channels": channels,
-                        "mode": mode,
-                    }
-            except ImportError:
-                pass
-
-        elif fmt in ("wav", "flac", "mp3", "ogg"):
-            try:
-                import soundfile as sf  # type: ignore[import]
-
-                with fs.open(path, "rb") as fh:
-                    info = sf.info(fh)
-                    return {
-                        "sample_rate": info.samplerate,
-                        "channels": info.channels,
-                        "frames": info.frames,
-                    }
-            except ImportError:
-                pass
-
-    except Exception:  # — never let schema extraction abort parsing
-        pass
-
-    return {}
-
-
-def _filelist_dirs(filelist: list[dict]) -> list[dict]:
-    """Return only directory entries from a filelist."""
-    return [e for e in filelist if e.get("type", "") == "directory"]
-
-
-def _filelist_files(filelist: list[dict]) -> list[dict]:
-    """Return only file entries from a filelist."""
-    return [e for e in filelist if e.get("type", "") != "directory"]
-
-
-def _fmt_from_path(path: str) -> tuple[str, str] | None:
-    """Return (format, modality) for *path* by extension, or None if unknown."""
-    ext = os.path.splitext(path)[1].lower()
-    return _EXT_TO_FORMAT.get(ext)
-
-
-def _identify_by_magic(path: str, fs) -> tuple[str, str] | None:
-    """Return (format, modality) by probing *path*'s header bytes, or None.
-
-    Reads up to 1 KiB.  Checks fixed-offset patterns first (longer offsets
-    first, to avoid short patterns shadowing longer ones), then scans for
-    anywhere-patterns via re.search.
-    """
-    try:
-        with fs.open(path, "rb") as fh:
-            head = fh.read(1024)
-    except Exception:
-        return None
-
-    for fmt, modality, offset, pattern in _MAGIC:
-        if offset is None:
-            if re.search(re.escape(pattern), head):
-                return fmt, modality
-        else:
-            if head[offset : offset + len(pattern)] == pattern:
-                return fmt, modality
-    return None
-
-
-# Token that may vary across files in a series: digits, dashes, underscores, dots.
-# Alphabetic variation (e.g. "users" vs "orders") disqualifies collation.
-_SERIES_VAR_RE = re.compile(r"^[\d\-_.]+$")
-
-
-def _common_affix(stems: list[str]) -> tuple[str, str]:
-    """Return the longest (prefix, suffix) shared by every stem in *stems*."""
-    if not stems:
-        return "", ""
-    prefix = os.path.commonprefix(stems)
-    # Reverse each stem to find common suffix via commonprefix trick
-    rev = [s[::-1] for s in stems]
-    suffix = os.path.commonprefix(rev)[::-1]
-    # Ensure prefix and suffix don't overlap (can happen with a single-char stem)
-    if len(prefix) + len(suffix) > min(len(s) for s in stems):
-        suffix = ""
-    return prefix, suffix
-
-
-def _group_by_naming_series(entries: list[dict]) -> list[list[dict]]:
-    """Partition *entries* (same-format file list) into naming-series groups.
-
-    Two or more files belong to the same series when their basenames (stems)
-    differ only in a contiguous segment that consists solely of digits, dashes,
-    underscores, or dots — i.e. a numeric counter or a date component.
-
-    A single file is always its own series (trivially consistent).
-
-    Returns a list of groups, each group being a non-empty list of entries that
-    share a common naming pattern.
-    """
-    if len(entries) <= 1:
-        return [entries] if entries else []
-
-    # Compute stems once
-    stems = [os.path.splitext(_basename(e["name"]))[0] for e in entries]
-
-    prefix, suffix = _common_affix(stems)
-    plen, slen = len(prefix), len(suffix)
-
-    # Extract the variable middle segment for each stem
-    variables = []
-    for stem in stems:
-        mid = stem[plen : len(stem) - slen if slen else len(stem)]
-        variables.append(mid)
-
-    # All files form one series if:
-    #   1. There is a non-trivial shared prefix OR suffix (at least 1 char), AND
-    #   2. Every variable segment is numeric/date-like (no alphabetic chars)
-    has_affix = plen >= 1 or slen >= 1
-    all_numeric_var = all(_SERIES_VAR_RE.match(v) or v == "" for v in variables)
-
-    if has_affix and all_numeric_var:
-        return [entries]
-
-    # Otherwise fall back: each file is its own "series" (separate resource)
-    return [[e] for e in entries]
-
-
-# Notably absent: datapackage.json, catalog.yaml/yml, .dvc/ — those belong
-# to projspec.proj.datapackage and are treated as compatible companions.
-_NON_DATA_SENTINELS: frozenset[str] = frozenset(
-    {
-        # Python
-        "pyproject.toml",
-        "setup.py",
-        "setup.cfg",
-        "hatch.toml",
-        # Rust
-        "Cargo.toml",
-        # JavaScript / Node
-        "package.json",
-        # Go
-        "go.mod",
-        # Container / infra
-        "Dockerfile",
-        "docker-compose.yml",
-        "docker-compose.yaml",
-        # Helm
-        "Chart.yaml",
-        # Ruby / Java / .NET
-        "Gemfile",
-        "pom.xml",
-        "build.gradle",
-        "*.csproj",
-        # R
-        "DESCRIPTION",
-        # Conda
-        "environment.yml",
-        "environment.yaml",
-        "meta.yaml",
-        # Pixi
-        "pixi.toml",
-        # Mkdocs / Sphinx / RTD
-        "mkdocs.yml",
-        "mkdocs.yaml",
-        "conf.py",
-        ".readthedocs.yaml",
-        ".readthedocs.yml",
-        # Scripts / notebooks that imply code-first dirs
-        "Makefile",
-    }
-)
-
-
-class Data(ProjectSpec):
-    """A directory whose primary contents are data files.
-
-    Matches on any of:
-    - At least one file with an unambiguous data extension (CSV, Parquet, Arrow,
-      HDF5, images, audio, etc.) — without requiring a metadata sidecar.
-    - A recognised directory layout: Hive partitioning (`key=value/` subdirs),
-      Apache Iceberg (`metadata/` directory), Delta Lake (`_delta_log/`), or
-      a Zarr store (`.zattrs` / `.zgroup` at the root).
-
-    If no non-datapackage project signals are present in the directory the spec
-    parses unconditionally.  If sentinel files that indicate another project type
-    (`pyproject.toml`, `Cargo.toml`, `package.json`, …) are found, parsing
-    succeeds only when the majority of bytes in the root file listing belong to
-    recognised data files; otherwise `ParseFailed` is raised so that the
-    directory is not double-counted as both a code project and a data project.
-    """
-
-    icon = "🗄️"
-
-    def match(self) -> bool:
-        # Fast path: structural layout signals (no file-content inspection needed)
-        if self._detect_layout():
-            return True
-        # Slow path: any top-level file with an unambiguous data extension
-        return any(
-            os.path.splitext(name)[1].lower() in _DATA_EXTENSIONS
-            for name in self.proj.basenames
-        )
-
-    def parse(self) -> None:
-        if self._has_non_data_sentinels():
-            if not self._data_bytes_majority():
-                raise ParseFailed(
-                    "Non-data project sentinels found and data files are not "
-                    "the majority of bytes — skipping Data spec"
-                )
-
-        layout = self._detect_layout()
-        resources: list
-
-        if layout in ("hive", "iceberg", "delta"):
-            resources = self._parse_layout_dirs(layout)
-            # Delta/Iceberg also commonly store data files at the root level
-            # alongside the log/metadata directory; collect those too.
-            if layout in ("iceberg", "delta"):
-                root_resources = self._parse_flat()
-                resources = resources + root_resources
-        elif layout in ("zarr_store", "tiledarray"):
-            resources = [self._parse_zarr_root()]
-        else:
-            resources = self._parse_flat()
-
-        if not resources:
-            raise ParseFailed("No recognisable data files found")
-
-        if len(resources) == 1:
-            self._contents["data_resource"] = resources[0]
-        else:
-            self._contents["data_resource"] = AttrDict(
-                {_safe_key(r.path): r for r in resources}
-            )
-
-    def _has_non_data_sentinels(self) -> bool:
-        """Return True if any non-datapackage project sentinel is present."""
-        basenames = self.proj.basenames
-        return any(name in _NON_DATA_SENTINELS for name in basenames)
-
-    def _data_bytes_majority(self) -> bool:
-        """Return True if data files account for >50 % of root-listing bytes.
-
-        Files with unknown / zero size are excluded from both totals so they
-        do not unfairly skew the ratio.
-        """
-        total_bytes = 0
-        data_bytes = 0
-        for entry in self.proj.filelist:
-            size = entry.get("size") or 0
-            if size <= 0:
-                continue
-            total_bytes += size
-            ext = os.path.splitext(entry["name"].rsplit("/", 1)[-1])[1].lower()
-            if ext in _DATA_EXTENSIONS:
-                data_bytes += size
-        if total_bytes == 0:
-            return False
-        return data_bytes > total_bytes / 2
-
-    def _detect_layout(self) -> str:
-        """Return a layout string, or '' if none of the known layouts match.
-
-        Uses the `contains` sentinel approach from intake: certain well-known
-        files/directories at the root identify a directory as a logical dataset.
-        """
-        basenames = self.proj.basenames
-        # Zarr store: .zattrs, .zgroup, or zarr.json at the root
-        # (zarr.json is the Zarr v3 sentinel; .zattrs/.zgroup are v2)
-        if any(s in basenames for s in (".zattrs", ".zgroup", "zarr.json")):
-            return "zarr_store"
-        dir_names = {_basename(e["name"]) for e in _filelist_dirs(self.proj.filelist)}
-        # Delta Lake
-        if "_delta_log" in dir_names:
-            return "delta"
-        # TileDB array directory
-        if "__meta" in dir_names and "__schema" in dir_names:
-            return "tiledarray"
-        # Apache Iceberg: metadata/ directory present
-        if "metadata" in dir_names:
-            return "iceberg"
-        # Partitioned Parquet: _metadata sentinel file at root (written by Spark/Dask)
-        if "_metadata" in basenames:
-            return "iceberg"
-        # Hive: any top-level subdirectory whose name matches key=value
-        if any(_HIVE_DIR_RE.match(d) for d in dir_names):
-            return "hive"
-        return ""
-
-    def _resource_from_entries(
-        self, entries: list[dict], fmt: str, modality: str, layout: str
-    ):
-        """Build a DataResource from a list of same-format file entries.
-
-        The `path` field is set to:
-
-        - Single file: the bare basename, e.g. `"data.csv"`.
-        - Multi-file series: a glob pattern, e.g. `"part*.csv"`, built from
-          the shared prefix/suffix of the basenames.
-        """
-        from projspec.content.data import DataResource
-
-        full_paths = [e["name"] for e in entries]
-        total_size = sum(e.get("size", 0) or 0 for e in entries)
-        sample_path = full_paths[0] if full_paths else ""
-        schema = _read_schema(sample_path, fmt, self.proj.fs) if sample_path else {}
-
-        ext = os.path.splitext(_basename(full_paths[0]))[1] if full_paths else ""
-
-        if len(entries) == 1:
-            path = _basename(full_paths[0]) or fmt
-        else:
-            stems = [os.path.splitext(_basename(p))[0] for p in full_paths]
-            prefix, suffix = _common_affix(stems)
-            stem_pattern = (prefix.rstrip("-_.") or fmt) + "*" + suffix
-            path = stem_pattern + ext
-
-        return DataResource(
-            proj=self.proj,
-            path=path,
-            format=fmt,
-            modality=modality,
-            layout=layout,
-            file_count=len(entries),
-            total_size=total_size,
-            schema=schema,
-            sample_path=sample_path,
-        )
-
-    def _parse_flat(self) -> list:
-        """Group top-level files by format and naming series.
-
-        Files of the same format are only collated into a single DataResource
-        when they share a consistent naming schema — i.e. their stems differ
-        only in a numeric or date-like segment (e.g. `part0.csv`,
-        `part1.csv` or `2024-02.tiff`, `2024-03.tiff`).  Files whose
-        stems vary in alphabetic content (e.g. `users.csv`, `orders.csv`)
-        each become their own DataResource.
-        """
-        # First bucket by (fmt, modality)
-        fmt_groups: dict[tuple[str, str], list[dict]] = {}
-        for entry in _filelist_files(self.proj.filelist):
-            fmt_info = _fmt_from_path(entry["name"])
-            if fmt_info is None:
-                continue
-            fmt_groups.setdefault(fmt_info, []).append(entry)
-
-        resources = []
-        for (fmt, modality), entries in fmt_groups.items():
-            # Split each format-group into naming series
-            for series in _group_by_naming_series(entries):
-                resources.append(
-                    self._resource_from_entries(series, fmt, modality, "flat")
-                )
-        return resources
-
-    def _parse_layout_dirs(self, layout: str) -> list:
-        """One DataResource per top-level subdirectory (partition / table root).
-
-        Within each subdirectory the dominant format is determined, then files
-        are checked for a consistent naming series before collating.
-        """
-        dir_entries = _filelist_dirs(self.proj.filelist)
-        resources = []
-        for dir_entry in dir_entries:
-            dir_path = dir_entry["name"]
-            dir_name = _basename(dir_path)
-            # Skip hidden/internal dirs for iceberg/delta
-            if layout in ("iceberg", "delta") and dir_name.startswith(
-                ("metadata", "_delta_log", "_")
-            ):
-                continue
-            # Enumerate files one level inside this subdirectory
-            try:
-                sub_filelist = self.proj.fs.ls(dir_path, detail=True)
-            except Exception:
-                continue
-
-            sub_files = _filelist_files(sub_filelist)
-            # Determine dominant (fmt, modality) by file count
-            fmt_counts: dict[tuple[str, str], int] = {}
-            for e in sub_files:
-                fmt_info = _fmt_from_path(e["name"])
-                if fmt_info:
-                    fmt_counts[fmt_info] = fmt_counts.get(fmt_info, 0) + 1
-            if not fmt_counts:
-                continue
-            dominant = max(fmt_counts, key=lambda k: fmt_counts[k])
-            dominant_fmt, dominant_modality = dominant
-            dominant_files = [
-                e for e in sub_files if _fmt_from_path(e["name"]) == dominant
-            ]
-            resource = self._resource_from_entries(
-                dominant_files, dominant_fmt, dominant_modality, layout
-            )
-            # Override path with the directory basename + trailing slash
-            # (partition dirs are already logically grouped by the directory)
-            resource.path = dir_name + "/"
-            resources.append(resource)
-        return resources
-
-    def _parse_zarr_root(self):
-        """Describe the whole directory as a single array-store resource.
-
-        Used for Zarr stores and TileDB arrays — both are directory-as-dataset
-        layouts with no individual data files at the root.
-        """
-        from projspec.content.data import DataResource
-
-        url = self.proj.url
-        layout = self._detect_layout()
-        # TileDB directories are not Zarr; distinguish the format accordingly
-        if layout == "tiledarray":
-            fmt, modality = "tiledb", "array"
-            schema: dict | list = {}
-        else:
-            fmt, modality = "zarr", "array"
-            schema = {}
-            try:
-                import zarr  # type: ignore[import]
-
-                store = zarr.open(url, mode="r")
-                schema = {
-                    "arrays": list(store.array_keys()),
-                    "groups": list(store.group_keys()),
-                    "attrs": dict(store.attrs),
-                }
-            except (ImportError, Exception):
-                pass
-
-        total_size = sum(
-            e.get("size", 0) or 0 for e in _filelist_files(self.proj.filelist)
-        )
-        return DataResource(
-            proj=self.proj,
-            path=(_basename(url) or fmt) + "/",
-            format=fmt,
-            modality=modality,
-            layout=layout,
-            file_count=len(_filelist_files(self.proj.filelist)),
-            total_size=total_size,
-            schema=schema,
-            sample_path="",
-        )
-
-
-# ---------------------------------------------------------------------------
-# Utilities
-# ---------------------------------------------------------------------------
-
-
-def _safe_key(name: str) -> str:
-    """Convert an arbitrary name to a valid Python identifier for AttrDict keys."""
-    key = re.sub(r"[^0-9a-zA-Z_]", "_", name)
-    if key and key[0].isdigit():
-        key = "_" + key
-    return key or "_unnamed"
diff --git a/src/projspec/proj/data_project.py b/src/projspec/proj/data_project.py
new file mode 100644
index 0000000..d930b2f
--- /dev/null
+++ b/src/projspec/proj/data_project.py
@@ -0,0 +1,434 @@
+"""The :class:`DataProject` project type.
+
+A *data project* is a directory that is wholly or substantially made up of
+data files (as opposed to source code, docs or config).  Examples:
+
+* a directory of CSV/parquet/JSON files exported from a database,
+* a folder of images or arrays,
+* a code project that *also* ships a significant amount of bundled data.
+
+Detection policy
+----------------
+Scanning data is comparatively expensive (intake reads magic bytes / samples),
+so we only do it when the data is *worth* describing.  Data is considered
+significant when **any** of the following holds:
+
+* the candidate data files make up at least ``data_min_fraction`` of the
+  project's total bytes, **and** their combined size is at least
+  ``data_min_total_size`` (guards against a project of tiny files);
+* at least one individual data file is at least ``data_min_file_size``
+  (a single big file is always worth describing);
+* the directory matched no other project type at all (a bare data dump), as
+  long as the data clears ``data_min_total_size``.
+
+Consolidation
+-------------
+Before handing files to intake, obviously-related files are grouped into a
+single dataset (see :mod:`projspec.proj._consolidate`):
+
+* numbered series – ``001.csv``, ``002.csv`` → ``*.csv``
+* spark/dask parts – ``part-00000.parquet`` … → ``part-*.parquet``
+* token series – ``green.gif``, ``red.gif`` → ``*.gif``
+
+Intake's own directory-dataset recognition (hive parquet, zarr, delta, …) is
+preserved: such directories are inspected as a whole rather than file-by-file.
+
+Per-dataset significance
+------------------------
+Just as the whole directory must clear the significance bar above, the
+individual datasets within a data project are filtered too: a dataset whose
+size is less than ``data_min_fraction`` of the largest dataset is treated as
+incidental and dropped (see :meth:`DataProject._filter_small_datasets`).  This
+mirrors the project-level fraction test so that a project dominated by one big
+dataset doesn't also report a handful of tiny, unrelated ones.
+"""
+
+from __future__ import annotations
+
+import logging
+
+from projspec.config import get_conf
+from projspec.proj.base import ProjectSpec, ParseFailed
+from projspec.proj._consolidate import consolidate, FileGroup
+from projspec.utils import AttrDict
+
+logger = logging.getLogger("projspec.data_project")
+
+# Extensions that are *not* data: source code, build/config, docs.  Anything
+# else (or no extension) is a candidate data file.  Kept conservative on
+# purpose - intake makes the final call on whether something is real data.
+_NON_DATA_EXT = {
+    # python / compiled
+    ".py",
+    ".pyc",
+    ".pyi",
+    ".pyx",
+    ".pxd",
+    ".so",
+    ".pyd",
+    ".ipynb",
+    # other languages
+    ".c",
+    ".h",
+    ".cpp",
+    ".hpp",
+    ".cc",
+    ".rs",
+    ".go",
+    ".java",
+    ".kt",
+    ".scala",
+    ".js",
+    ".jsx",
+    ".ts",
+    ".tsx",
+    ".rb",
+    ".php",
+    ".swift",
+    ".m",
+    ".sh",
+    ".bash",
+    ".lua",
+    ".pl",
+    ".r",
+    ".jl",
+    # config / build / project metadata
+    ".toml",
+    ".cfg",
+    ".ini",
+    ".lock",
+    ".mk",
+    ".cmake",
+    ".gradle",
+    ".bazel",
+    ".dockerfile",
+    ".env",
+    ".editorconfig",
+    ".gitignore",
+    ".gitattributes",
+    # docs / web
+    ".md",
+    ".rst",
+    ".txt",
+    ".html",
+    ".htm",
+    ".css",
+    ".scss",
+    ".tex",
+    # these are ambiguous - yaml/json are often config but also data; we treat
+    # them as candidate data only when they dominate (handled by thresholds).
+}
+
+# Directory-based dataset markers intake understands; if present we inspect the
+# whole directory rather than enumerating files.
+_DIR_DATASET_MARKERS = (
+    "_metadata",
+    "_common_metadata",
+    "_delta_log",
+    ".zgroup",
+    ".zarray",
+    "zarr.json",
+    "_latest.manifest",
+)
+
+
+class DataProject(ProjectSpec):
+    """A project that is wholly or substantially composed of data files.
+
+    Produces one :class:`projspec.content.data.Dataset` content object per
+    consolidated dataset found, populated from
+    :func:`intake.readers.inspect.inspect_dataset` where intake is available.
+    """
+
+    icon = "🗃️"
+    spec_doc = (
+        "https://intake.readthedocs.io/en/latest/api2.html"
+        "#intake.readers.inspect.inspect_dataset"
+    )
+
+    # ── helpers ───────────────────────────────────────────────────────────
+    @staticmethod
+    def _is_data_ext(name: str) -> bool:
+        """Whether a basename looks like a data file (not code/docs/config)."""
+        lower = name.lower()
+        if lower.startswith("."):
+            return False  # dotfiles are metadata, not data
+        if "." not in lower:
+            return False  # no extension - usually not a recognisable dataset
+        for double in (".csv.gz", ".json.gz", ".tsv.gz"):
+            if lower.endswith(double):
+                return True
+        ext = "." + lower.rsplit(".", 1)[-1]
+        return ext not in _NON_DATA_EXT
+
+    def _candidate_files(self) -> list[tuple[str, int | None]]:
+        """``(basename, size)`` for data-like files directly in the root."""
+        out = []
+        for info in self.proj.filelist:
+            if info.get("type") == "directory":
+                continue
+            name = info["name"].rsplit("/", 1)[-1]
+            if self._is_data_ext(name):
+                out.append((name, info.get("size")))
+        return out
+
+    def _has_dir_dataset(self) -> bool:
+        """True if the root itself is an intake directory-dataset (hive, zarr…)."""
+        return any(m in self.proj.basenames for m in _DIR_DATASET_MARKERS)
+
+    # ── match ─────────────────────────────────────────────────────────────
+    def match(self) -> bool:
+        """Cheap check: are there any candidate data files (or a dir-dataset)?
+
+        Significance (size/fraction) is enforced in :meth:`parse` so that
+        ``match`` stays cheap and never reads file contents.
+        """
+        if self._has_dir_dataset():
+            return True
+        return bool(self._candidate_files())
+
+    # ── significance policy ────────────────────────────────────────────────
+    def _other_type_matches(self) -> bool:
+        """Cheaply test whether any *other* project type matches this directory.
+
+        ``parse`` runs in registry order, so ``self.proj.specs`` is not yet
+        complete when ``DataProject`` is parsed.  Instead we re-run the cheap
+        ``match()`` of every other registered spec.  ``match`` is contractually
+        cheap (basename checks), so this adds little cost and only happens once
+        per directory that has candidate data.
+        """
+        from projspec.proj.base import registry, ProjectExtra
+
+        for name, cls in registry.items():
+            if name == "data_project":
+                continue
+            # ProjectExtra specs (licences, CI, intake catalogs, …) are
+            # cross-cutting add-ons, not standalone project types, so a match
+            # from one of them should not suppress a data project.
+            if issubclass(cls, ProjectExtra):
+                continue
+            try:
+                inst = cls(self.proj)  # __init__ runs match(), raises if no match
+            except Exception:
+                continue
+            else:
+                logger.debug("DataProject deferring to %s for %s", name, self.proj.url)
+                return True
+        return False
+
+    def _is_significant(self, data_bytes: int, max_file: int) -> bool:
+        """Apply the detection policy described in the module docstring."""
+        min_file = get_conf("data_min_file_size")
+        min_total = get_conf("data_min_total_size")
+        min_frac = get_conf("data_min_fraction")
+        min_play = get_conf("data_min_play_size")
+
+        # 1. a single big file is always worth describing
+        if max_file >= min_file:
+            return True
+
+        total = self.proj.total_size or data_bytes
+        # 2. data dominates the project by byte fraction (and isn't trivially small)
+        if total and data_bytes / total >= min_frac and data_bytes >= min_total:
+            return True
+
+        # 3. nothing else matched -> treat any non-play data dump as a project.
+        #    Here the bar is only "more than play data", not the full
+        #    data_min_total_size used for the also-a-data-project case above.
+        if data_bytes >= min_play and not self._other_type_matches():
+            return True
+
+        return False
+
+    def _filter_small_datasets(self, datasets: list) -> list:
+        """Drop datasets that are a small fraction of the largest one.
+
+        Operates on a list of ``(name, Dataset)`` pairs (the form used while
+        assembling :meth:`parse`'s output).
+
+        Just as :meth:`_is_significant` decides whether the directory as a
+        whole is data-y enough to report, this applies the same spirit to the
+        individual datasets within a data project: a dataset whose size is
+        less than ``data_min_fraction`` of the biggest dataset is treated as
+        incidental and discarded.
+
+        The comparison is by byte size relative to the largest dataset.  If
+        fewer than two datasets are present, or any dataset's size is unknown
+        (``None``), no filtering is applied (we can't reason about fractions).
+        """
+        if len(datasets) < 2:
+            return datasets
+        sizes = [getattr(ds, "total_size", None) for _, ds in datasets]
+        if any(s is None for s in sizes):
+            return datasets
+        largest = max(s for s in sizes if s is not None)
+        if largest <= 0:
+            return datasets
+        min_frac = get_conf("data_min_fraction")
+        kept = [
+            pair
+            for pair, s in zip(datasets, sizes)
+            if s is not None and s / largest >= min_frac
+        ]
+        # never drop everything: if the threshold somehow excludes all (e.g.
+        # min_frac > 1), fall back to keeping the original set.
+        return kept or datasets
+
+    # ── parse ──────────────────────────────────────────────────────────────
+    def parse(self) -> None:
+        candidates = self._candidate_files()
+        has_dir_dataset = self._has_dir_dataset()
+
+        data_bytes = sum(s or 0 for _, s in candidates)
+        max_file = max((s or 0 for _, s in candidates), default=0)
+
+        if not has_dir_dataset and not self._is_significant(data_bytes, max_file):
+            raise ParseFailed("Data present but not a significant data project")
+
+        groups: list[FileGroup]
+        if has_dir_dataset:
+            # Let intake describe the whole directory as one dataset.
+            name = self.proj.url.rstrip("/").rsplit("/", 1)[-1] or "dataset"
+            groups = [
+                FileGroup(
+                    members=[],
+                    total_size=self.proj.total_size,
+                    pattern=name,
+                    consolidated=True,
+                )
+            ]
+            dir_dataset = True
+        else:
+            min_group = get_conf("data_consolidate_min_group")
+            groups = consolidate(candidates, min_group=min_group)
+            dir_dataset = False
+
+        if len(groups) > get_conf("data_inspect_max_datasets"):
+            logger.debug(
+                "Too many datasets (%d) in %s; describing without intake",
+                len(groups),
+                self.proj.url,
+            )
+            described = [self._describe_without_intake(g) for g in groups]
+        else:
+            described = [self._describe(g, dir_dataset=dir_dataset) for g in groups]
+
+        # Each entry is a (name, Dataset) pair. Only keep datasets that intake
+        # could assign a datatype to; datasets whose type could not be
+        # identified are not useful as data content.
+        described = [(name, ds) for name, ds in described if ds.datatype is not None]
+
+        # Drop datasets that are only a small fraction of the largest one,
+        # analogous to the project-level significance test.
+        described = self._filter_small_datasets(described)
+
+        if not described:
+            raise ParseFailed("No datasets with an identified datatype found")
+
+        # Datasets are keyed by their (unique) name; the name is therefore not
+        # duplicated as a field on the Dataset objects themselves.
+        datasets = AttrDict()
+        for name, ds in described:
+            key = name
+            # guard against the (rare) case of duplicate names
+            n = 2
+            while key in datasets:
+                key = f"{name}#{n}"
+                n += 1
+            datasets[key] = ds
+        self._contents = AttrDict(dataset=datasets)
+
+    # ── dataset description ─────────────────────────────────────────────────
+    def _root_url(self) -> str:
+        """Protocol-qualified root URL for handing to intake / building dataset
+        URLs.
+
+        ``self.proj.url`` is the filesystem-specific path with the protocol
+        stripped (e.g. ``bucket/key`` for ``s3://bucket/key``).  Intake needs
+        the protocol to pick the right filesystem, so we restore it here.
+        """
+        return self.proj.fs.unstrip_protocol(self.proj.url)
+
+    def _dataset_url(self, group: FileGroup, dir_dataset: bool):
+        if dir_dataset:
+            return self._root_url()
+        return group.url(self._root_url())
+
+    def _describe_without_intake(self, group: FileGroup):
+        """Build a Dataset content object using only filename info (no I/O).
+
+        Returns a ``(name, Dataset)`` pair; the name becomes the key in the
+        project's ``contents.dataset`` mapping.
+        """
+        from projspec.content.data import Dataset
+
+        return group.name, Dataset(
+            proj=self.proj,
+            url=group.url(self._root_url()),
+            datatype=None,
+            structure=[],
+            schema={},
+            n_files=len(group.members) or 1,
+            total_size=group.total_size,
+            metadata={},
+        )
+
+    def _describe(self, group: FileGroup, dir_dataset: bool = False):
+        """Describe a single file-group as a Dataset, using intake if available."""
+        from projspec.content.data import Dataset
+
+        url = self._dataset_url(group, dir_dataset)
+        info: dict | None = None
+        try:
+            from intake.readers.inspect import inspect_dataset
+
+            # storage_options keep remote access working; the size guard and
+            # timeout protect against pathological inputs.
+            info = inspect_dataset(
+                url,
+                storage_options=self.proj.storage_options or None,
+            )
+        except ImportError:
+            logger.debug("intake not installed; describing %s by name only", url)
+        except Exception as exc:  # never let a bad file abort the whole parse
+            logger.debug("inspect_dataset failed for %s: %s", url, exc)
+
+        if not info:
+            return self._describe_without_intake(group)
+
+        n_files = info.get("n_files") or (len(group.members) or 1)
+        total = info.get("file_size_bytes")
+        if total is None:
+            total = group.total_size
+
+        meta = {
+            k: info[k]
+            for k in (
+                "shape",
+                "npartitions",
+                "reader_used",
+                "description",
+                "html_repr",
+                "thumbnail",
+            )
+            if info.get(k) is not None
+        }
+        # report which readers intake thinks can load this, if any
+        readers = info.get("readers") or {}
+        if readers:
+            meta["readers"] = sorted(readers)
+
+        structure = info.get("structure") or set()
+        name = group.pattern if dir_dataset else group.name
+        return name, Dataset(
+            proj=self.proj,
+            url=url,
+            datatype=info.get("detected_type"),
+            structure=sorted(structure)
+            if isinstance(structure, set)
+            else list(structure),
+            schema=info.get("datashape") or {},
+            n_files=n_files,
+            total_size=total,
+            metadata=meta,
+        )
diff --git a/src/projspec/proj/knowledge_catalog.py b/src/projspec/proj/knowledge_catalog.py
new file mode 100644
index 0000000..bf31fca
--- /dev/null
+++ b/src/projspec/proj/knowledge_catalog.py
@@ -0,0 +1,208 @@
+"""The :class:`KnowledgeCatalog` project spec.
+
+Detects an *Open Knowledge Format* (OKF) bundle: a directory tree of markdown
+files with YAML frontmatter, where every non-reserved ``.md`` file is a
+"concept" carrying at least a ``type`` field.  Two filenames are reserved at
+any level: ``index.md`` (directory listing) and ``log.md`` (update history).
+
+See https://github.com/GoogleCloudPlatform/knowledge-catalog/blob/main/okf/SPEC.md
+"""
+
+from __future__ import annotations
+
+import os
+from io import StringIO
+
+from projspec.proj import ProjectSpec
+from projspec.proj.base import ParseFailed
+from projspec.utils import AttrDict
+
+# Filenames with reserved meaning that are never concept documents (§3.1).
+_RESERVED = {"index.md", "log.md"}
+
+
+def _split_frontmatter(text: str | bytes) -> dict | None:
+    """Return the parsed YAML frontmatter block of a markdown document.
+
+    Returns ``None`` when the document has no parseable ``---``-delimited
+    frontmatter mapping at its start.
+    """
+    import yaml
+
+    if isinstance(text, bytes):
+        text = text.decode("utf-8", "replace")
+    # Frontmatter must be delimited by '---' on its own line at the start and a
+    # closing '---'. Require at least the opening and closing fences.
+    if text.count("---\n") < 2 and not text.lstrip().startswith("---"):
+        return None
+    parts = text.split("---\n")
+    if len(parts) < 3:
+        return None
+    # parts[0] is whatever precedes the first fence (should be empty/whitespace)
+    if parts[0].strip():
+        return None
+    try:
+        meta = yaml.safe_load(StringIO(parts[1]))
+    except Exception:
+        return None
+    return meta if isinstance(meta, dict) else None
+
+
+class KnowledgeCatalog(ProjectSpec):
+    """An Open Knowledge Format (OKF) knowledge bundle.
+
+    An OKF bundle is a directory of markdown "concept" documents, each with a
+    YAML frontmatter block declaring a ``type``.  Reserved ``index.md`` /
+    ``log.md`` files provide directory listings and update history.
+
+    Produces one :class:`projspec.content.metadata.DescriptiveMetadata` per
+    concept, keyed by its *concept ID* (the file path within the bundle with
+    the ``.md`` suffix removed, e.g. ``tables/orders``).
+    """
+
+    icon = "📚"
+    spec_doc = (
+        "https://github.com/GoogleCloudPlatform/knowledge-catalog/blob/main/okf/SPEC.md"
+    )
+
+    def match(self) -> bool:
+        """Cheap check: a reserved ``index.md`` is present, plus either another
+        markdown document or a subdirectory that might hold concepts.
+
+        Full validation (that concepts carry a ``type`` field) is deferred to
+        :meth:`parse`, which raises :class:`ParseFailed` if none qualify, so a
+        plain ``index.md`` from some other tool does not register as an OKF
+        bundle.
+        """
+        if "index.md" not in self.proj.basenames:
+            return False
+        # another markdown concept at the root...
+        for name in self.proj.basenames:
+            if name.endswith(".md") and name not in _RESERVED:
+                return True
+        # ...or a subdirectory that might contain concepts
+        for info in self.proj.filelist:
+            if info.get("type") == "directory":
+                base = str(info["name"]).rstrip("/").rsplit("/", 1)[-1]
+                # skip hidden/dunder dirs (handled like project walking)
+                if not base.startswith((".", "_")):
+                    return True
+        return False
+
+    def _concept_files(self) -> list[str]:
+        """Full paths of candidate concept documents (recursive, non-reserved)."""
+        root = self.proj.url.rstrip("/")
+        try:
+            # glob may return a list or (with detail) a dict keyed by path
+            paths = list(self.proj.fs.glob(f"{root}/**/*.md"))
+        except Exception:
+            # fall back to the top-level listing if globbing isn't supported
+            paths = [
+                full
+                for name, full in self.proj.basenames.items()
+                if name.endswith(".md")
+            ]
+        out = []
+        for p in paths:
+            p = str(p)
+            base = p.rsplit("/", 1)[-1]
+            if base in _RESERVED:
+                continue
+            out.append(p)
+        return sorted(out)
+
+    def _concept_id(self, full_path: str) -> str:
+        """The concept ID: bundle-relative path with the ``.md`` suffix removed."""
+        root = self.proj.url.rstrip("/") + "/"
+        rel = full_path[len(root) :] if full_path.startswith(root) else full_path
+        if rel.endswith(".md"):
+            rel = rel[: -len(".md")]
+        return rel
+
+    def parse(self) -> None:
+        from projspec.content.metadata import DescriptiveMetadata
+
+        concepts = AttrDict()
+        for full in self._concept_files():
+            try:
+                with self.proj.fs.open(full, "rt") as f:
+                    text = f.read()
+            except OSError:
+                continue
+            meta = _split_frontmatter(text)
+            if not meta:
+                # not a conformant concept document - skip
+                continue
+            type_ = meta.get("type")
+            if not type_ or not str(type_).strip():
+                # §9: every concept frontmatter must carry a non-empty `type`
+                continue
+
+            entry: dict[str, str] = {"type": str(type_)}
+            for field in ("title", "description", "resource", "timestamp"):
+                val = meta.get(field)
+                if val:
+                    entry[field] = str(val)
+            tags = meta.get("tags")
+            if tags:
+                if isinstance(tags, (list, tuple)):
+                    entry["tags"] = ", ".join(str(t) for t in tags)
+                else:
+                    entry["tags"] = str(tags)
+
+            key = self._concept_id(full)
+            concepts[key] = DescriptiveMetadata(proj=self.proj, meta=entry)
+
+        if not concepts:
+            raise ParseFailed("No OKF concept documents with a 'type' field found")
+
+        # The bundle-root index.md may declare the OKF version it targets.
+        bundle_meta: dict[str, str] = {}
+        if "index.md" in self.proj.basenames:
+            try:
+                with self.proj.get_file("index.md") as f:
+                    idx = _split_frontmatter(f.read())
+            except OSError:
+                idx = None
+            if idx and idx.get("okf_version"):
+                bundle_meta["okf_version"] = str(idx["okf_version"])
+
+        contents = AttrDict(concept=concepts)
+        if bundle_meta:
+            contents["descriptive_metadata"] = DescriptiveMetadata(
+                proj=self.proj, meta=bundle_meta
+            )
+        self._contents = contents
+        self._artifacts = AttrDict()
+
+    @staticmethod
+    def _create(path: str) -> None:
+        """Scaffold a minimal but conformant OKF bundle."""
+        name = os.path.basename(path.rstrip("/")) or "bundle"
+
+        with open(f"{path}/index.md", "w") as f:
+            f.write(
+                "---\n"
+                'okf_version: "0.1"\n'
+                "---\n\n"
+                f"# {name}\n\n"
+                "* [Overview](overview.md) - what this bundle contains\n"
+            )
+
+        with open(f"{path}/log.md", "w") as f:
+            f.write(
+                "# Update Log\n\n"
+                "## 2026-01-01\n"
+                "* **Initialization**: Created the bundle.\n"
+            )
+
+        with open(f"{path}/overview.md", "w") as f:
+            f.write(
+                "---\n"
+                "type: Reference\n"
+                f"title: {name} overview\n"
+                "description: A short description of this knowledge bundle.\n"
+                "---\n\n"
+                f"# {name}\n\n"
+                "Free-form markdown describing the knowledge captured here.\n"
+            )
diff --git a/src/projspec/textapp/main.py b/src/projspec/textapp/main.py
index 5911bb4..bff7407 100644
--- a/src/projspec/textapp/main.py
+++ b/src/projspec/textapp/main.py
@@ -171,19 +171,9 @@ def _basename(url: str) -> str:
 
 
 def _fmt_age(ts: float) -> str:
-    import datetime
+    from projspec.proj.base import _humanize_age
 
-    days = (datetime.datetime.now() - datetime.datetime.fromtimestamp(ts)).days
-    if days == 0:
-        return "today"
-    if days == 1:
-        return "yesterday"
-    if days < 30:
-        return f"{days} days ago"
-    if days < 365:
-        return f"{days // 30} months ago"
-    yrs = days // 365
-    return f"{yrs} year{'s' if yrs > 1 else ''} ago"
+    return _humanize_age(ts)
 
 
 def _is_enum(v: Any) -> bool:
@@ -253,7 +243,15 @@ def _yaml_lines(
             return [f"{pad}{_role('{}', 'muted')}"]
         out = []
         for k, v in data.items():
-            if _is_enum(v):
+            # The web UIs embed these as live HTML / an image; a TUI can't, so
+            # show a short placeholder rather than dumping the huge raw string.
+            if k in ("html_repr", "thumbnail") and isinstance(v, str):
+                note = "HTML preview" if k == "html_repr" else "image thumbnail"
+                out.append(
+                    f"{pad}{_role(str(k), 'field')}: "
+                    f"{_role(f'<{note} available in graphical UI>', 'muted')}"
+                )
+            elif _is_enum(v):
                 out.append(
                     f"{pad}{_role(str(k), 'field')}: "
                     f"{_role(_enum_label(v, enums), 'enum')}"
@@ -713,6 +711,9 @@ def compose(self) -> ComposeResult:
             age = _fmt_age(float(last_modified))
             by = self.project.get("last_modified_by")
             meta_parts.append("last modified " + age + (f" by {by}" if by else ""))
+        scanned_at = self.project.get("scanned_at")
+        if scanned_at is not None:
+            meta_parts.append("scanned " + _fmt_age(float(scanned_at)))
         if meta_parts:
             yield Static(" · ".join(meta_parts), classes="meta")
         # Build the full list of chips first, then split into horizontal
diff --git a/src/projspec/utils.py b/src/projspec/utils.py
index fa70261..64dd015 100644
--- a/src/projspec/utils.py
+++ b/src/projspec/utils.py
@@ -15,6 +15,10 @@
 logger = logging.getLogger("projspec")
 
 
+class DEFAULT:
+    ...
+
+
 class Enum(enum.Enum):
     """Named enum values, so that str(x) looks like the label."""
 
@@ -113,7 +117,10 @@ def from_dict(dic, proj=None):
             if dic["klass"] == "project":
                 return Project.from_dict(dic)
             category, name = dic.pop("klass")
-            cls = get_cls(name, category)
+            try:
+                cls = get_cls(name, category)
+            except KeyError:
+                return None
             if category == "enum":
                 return cls(dic["value"])
             obj = object.__new__(cls)
diff --git a/src/projspec/webui/panel.js b/src/projspec/webui/panel.js
index 870e249..b1f3904 100644
--- a/src/projspec/webui/panel.js
+++ b/src/projspec/webui/panel.js
@@ -102,8 +102,18 @@
         }
     }
     function fmtAge(ts) {
-        const days = Math.floor((Date.now() / 1000 - parseFloat(ts)) / 86400);
-        if (days === 0) return 'today';
+        const secs = Math.floor(Date.now() / 1000 - parseFloat(ts));
+        if (secs < 0) return 'just now';
+        const days = Math.floor(secs / 86400);
+        if (days === 0) {
+            if (secs < 60) return 'just now';
+            if (secs < 3600) {
+                const m = Math.floor(secs / 60);
+                return m + ' minute' + (m !== 1 ? 's' : '') + ' ago';
+            }
+            const h = Math.floor(secs / 3600);
+            return h + ' hour' + (h !== 1 ? 's' : '') + ' ago';
+        }
         if (days === 1) return 'yesterday';
         if (days < 30) return days + ' days ago';
         if (days < 365) return Math.floor(days / 30) + ' months ago';
@@ -200,6 +210,8 @@
             const by = project.last_modified_by != null ? project.last_modified_by : null;
             metaParts.push('last modified ' + age + (by ? ' by ' + by : ''));
         }
+        if (project.scanned_at != null)
+            metaParts.push('scanned ' + fmtAge(project.scanned_at));
         if (metaParts.length > 0) {
             const meta = document.createElement('div');
             meta.className = 'meta';
@@ -456,14 +468,69 @@
             body.innerHTML = sanitizeHtml(html);
             w.appendChild(body);
         } else {
+            // Datasets (and other content) may carry rich previews in
+            // ``metadata.html_repr`` (an HTML fragment) and
+            // ``metadata.thumbnail`` (a data: image URL). Embed those rather
+            // than dumping their (often huge) raw strings into the YAML tree.
+            const meta = (kind === 'content' && data && typeof data === 'object'
+                && data.metadata && typeof data.metadata === 'object') ? data.metadata : null;
+            const htmlRepr = meta && typeof meta.html_repr === 'string' ? meta.html_repr : null;
+            const thumb = meta && typeof meta.thumbnail === 'string' ? meta.thumbnail : null;
+
             const tree = document.createElement('div');
             tree.className = 'tree yaml';
-            tree.appendChild(renderYaml(stripKlass(data)));
+            tree.appendChild(renderYaml(stripPreview(stripKlass(data))));
             w.appendChild(tree);
+
+            if (thumb) w.appendChild(thumbnailImg(thumb));
+            if (htmlRepr) {
+                const body = document.createElement('div');
+                body.className = 'widget-html';
+                body.innerHTML = sanitizeHtml(htmlRepr);
+                w.appendChild(body);
+            }
         }
         return w;
     }
 
+    /**
+     * Return a shallow copy of a content dict with the embedded-preview keys
+     * (``metadata.html_repr`` / ``metadata.thumbnail``) removed, so the YAML
+     * tree doesn't show their large raw strings - they are rendered as live
+     * HTML / an image instead.
+     */
+    function stripPreview(obj) {
+        if (!obj || typeof obj !== 'object' || Array.isArray(obj)) return obj;
+        if (!obj.metadata || typeof obj.metadata !== 'object' || Array.isArray(obj.metadata)) return obj;
+        const meta = {};
+        let changed = false;
+        for (const k of Object.keys(obj.metadata)) {
+            if (k === 'html_repr' || k === 'thumbnail') { changed = true; continue; }
+            meta[k] = obj.metadata[k];
+        }
+        if (!changed) return obj;
+        const out = {};
+        for (const k of Object.keys(obj)) out[k] = obj[k];
+        out.metadata = meta;
+        return out;
+    }
+
+    /**
+     * Build an <img> for a ``data:image/...`` thumbnail URL. Only accepts
+     * data: image URLs (never remote/javascript URLs).
+     */
+    function thumbnailImg(src) {
+        const wrap = document.createElement('div');
+        wrap.className = 'widget-html';
+        if (/^data:image\//i.test(src)) {
+            const img = document.createElement('img');
+            img.src = src;
+            img.alt = 'thumbnail';
+            wrap.appendChild(img);
+        }
+        return wrap;
+    }
+
     /**
      * Minimal HTML sanitisation for content-provided ``_html`` fragments.
      * The markup originates from projspec itself, so we don't need a
diff --git a/tests/test_basic.py b/tests/test_basic.py
index 778d53b..b9f011d 100644
--- a/tests/test_basic.py
+++ b/tests/test_basic.py
@@ -1,5 +1,6 @@
 import json
 import pickle
+import time
 
 import pytest
 
@@ -14,7 +15,32 @@ def test_basic(proj):
     assert "src/projspec" in proj.children
     assert repr(proj).count("\n") == 0
     assert str(proj).count("\n") > 0
-    proj._repr_html_()
+    proj._ipython_display_()
+
+
+def test_humanize_age():
+    from projspec.proj.base import _humanize_age
+
+    now = time.time()
+    assert _humanize_age(now) == "just now"
+    assert _humanize_age(now + 100) == "just now"  # future / clock skew
+    assert _humanize_age(now - 5 * 60) == "5 minutes ago"
+    assert _humanize_age(now - 60) == "1 minute ago"
+    assert _humanize_age(now - 3 * 3600) == "3 hours ago"
+    assert _humanize_age(now - 1.5 * 86400) == "yesterday"
+    assert _humanize_age(now - 10 * 86400) == "10 days ago"
+    assert _humanize_age(now - 60 * 86400) == "2 months ago"
+    assert _humanize_age(now - 400 * 86400) == "1 year ago"
+    assert _humanize_age(now - 800 * 86400) == "2 years ago"
+
+
+def test_scanned_at_in_stats_line(proj):
+    # scanned_at should appear in the textual surfaces
+    assert "scanned " in proj._stats_line()
+    assert "scanned " in proj.text_summary()
+    assert "scanned " in str(proj)
+    # bare summary omits the stats line entirely
+    assert "scanned " not in proj.text_summary(bare=True)
 
 
 def test_errors():
diff --git a/tests/test_data_html.py b/tests/test_data_html.py
deleted file mode 100644
index 2d6e6ea..0000000
--- a/tests/test_data_html.py
+++ /dev/null
@@ -1,449 +0,0 @@
-"""Tests for projspec.content.data_html — repr_text and repr_html.
-
-These tests use a mock DataResource to avoid needing real data files on disk
-for basic formatting checks, then run format-specific loader tests when the
-required optional libraries are available.
-"""
-
-from __future__ import annotations
-
-import io
-import os
-import tempfile
-from unittest.mock import MagicMock
-
-import pytest
-
-import projspec
-
-# ---------------------------------------------------------------------------
-# Helpers
-# ---------------------------------------------------------------------------
-
-
-def _make_dr(
-    path="mytable.parquet",
-    fmt="parquet",
-    modality="tabular",
-    layout="flat",
-    file_count=3,
-    total_size=1024 * 512,
-    schema=None,
-    sample_path="",
-    metadata=None,
-):
-    """Build a DataResource backed by a real Project (the repo root) but with
-    controlled field values."""
-    from projspec.content.data import DataResource
-
-    mock_proj = MagicMock(spec=projspec.Project)
-    # Use a real local filesystem via fsspec
-    import fsspec
-
-    mock_proj.fs = fsspec.filesystem("file")
-    mock_proj.url = "/tmp"
-
-    return DataResource(
-        proj=mock_proj,
-        path=path,
-        format=fmt,
-        modality=modality,
-        layout=layout,
-        file_count=file_count,
-        total_size=total_size,
-        schema=schema or {},
-        sample_path=sample_path,
-        metadata=metadata or {},
-    )
-
-
-# ---------------------------------------------------------------------------
-# repr_text tests
-# ---------------------------------------------------------------------------
-
-
-class TestReprText:
-    def test_basic_fields_present(self):
-        dr = _make_dr()
-        text = repr(dr)
-        assert "mytable.parquet" in text
-        assert "parquet" in text
-        assert "tabular" in text
-        assert "files=3" in text
-
-    def test_size_formatting(self):
-        dr = _make_dr(total_size=1024)
-        text = repr(dr)
-        assert "KB" in text or "B" in text
-
-    def test_size_zero(self):
-        dr = _make_dr(total_size=0)
-        text = repr(dr)
-        assert "unknown" in text
-
-    def test_schema_hint_dict(self):
-        dr = _make_dr(schema={"col_a": "int64", "col_b": "float32", "col_c": "str"})
-        text = repr(dr)
-        assert "col_a" in text
-
-    def test_schema_hint_many_fields(self):
-        schema = {f"col_{i}": "int64" for i in range(10)}
-        dr = _make_dr(schema=schema)
-        text = repr(dr)
-        assert "+7 more" in text
-
-    def test_schema_hint_list(self):
-        dr = _make_dr(schema=[{"name": "a"}, {"name": "b"}])
-        text = repr(dr)
-        assert "2 fields" in text
-
-    def test_non_flat_layout_shown(self):
-        dr = _make_dr(layout="hive")
-        text = repr(dr)
-        assert "hive" in text
-
-    def test_flat_layout_hidden(self):
-        dr = _make_dr(layout="flat")
-        text = repr(dr)
-        assert "layout" not in text
-
-    def test_no_modality(self):
-        dr = _make_dr(modality="")
-        text = repr(dr)
-        assert "modality" not in text
-
-    def test_single_line(self):
-        dr = _make_dr()
-        text = repr(dr)
-        assert "\n" not in text
-
-    def test_path_shown(self):
-        """repr_text must show the path field, not a separate name."""
-        dr = _make_dr(path="part*.csv")
-        text = repr(dr)
-        assert "part*.csv" in text
-
-    def test_dir_path_shown(self):
-        dr = _make_dr(path="year=2024/")
-        text = repr(dr)
-        assert "year=2024/" in text
-
-
-# ---------------------------------------------------------------------------
-# repr_html tests
-# ---------------------------------------------------------------------------
-
-
-class TestReprHtml:
-    def test_returns_string(self):
-        dr = _make_dr()
-        html = dr._repr_html_()
-        assert isinstance(html, str)
-        assert len(html) > 0
-
-    def test_contains_path(self):
-        dr = _make_dr(path="my_dataset.parquet")
-        html = dr._repr_html_()
-        assert "my_dataset.parquet" in html
-
-    def test_contains_glob_path(self):
-        dr = _make_dr(path="part*.parquet")
-        html = dr._repr_html_()
-        assert "part*.parquet" in html
-
-    def test_contains_dir_path(self):
-        dr = _make_dr(path="year=2024/")
-        html = dr._repr_html_()
-        assert "year=2024/" in html
-
-    def test_contains_format_badge(self):
-        dr = _make_dr(fmt="parquet")
-        html = dr._repr_html_()
-        assert "parquet" in html
-
-    def test_contains_modality_badge(self):
-        dr = _make_dr(modality="tabular")
-        html = dr._repr_html_()
-        assert "tabular" in html
-
-    def test_contains_file_count(self):
-        dr = _make_dr(file_count=7)
-        html = dr._repr_html_()
-        assert "7" in html
-
-    def test_contains_size(self):
-        dr = _make_dr(total_size=2048)
-        html = dr._repr_html_()
-        assert "KB" in html or "B" in html
-
-    def test_schema_dict_rendered(self):
-        dr = _make_dr(schema={"id": "int64", "name": "string"})
-        html = dr._repr_html_()
-        assert "id" in html
-        assert "int64" in html
-
-    def test_schema_list_of_dicts_rendered(self):
-        dr = _make_dr(
-            schema=[
-                {"name": "id", "type": "integer"},
-                {"name": "val", "type": "number"},
-            ]
-        )
-        html = dr._repr_html_()
-        assert "id" in html
-        assert "integer" in html
-
-    def test_schema_empty_no_details(self):
-        dr = _make_dr(schema={})
-        html = dr._repr_html_()
-        assert "Schema" not in html
-
-    def test_no_preview_section_without_sample_path(self):
-        dr = _make_dr(sample_path="")
-        html = dr._repr_html_()
-        assert "Preview" not in html
-
-    def test_layout_badge_shown_for_hive(self):
-        dr = _make_dr(layout="hive")
-        html = dr._repr_html_()
-        assert "hive" in html
-
-    def test_layout_badge_hidden_for_flat(self):
-        dr = _make_dr(layout="flat")
-        html = dr._repr_html_()
-        assert 'ps-badge-gray">flat<' not in html
-
-    def test_html_structure(self):
-        dr = _make_dr()
-        html = dr._repr_html_()
-        assert "ps-data-card" in html
-        assert "ps-data-card-header" in html
-        assert "ps-data-meta" in html
-
-    def test_icon_present_for_known_modality(self):
-        dr = _make_dr(modality="image")
-        html = dr._repr_html_()
-        # Image icon is 🖼 (&#x1F5BC;)
-        assert "&#x1F5BC;" in html
-
-    def test_icon_fallback_for_unknown_modality(self):
-        dr = _make_dr(modality="")
-        html = dr._repr_html_()
-        # Fallback icon &#x1F5C2;
-        assert "&#x1F5C2;" in html
-
-    def test_large_schema_collapsed(self):
-        schema = {f"col_{i}": "int64" for i in range(20)}
-        dr = _make_dr(schema=schema)
-        html = dr._repr_html_()
-        # details element should NOT have open attribute when >8 fields
-        assert (
-            "<details  style" in html
-            or 'details  style="margin-top:6px"' in html
-            or 'details style="margin-top:6px">' in html
-        )
-
-    def test_small_schema_open(self):
-        schema = {f"col_{i}": "int64" for i in range(4)}
-        dr = _make_dr(schema=schema)
-        html = dr._repr_html_()
-        assert "<details open" in html
-
-
-# ---------------------------------------------------------------------------
-# Live preview tests — skipped when optional dependencies are absent
-# ---------------------------------------------------------------------------
-
-
-class TestLivePreviews:
-    """Tests that write real files and verify the preview HTML is produced."""
-
-    def _dr_for_file(self, path, fmt, modality):
-        """Create a DataResource pointing at a real local file."""
-        from projspec.content.data import DataResource
-        import fsspec
-
-        mock_proj = MagicMock(spec=projspec.Project)
-        mock_proj.fs = fsspec.filesystem("file")
-        mock_proj.url = os.path.dirname(path)
-        return DataResource(
-            proj=mock_proj,
-            path=os.path.basename(path),
-            format=fmt,
-            modality=modality,
-            layout="flat",
-            file_count=1,
-            total_size=os.path.getsize(path),
-            schema={},
-            sample_path=path,
-        )
-
-    def test_csv_preview(self, tmp_path):
-        pd = pytest.importorskip("pandas")
-        import pandas as pd
-
-        path = str(tmp_path / "data.csv")
-        pd.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}).to_csv(path, index=False)
-        dr = self._dr_for_file(path, "csv", "tabular")
-        html = dr._repr_html_()
-        assert "Preview" in html
-        assert "<table" in html
-        assert "x" in html
-        assert "y" in html
-
-    def test_csv_preview_uses_repr_html(self, tmp_path):
-        """Preview HTML should come from pandas html, not hand-rolled."""
-        pytest.importorskip("pandas")
-        import pandas as pd
-
-        path = str(tmp_path / "data.csv")
-        pd.DataFrame({"x": range(20), "y": range(20)}).to_csv(path, index=False)
-        dr = self._dr_for_file(path, "csv", "tabular")
-        html = dr._repr_html_()
-        # pandas wraps its table in a <div> with a dataframe class
-        assert "dataframe" in html or "ps-df-wrap" in html
-
-    def test_csv_preview_row_limit(self, tmp_path):
-        """Only _PREVIEW_ROWS rows of data should appear, not all 50."""
-        pytest.importorskip("pandas")
-        import pandas as pd
-
-        path = str(tmp_path / "big.csv")
-        pd.DataFrame({"v": range(50)}).to_csv(path, index=False)
-        dr = self._dr_for_file(path, "csv", "tabular")
-        html = dr._repr_html_()
-        # Extract just the preview section so CSS text doesn't interfere
-        preview_start = html.find('<div class="ps-preview">')
-        assert preview_start != -1, "no preview section found"
-        preview_html = html[preview_start:]
-        # The last row value (49) should not appear as a table cell
-        assert "<td>49</td>" not in preview_html
-
-    def test_parquet_preview(self, tmp_path):
-        pytest.importorskip("pyarrow")
-        import pyarrow as pa
-        import pyarrow.parquet as pq
-
-        path = str(tmp_path / "data.parquet")
-        table = pa.table({"a": [1, 2, 3], "b": ["x", "y", "z"]})
-        pq.write_table(table, path)
-        dr = self._dr_for_file(path, "parquet", "tabular")
-        html = dr._repr_html_()
-        assert "Preview" in html
-        assert "<table" in html
-        assert "a" in html
-
-    def test_parquet_preview_uses_pandas_repr(self, tmp_path):
-        """Parquet preview must go through pandas html, not raw arrow HTML."""
-        pytest.importorskip("pyarrow")
-        import pyarrow as pa
-        import pyarrow.parquet as pq
-
-        path = str(tmp_path / "data.parquet")
-        table = pa.table({"col_a": range(10), "col_b": list("abcdefghij")})
-        pq.write_table(table, path)
-        dr = self._dr_for_file(path, "parquet", "tabular")
-        html = dr._repr_html_()
-        # pandas DataFrame.html includes class="dataframe"
-        assert "dataframe" in html
-
-    def test_parquet_preview_row_limit(self, tmp_path):
-        """Parquet preview reads only one row group and slices to _PREVIEW_ROWS."""
-        pytest.importorskip("pyarrow")
-        import pyarrow as pa
-        import pyarrow.parquet as pq
-        from projspec.content.data_html import _PREVIEW_ROWS
-
-        n_rows = 100
-        path = str(tmp_path / "large.parquet")
-        # Use a column whose values are unique strings unlikely to appear in CSS
-        values = [f"row_{i:04d}" for i in range(n_rows)]
-        pq.write_table(pa.table({"label": values}), path)
-        dr = self._dr_for_file(path, "parquet", "tabular")
-        html = dr._repr_html_()
-        assert "row_0000" in html  # first row present
-        assert "row_0099" not in html  # last row absent
-
-    def test_arrow_ipc_preview(self, tmp_path):
-        """Arrow IPC file: reads only the first batch, converts via pandas."""
-        pytest.importorskip("pyarrow")
-        import pyarrow as pa
-        import pyarrow.ipc as ipc
-
-        path = str(tmp_path / "data.arrow")
-        table = pa.table({"x": [10, 20, 30], "y": ["a", "b", "c"]})
-        with pa.OSFile(path, "wb") as sink:
-            with ipc.new_file(sink, table.schema) as writer:
-                writer.write_table(table)
-        dr = self._dr_for_file(path, "arrow", "tabular")
-        html = dr._repr_html_()
-        assert "Preview" in html
-        assert "dataframe" in html
-        assert "x" in html
-
-    def test_image_preview(self, tmp_path):
-        pytest.importorskip("PIL")
-        from PIL import Image
-
-        path = str(tmp_path / "test.png")
-        img = Image.new("RGB", (64, 64), color=(128, 0, 200))
-        img.save(path)
-        dr = self._dr_for_file(path, "png", "image")
-        html = dr._repr_html_()
-        assert "Preview" in html
-        assert "data:image/png;base64," in html
-
-    def test_numpy_preview(self, tmp_path):
-        np = pytest.importorskip("numpy")
-        import numpy as np
-
-        path = str(tmp_path / "arr.npy")
-        np.save(path, np.arange(20).reshape(4, 5))
-        dr = self._dr_for_file(path, "numpy", "array")
-        html = dr._repr_html_()
-        assert "Preview" in html
-        assert "shape" in html
-
-    def test_numpy_preview_reads_header_shape(self, tmp_path):
-        """The shape reported in the preview must match the actual array shape."""
-        np = pytest.importorskip("numpy")
-        import numpy as np
-
-        path = str(tmp_path / "arr.npy")
-        arr = np.zeros((7, 3), dtype="float32")
-        np.save(path, arr)
-        dr = self._dr_for_file(path, "numpy", "array")
-        html = dr._repr_html_()
-        assert "(7, 3)" in html
-        assert "float32" in html
-
-    def test_numpy_large_array_no_full_load(self, tmp_path):
-        """Arrays above the 1 MB threshold should show shape/dtype without a data slice."""
-        np = pytest.importorskip("numpy")
-        import numpy as np
-
-        path = str(tmp_path / "big.npy")
-        # 512 * 512 * float64 = 2 MB > 1 MB threshold
-        np.save(path, np.zeros((512, 512), dtype="float64"))
-        dr = self._dr_for_file(path, "numpy", "array")
-        html = dr._repr_html_()
-        assert "(512, 512)" in html  # shape shown
-        assert "float64" in html  # dtype shown
-        # The data slice key ("preview") should NOT appear in the info table;
-        # check the table cell content rather than the CSS class names
-        assert ">preview<" not in html  # no <td>preview</td> row
-
-
-# ---------------------------------------------------------------------------
-# fmt_size helper
-# ---------------------------------------------------------------------------
-
-
-def test_fmt_size():
-    from projspec.content.data_html import _fmt_size
-
-    assert _fmt_size(0) == "unknown"
-    assert _fmt_size(512) == "512 B"
-    assert "KB" in _fmt_size(2048)
-    assert "MB" in _fmt_size(2 * 1024 * 1024)
-    assert "GB" in _fmt_size(3 * 1024**3)
diff --git a/tests/test_data_project.py b/tests/test_data_project.py
index 3dae345..f0d13b5 100644
--- a/tests/test_data_project.py
+++ b/tests/test_data_project.py
@@ -1,326 +1,594 @@
-import json
+"""Tests for the DataProject spec and the file-consolidation helper.
+
+The consolidation helper is filesystem-agnostic and tested directly on
+``(basename, size)`` lists.  The DataProject spec is tested end-to-end by
+writing files into a tmpdir and constructing a real ``projspec.Project``.
+
+Intake may or may not be installed (and which readers are available varies),
+so the DataProject assertions only check things that do not depend on a
+specific reader being present: that the project is/ isn't detected, how files
+are consolidated, file counts and sizes.  Where intake is available we also
+spot-check ``datatype``/``structure``.
+"""
+
 import os
 
 import pytest
 
 import projspec
-from projspec.content.data import DataResource
-from projspec.utils import from_dict
-
-
-def _data_project(tmp_path):
-    """Return a projspec.Project rooted at *tmp_path* (no walk needed)."""
-    return projspec.Project(str(tmp_path))
-
-
-class TestDataDetection:
-    def test_csv_detected(self, tmp_path):
-        (tmp_path / "data.csv").write_text("x,y\n1,2\n3,4\n")
-        proj = _data_project(tmp_path)
-        assert "data" in proj.specs
-
-    def test_parquet_detected(self, tmp_path):
-        pytest.importorskip("pyarrow")
-        import pyarrow as pa
-        import pyarrow.parquet as pq
-
-        pq.write_table(pa.table({"a": [1, 2]}), str(tmp_path / "t.parquet"))
-        proj = _data_project(tmp_path)
-        assert "data" in proj.specs
-
-    def test_no_data_files_not_detected(self, tmp_path):
-        (tmp_path / "README.md").write_text("hello")
-        (tmp_path / "config.json").write_text("{}")
-        proj = _data_project(tmp_path)
-        assert "data" not in proj.specs
-
-
-class TestDataParse:
-    def test_single_csv_resource(self, tmp_path):
-        (tmp_path / "sales.csv").write_text("col1,col2\n1,a\n2,b\n")
-        proj = _data_project(tmp_path)
-        dr = proj.specs["data"].contents["data_resource"]
-        assert isinstance(dr, DataResource)
-        assert dr.path == "sales.csv"
-        assert dr.format == "csv"
-        assert dr.modality == "tabular"
-        assert dr.file_count == 1
-
-    def test_series_collated_to_glob_path(self, tmp_path):
-        """part0.csv + part1.csv → path == 'part*.csv'"""
-        for i in range(3):
-            (tmp_path / f"part{i}.csv").write_text("x\n1\n")
-        proj = _data_project(tmp_path)
-        dr = proj.specs["data"].contents["data_resource"]
-        assert isinstance(dr, DataResource)
-        assert dr.path == "part*.csv"
-        assert dr.file_count == 3
-
-    def test_distinct_csv_files_separate_resources(self, tmp_path):
-        """users.csv and orders.csv differ alphabetically → two resources."""
-        (tmp_path / "users.csv").write_text("id\n1\n")
-        (tmp_path / "orders.csv").write_text("id\n1\n")
-        proj = _data_project(tmp_path)
-        dr_map = proj.specs["data"].contents["data_resource"]
-        # Two separate DataResource objects, keyed in an AttrDict
-        assert len(dr_map) == 2
-        paths = {dr_map[k].path for k in dr_map}
-        assert "users.csv" in paths
-        assert "orders.csv" in paths
-
-    def test_sample_path_is_full_path(self, tmp_path):
-        csv = tmp_path / "data.csv"
-        csv.write_text("x\n1\n")
-        proj = _data_project(tmp_path)
-        dr = proj.specs["data"].contents["data_resource"]
-        assert dr.sample_path == str(csv)
-
-    def test_total_size_nonzero(self, tmp_path):
-        content = "x,y\n" + "\n".join(f"{i},{i}" for i in range(20))
-        (tmp_path / "nums.csv").write_text(content)
-        proj = _data_project(tmp_path)
-        dr = proj.specs["data"].contents["data_resource"]
-        assert dr.total_size > 0
-
-
-class TestDataResourceToDict:
-    def _make_dr(self, tmp_path):
-        (tmp_path / "items.csv").write_text("id,val\n1,a\n2,b\n")
-        proj = _data_project(tmp_path)
-        return proj.specs["data"].contents["data_resource"]
-
-    def test_compact_omits_klass(self, tmp_path):
-        dr = self._make_dr(tmp_path)
-        d = dr.to_dict(compact=True)
-        assert "klass" not in d
-
-    def test_compact_omits_html(self, tmp_path):
-        """compact=True is for human/console output — _html must be absent."""
-        dr = self._make_dr(tmp_path)
-        d = dr.to_dict(compact=True)
-        assert "_html" not in d
-
-
-class TestDataResourceRoundTrip:
-    def _roundtrip(self, dr):
-        """Serialise to JSON and rehydrate, returning the new DataResource."""
-        d = dr.to_dict(compact=False)
-        js = json.dumps(d)
-        d2 = json.loads(js)
-        return from_dict(d2, proj=dr.proj)
-
-    def _make_dr(self, tmp_path):
-        (tmp_path / "orders.csv").write_text("order_id,amount\n1,99\n2,42\n")
-        proj = _data_project(tmp_path)
-        return proj.specs["data"].contents["data_resource"]
-
-    def test_roundtrip_returns_dataresource(self, tmp_path):
-        dr2 = self._roundtrip(self._make_dr(tmp_path))
-        assert isinstance(dr2, DataResource)
-
-    def test_roundtrip_preserves_path(self, tmp_path):
-        dr2 = self._roundtrip(self._make_dr(tmp_path))
-        assert dr2.path == "orders.csv"
-
-    def test_roundtrip_preserves_format(self, tmp_path):
-        dr2 = self._roundtrip(self._make_dr(tmp_path))
-        assert dr2.format == "csv"
-
-    def test_roundtrip_preserves_modality(self, tmp_path):
-        dr2 = self._roundtrip(self._make_dr(tmp_path))
-        assert dr2.modality == "tabular"
-
-    def test_roundtrip_preserves_file_count(self, tmp_path):
-        dr2 = self._roundtrip(self._make_dr(tmp_path))
-        assert dr2.file_count == 1
-
-    def test_roundtrip_preserves_total_size(self, tmp_path):
-        dr = self._make_dr(tmp_path)
-        dr2 = self._roundtrip(dr)
-        assert dr2.total_size == dr.total_size
-
-    def test_roundtrip_preserves_schema(self, tmp_path):
-        pytest.importorskip("pyarrow")
-        import pyarrow as pa, pyarrow.parquet as pq
-
-        pq.write_table(
-            pa.table({"col_a": [1, 2, 3], "col_b": ["x", "y", "z"]}),
-            str(tmp_path / "data.parquet"),
+from projspec.config import temp_conf
+from projspec.proj._consolidate import consolidate, FileGroup
+from projspec.proj.data_project import DataProject
+from projspec.content.data import Dataset, TabularData, IntakeSource
+
+try:
+    import intake.readers.inspect  # noqa: F401
+
+    HAS_INTAKE = True
+except Exception:  # pragma: no cover
+    HAS_INTAKE = False
+
+try:
+    import pandas as _pd  # noqa: F401
+
+    HAS_PANDAS = True
+except Exception:  # pragma: no cover
+    HAS_PANDAS = False
+
+try:
+    # importing here puts PIL in sys.modules so intake's check_imports (which
+    # uses importlib.metadata.distribution and falls back to sys.modules) finds
+    # it - Pillow's distribution name ("pillow") differs from the import name.
+    import PIL  # noqa: F401
+    import numpy as _np  # noqa: F401
+
+    HAS_PIL = True
+except Exception:  # pragma: no cover
+    HAS_PIL = False
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+# Production-equivalent significance thresholds. Tests that depend on these
+# values set them explicitly via temp_conf so they do not rely on (and are not
+# broken by changes to) the config defaults.
+PROD_THRESHOLDS = dict(
+    data_min_fraction=0.5,
+    data_min_file_size=1024 * 1024,
+    data_min_total_size=10 * 1024 * 1024,
+    data_min_play_size=64 * 1024,
+)
+
+
+def write_data(tmpdir, files: dict[str, int | bytes]) -> str:
+    """Write files into *tmpdir*.
+
+    Values are either an int (number of zero bytes to write) or raw bytes.
+    """
+    path = str(tmpdir)
+    for rel, content in files.items():
+        full = os.path.join(path, rel)
+        os.makedirs(os.path.dirname(full), exist_ok=True)
+        data = content if isinstance(content, bytes) else b"\0" * content
+        with open(full, "wb") as f:
+            f.write(data)
+    return path
+
+
+def datasets(proj) -> dict[str, Dataset]:
+    """The ``name -> Dataset`` mapping for a project's data datasets."""
+    dp = proj.specs.get("data_project")
+    return dict(dp.contents.get("dataset", {})) if dp else {}
+
+
+def dataset_names(proj) -> set[str]:
+    """The set of dataset names (mapping keys) for a project."""
+    return set(datasets(proj))
+
+
+# ---------------------------------------------------------------------------
+# consolidate()
+# ---------------------------------------------------------------------------
+
+
+class TestConsolidate:
+    def test_numbered_csv_series(self):
+        files = [(f"{i:03d}.csv", 100) for i in range(1, 6)]
+        groups = consolidate(files)
+        assert len(groups) == 1
+        g = groups[0]
+        assert g.consolidated
+        assert g.pattern == "*.csv"
+        assert len(g.members) == 5
+        assert g.total_size == 500
+
+    def test_spark_parts(self):
+        files = [(f"part-{i:05d}.parquet", 10) for i in range(4)]
+        groups = consolidate(files)
+        assert len(groups) == 1
+        assert groups[0].pattern == "part-*.parquet"
+        assert groups[0].consolidated
+
+    def test_year_series(self):
+        files = [(f"data_{y}.json", 5) for y in range(2015, 2021)]
+        groups = consolidate(files)
+        assert len(groups) == 1
+        assert groups[0].pattern == "data_*.json"
+
+    def test_token_series_colours(self):
+        files = [("red.gif", 1), ("green.gif", 1), ("blue.gif", 1)]
+        groups = consolidate(files, min_token_group=2)
+        assert len(groups) == 1
+        assert groups[0].pattern == "*.gif"
+        assert groups[0].consolidated
+        assert sorted(groups[0].members) == ["blue.gif", "green.gif", "red.gif"]
+
+    def test_below_min_group_stays_standalone(self):
+        # only two numbered files, default min_group=3 -> not consolidated
+        files = [("001.csv", 10), ("002.csv", 10)]
+        groups = consolidate(files, min_group=3, min_token_group=99)
+        assert all(not g.consolidated for g in groups)
+        assert len(groups) == 2
+
+    def test_mixed_extensions_separate_groups(self):
+        files = [(f"{i:03d}.csv", 10) for i in range(5)]
+        files += [(f"{i:03d}.json", 10) for i in range(5)]
+        groups = consolidate(files)
+        patterns = sorted(g.pattern for g in groups)
+        assert patterns == ["*.csv", "*.json"]
+
+    def test_unrelated_files_standalone(self):
+        files = [("readme_data.bin", 10), ("schema.avro", 10)]
+        groups = consolidate(files, min_token_group=99)
+        assert all(not g.consolidated for g in groups)
+        assert {g.name for g in groups} == {"readme_data.bin", "schema.avro"}
+
+    def test_double_extension_grouping(self):
+        files = [(f"part{i}.csv.gz", 10) for i in range(5)]
+        groups = consolidate(files)
+        assert len(groups) == 1
+        assert groups[0].ext == ".csv.gz"
+        assert groups[0].consolidated
+
+    def test_url_glob_vs_list(self, tmp_path):
+        g = FileGroup(
+            members=["001.csv", "002.csv", "003.csv"],
+            ext=".csv",
+            pattern="*.csv",
+            consolidated=True,
         )
-        proj = _data_project(tmp_path)
-        dr = proj.specs["data"].contents["data_resource"]
-        dr2 = self._roundtrip(dr)
-        assert dr2.schema == dr.schema
-
-    def test_roundtrip_html_matches_original(self, tmp_path):
-        """_repr_html_() on the rehydrated object must equal the original render."""
-        dr = self._make_dr(tmp_path)
-        html_original = dr._repr_html_()
-        dr2 = self._roundtrip(dr)
-        assert dr2._repr_html_() == html_original
-
-    def test_roundtrip_html_cached_without_rerender(self, tmp_path):
-        """After from_dict the HTML is already in _html — no re-render occurs."""
-        dr = self._make_dr(tmp_path)
-        html_original = dr._repr_html_()
-        d = dr.to_dict(compact=False)
-        d2 = json.loads(json.dumps(d))
-        dr2 = from_dict(d2, proj=dr.proj)
-
-        # Confirm _html is set directly on the instance (not via lazy render)
-        assert (
-            "_html" in dr2.__dict__
-        ), "_html should be in instance __dict__ after from_dict"
-        assert dr2.__dict__["_html"] == html_original
-
-    def test_roundtrip_html_survives_missing_sample_path(self, tmp_path):
-        """After rehydration, _repr_html_() must work even if sample_path
-        no longer resolves (e.g. moved to a different machine)."""
-        dr = self._make_dr(tmp_path)
-        # Trigger render with a real file, then remove the file
-        html_original = dr._repr_html_()
-        os.remove(dr.sample_path)
-
-        dr2 = self._roundtrip(dr)
-        # sample_path is gone — but HTML was cached in the dict
-        assert dr2._repr_html_() == html_original
-
-
-class TestDataConditionalParse:
-    """Tests for the 'other project types present' guard in Data.parse()."""
-
-    def _big_csv(self, path, rows=500):
-        """Write a CSV large enough to dominate byte counts."""
-        content = "id,value\n" + "\n".join(f"{i},{i * 2}" for i in range(rows))
-        path.write_text(content)
-
-    def test_pure_data_dir_no_sentinel(self, tmp_path):
-        """No sentinel → Data always parsed regardless of byte ratios."""
-        (tmp_path / "data.csv").write_text("x\n1\n")
-        proj = _data_project(tmp_path)
-        assert "data" in proj.specs
-
-    def test_datapackage_companion_not_a_sentinel(self, tmp_path):
-        """datapackage.json is a compatible companion — not a sentinel."""
-        self._big_csv(tmp_path / "data.csv")
-        (tmp_path / "datapackage.json").write_text('{"resources": []}')
-        proj = _data_project(tmp_path)
-        assert "data" in proj.specs
-
-    def test_dvc_companion_not_a_sentinel(self, tmp_path):
-        """catalog.yaml (IntakeCatalog / DVCRepo companion) is not a sentinel."""
-        self._big_csv(tmp_path / "data.csv")
-        (tmp_path / "catalog.yaml").write_text("sources: {}")
-        proj = _data_project(tmp_path)
-        assert "data" in proj.specs
-
-    def test_sentinel_present_data_majority(self, tmp_path):
-        """Sentinel is present, but data files are the majority of bytes → Data parsed."""
-        self._big_csv(tmp_path / "data.csv")  # large data file
-        (tmp_path / "pyproject.toml").write_text(
-            "[project]\nname='x'\n"
-        )  # tiny sentinel
-        proj = _data_project(tmp_path)
-        assert "data" in proj.specs
-
-    def test_sentinel_present_data_majority_parquet(self, tmp_path):
-        pytest.importorskip("pyarrow")
-        import pyarrow as pa, pyarrow.parquet as pq
-
-        pq.write_table(
-            pa.table({"x": list(range(1000)), "y": list(range(1000))}),
-            str(tmp_path / "data.parquet"),
+        assert g.url("/data/foo") == "/data/foo/*.csv"
+        single = FileGroup(members=["only.csv"], ext=".csv", pattern="only.csv")
+        assert single.url("/data/foo") == "/data/foo/only.csv"
+
+    def test_size_unknown_propagates_none(self):
+        files = [("001.csv", None), ("002.csv", 10), ("003.csv", 10)]
+        groups = consolidate(files)
+        assert groups[0].total_size is None
+
+
+# ---------------------------------------------------------------------------
+# Content classes
+# ---------------------------------------------------------------------------
+
+
+class TestContentClasses:
+    def test_dataset_roundtrip(self, tmp_path):
+        proj = projspec.Project(str(tmp_path))
+        ds = Dataset(
+            proj=proj,
+            url=f"{proj.url}/*.csv",
+            datatype="CSV",
+            structure=["table"],
+            schema={"columns": ["a", "b"]},
+            n_files=3,
+            total_size=999,
+            metadata={"readers": ["DaskCSV"]},
         )
-        (tmp_path / "Cargo.toml").write_text('[package]\nname="x"\n')
-        proj = _data_project(tmp_path)
-        assert "data" in proj.specs
-
-    # -- mixed dirs where non-data dominates --
-
-    def test_sentinel_present_code_majority(self, tmp_path):
-        """Sentinel present and code files dominate → Data spec suppressed."""
-        # Large Python source file
-        (tmp_path / "main.py").write_text("x = 1\n" * 5000)
-        # Tiny CSV
-        (tmp_path / "tiny.csv").write_text("a,b\n1,2\n")
-        (tmp_path / "pyproject.toml").write_text("[project]\nname='x'\n")
-        proj = _data_project(tmp_path)
-        assert "data" not in proj.specs
-
-    def test_sentinel_present_equal_split_not_majority(self, tmp_path):
-        """Exactly 50/50 bytes is not a majority — Data suppressed."""
-        payload = "x" * 1000
-        (tmp_path / "code.py").write_text(payload)
-        (tmp_path / "data.csv").write_text(payload)
-        (tmp_path / "pyproject.toml").write_text("[project]\nname='x'\n")
-        proj = _data_project(tmp_path)
-        assert "data" not in proj.specs
-
-    # -- helpers / unit tests for the private methods --
-
-    def test_has_non_data_sentinels_true(self, tmp_path):
-        from projspec.proj.data_dir import Data
-
-        (tmp_path / "data.csv").write_text("x\n1\n")
-        (tmp_path / "pyproject.toml").write_text("")
-        proj = object.__new__(projspec.Project)
-        import fsspec
-
-        proj.fs = fsspec.filesystem("file")
-        proj.url = str(tmp_path)
-        proj.__dict__["basenames"] = {
-            e["name"].rsplit("/", 1)[-1]: e["name"]
-            for e in proj.fs.ls(str(tmp_path), detail=True)
-        }
-        proj.__dict__["filelist"] = proj.fs.ls(str(tmp_path), detail=True)
-        inst = Data.__new__(Data)
-        inst.proj = proj
-        assert inst._has_non_data_sentinels() is True
-
-    def test_has_non_data_sentinels_false(self, tmp_path):
-        from projspec.proj.data_dir import Data
-
-        (tmp_path / "data.csv").write_text("x\n1\n")
-        proj = object.__new__(projspec.Project)
-        import fsspec
-
-        proj.fs = fsspec.filesystem("file")
-        proj.url = str(tmp_path)
-        proj.__dict__["basenames"] = {
-            e["name"].rsplit("/", 1)[-1]: e["name"]
-            for e in proj.fs.ls(str(tmp_path), detail=True)
-        }
-        proj.__dict__["filelist"] = proj.fs.ls(str(tmp_path), detail=True)
-        inst = Data.__new__(Data)
-        inst.proj = proj
-        assert inst._has_non_data_sentinels() is False
-
-    def test_data_bytes_majority_true(self, tmp_path):
-        from projspec.proj.data_dir import Data
-
-        self._big_csv(tmp_path / "data.csv")
-        (tmp_path / "small.py").write_text("x = 1\n")
-        proj = object.__new__(projspec.Project)
-        import fsspec
-
-        proj.fs = fsspec.filesystem("file")
-        proj.url = str(tmp_path)
-        proj.__dict__["filelist"] = proj.fs.ls(str(tmp_path), detail=True)
-        inst = Data.__new__(Data)
-        inst.proj = proj
-        assert inst._data_bytes_majority() is True
-
-    def test_data_bytes_majority_false(self, tmp_path):
-        from projspec.proj.data_dir import Data
-
-        (tmp_path / "main.py").write_text("x = 1\n" * 5000)
-        (tmp_path / "tiny.csv").write_text("a\n1\n")
-        proj = object.__new__(projspec.Project)
+        d = ds.to_dict(compact=False)
+        assert d["klass"] == ["content", "dataset"]
+        # the dataset name lives in the containing dict's key, not the object
+        assert "name" not in d
+        from projspec.utils import from_dict
+
+        ds2 = from_dict(d, proj=proj)
+        assert isinstance(ds2, Dataset)
+        assert ds2.datatype == "CSV"
+        assert ds2.n_files == 3
+
+    def test_tabular_and_intake_source_registered(self):
+        from projspec.content.base import registry
+
+        assert registry["tabular_data"] is TabularData
+        assert registry["intake_source"] is IntakeSource
+        assert registry["dataset"] is Dataset
+
+
+# ---------------------------------------------------------------------------
+# DataProject detection / significance
+# ---------------------------------------------------------------------------
+
+
+class TestDataProjectSignificance:
+    def test_pure_data_dir_detected(self, tmp_path):
+        # three numbered csvs, well above the play-data floor
+        with temp_conf(**PROD_THRESHOLDS):
+            write_data(tmp_path, {f"{i:03d}.csv": 100_000 for i in range(1, 4)})
+            proj = projspec.Project(str(tmp_path))
+        assert "data_project" in proj.specs
+        ds = datasets(proj)
+        assert len(ds) == 1
+        assert "*.csv" in ds
+        assert ds["*.csv"].n_files == 3
+
+    def test_tiny_play_data_rejected(self, tmp_path):
+        with temp_conf(**PROD_THRESHOLDS):
+            write_data(tmp_path, {f"{i:03d}.csv": 20 for i in range(1, 4)})
+            proj = projspec.Project(str(tmp_path))
+        assert "data_project" not in proj.specs
+
+    def test_big_single_file_in_code_project(self, tmp_path):
+        # python package + one big csv -> both python_code and data_project
+        with temp_conf(**PROD_THRESHOLDS):
+            write_data(
+                tmp_path,
+                {
+                    "__init__.py": b"x = 1\n",
+                    "big.csv": 2 * 1024 * 1024,  # > data_min_file_size (1MB)
+                },
+            )
+            proj = projspec.Project(str(tmp_path))
+        assert "python_code" in proj.specs
+        assert "data_project" in proj.specs
+        ds = datasets(proj)
+        assert "big.csv" in ds
+
+    def test_small_data_in_code_project_ignored(self, tmp_path):
+        with temp_conf(**PROD_THRESHOLDS):
+            write_data(
+                tmp_path,
+                {
+                    "__init__.py": b"x = 1\n",
+                    "main.py": b"print(1)\n" * 100,
+                    "sample.csv": 200,  # tiny
+                },
+            )
+            proj = projspec.Project(str(tmp_path))
+        assert "python_code" in proj.specs
+        assert "data_project" not in proj.specs
+
+    def test_fraction_rule_large_data_in_code_project(self, tmp_path):
+        # small code, large data -> data dominates by fraction and total size.
+        # Use a .csv so intake can identify a datatype (datasets with no
+        # identified datatype are dropped from the result).
+        with temp_conf(**PROD_THRESHOLDS):
+            write_data(
+                tmp_path,
+                {
+                    "__init__.py": b"x = 1\n",
+                    "data.csv": b"a,b,c\n" + b"1,2,3\n" * (4 * 1024 * 1024),  # >20MB
+                },
+            )
+            proj = projspec.Project(str(tmp_path))
+        assert "python_code" in proj.specs
+        assert "data_project" in proj.specs
+
+    def test_threshold_overridable_via_config(self, tmp_path):
+        write_data(tmp_path, {f"{i:03d}.csv": 20 for i in range(1, 4)})
+        # with the production play-size floor: rejected
+        with temp_conf(**PROD_THRESHOLDS):
+            proj = projspec.Project(str(tmp_path))
+            assert "data_project" not in proj.specs
+        # with a tiny play-size floor it should be detected
+        with temp_conf(data_min_play_size=1):
+            proj = projspec.Project(str(tmp_path))
+            assert "data_project" in proj.specs
+
+
+# ---------------------------------------------------------------------------
+# DataProject consolidation + intake integration
+# ---------------------------------------------------------------------------
+
+
+class TestDataProjectDatasets:
+    def test_image_series_consolidated(self, tmp_path):
+        with temp_conf(**PROD_THRESHOLDS):
+            write_data(
+                tmp_path,
+                {
+                    f"{c}.gif": b"GIF89a" + b"\0" * 50_000
+                    for c in ("red", "green", "blue")
+                },
+            )
+            proj = projspec.Project(str(tmp_path))
+        ds = datasets(proj)
+        assert len(ds) == 1
+        assert "*.gif" in ds
+        assert ds["*.gif"].n_files == 3
+
+    def test_directory_dataset_marker(self, tmp_path):
+        # a _metadata marker means intake treats the whole dir as one dataset
+        with temp_conf(**PROD_THRESHOLDS):
+            write_data(
+                tmp_path,
+                {
+                    "_metadata": 100,
+                    "part-0.parquet": b"PAR1" + b"\0" * 200_000,
+                    "part-1.parquet": b"PAR1" + b"\0" * 200_000,
+                },
+            )
+            proj = projspec.Project(str(tmp_path))
+        assert "data_project" in proj.specs
+        ds = datasets(proj)
+        # whole directory described as a single dataset
+        assert len(ds) == 1
+
+    @pytest.mark.skipif(not HAS_INTAKE, reason="intake not installed")
+    def test_intake_identifies_csv(self, tmp_path):
+        with temp_conf(**PROD_THRESHOLDS):
+            rows = b"a,b,c\n" + b"".join(b"1,2,3\n" for _ in range(50_000))
+            write_data(tmp_path, {f"{i:03d}.csv": rows for i in range(1, 4)})
+            proj = projspec.Project(str(tmp_path))
+        ds = datasets(proj)
+        assert len(ds) == 1
+        assert ds["*.csv"].datatype == "CSV"
+        assert "table" in ds["*.csv"].structure
+
+    def test_no_data_files_no_match(self, tmp_path):
+        write_data(tmp_path, {"README.md": b"# hi\n", "setup.py": b"x=1\n"})
+        proj = projspec.Project(str(tmp_path))
+        assert "data_project" not in proj.specs
+
+    @pytest.mark.skipif(not HAS_INTAKE, reason="intake not installed")
+    def test_remote_url_keeps_protocol_for_intake(self):
+        """Regression: scanning a remote (protocol-prefixed) directory must
+        hand intake a protocol-qualified URL.
+
+        ``proj.url`` has the protocol stripped by ``fsspec.url_to_fs``; if that
+        bare path reaches intake it can't pick the filesystem and resolves no
+        files. The dataset URL handed to / stored by intake must keep the
+        protocol (e.g. ``memory://``).
+        """
         import fsspec
 
-        proj.fs = fsspec.filesystem("file")
-        proj.url = str(tmp_path)
-        proj.__dict__["filelist"] = proj.fs.ls(str(tmp_path), detail=True)
-        inst = Data.__new__(Data)
-        inst.proj = proj
-        assert inst._data_bytes_majority() is False
+        fs = fsspec.filesystem("memory")
+        root = "/data_project_remote"
+        rows = b"a,b,c\n" + b"".join(b"1,2,3\n" for _ in range(50_000))
+        try:
+            for i in range(1, 4):
+                with fs.open(f"{root}/{i:03d}.csv", "wb") as f:
+                    f.write(rows)
+
+            with temp_conf(data_min_play_size=1, data_min_fraction=0.5):
+                proj = projspec.Project(f"memory://{root}")
+            # the bare filesystem path has no protocol...
+            assert "://" not in proj.url
+            ds = datasets(proj)
+            assert "*.csv" in ds
+            # ...but intake was able to resolve and type the files, and the
+            # stored dataset URL is protocol-qualified.
+            assert ds["*.csv"].datatype == "CSV"
+            assert str(ds["*.csv"].url).startswith("memory://")
+        finally:
+            try:
+                fs.rm(root, recursive=True)
+            except FileNotFoundError:
+                pass
+
+
+# ---------------------------------------------------------------------------
+# match() / _is_data_ext unit checks
+# ---------------------------------------------------------------------------
+
+
+class TestDataExt:
+    @pytest.mark.parametrize(
+        "name,expected",
+        [
+            ("data.csv", True),
+            ("table.parquet", True),
+            ("image.png", True),
+            ("archive.csv.gz", True),
+            ("module.py", False),
+            ("README.md", False),
+            ("pyproject.toml", False),
+            (".gitignore", False),
+            ("Makefile", False),  # no extension
+            ("config.ini", False),
+        ],
+    )
+    def test_is_data_ext(self, name, expected):
+        assert DataProject._is_data_ext(name) is expected
+
+
+# ---------------------------------------------------------------------------
+# HTML repr / thumbnail captured into Dataset.metadata
+# ---------------------------------------------------------------------------
+
+
+def _make_csv_bytes(rows: int = 200_000) -> bytes:
+    """A CSV big enough to clear the single-big-file significance threshold."""
+    body = "a,b,c\n" + "\n".join(f"{i},{i * 2},val{i}" for i in range(rows))
+    return body.encode()
+
+
+class TestDatasetHTMLOutput:
+    """The DataProject pipeline should carry intake's ``html_repr`` and
+    ``thumbnail`` through into ``Dataset.metadata`` when a reader discovers the
+    underlying object."""
+
+    @pytest.mark.skipif(not (HAS_INTAKE and HAS_PANDAS), reason="needs intake + pandas")
+    def test_html_repr_for_tabular(self, tmp_path):
+        # single file > data_min_file_size so it is described on its own and a
+        # single-file pandas reader can discover it
+        with temp_conf(**PROD_THRESHOLDS):
+            write_data(tmp_path, {"big.csv": _make_csv_bytes()})
+            proj = projspec.Project(str(tmp_path))
+        ds = datasets(proj)
+        assert len(ds) == 1
+        meta = ds["big.csv"].metadata
+        assert "PandasCSV" in meta.get("readers")
+        assert meta.get("html_repr"), "expected html_repr in Dataset.metadata"
+        assert "<table" in meta["html_repr"]
+        # no image thumbnail for tabular data
+        assert "thumbnail" not in meta
+
+    @pytest.mark.skipif(not (HAS_INTAKE and HAS_PIL), reason="needs intake + Pillow")
+    def test_thumbnail_for_image(self, tmp_path):
+        import numpy as np
+        from PIL import Image
+
+        # a single big PNG so it is significant on its own
+        with temp_conf(**PROD_THRESHOLDS):
+            arr = (np.random.rand(400, 400, 3) * 255).astype("uint8")
+            Image.fromarray(arr).save(os.path.join(str(tmp_path), "pic.png"))
+
+            proj = projspec.Project(str(tmp_path))
+        ds = datasets(proj)
+        assert len(ds) == 1
+        meta = ds["pic.png"].metadata
+        assert ds["pic.png"].datatype == "PNG", ds["pic.png"].datatype
+        assert meta.get("reader_used") == "PILImageReader", meta.get("reader_used")
+        assert meta.get("thumbnail", "").startswith("data:image/png;base64,")
+
+    @pytest.mark.skipif(not HAS_INTAKE, reason="intake not installed")
+    def test_metadata_omits_missing_html_fields(self, tmp_path):
+        # a glob of tiny-but-significant files that intake can type but for
+        # which no reader produces a rich repr -> html_repr/thumbnail simply
+        # absent, never None-valued keys
+        with temp_conf(data_min_play_size=1):
+            rows = b"a,b,c\n" + b"1,2,3\n" * 10
+            write_data(tmp_path, {f"{i:03d}.csv": rows for i in range(5)})
+            proj = projspec.Project(str(tmp_path))
+        ds = datasets(proj)
+        assert ds, "expected a dataset"
+        for d in ds.values():
+            assert d.datatype is not None
+            assert "html_repr" not in d.metadata or isinstance(
+                d.metadata["html_repr"], str
+            )
+            assert "thumbnail" not in d.metadata or isinstance(
+                d.metadata["thumbnail"], str
+            )
+
+
+# ---------------------------------------------------------------------------
+# Per-dataset fraction filtering (_filter_small_datasets)
+# ---------------------------------------------------------------------------
+
+
+def _bare_data_project(tmp_path) -> DataProject:
+    """A DataProject instance not bound to any real data (for unit testing
+    the pure-Python helper without triggering match()/parse())."""
+    proj = projspec.Project(str(tmp_path))
+    dp = DataProject.__new__(DataProject)
+    dp.proj = proj
+    return dp
+
+
+def _ds(proj, name, size):
+    """Return a ``(name, Dataset)`` pair as consumed by
+    ``DataProject._filter_small_datasets``."""
+    return name, Dataset(
+        proj=proj,
+        url=f"{proj.url}/{name}",
+        datatype="CSV",
+        structure=["table"],
+        schema={},
+        n_files=1,
+        total_size=size,
+        metadata={},
+    )
+
+
+def _kept_names(pairs):
+    return [name for name, _ in pairs]
+
+
+class TestFilterSmallDatasets:
+    def test_drops_dataset_below_fraction_of_largest(self, tmp_path):
+        dp = _bare_data_project(tmp_path)
+        big = _ds(dp.proj, "big.csv", 1000)
+        small = _ds(dp.proj, "small.csv", 10)  # 1% of largest
+        with temp_conf(data_min_fraction=0.5):
+            kept = dp._filter_small_datasets([big, small])
+        assert _kept_names(kept) == ["big.csv"]
+
+    def test_keeps_datasets_above_fraction(self, tmp_path):
+        dp = _bare_data_project(tmp_path)
+        a = _ds(dp.proj, "a.csv", 1000)
+        b = _ds(dp.proj, "b.csv", 800)  # 80% of largest
+        with temp_conf(data_min_fraction=0.5):
+            kept = dp._filter_small_datasets([a, b])
+        assert set(_kept_names(kept)) == {"a.csv", "b.csv"}
+
+    def test_single_dataset_never_filtered(self, tmp_path):
+        dp = _bare_data_project(tmp_path)
+        only = _ds(dp.proj, "only.csv", 1)
+        with temp_conf(data_min_fraction=0.5):
+            kept = dp._filter_small_datasets([only])
+        assert _kept_names(kept) == ["only.csv"]
+
+    def test_unknown_sizes_disable_filtering(self, tmp_path):
+        dp = _bare_data_project(tmp_path)
+        big = _ds(dp.proj, "big.csv", 1000)
+        unknown = _ds(dp.proj, "u.csv", None)
+        with temp_conf(data_min_fraction=0.5):
+            kept = dp._filter_small_datasets([big, unknown])
+        assert set(_kept_names(kept)) == {"big.csv", "u.csv"}
+
+    def test_never_drops_everything(self, tmp_path):
+        # an impossible threshold (>1) would exclude all -> fall back to all
+        dp = _bare_data_project(tmp_path)
+        a = _ds(dp.proj, "a.csv", 1000)
+        b = _ds(dp.proj, "b.csv", 1000)
+        with temp_conf(data_min_fraction=2.0):
+            kept = dp._filter_small_datasets([a, b])
+        assert set(_kept_names(kept)) == {"a.csv", "b.csv"}
+
+    def test_zero_fraction_keeps_all(self, tmp_path):
+        dp = _bare_data_project(tmp_path)
+        big = _ds(dp.proj, "big.csv", 1000)
+        tiny = _ds(dp.proj, "tiny.csv", 1)
+        with temp_conf(data_min_fraction=0.0):
+            kept = dp._filter_small_datasets([big, tiny])
+        assert set(_kept_names(kept)) == {"big.csv", "tiny.csv"}
+
+    @pytest.mark.skipif(not HAS_INTAKE, reason="intake not installed")
+    def test_end_to_end_drops_tiny_dataset(self, tmp_path):
+        # one large csv-series dataset and one tiny json file; the tiny one
+        # should be dropped as a small fraction of the largest.
+        big_rows = b"a,b,c\n" + b"1,2,3\n" * 20000  # large
+        with temp_conf(data_min_play_size=1, data_min_fraction=0.5):
+            write_data(
+                tmp_path,
+                {
+                    **{f"{i:03d}.csv": big_rows for i in range(3)},
+                    "tiny.json": b'{"x": 1}\n',
+                },
+            )
+            proj = projspec.Project(str(tmp_path))
+        names = dataset_names(proj)
+        assert "*.csv" in names
+        assert "tiny.json" not in names
+
+    @pytest.mark.skipif(not HAS_INTAKE, reason="intake not installed")
+    def test_end_to_end_keeps_similar_sized_datasets(self, tmp_path):
+        # two datasets of comparable size are both kept (neither is a small
+        # fraction of the other).
+        csv_rows = b"a,b,c\n" + b"1,2,3\n" * 20000
+        json_rows = b'{"x": 1}\n' * 20000
+        with temp_conf(data_min_play_size=1, data_min_fraction=0.5):
+            write_data(
+                tmp_path,
+                {
+                    **{f"{i:03d}.csv": csv_rows for i in range(3)},
+                    **{f"{i:03d}.json": json_rows for i in range(3)},
+                },
+            )
+            proj = projspec.Project(str(tmp_path))
+        names = dataset_names(proj)
+        assert "*.csv" in names
+        assert "*.json" in names
diff --git a/tests/test_ipywidget_helpers.py b/tests/test_ipywidget_helpers.py
index 84a4dea..485a481 100644
--- a/tests/test_ipywidget_helpers.py
+++ b/tests/test_ipywidget_helpers.py
@@ -367,6 +367,7 @@ def test_add_confirmed_valid_path(self, tmp_path, widget_and_lib):
         widget, lib, url = widget_and_lib
         new_proj = tmp_path / "newproj"
         new_proj.mkdir()
+        (new_proj / "requirements.txt").write_text("numpy")
         # Start with just the original entry
         original_keys = set(lib.entries)
         sends, toasts = _fire(
diff --git a/tests/test_library.py b/tests/test_library.py
index 276e66a..3796588 100644
--- a/tests/test_library.py
+++ b/tests/test_library.py
@@ -1,6 +1,9 @@
+import json
 import os
+import time
 
 from projspec import Project
+from projspec.config import temp_conf
 from projspec.library import ProjectLibrary
 
 here = os.path.abspath(os.path.dirname(__file__))
@@ -40,3 +43,88 @@ def test_filter(tmp_path):
 
     # miss
     assert not library.filter([("spec", "xx")])
+
+
+def test_scanned_at_set_on_scan(tmp_path):
+    (tmp_path / "__init__.py").write_text("x = 1\n")
+    before = time.time()
+    proj = Project(str(tmp_path), walk=False)
+    after = time.time()
+    assert isinstance(proj.scanned_at, float)
+    assert before <= proj.scanned_at <= after
+
+
+def test_scanned_at_serialised_and_roundtrips(tmp_path):
+    (tmp_path / "__init__.py").write_text("x = 1\n")
+    proj = Project(str(tmp_path), walk=False)
+
+    dic = proj.to_dict(compact=False)
+    assert "scanned_at" in dic
+
+    proj2 = Project.from_dict(dic)
+    # round-trips back to the same numeric value (serialiser stringifies floats)
+    assert isinstance(proj2.scanned_at, float)
+    assert proj2.scanned_at == proj.scanned_at
+
+
+def test_scanned_at_defaults_to_now_when_missing(tmp_path):
+    (tmp_path / "__init__.py").write_text("x = 1\n")
+    proj = Project(str(tmp_path), walk=False)
+
+    dic = proj.to_dict(compact=False)
+    dic.pop("scanned_at")  # simulate an older library without the field
+
+    before = time.time()
+    proj2 = Project.from_dict(dic)
+    assert before <= proj2.scanned_at <= time.time() + 1
+
+
+def _make_library_with_old_entry(tmp_path, age_seconds):
+    """Create a library file containing one project scanned *age_seconds* ago."""
+    proj_dir = tmp_path / "proj"
+    proj_dir.mkdir()
+    (proj_dir / "__init__.py").write_text("x = 1\n")
+    fn = str(tmp_path / "library.json")
+
+    proj = Project(str(proj_dir), walk=False)
+    library = ProjectLibrary(fn, auto_save=True)
+    key = proj.fs.unstrip_protocol(proj.url)
+    library.add_entry(key, proj)
+
+    # rewrite the saved scanned_at to be old
+    data = json.load(open(fn))
+    for entry in data.values():
+        entry["scanned_at"] = time.time() - age_seconds
+    json.dump(data, open(fn, "w"))
+    return fn, key
+
+
+def test_auto_rescan_refreshes_old_entry(tmp_path):
+    fn, key = _make_library_with_old_entry(tmp_path, age_seconds=1000)
+
+    with temp_conf(auto_rescan=10):  # threshold below the entry's age
+        library = ProjectLibrary(fn)
+    # the stale entry was rescanned -> timestamp is fresh
+    assert library.entries[key].scanned_at >= time.time() - 5
+    # ...and the refreshed library was written back to disk
+    data = json.load(open(fn))
+    assert float(data[key]["scanned_at"]) >= time.time() - 5
+
+
+def test_auto_rescan_keeps_fresh_entry(tmp_path):
+    fn, key = _make_library_with_old_entry(tmp_path, age_seconds=5)
+
+    with temp_conf(auto_rescan=1000):  # threshold well above the entry's age
+        library = ProjectLibrary(fn)
+    # fresh enough -> not rescanned, original (old) timestamp preserved
+    assert library.entries[key].scanned_at < time.time() - 1
+
+
+def test_auto_rescan_disabled_with_zero(tmp_path):
+    fn, key = _make_library_with_old_entry(tmp_path, age_seconds=10_000)
+    old = json.load(open(fn))[key]["scanned_at"]
+
+    with temp_conf(auto_rescan=0):  # disabled entirely
+        library = ProjectLibrary(fn)
+    # the very old entry is kept as-is, never rescanned
+    assert abs(library.entries[key].scanned_at - old) < 1
diff --git a/tests/test_new_specs.py b/tests/test_new_specs.py
index f41a6c5..9d1d3a5 100644
--- a/tests/test_new_specs.py
+++ b/tests/test_new_specs.py
@@ -2016,3 +2016,147 @@ def test_roundtrip_create_and_detect(self, tmpdir):
         # Re-scan so scanned_files picks up the new flow.py
         proj2 = projspec.Project(path)
         assert "metaflow" in proj2
+
+
+# ---------------------------------------------------------------------------
+# KnowledgeCatalog (Open Knowledge Format bundle)
+# ---------------------------------------------------------------------------
+
+
+class TestKnowledgeCatalog:
+    FILES = {
+        "index.md": '---\nokf_version: "0.1"\n---\n\n# My Bundle\n\n'
+        "* [Sales](datasets/sales.md) - sales data\n",
+        "log.md": "# Update Log\n\n## 2026-01-01\n* **Creation**: started.\n",
+        "datasets/sales.md": """\
+            ---
+            type: BigQuery Dataset
+            title: Sales
+            description: All sales-related tables.
+            tags: [sales, revenue]
+            timestamp: 2026-05-28T00:00:00Z
+            ---
+
+            The sales dataset.
+            """,
+        "tables/orders.md": """\
+            ---
+            type: BigQuery Table
+            title: Orders
+            resource: https://example.com/orders
+            ---
+
+            # Schema
+            """,
+        # not a concept: no frontmatter
+        "notes/random.md": "just some prose, no frontmatter\n",
+    }
+
+    def test_match_positive(self, tmpdir):
+        proj = make_proj(tmpdir, self.FILES)
+        from projspec.proj.knowledge_catalog import KnowledgeCatalog
+
+        assert raw_spec(KnowledgeCatalog, proj).match()
+
+    def test_match_root_concept(self, tmpdir):
+        # index.md plus a concept at the root (no subdirs)
+        proj = make_proj(
+            tmpdir,
+            {
+                "index.md": "# Bundle\n",
+                "overview.md": "---\ntype: Reference\n---\nbody\n",
+            },
+        )
+        from projspec.proj.knowledge_catalog import KnowledgeCatalog
+
+        assert raw_spec(KnowledgeCatalog, proj).match()
+
+    def test_match_negative_no_index(self, tmpdir):
+        proj = make_proj(tmpdir, {"tables/orders.md": "---\ntype: T\n---\n"})
+        from projspec.proj.knowledge_catalog import KnowledgeCatalog
+
+        assert not raw_spec(KnowledgeCatalog, proj).match()
+
+    def test_match_negative_empty(self, tmpdir):
+        proj = make_proj(tmpdir, {})
+        from projspec.proj.knowledge_catalog import KnowledgeCatalog
+
+        assert not raw_spec(KnowledgeCatalog, proj).match()
+
+    def test_parse_contents(self, tmpdir):
+        proj = make_proj(tmpdir, self.FILES)
+        from projspec.proj.knowledge_catalog import KnowledgeCatalog
+
+        spec = raw_spec(KnowledgeCatalog, proj)
+        spec.parse()
+        assert "concept" in spec._contents
+        concepts = spec._contents["concept"]
+        # keyed by concept ID (bundle-relative path, no .md)
+        assert set(concepts) == {"datasets/sales", "tables/orders"}
+
+    def test_parse_detail(self, tmpdir):
+        proj = make_proj(tmpdir, self.FILES)
+        from projspec.proj.knowledge_catalog import KnowledgeCatalog
+
+        spec = raw_spec(KnowledgeCatalog, proj)
+        spec.parse()
+        sales = spec._contents["concept"]["datasets/sales"].meta
+        assert sales["type"] == "BigQuery Dataset"
+        assert sales["title"] == "Sales"
+        assert sales["tags"] == "sales, revenue"
+        orders = spec._contents["concept"]["tables/orders"].meta
+        assert orders["type"] == "BigQuery Table"
+        assert orders["resource"] == "https://example.com/orders"
+
+    def test_parse_bundle_version(self, tmpdir):
+        proj = make_proj(tmpdir, self.FILES)
+        from projspec.proj.knowledge_catalog import KnowledgeCatalog
+
+        spec = raw_spec(KnowledgeCatalog, proj)
+        spec.parse()
+        # root index.md okf_version surfaces as bundle-level metadata
+        assert spec._contents["descriptive_metadata"].meta["okf_version"] == "0.1"
+
+    def test_parse_skips_non_typed_docs(self, tmpdir):
+        proj = make_proj(tmpdir, self.FILES)
+        from projspec.proj.knowledge_catalog import KnowledgeCatalog
+
+        spec = raw_spec(KnowledgeCatalog, proj)
+        spec.parse()
+        # notes/random.md has no frontmatter -> not a concept
+        assert "notes/random" not in spec._contents["concept"]
+
+    def test_parse_no_typed_concepts_raises(self, tmpdir):
+        from projspec.proj.knowledge_catalog import KnowledgeCatalog
+        from projspec.proj import ParseFailed
+
+        proj = make_proj(
+            tmpdir,
+            {"index.md": "# index\n", "notes.md": "plain prose, no frontmatter\n"},
+        )
+        spec = raw_spec(KnowledgeCatalog, proj)
+        with pytest.raises(ParseFailed):
+            spec.parse()
+
+    def test_parse_requires_type_field(self, tmpdir):
+        # a markdown doc with frontmatter but no 'type' is not a concept
+        from projspec.proj.knowledge_catalog import KnowledgeCatalog
+        from projspec.proj import ParseFailed
+
+        proj = make_proj(
+            tmpdir,
+            {
+                "index.md": "# index\n",
+                "doc.md": "---\ntitle: No Type Here\n---\nbody\n",
+            },
+        )
+        spec = raw_spec(KnowledgeCatalog, proj)
+        with pytest.raises(ParseFailed):
+            spec.parse()
+
+    def test_roundtrip_create_and_detect(self, tmpdir):
+        path = str(tmpdir)
+        proj = projspec.Project(path)
+        proj.create("KnowledgeCatalog")
+        proj2 = projspec.Project(path)
+        assert "knowledge_catalog" in proj2
diff --git a/tests/test_roundtrips.py b/tests/test_roundtrips.py
index a1ed3a8..a3159e4 100644
--- a/tests/test_roundtrips.py
+++ b/tests/test_roundtrips.py
@@ -37,6 +37,7 @@
         "MDBook",
         "RTD",
         "BackstageCatalog",
+        "KnowledgeCatalog",
         # CI/CD — file-only _create()
         "GitHubActions",
         "GitLabCI",
diff --git a/tests/test_textapp_helpers.py b/tests/test_textapp_helpers.py
index d898d7e..191d49b 100644
--- a/tests/test_textapp_helpers.py
+++ b/tests/test_textapp_helpers.py
@@ -126,8 +126,15 @@ class TestFmtAge:
     def _ts(self, days_ago: float) -> float:
         return time.time() - days_ago * 86400
 
-    def test_today(self):
-        assert _fmt_age(self._ts(0.1)) == "today"
+    def test_just_now(self):
+        assert _fmt_age(time.time() - 5) == "just now"
+
+    def test_minutes(self):
+        assert _fmt_age(time.time() - 5 * 60) == "5 minutes ago"
+
+    def test_hours(self):
+        # 0.1 days ~= 2.4 hours -> reported in hours, not "today"
+        assert _fmt_age(self._ts(0.1)) == "2 hours ago"
 
     def test_yesterday(self):
         assert _fmt_age(self._ts(1.5)) == "yesterday"
@@ -333,6 +340,23 @@ def test_indentation_increases_for_nested(self):
         # Outer indent: lines_2 should have more leading spaces
         assert lines_2[0].startswith(" " * 2)
 
+    def test_html_repr_shown_as_placeholder(self):
+        # the giant raw HTML must not be dumped; show a short note instead
+        big = "<table>" + "x" * 5000 + "</table>"
+        lines = _yaml_lines({"metadata": {"html_repr": big}}, {}, 0)
+        combined = " ".join(lines)
+        assert big not in combined
+        assert "html_repr" in combined
+        assert "HTML preview" in combined
+
+    def test_thumbnail_shown_as_placeholder(self):
+        url = "data:image/png;base64," + "A" * 5000
+        lines = _yaml_lines({"metadata": {"thumbnail": url}}, {}, 0)
+        combined = " ".join(lines)
+        assert url not in combined
+        assert "thumbnail" in combined
+        assert "image thumbnail" in combined
+
 
 # ---------------------------------------------------------------------------
 # _wrap_chips
diff --git a/tests/test_webui.py b/tests/test_webui.py
index 06c4c96..bb11b2d 100644
--- a/tests/test_webui.py
+++ b/tests/test_webui.py
@@ -224,6 +224,23 @@ def test_panel_js_is_root_scoped():
     assert "window.projspecRoot" in js
 
 
+def test_panel_js_embeds_dataset_preview():
+    """The shared panel.js must embed a content's ``metadata.html_repr`` as
+    live HTML (via sanitizeHtml + innerHTML) and ``metadata.thumbnail`` as an
+    <img>, rather than dumping their raw strings into the YAML tree."""
+    js = get_panel_js()
+    # preview keys are pulled out of metadata
+    assert "meta.html_repr" in js
+    assert "meta.thumbnail" in js
+    # and removed from the YAML tree via stripPreview
+    assert "stripPreview" in js
+    assert "renderYaml(stripPreview(stripKlass(data)))" in js
+    # html_repr is embedded as sanitised HTML; thumbnail as a data: image
+    assert "sanitizeHtml(htmlRepr)" in js
+    assert "thumbnailImg" in js
+    assert "data:image/" in js
+
+
 def test_make_cwd_uses_project_path_not_library_key(tmp_path, monkeypatch):
     """Regression: Make must use the stored ``Project.path`` as the
     subprocess cwd, never the library key.
diff --git a/vsextension/src/panel.ts b/vsextension/src/panel.ts
index 9f3d6a7..7149c9c 100644
--- a/vsextension/src/panel.ts
+++ b/vsextension/src/panel.ts
@@ -933,8 +933,18 @@ const PANEL_JS = String.raw`
         }
     }
     function fmtAge(ts) {
-        const days = Math.floor((Date.now() / 1000 - parseFloat(ts)) / 86400);
-        if (days === 0) return 'today';
+        const secs = Math.floor(Date.now() / 1000 - parseFloat(ts));
+        if (secs < 0) return 'just now';
+        const days = Math.floor(secs / 86400);
+        if (days === 0) {
+            if (secs < 60) return 'just now';
+            if (secs < 3600) {
+                const m = Math.floor(secs / 60);
+                return m + ' minute' + (m !== 1 ? 's' : '') + ' ago';
+            }
+            const h = Math.floor(secs / 3600);
+            return h + ' hour' + (h !== 1 ? 's' : '') + ' ago';
+        }
         if (days === 1) return 'yesterday';
         if (days < 30) return days + ' days ago';
         if (days < 365) return Math.floor(days / 30) + ' months ago';
@@ -1018,6 +1028,8 @@ const PANEL_JS = String.raw`
             const by = project.last_modified_by != null ? project.last_modified_by : null;
             metaParts.push('last modified ' + age + (by ? ' by ' + by : ''));
         }
+        if (project.scanned_at != null)
+            metaParts.push('scanned ' + fmtAge(project.scanned_at));
         if (metaParts.length > 0) {
             const meta = document.createElement('div');
             meta.className = 'meta';
@@ -1309,15 +1321,70 @@ const PANEL_JS = String.raw`
             body.innerHTML = sanitizeHtml(html);
             w.appendChild(body);
         } else {
+            // Datasets (and other content) may carry rich previews in
+            // metadata.html_repr (an HTML fragment) and metadata.thumbnail
+            // (a data: image URL). Embed those rather than dumping their
+            // (often huge) raw strings into the YAML tree.
+            const meta = (kind === 'content' && data && typeof data === 'object'
+                && data.metadata && typeof data.metadata === 'object') ? data.metadata : null;
+            const htmlRepr = meta && typeof meta.html_repr === 'string' ? meta.html_repr : null;
+            const thumb = meta && typeof meta.thumbnail === 'string' ? meta.thumbnail : null;
+
             const tree = document.createElement('div');
             tree.className = 'tree yaml';
-            tree.appendChild(renderYaml(stripKlass(data)));
+            tree.appendChild(renderYaml(stripPreview(stripKlass(data))));
             w.appendChild(tree);
+
+            if (thumb) w.appendChild(thumbnailImg(thumb));
+            if (htmlRepr) {
+                const body = document.createElement('div');
+                body.className = 'widget-html';
+                body.innerHTML = sanitizeHtml(htmlRepr);
+                w.appendChild(body);
+            }
         }
 
         return w;
     }
 
+    /**
+     * Return a shallow copy of a content dict with the embedded-preview keys
+     * (metadata.html_repr / metadata.thumbnail) removed, so the YAML tree
+     * doesn't show their large raw strings - they are rendered as live
+     * HTML / an image instead.
+     */
+    function stripPreview(obj) {
+        if (!obj || typeof obj !== 'object' || Array.isArray(obj)) return obj;
+        if (!obj.metadata || typeof obj.metadata !== 'object' || Array.isArray(obj.metadata)) return obj;
+        const meta = {};
+        let changed = false;
+        for (const k of Object.keys(obj.metadata)) {
+            if (k === 'html_repr' || k === 'thumbnail') { changed = true; continue; }
+            meta[k] = obj.metadata[k];
+        }
+        if (!changed) return obj;
+        const out = {};
+        for (const k of Object.keys(obj)) out[k] = obj[k];
+        out.metadata = meta;
+        return out;
+    }
+
+    /**
+     * Build an <img> for a data:image/... thumbnail URL. Only accepts
+     * data: image URLs (never remote/javascript URLs).
+     */
+    function thumbnailImg(src) {
+        const wrap = document.createElement('div');
+        wrap.className = 'widget-html';
+        if (/^data:image\//i.test(src)) {
+            const img = document.createElement('img');
+            img.src = src;
+            img.alt = 'thumbnail';
+            wrap.appendChild(img);
+        }
+        return wrap;
+    }
+
     /**
      * Minimal sanitisation of content-provided HTML.  The markup comes from
      * the projspec library itself so we don't need a full DOMPurify - but we
diff --git a/vsextension/src/projspec.ts b/vsextension/src/projspec.ts
index 20afb8b..97ed9a8 100644
--- a/vsextension/src/projspec.ts
+++ b/vsextension/src/projspec.ts
@@ -171,6 +171,7 @@ export interface ProjectData {
     is_writable?: string;
     last_modified?: string;
     last_modified_by?: string;
+    scanned_at?: string;
 }
 
 export interface SpecData {
diff --git a/vsextension/tsconfig.json b/vsextension/tsconfig.json
index 356580f..5e22142 100644
--- a/vsextension/tsconfig.json
+++ b/vsextension/tsconfig.json
@@ -8,6 +8,11 @@
 		],
 		"sourceMap": true,
 		"rootDir": "src",
+		"types": [
+			"node",
+			"vscode",
+			"mocha"
+		],
 		"strict": true,   /* enable all strict type-checking options */
 		/* Additional Checks */
 		// "noImplicitReturns": true, /* Report error when not all code paths in function return a value. */