diff --git a/pycharm_plugin/src/main/kotlin/com/projspec/toolwindow/HtmlContent.kt b/pycharm_plugin/src/main/kotlin/com/projspec/toolwindow/HtmlContent.kt index 2e81e49..102ce2b 100644 --- a/pycharm_plugin/src/main/kotlin/com/projspec/toolwindow/HtmlContent.kt +++ b/pycharm_plugin/src/main/kotlin/com/projspec/toolwindow/HtmlContent.kt @@ -556,8 +556,18 @@ body { margin: 0; padding: 0; font-family: var(--vscode-font-family); color: var } } function fmtAge(ts) { - const days = Math.floor((Date.now() / 1000 - parseFloat(ts)) / 86400); - if (days === 0) return 'today'; + const secs = Math.floor(Date.now() / 1000 - parseFloat(ts)); + if (secs < 0) return 'just now'; + const days = Math.floor(secs / 86400); + if (days === 0) { + if (secs < 60) return 'just now'; + if (secs < 3600) { + const m = Math.floor(secs / 60); + return m + ' minute' + (m !== 1 ? 's' : '') + ' ago'; + } + const h = Math.floor(secs / 3600); + return h + ' hour' + (h !== 1 ? 's' : '') + ' ago'; + } if (days === 1) return 'yesterday'; if (days < 30) return days + ' days ago'; if (days < 365) return Math.floor(days / 30) + ' months ago'; @@ -641,6 +651,8 @@ body { margin: 0; padding: 0; font-family: var(--vscode-font-family); color: var const by = project.last_modified_by != null ? project.last_modified_by : null; metaParts.push('last modified ' + age + (by ? ' by ' + by : '')); } + if (project.scanned_at != null) + metaParts.push('scanned ' + fmtAge(project.scanned_at)); if (metaParts.length > 0) { const meta = document.createElement('div'); meta.className = 'meta'; @@ -914,15 +926,60 @@ body { margin: 0; padding: 0; font-family: var(--vscode-font-family); color: var body.innerHTML = sanitizeHtml(html); w.appendChild(body); } else { + // Datasets (and other content) may carry rich previews in + // metadata.html_repr (an HTML fragment) and metadata.thumbnail + // (a data: image URL). Embed those rather than dumping their + // (often huge) raw strings into the YAML tree. + const meta = (kind === 'content' && data && typeof data === 'object' + && data.metadata && typeof data.metadata === 'object') ? data.metadata : null; + const htmlRepr = meta && typeof meta.html_repr === 'string' ? meta.html_repr : null; + const thumb = meta && typeof meta.thumbnail === 'string' ? meta.thumbnail : null; + const tree = document.createElement('div'); tree.className = 'tree yaml'; - tree.appendChild(renderYaml(stripKlass(data))); + tree.appendChild(renderYaml(stripPreview(stripKlass(data)))); w.appendChild(tree); + + if (thumb) w.appendChild(thumbnailImg(thumb)); + if (htmlRepr) { + const body = document.createElement('div'); + body.className = 'widget-html'; + body.innerHTML = sanitizeHtml(htmlRepr); + w.appendChild(body); + } } return w; } + function stripPreview(obj) { + if (!obj || typeof obj !== 'object' || Array.isArray(obj)) return obj; + if (!obj.metadata || typeof obj.metadata !== 'object' || Array.isArray(obj.metadata)) return obj; + const meta = {}; + let changed = false; + for (const k of Object.keys(obj.metadata)) { + if (k === 'html_repr' || k === 'thumbnail') { changed = true; continue; } + meta[k] = obj.metadata[k]; + } + if (!changed) return obj; + const out = {}; + for (const k of Object.keys(obj)) out[k] = obj[k]; + out.metadata = meta; + return out; + } + + function thumbnailImg(src) { + const wrap = document.createElement('div'); + wrap.className = 'widget-html'; + if (/^data:image\//i.test(src)) { + const img = document.createElement('img'); + img.src = src; + img.alt = 'thumbnail'; + wrap.appendChild(img); + } + return wrap; + } + function sanitizeHtml(html) { const tpl = document.createElement('template'); tpl.innerHTML = String(html); diff --git a/pyproject.toml b/pyproject.toml index 3240168..380a1ea 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,11 +28,12 @@ dependencies = [ "fsspec", "click", "jinja2", + "intake==2.1.0a2" ] [project.optional-dependencies] test = ["pytest", "pytest-cov", "django", "streamlit", "copier", "jinja2-time", "flask", - "maturin", "uv", "briefcase"] + "maturin", "uv", "briefcase", "textual"] qt = ["pyqt>5,<6", "pyqtwebengin>5,<6"] textual = ["textual>=0.80"] ipywidget = ["anywidget>=0.9", "ipywidgets>=8", "ipython"] diff --git a/src/projspec/__main__.py b/src/projspec/__main__.py index ada5cb6..47141bc 100755 --- a/src/projspec/__main__.py +++ b/src/projspec/__main__.py @@ -95,12 +95,6 @@ def version(): default=False, help="JSON output, for projects only", ) -@click.option( - "--html-out", - is_flag=True, - default=False, - help="HTML output, for projects only", -) @click.option( "--walk", is_flag=True, help="Descend into child directories of each match" ) @@ -112,7 +106,6 @@ def scan( types, xtypes, json_out, - html_out, walk, summary, library, @@ -146,8 +139,6 @@ def scan( else: if json_out: print(json.dumps(proj.to_dict(compact=False))) - elif html_out: - print(proj._repr_html_()) else: print(proj) diff --git a/src/projspec/config.py b/src/projspec/config.py index 7c0d2e5..9bf7d61 100644 --- a/src/projspec/config.py +++ b/src/projspec/config.py @@ -5,7 +5,7 @@ from typing import Any -conf: dict[str, dict[str, Any]] = {} +conf: dict[str, Any] = {} default_conf_dir = os.path.join(os.path.expanduser("~"), ".config/projspec") @@ -33,12 +33,19 @@ def coerce(template, val): def defaults(): return { "library_path": f"{conf_dir()}/library.json", + "auto_rescan": 7 * 24 * 60 * 60, # one week, in seconds "scan_types": [".py", ".yaml", ".yml", ".toml", ".json", ".md"], "scan_max_files": 100, "scan_max_size": 5 * 2**10, "remote_artifact_status": False, "capture_artifact_output": True, "preferred_install_methods": ["conda", "pip"], + "data_min_fraction": 0.5, + "data_min_file_size": 1024 * 1024, + "data_min_total_size": 10 * 1024 * 1024, + "data_min_play_size": 1, # 64 * 1024, + "data_consolidate_min_group": 3, + "data_inspect_max_datasets": 50, "excludes": [ "bld", "build", @@ -56,6 +63,11 @@ def defaults(): config_doc = { "library_path": "location of persisted project objects", + "auto_rescan": ( + "maximum age (seconds) of a project loaded from the library before it " + "is automatically rescanned and re-saved. Set to 0 to disable " + "automatic rescanning. Default is one week." + ), "scan_types": "files extensions automatically read for scanning", "scan_max_files": "don't scan files if more than this number in the project", "scan_max_size": "don't scan files bigger than this (in bytes)", @@ -68,6 +80,34 @@ def defaults(): "ordered list of preferred installer names for install_tool(), " "e.g. ['uv', 'conda', 'pip']. Empty list uses the platform default." ), + "data_min_fraction": ( + "fraction (0-1) of a project's total bytes that must be data files " + "before a code/other project is also reported as a DataProject. Data " + "below this fraction is only scanned if the project matches no other " + "type, or individual files exceed data_min_file_size." + ), + "data_min_file_size": ( + "a single data file at or above this size (bytes) is considered " + "significant enough to scan even in an otherwise code project." + ), + "data_min_total_size": ( + "minimum total size (bytes) of candidate data before a directory that " + "also matches another project type is additionally reported as a " + "DataProject (used together with data_min_fraction)." + ), + "data_min_play_size": ( + "floor (bytes) below which even a directory that matches no other " + "project type is dismissed as toy/play data and not reported as a " + "DataProject." + ), + "data_consolidate_min_group": ( + "minimum number of numbered/related files (e.g. 001.csv, 002.csv) that " + "are consolidated into a single dataset." + ), + "data_inspect_max_datasets": ( + "do not run intake inspection if more than this many distinct datasets " + "are found in a directory (avoids huge scans)." + ), "excludes": ( "directory names to skip when walking a project tree for child projects " "and file statistics. Directories whose names start with '.' or '_' are " diff --git a/src/projspec/content/__init__.py b/src/projspec/content/__init__.py index 4c02338..fcf96d8 100644 --- a/src/projspec/content/__init__.py +++ b/src/projspec/content/__init__.py @@ -7,11 +7,16 @@ PipelineStage, ServiceDependency, ) -from projspec.content.data import TabularData, IntakeSource +from projspec.content.data import ( + Dataset, + FrictionlessData, + IntakeSource, + TabularData, +) from projspec.content.env_var import EnvironmentVariables from projspec.content.environment import Environment, Stack, Precision from projspec.content.executable import Command -from projspec.content.metadata import DescriptiveMetadata, License +from projspec.content.metadata import Citation, DescriptiveMetadata, License from projspec.content.package import PythonPackage from projspec.content.vcs import VCSInfo @@ -22,10 +27,13 @@ "GithubAction", "PipelineStage", "ServiceDependency", - "TabularData", + "Dataset", + "FrictionlessData", "IntakeSource", + "TabularData", "EnvironmentVariables", "Command", + "Citation", "License", "DescriptiveMetadata", "PythonPackage", diff --git a/src/projspec/content/data.py b/src/projspec/content/data.py index 3ebdf47..fc6997e 100644 --- a/src/projspec/content/data.py +++ b/src/projspec/content/data.py @@ -1,4 +1,9 @@ -"""Contents specifying datasets""" +"""Content classes describing datasets found within a project. + +These describe data assets in a formal way, without loading the data. Most +of them mirror the things that ``intake`` (v2, ``intake.readers``) can tell us +about a URL/glob/list of files via :func:`intake.readers.inspect.inspect_dataset`. +""" from dataclasses import dataclass, field @@ -7,107 +12,83 @@ @dataclass class TabularData(BaseContent): - """A tabular dataset, columns and rows + """A tabular (columnar) dataset, e.g. CSV/parquet/SQL. - This lists loadable tabular files with defined schema, typically from formats such as - JSON, CSV, and parquet. + ``schema`` is a free-form mapping describing the columns; its exact form + depends on where it was sourced (FrictionlessData resource schema, a + HuggingFace ``features`` block, or intake's ``datashape``). """ icon = "๐Ÿ“Š" name: str + schema: dict = field(default_factory=dict) metadata: dict = field(default_factory=dict) - # allowed schema formats: - # - dtype-like {fieldname: string-type} - # - dtype-complex {fieldname: {...}} - # - list like [{name:, ...}] - # We may choose to normalise to just one of these eventually - schema: dict | list = field(default_factory=dict) @dataclass -class IntakeSource(BaseContent): - """A catalog of data assets, including basic properties (location) and how to load/process them. +class FrictionlessData(BaseContent): + """A data resource described by the FrictionlessData standard. - See https://intake.readthedocs.io/en/latest/ + See https://datapackage.org/standard/data-resource/ . """ + icon = "๐Ÿชช" + + name: str + schema: dict = field(default_factory=dict) + + +@dataclass +class IntakeSource(BaseContent): + """A named entry in an intake catalog.""" + icon = "๐Ÿ“–" - # TODO: add better fields: args, driver/reader, metadata, description name: str @dataclass -class DataResource(BaseContent): - """A data resource found inside a data-only directory. - - Describes one logical dataset โ€” which may be a flat collection of files, a - Hive-partitioned tree, an Iceberg/Delta table, a Zarr store, or any other - recognised on-disk layout. - - The `path` field is a human-readable basename that identifies the resource: - - - Single file: `"data.csv"` - - Multi-file series: `"part*.parquet"` (glob-style, common prefix + `*` + ext) - - Directory-as-dataset (Hive partition, Zarr store, โ€ฆ): `"year=2024/"` - - The `modality` field classifies the broad nature of the data using the - vocabulary established by intake's `structure` tags and napari's layer - type system: - - - `"tabular"` โ€” row/column data (CSV, Parquet, ORC, Excel, โ€ฆ) - - `"array"` โ€” N-dimensional arrays (NumPy, HDF5, NetCDF, Zarr, โ€ฆ) - - `"image"` โ€” 2-D/3-D images (PNG, JPEG, TIFF, DICOM, NIfTI, โ€ฆ) - - `"timeseries"` โ€” time-indexed signals (WAV, GRIB, โ€ฆ) - - `"geospatial"` โ€” vector/raster geodata (Shapefile, GeoJSON, GeoTIFF, โ€ฆ) - - `"model"` โ€” ML model weights (GGUF, SafeTensors, PyTorch, โ€ฆ) - - `"nested"` โ€” hierarchical / JSON-like (Avro, YAML, XML, โ€ฆ) - - `"document"` โ€” human-readable documents (PDF, DOCX, โ€ฆ) - - `"video"` โ€” video streams (MP4, AVI, โ€ฆ) - - `"archive"` โ€” compressed bundles (ZIP, tar.gz, โ€ฆ) - - `""` โ€” unknown / mixed - - The `schema` field is format-specific: - - - Tabular (Parquet, Arrow, CSV, โ€ฆ): `{column_name: dtype_str, โ€ฆ}` - - Image / array: `{"width": int, "height": int, "channels": int, "mode": str}` - - Audio: `{"sample_rate": int, "channels": int, "frames": int}` - - HDF5 / Zarr / NetCDF: `{"variables": [...], "dims": {...}, "attrs": {...}}` - - Unknown / library not available: `{}` +class Dataset(BaseContent): + """A generic dataset discovered on disk and described by intake. + + This is produced by :class:`projspec.proj.data_project.DataProject` after + scanning files/globs with :func:`intake.readers.inspect.inspect_dataset`. + + The dataset's short identifying name is *not* stored on the object: a + :class:`DataProject` exposes its datasets as an ``AttrDict`` keyed by that + name (e.g. ``proj.contents.dataset["*.csv"]``), so duplicating it here + would be redundant. + + Attributes + ---------- + url: + The URL, glob or list of URLs that make up this dataset, relative to + (or rooted at) the project directory. + datatype: + The intake ``BaseData`` subclass name detected (e.g. ``"CSV"``, + ``"Parquet"``), or ``None`` if intake could not identify the type. + structure: + Structural tags reported by intake (e.g. ``{"table"}``, + ``{"array", "image"}``). + schema: + The ``datashape`` mapping returned by intake (columns/dtypes, dims, + etc.); empty if no reader could describe the data. + n_files: + Number of files that make up the dataset (after glob expansion). + total_size: + Total bytes across all files in the dataset, if known. + metadata: + Any other useful summary information from intake (shape, npartitions, + recommended readers, description, โ€ฆ). """ - icon = "๐Ÿ“ฅ" - - path: str # basename (or glob pattern / dir/ ) identifying this resource - format: str # canonical format string, e.g. "parquet", "csv", "png", "hdf5" - modality: str = "" # broad data nature; see docstring for vocabulary - layout: str = "" # "flat"|"hive"|"iceberg"|"delta"|"zarr_store"|"tiledarray"|"" - file_count: int = 0 - total_size: int = 0 # bytes; 0 when unknown (e.g. remote FS without size info) - schema: dict | list = field(default_factory=dict) - # full path to one representative file, for use by preview loaders - sample_path: str = "" - metadata: dict = field(default_factory=dict) # catch-all extras - _html = None - - def __repr__(self) -> str: - from projspec.content.data_html import repr_text - - return repr_text(self) - - def _repr_html_(self) -> str: - """Jupyter rich display โ€” returns cached HTML, rendering on first call.""" - # TODO: this is probably not what we want jupyter to dysplay, but it's - # convenient for now. - if self._html is None: - from projspec.content.data_html import repr_html - - self._html = repr_html(self) - return self._html - - def to_dict(self, compact=False): - d = super().to_dict(compact=compact) - if not compact: - d["_html"] = self._repr_html_() - return d + icon = "๐Ÿ—ƒ๏ธ" + + url: str | list[str] = "" + datatype: str | None = None + structure: list[str] = field(default_factory=list) + schema: dict = field(default_factory=dict) + n_files: int = 1 + total_size: int | None = None + metadata: dict = field(default_factory=dict) diff --git a/src/projspec/content/data_html.py b/src/projspec/content/data_html.py deleted file mode 100644 index 530fb60..0000000 --- a/src/projspec/content/data_html.py +++ /dev/null @@ -1,632 +0,0 @@ -"""Text and HTML representations for DataResource. - -``repr_text`` โ€” plain-text one-liner for ``__repr__``. -``repr_html`` โ€” rich HTML card for Jupyter's ``_repr_html_`` protocol. - -The HTML card has two sections: - -1. **Metadata table** โ€” name, format, modality, layout, file count, total size, - schema (collapsed by default when it has many entries). - -2. **Preview** (optional) โ€” a lightweight peek at the actual data using - whichever optional library is available for the format. The section is - silently omitted when no suitable loader can be imported. - -All loader imports are guarded with ``try/except ImportError`` so that the -representation degrades gracefully when optional dependencies are absent. -""" - -from __future__ import annotations - -import base64 -import html as _html -from typing import TYPE_CHECKING - -if TYPE_CHECKING: - from projspec.content.data import DataResource - -# --------------------------------------------------------------------------- -# Helpers -# --------------------------------------------------------------------------- - -_MODALITY_ICON: dict[str, str] = { - "tabular": "📊", # ๐Ÿ“Š - "image": "🖼", # ๐Ÿ–ผ - "array": "🧮", # ๐Ÿงฎ - "timeseries": "📈", # ๐Ÿ“ˆ - "geospatial": "🌍", # ๐ŸŒ - "model": "🧠", # ๐Ÿง  - "nested": "📂", # ๐Ÿ“‚ - "document": "📄", # ๐Ÿ“„ - "video": "🎬", # ๐ŸŽฌ - "archive": "📦", # ๐Ÿ“ฆ - "": "🗂", # ๐Ÿ—‚ -} - - -def _fmt_size(n: int) -> str: - """Human-readable byte count.""" - if n <= 0: - return "unknown" - for unit in ("B", "KB", "MB", "GB", "TB"): - if n < 1024: - return f"{n:.1f} {unit}" if unit != "B" else f"{n} B" - n /= 1024 # type: ignore[assignment] - return f"{n:.1f} PB" - - -def _esc(s: object) -> str: - return _html.escape(str(s)) - - -# --------------------------------------------------------------------------- -# Plain-text repr -# --------------------------------------------------------------------------- - - -def repr_text(dr: "DataResource") -> str: - """One-line text representation of a DataResource.""" - size = _fmt_size(dr.total_size) - schema_hint = "" - if isinstance(dr.schema, dict) and dr.schema: - keys = list(dr.schema)[:3] - extra = f", +{len(dr.schema) - 3} more" if len(dr.schema) > 3 else "" - schema_hint = f" [{', '.join(str(k) for k in keys)}{extra}]" - elif isinstance(dr.schema, list) and dr.schema: - schema_hint = f" [{len(dr.schema)} fields]" - - parts = [ - f"DataResource({dr.path!r}", - f"format={dr.format!r}", - ] - if dr.modality: - parts.append(f"modality={dr.modality!r}") - if dr.layout and dr.layout not in ("flat", ""): - parts.append(f"layout={dr.layout!r}") - parts.append(f"files={dr.file_count}") - parts.append(f"size={size}") - if schema_hint: - parts.append(f"schema={schema_hint.strip()}") - return ", ".join(parts) + ")" - - -# --------------------------------------------------------------------------- -# HTML repr -# --------------------------------------------------------------------------- - -# No inline styles โ€” class names are present for external styling by the -# host environment (Jupyter, VS Code webview, etc.). -_CARD_CSS = "" - - -def repr_html(dr: "DataResource") -> str: - """Rich HTML card representation of a DataResource.""" - icon = _MODALITY_ICON.get(dr.modality, _MODALITY_ICON[""]) - size_str = _fmt_size(dr.total_size) - - # ---- header ---- - modality_badge = ( - f'{_esc(dr.modality)}' if dr.modality else "" - ) - format_badge = f'{_esc(dr.format)}' - layout_badge = ( - f'{_esc(dr.layout)}' - if dr.layout and dr.layout not in ("flat", "") - else "" - ) - - header = ( - f'
' - f'{icon}' - f'{_esc(dr.path)}' - f"{modality_badge}{format_badge}{layout_badge}" - f"
" - ) - - # ---- metadata table ---- - meta_rows = [ - ("Files", str(dr.file_count)), - ("Total size", size_str), - ] - - meta_html_rows = "".join( - f"{_esc(k)}{v}" for k, v in meta_rows - ) - schema_html = _render_schema(dr.schema) - - meta_section = ( - f'
' - f"{meta_html_rows}
" - f"{schema_html}" - f"
" - ) - - # ---- preview ---- - preview_html = _build_preview(dr) - preview_section = "" - if preview_html: - preview_section = ( - f'
' - f'
Preview
' - f"{preview_html}" - f"
" - ) - - return ( - _CARD_CSS - + f'
' - + header - + meta_section - + preview_section - + "
" - ) - - -# --------------------------------------------------------------------------- -# Schema rendering -# --------------------------------------------------------------------------- - - -def _render_schema(schema: dict | list) -> str: - """Render schema as a collapsible HTML block.""" - if not schema: - return "" - - if isinstance(schema, dict): - # Tabular-style {col: dtype} or structural {"variables": [...], ...} - rows = "" - for k, v in schema.items(): - rows += f"{_esc(k)}{_esc(v)}" - table = ( - f'' - f"" - f"{rows}" - f"
FieldType / Value
" - ) - n = len(schema) - open_attr = "open" if n <= 8 else "" - return ( - f'
' - f'Schema ({n} {"field" if n == 1 else "fields"})' - f"{table}
" - ) - - if isinstance(schema, list): - # List-of-dicts (frictionless style) or plain list - if schema and isinstance(schema[0], dict): - # Render each dict as a row; use union of all keys as columns - all_keys: list[str] = [] - for item in schema: - for k in item: - if k not in all_keys: - all_keys.append(k) - header_row = "".join(f"{_esc(k)}" for k in all_keys) - body_rows = "" - for item in schema: - cells = "".join(f"{_esc(item.get(k, ''))}" for k in all_keys) - body_rows += f"{cells}" - table = ( - f'' - f"{header_row}{body_rows}
" - ) - else: - items_html = "".join(f"
  • {_esc(s)}
  • " for s in schema) - table = f"" - - n = len(schema) - open_attr = "open" if n <= 8 else "" - return ( - f'
    ' - f'Schema ({n} {"field" if n == 1 else "fields"})' - f"{table}
    " - ) - - return "" - - -# --------------------------------------------------------------------------- -# Preview builders โ€” one function per modality family, all return HTML str -# or None when no loader is available. -# --------------------------------------------------------------------------- - -#: How many rows to show in tabular previews. -_PREVIEW_ROWS = 5 - - -def _obj_to_preview_html(obj) -> str: - """Return the richest HTML string available for *obj*. - - Tries ``_repr_html_()`` first (pandas DataFrame, polars DataFrame, xarray - Dataset, โ€ฆ), then falls back to ``__repr__``. The result is always - wrapped in a ``
    `` so callers can rely on valid HTML. - """ - if hasattr(obj, "_repr_html_"): - try: - h = obj._repr_html_() - if h: - return f'
    {h}
    ' - except Exception: - pass - return f'
    {_esc(repr(obj))}
    ' - - -def _build_preview(dr: "DataResource") -> str | None: - """Return an HTML preview fragment, or None if not possible.""" - fmt = dr.format - modality = dr.modality - sample = dr.sample_path if dr.sample_path else None - - if sample is None: - return None - - if modality == "tabular": - return _preview_tabular(dr, sample) - if modality == "image": - return _preview_image(dr, sample) - if modality == "array": - return _preview_array(dr, sample) - if modality == "timeseries" and fmt in ("wav", "flac", "mp3", "ogg"): - return _preview_audio(dr, sample) - return None - - -# --- tabular --- - - -def _preview_tabular(dr: "DataResource", path: str) -> str | None: - fmt = dr.format - fs = dr.proj.fs - - try: - if fmt == "parquet": - return _preview_parquet(fs, path) - if fmt == "csv": - return _preview_csv(fs, path) - if fmt in ("tsv", "psv"): - sep = "\t" if fmt == "tsv" else "|" - return _preview_csv(fs, path, sep=sep) - if fmt == "arrow": - return _preview_arrow(fs, path) - if fmt == "jsonlines": - return _preview_jsonlines(fs, path) - if fmt == "excel": - return _preview_excel(fs, path) - if fmt in ("sqlite", "duckdb"): - return _preview_sql(fs, path, fmt) - if fmt == "orc": - return _preview_orc(fs, path) - except Exception: - pass - return None - - -def _preview_parquet(fs, path: str) -> str | None: - """Read only the first row group (or N rows from it) โ€” no full file scan.""" - try: - import pyarrow.parquet as pq - - with fs.open(path, "rb") as fh: - pf = pq.ParquetFile(fh) - # read_row_group reads one row group's pages, not the whole file - batch = pf.read_row_group(0) - if batch.num_rows > _PREVIEW_ROWS: - batch = batch.slice(0, _PREVIEW_ROWS) - # Convert to pandas so we get _repr_html_() for free - df = batch.to_pandas() - return _obj_to_preview_html(df) - except ImportError: - pass - try: - # polars can read a row-count-limited slice without decoding the rest - import polars as pl - - with fs.open(path, "rb") as fh: - df = pl.read_parquet(fh, n_rows=_PREVIEW_ROWS) - return _obj_to_preview_html(df) - except ImportError: - pass - return None - - -def _preview_csv(fs, path: str, sep: str = ",") -> str | None: - # pandas nrows= stops parsing after N data lines โ€” minimal I/O - try: - import pandas as pd - - with fs.open(path, "r", encoding="utf-8", errors="replace") as fh: - df = pd.read_csv(fh, sep=sep, nrows=_PREVIEW_ROWS) - return _obj_to_preview_html(df) - except ImportError: - pass - try: - import polars as pl - - with fs.open(path, "rb") as fh: - df = pl.read_csv(fh, n_rows=_PREVIEW_ROWS, separator=sep) - return _obj_to_preview_html(df) - except ImportError: - pass - return None - - -def _preview_arrow(fs, path: str) -> str | None: - """Read only the first record batch โ€” no full file deserialisation.""" - try: - import pyarrow.ipc as ipc - - with fs.open(path, "rb") as fh: - try: - # IPC file format: random-access; read just batch 0 - reader = ipc.open_file(fh) - batch = reader.get_batch(0) - except Exception: - fh.seek(0) - # IPC stream format: sequential; read just the first batch - reader = ipc.open_stream(fh) - batch = reader.read_next_batch() - if batch.num_rows > _PREVIEW_ROWS: - batch = batch.slice(0, _PREVIEW_ROWS) - df = batch.to_pandas() - return _obj_to_preview_html(df) - except ImportError: - pass - return None - - -def _preview_jsonlines(fs, path: str) -> str | None: - # pandas nrows= stops reading after N lines - try: - import pandas as pd - - with fs.open(path, "r", encoding="utf-8", errors="replace") as fh: - df = pd.read_json(fh, lines=True, nrows=_PREVIEW_ROWS) - return _obj_to_preview_html(df) - except ImportError: - pass - return None - - -def _preview_excel(fs, path: str) -> str | None: - # nrows= limits rows read from the sheet - try: - import pandas as pd - - with fs.open(path, "rb") as fh: - df = pd.read_excel(fh, nrows=_PREVIEW_ROWS) - return _obj_to_preview_html(df) - except ImportError: - pass - return None - - -def _preview_sql(fs, path: str, fmt: str) -> str | None: - # SQLite/DuckDB: only works with a local path (not a remote FS) - try: - if getattr(fs, "protocol", "file") not in ("file", "local", ""): - return None - if fmt == "duckdb": - try: - import duckdb - - con = duckdb.connect(path, read_only=True) - tables = con.execute("SHOW TABLES").fetchall() - if not tables: - return None - tname = tables[0][0] - df = con.execute( - f'SELECT * FROM "{tname}" LIMIT {_PREVIEW_ROWS}' - ).fetchdf() - return _obj_to_preview_html(df) - except ImportError: - pass - else: - import sqlite3 - import pandas as pd - - con = sqlite3.connect(path) - cur = con.cursor() - cur.execute("SELECT name FROM sqlite_master WHERE type='table'") - tables = cur.fetchall() - if not tables: - return None - tname = tables[0][0] - df = pd.read_sql(f'SELECT * FROM "{tname}" LIMIT {_PREVIEW_ROWS}', con) - return _obj_to_preview_html(df) - except Exception: - pass - return None - - -def _preview_orc(fs, path: str) -> str | None: - try: - import pyarrow.orc as orc - - with fs.open(path, "rb") as fh: - table = orc.ORCFile(fh).read().slice(0, _PREVIEW_ROWS) - df = table.to_pandas() - return _obj_to_preview_html(df) - except ImportError: - pass - return None - - -# --- image --- - - -def _preview_image(dr: "DataResource", path: str) -> str | None: - try: - from PIL import Image - import io - - fs = dr.proj.fs - with fs.open(path, "rb") as fh: - raw: bytes = fh.read() - - img = Image.open(io.BytesIO(raw)) - img.thumbnail((600, 200)) - - buf = io.BytesIO() - # Save as PNG for lossless display regardless of source format - rgb = img.convert("RGB") if img.mode not in ("RGB", "L", "RGBA") else img - rgb.save(buf, format="PNG") - b64 = base64.b64encode(buf.getvalue()).decode("ascii") - - w, h = img.size - schema = dr.schema if isinstance(dr.schema, dict) else {} - info = f"{schema.get('width', w)}ร—{schema.get('height', h)}" - if "mode" in schema: - info += f", mode={schema['mode']}" - - return ( - f'
    ' - f'
    {_esc(info)}
    ' - ) - except ImportError: - pass - except Exception: - pass - return None - - -# --- array --- - - -def _preview_array(dr: "DataResource", path: str) -> str | None: - fmt = dr.format - fs = dr.proj.fs - - if fmt == "numpy": - return _preview_numpy(fs, path) - if fmt == "hdf5": - return _preview_hdf5(fs, path) - if fmt == "netcdf": - return _preview_netcdf(fs, path) - if fmt == "zarr": - return _preview_zarr(dr) - return None - - -def _array_info_html(info: dict) -> str: - rows = "".join( - f"{_esc(k)}{_esc(v)}" - for k, v in info.items() - ) - return f'{rows}
    ' - - -def _preview_numpy(fs, path: str) -> str | None: - """Read only the .npy header to get shape/dtype, then load a minimal slice.""" - try: - import numpy as np - import numpy.lib.format as nf - import io - - with fs.open(path, "rb") as fh: - raw_header = fh.read(512) # header is always โ‰ค 512 bytes - - buf = io.BytesIO(raw_header) - nf.read_magic(buf) - # read_array_header_1_0 is the stable API across numpy versions; - # newer numpy also exposes read_array_header โ€” try both. - try: - shape, _, dtype = nf.read_array_header_1_0(buf) - except AttributeError: - shape, _, dtype = nf.read_array_header(buf) # type: ignore[attr-defined] - - info: dict = {"shape": str(shape), "dtype": str(dtype)} - - # Load the full array only when it's small enough (โ‰ค 1 MB heuristic) - # or when we can cheaply slice the first N rows. - try: - total_elements = 1 - for s in shape: - total_elements *= s - item_size = np.dtype(dtype).itemsize - if total_elements * item_size <= 1_048_576: - with fs.open(path, "rb") as fh: - arr = np.load(io.BytesIO(fh.read()), allow_pickle=False) - sliced = arr[:_PREVIEW_ROWS] if arr.ndim >= 1 else arr - info["preview"] = repr(sliced) - except Exception: - pass - - return _array_info_html(info) - except Exception: - pass - return None - - -def _preview_hdf5(fs, path: str) -> str | None: - """Open the HDF5 file and read only metadata โ€” no array data loaded.""" - try: - import h5py - - with fs.open(path, "rb") as fh: - with h5py.File(fh, "r") as f: - keys = list(f.keys())[:8] - info: dict = {"top-level keys": ", ".join(keys) or "(none)"} - for k in keys[:3]: - obj = f[k] - if hasattr(obj, "shape"): - info[k] = f"shape={obj.shape}, dtype={obj.dtype}" - else: - info[k] = f"group ({len(obj)} members)" - return _array_info_html(info) - except ImportError: - pass - return None - - -def _preview_netcdf(fs, path: str) -> str | None: - """Open the dataset lazily (no data loaded) and render its repr.""" - try: - import xarray as xr - - with fs.open(path, "rb") as fh: - # engine="scipy" reads lazily; no array data is decoded here - ds = xr.open_dataset(fh, engine="scipy") - # xarray Dataset has a rich _repr_html_() - return _obj_to_preview_html(ds) - except ImportError: - pass - return None - - -def _preview_zarr(dr: "DataResource") -> str | None: - """Use the schema cached at parse time โ€” zero extra I/O.""" - schema = dr.schema - if not schema or not isinstance(schema, dict): - return None - info = {} - if "arrays" in schema: - info["arrays"] = ", ".join(str(a) for a in schema["arrays"][:8]) or "(none)" - if "groups" in schema: - info["groups"] = ", ".join(str(g) for g in schema["groups"][:8]) or "(none)" - if "attrs" in schema: - info["attrs"] = str(dict(list(schema["attrs"].items())[:4])) - return _array_info_html(info) if info else None - - -# --- audio --- - - -def _preview_audio(dr: "DataResource", path: str) -> str | None: - """Read only the audio file header โ€” no sample data loaded.""" - try: - import soundfile as sf - - fs = dr.proj.fs - with fs.open(path, "rb") as fh: - info = sf.info(fh) - details = { - "sample rate": f"{info.samplerate} Hz", - "channels": str(info.channels), - "duration": f"{info.frames / info.samplerate:.2f} s", - "format": info.format, - "subtype": info.subtype, - } - return _array_info_html(details) - except ImportError: - pass - return None diff --git a/src/projspec/html.py b/src/projspec/html.py deleted file mode 100644 index eb62124..0000000 --- a/src/projspec/html.py +++ /dev/null @@ -1,47 +0,0 @@ -def dict_to_html(data: dict, title="Data", open_level=2) -> str: - """ - Convert a nested dictionary to expandable HTML using
    tags. - - Args: - data: The dictionary to convert - title: Title for the details element - open_level: whether to set elements as expanded; yes if > 0, and will - decrement for inner levels. - - Returns: - String containing HTML with expandable details elements - """ - # With help from Claude Sonnet 4. - if not isinstance(data, dict): - return f"{data}" - - if not data: - return "" - open = "open" if open_level > 0 else "closed" - - html = [ - f'
    {title}' - ] - - for key, value in data.items(): - if isinstance(value, dict): - html.append(dict_to_html(value, key, open_level - 1)) - elif isinstance(value, (list, tuple)): - html.append( - f'
    {key}' - ) - for i, item in enumerate(value): - if isinstance(item, dict): - html.append(dict_to_html(item, f"{key}[{i}]", open_level - 1)) - else: - html.append(f'
    {item}
    ') - html.append("
    ") - else: - html.append( - f'
    {key}: {value}
    ' - ) - - html.append("
    ") - return "".join(html) diff --git a/src/projspec/library.py b/src/projspec/library.py index db6bcb5..0b3f313 100644 --- a/src/projspec/library.py +++ b/src/projspec/library.py @@ -1,10 +1,12 @@ import json import os +import time import fsspec from projspec.config import get_conf from projspec.proj import Project +from projspec.utils import DEFAULT class ProjectLibrary: @@ -15,14 +17,28 @@ class ProjectLibrary: # TODO: support for remote libraries - def __init__(self, library_path: str | None = None, auto_save: bool = True): - self.path = library_path or get_conf("library_path") - self.entries: dict[str, Project] = {} + def __init__( + self, + library_path: str | None | type = DEFAULT, + auto_save: bool = True, + entries: dict | None = None, + ): + self.path = ( + get_conf("library_path") if library_path is DEFAULT else library_path + ) + self.entries: dict[str, Project] = {} if entries is None else entries self.auto_save = auto_save self.load() def load(self): - """Loads scanned project objects from JSON file""" + """Loads scanned project objects from JSON file. + + Any entry whose last scan is older than the ``auto_rescan`` config + value (in seconds) is automatically rescanned and the refreshed + library is saved back. Set ``auto_rescan`` to 0 to disable this. + """ + if self.path is None: + return try: with fsspec.open(self.path, "r") as f: self.entries = { @@ -30,6 +46,35 @@ def load(self): } except FileNotFoundError: self.entries = {} + return + self._auto_rescan() + + def _auto_rescan(self): + """Rescan entries older than the ``auto_rescan`` config threshold.""" + max_age = get_conf("auto_rescan") + if not max_age or max_age <= 0: + return + now = time.time() + rescanned = False + for key, proj in list(self.entries.items()): + scanned_at = getattr(proj, "scanned_at", None) + if scanned_at is None or (now - scanned_at) < max_age: + continue + try: + # Rescan from the project's own path, preserving the library + # key so the entry's identity does not drift. + fresh = Project( + proj.path, + storage_options=proj.storage_options, + walk=False, + ) + except Exception: + # never let an unreachable/changed project break library load + continue + self.entries[key] = fresh + rescanned = True + if rescanned and self.auto_save and self.path is not None: + self.save() def clear(self): """Clears scanned project objects from JSON file and memory""" @@ -46,6 +91,8 @@ def add_entry(self, path: str, entry: Project): def save(self): """Serialise the state of the scanned project objects to file""" # don't catch + if self.path is None: + raise ValueError("Cannot save without .path set") data = {k: v.to_dict(compact=False) for k, v in self.entries.items()} with fsspec.open(self.path, "w") as f: json.dump(data, f) diff --git a/src/projspec/proj/__init__.py b/src/projspec/proj/__init__.py index 328cd80..2f535b5 100644 --- a/src/projspec/proj/__init__.py +++ b/src/projspec/proj/__init__.py @@ -24,7 +24,7 @@ from projspec.proj.conda_package import CondaRecipe, RattlerRecipe from projspec.proj.conda_project import CondaProject from projspec.proj.conda_workspace import CondaWorkspace -from projspec.proj.data_dir import Data +from projspec.proj.data_project import DataProject from projspec.proj.datapackage import DataPackage, DVCRepo from projspec.proj.dataworkflows import ( Airflow, @@ -43,6 +43,7 @@ from projspec.proj.golang import Golang from projspec.proj.helm import HelmChart from projspec.proj.hf import HuggingFaceRepo +from projspec.proj.knowledge_catalog import KnowledgeCatalog from projspec.proj.ide import JetbrainsIDE, NvidiaAIWorkbench, VSCode from projspec.proj.infra import ( Ansible, @@ -92,9 +93,9 @@ "CondaWorkspace", "RattlerRecipe", # Data - "Data", "DataPackage", "DVCRepo", + "DataProject", # Data/ML workflows "Airflow", "Dagster", @@ -121,6 +122,8 @@ "HelmChart", # HuggingFace "HuggingFaceRepo", + # Knowledge + "KnowledgeCatalog", # IDE "AIEnabled", "BackstageCatalog", diff --git a/src/projspec/proj/_consolidate.py b/src/projspec/proj/_consolidate.py new file mode 100644 index 0000000..9ee93f4 --- /dev/null +++ b/src/projspec/proj/_consolidate.py @@ -0,0 +1,258 @@ +"""Consolidate sets of related files into logical datasets. + +Intake can already recognise some directory-based datasets (hive-partitioned +parquet, zarr, delta, โ€ฆ) by their characteristic contents. This module covers +the complementary case where a directory holds *many individually-named files +that obviously belong together*, e.g.:: + + 001.csv 002.csv 003.csv -> one CSV dataset + part-00000.parquet part-00001โ€ฆ -> one parquet dataset + data_2019.json data_2020.json -> one JSON dataset + green.gif red.gif blue.gif -> one GIF (image) dataset + +The output is a list of :class:`FileGroup` objects. Each group is either a +single standalone file or a consolidated set, and exposes a ``glob`` (or list of +members) suitable for handing straight to +:func:`intake.readers.inspect.inspect_dataset`. + +The logic here is deliberately filesystem-agnostic: it operates on +``(basename, size)`` pairs so it can be unit-tested without any I/O. +""" + +from __future__ import annotations + +import os +import re +from dataclasses import dataclass, field + +# A maximal run of digits anywhere in the stem - the most common way numbered +# file series differ (001, 00001, 2020, ...). +_DIGITS = re.compile(r"\d+") +# Tokens for the "one differing token" heuristic (split on common separators). +_SEP = re.compile(r"[._\- ]+") + + +@dataclass +class FileGroup: + """A standalone file or a consolidated set of related files. + + Attributes + ---------- + members: + Basenames belonging to this group, sorted. + ext: + Common file extension (lower-case, including the dot), or ``""``. + total_size: + Sum of the sizes of all members (bytes); ``None`` if unknown. + pattern: + For consolidated groups, a glob basename that matches all members + (e.g. ``"*.csv"`` or ``"part-*.parquet"``). For a single file this is + just that file's basename. + consolidated: + ``True`` when this group represents more than one physical file. + """ + + members: list[str] + ext: str = "" + total_size: int | None = None + pattern: str = "" + consolidated: bool = False + + @property + def name(self) -> str: + """A short identifying name for the group.""" + if self.consolidated: + return self.pattern + return self.members[0] + + def url(self, root: str) -> str | list[str]: + """Build the URL/glob (rooted at *root*) to hand to intake. + + A consolidated group whose members match a simple glob is expressed as + a single ``root/pattern`` glob string; otherwise it is returned as an + explicit list of member URLs. A single file is returned as one URL. + """ + root = root.rstrip("/") + if not self.consolidated: + return f"{root}/{self.members[0]}" + if self.pattern and "*" in self.pattern: + return f"{root}/{self.pattern}" + return [f"{root}/{m}" for m in self.members] + + +def _split_ext(name: str) -> tuple[str, str]: + """Split into ``(stem, ext)`` with a lower-cased extension. + + Handles common double extensions like ``.csv.gz`` / ``.tar.gz`` so that a + series of compressed parts groups correctly. + """ + lower = name.lower() + for double in (".csv.gz", ".json.gz", ".tar.gz", ".tar.bz2", ".tsv.gz"): + if lower.endswith(double) and len(name) > len(double): + return name[: -len(double)], double + stem, ext = os.path.splitext(name) + return stem, ext.lower() + + +def _digit_pattern(stem: str) -> str | None: + """Mask digit runs in *stem* with ``#``, or ``None`` if it has no digits. + + ``part-00001`` -> ``part-#``; ``data2020`` -> ``data#``. Consecutive digit + runs collapse to a single placeholder so that ``a1b2`` and ``a3b4`` share a + key. + """ + if not _DIGITS.search(stem): + return None + return _DIGITS.sub("#", stem) + + +def _glob_from_digit_pattern(pattern: str) -> str: + """Turn a masked pattern (``part-#``) into a glob stem (``part-*``).""" + return pattern.replace("#", "*") + + +def _token_signature(stem: str) -> tuple[tuple[str, ...], int] | None: + """Return ``(tokens_with_one_blanked, blank_index)`` for the token heuristic. + + Used for non-numeric series such as ``green``/``red``/``blue``. We only + consider stems that split into the *same* number of tokens differing in + exactly one position; here we just return the token tuple so the caller can + group by "all-but-one token equal". + """ + tokens = tuple(t for t in _SEP.split(stem) if t) + if not tokens: + return None + return tokens, len(tokens) + + +def consolidate( + files: list[tuple[str, int | None]], + min_group: int = 3, + min_token_group: int = 2, +) -> list[FileGroup]: + """Group a flat list of files into datasets. + + Parameters + ---------- + files: + ``[(basename, size_or_None), ...]`` for the files directly in a + directory (not directories, not recursive). + min_group: + Minimum number of files sharing a digit-masked pattern before they are + consolidated. Below this they are emitted as standalone files. + min_token_group: + Minimum size for the (weaker) "one differing token" heuristic used for + non-numeric series like colour names. + + Returns + ------- + list[FileGroup] + One entry per resulting dataset, sorted by name. Files that match no + consolidation rule are returned as singleton, non-consolidated groups. + """ + sizes: dict[str, int | None] = {n: s for n, s in files} + remaining = set(sizes) + groups: list[FileGroup] = [] + + # โ”€โ”€ Pass 1: digit-run patterns within each extension โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + # key: (ext, digit_masked_stem) -> [names] + digit_buckets: dict[tuple[str, str], list[str]] = {} + for name in list(remaining): + stem, ext = _split_ext(name) + pat = _digit_pattern(stem) + if pat is not None: + digit_buckets.setdefault((ext, pat), []).append(name) + + for (ext, pat), members in digit_buckets.items(): + if len(members) >= min_group: + members = sorted(members) + remaining.difference_update(members) + glob_stem = _glob_from_digit_pattern(pat) + groups.append( + FileGroup( + members=members, + ext=ext, + total_size=_sum_sizes(members, sizes), + pattern=f"{glob_stem}{ext}", + consolidated=True, + ) + ) + + # โ”€โ”€ Pass 2: "one differing token" within each extension โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + # Group stems that share all tokens but one (same token count). + token_buckets: dict[tuple[str, int, int, tuple[str, ...]], list[str]] = {} + for name in list(remaining): + stem, ext = _split_ext(name) + sig = _token_signature(stem) + if sig is None: + continue + tokens, ntok = sig + # For each position, the key is (ext, ntok, blanked_index, other_tokens) + for i in range(ntok): + others = tokens[:i] + ("*",) + tokens[i + 1 :] + token_buckets.setdefault((ext, ntok, i, others), []).append(name) + + used_in_token_pass: set[str] = set() + # Prefer the largest buckets first so a file lands in its best group. + for (ext, ntok, idx, others), members in sorted( + token_buckets.items(), key=lambda kv: -len(kv[1]) + ): + members = [m for m in members if m in remaining and m not in used_in_token_pass] + if len(members) >= min_token_group and len(set(members)) >= min_token_group: + members = sorted(members) + used_in_token_pass.update(members) + remaining.difference_update(members) + glob_stem = "*".join("" if t == "*" else t for t in others) + # rebuild a readable glob like "*.gif" / "frame_*_left.png" + pattern = _normalise_token_glob(others) + groups.append( + FileGroup( + members=members, + ext=ext, + total_size=_sum_sizes(members, sizes), + pattern=f"{pattern}{ext}", + consolidated=True, + ) + ) + + # โ”€โ”€ Pass 3: leftovers are standalone files โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + for name in sorted(remaining): + _, ext = _split_ext(name) + groups.append( + FileGroup( + members=[name], + ext=ext, + total_size=sizes.get(name), + pattern=name, + consolidated=False, + ) + ) + + return sorted(groups, key=lambda g: g.name) + + +def _normalise_token_glob(tokens: tuple[str, ...]) -> str: + """Join token glob pieces, collapsing the blanked position to ``*``. + + ``("*",)`` -> ``"*"`` + ``("frame", "*", "left")`` -> ``"frame_*_left"`` (best-effort separator) + """ + parts = [("*" if t == "*" else t) for t in tokens] + # We lost the original separators; "_" is the most common, and the exact + # separator does not matter for globbing since "*" spans it anyway when the + # blank is interior. For a single trailing/leading blank this yields "*". + glob = "_".join(parts) + # Tidy duplicate stars produced by adjacent blanks. + while "**" in glob: + glob = glob.replace("**", "*") + return glob + + +def _sum_sizes(members: list[str], sizes: dict[str, int | None]) -> int | None: + total = 0 + for m in members: + s = sizes.get(m) + if s is None: + return None + total += s + return total diff --git a/src/projspec/proj/base.py b/src/projspec/proj/base.py index 25d5e7c..1766a51 100644 --- a/src/projspec/proj/base.py +++ b/src/projspec/proj/base.py @@ -3,18 +3,19 @@ import logging import os import stat +import time from collections.abc import Iterable from itertools import chain from functools import cached_property import fsspec import fsspec.implementations.local -import projspec.utils import toml from projspec.config import get_conf from projspec.utils import ( AttrDict, + DEFAULT, IndentDumper, PickleableTomlDecoder, camel_to_snake, @@ -34,6 +35,37 @@ def _fmt_size(n: int) -> str: n /= 1024 +def _humanize_age(ts: float) -> str: + """Render a Unix timestamp as a relative "X ago" string. + + e.g. "just now", "5 minutes ago", "3 hours ago", "today", "yesterday", + "4 days ago", "2 months ago", "1 year ago". + """ + import datetime + + age = datetime.datetime.now() - datetime.datetime.fromtimestamp(ts) + days = age.days + if days < 0: + # clock skew / future timestamp - treat as just now + return "just now" + if days == 0: + secs = int(age.total_seconds()) + if secs < 60: + return "just now" + if secs < 3600: + mins = secs // 60 + return f"{mins} minute{'s' if mins != 1 else ''} ago" + hours = secs // 3600 + return f"{hours} hour{'s' if hours != 1 else ''} ago" + if days == 1: + return "yesterday" + if days < 30: + return f"{days} days ago" + if days < 365: + return f"{days // 30} months ago" + return f"{days // 365} year{'s' if days >= 730 else ''} ago" + + class ParseFailed(ValueError): """Exception raised when parsing fails: a directory does not meet the given spec.""" @@ -315,6 +347,8 @@ def resolve( types = set(camel_to_snake(_) for _ in types or ()) if types and types - set(registry): raise ValueError(f"Unknown types: {set(types) - set(registry)}") + # record when this (re)scan happened + self.scanned_at = time.time() # sorting to ensure consistency for name in sorted(registry): cls = registry[name] @@ -410,26 +444,18 @@ def _stats_line(self) -> str: # last modified lm = self.last_modified if lm is not None: - import datetime - - age = datetime.datetime.now() - datetime.datetime.fromtimestamp(lm) - days = age.days - if days == 0: - age_str = "today" - elif days == 1: - age_str = "yesterday" - elif days < 30: - age_str = f"{days} days ago" - elif days < 365: - age_str = f"{days // 30} months ago" - else: - age_str = f"{days // 365} year{'s' if days >= 730 else ''} ago" + age_str = _humanize_age(lm) by = self.last_modified_by if by: parts.append(f"last modified {age_str} by {by}") else: parts.append(f"last modified {age_str}") + # when this project was last scanned + scanned_at = getattr(self, "scanned_at", None) + if scanned_at is not None: + parts.append(f"scanned {_humanize_age(scanned_at)}") + return " " + " ยท ".join(parts) if parts else "" def __repr__(self): @@ -557,17 +583,23 @@ def to_dict(self, compact=True) -> dict: is_writable=self.is_writable, last_modified=self.last_modified, last_modified_by=self.last_modified_by, + scanned_at=self.scanned_at, ) if not compact: dic["klass"] = "project" return dic.to_dict(compact=compact) - def _repr_html_(self): - from projspec.html import dict_to_html + def _ipython_display_(self): + """Auto-display as the interactive widget when possible. - # TODO: add tooltips to docs or spec links - # TODO: remove redundant information? - return dict_to_html(self.to_dict(), title=self.url) + Falls back to a plain ``repr`` when ``anywidget`` / + ``ipywidgets`` is not available - Jupyter will then use the + normal text representation. + """ + from projspec.library import ProjectLibrary + + lib = ProjectLibrary(entries={"memory": self}, library_path=None) + lib._ipython_display_() @staticmethod def from_dict(dic): @@ -583,6 +615,11 @@ def from_dict(dic): proj.path = dic["url"] proj.storage_options = dic["storage_options"] proj.fs, proj.url = fsspec.url_to_fs(proj.path, **proj.storage_options) + scanned_at = dic.get("scanned_at") + try: + proj.scanned_at = float(scanned_at) + except (TypeError, ValueError): + proj.scanned_at = time.time() # Restore cached tree stats so a round-tripped Project never re-walks. # Keys default to None if absent (e.g. older serialised data). proj.__dict__["_tree_stats"] = { @@ -640,7 +677,7 @@ def make(self, qname: str, **kwargs): art.make(**kwargs) return art - def add_to_library(self, path=None): + def add_to_library(self, path=DEFAULT): """Add this project to the current session library""" # TODO: prevent overwrite? from projspec.library import ProjectLibrary diff --git a/src/projspec/proj/data_dir.py b/src/projspec/proj/data_dir.py deleted file mode 100644 index c0172c9..0000000 --- a/src/projspec/proj/data_dir.py +++ /dev/null @@ -1,679 +0,0 @@ -"""ProjectSpec for bare data directories. - -Matches directories whose contents are predominantly data files (by extension or -by a recognised on-disk layout such as Hive partitioning, Apache Iceberg, Delta -Lake, or Zarr), with no requirement for any declarative metadata file. -""" - -from __future__ import annotations - -import os -import re -from posixpath import basename as _basename - -from projspec.proj import ProjectSpec, ParseFailed -from projspec.utils import AttrDict - -_EXT_TO_FORMAT: dict[str, tuple[str, str]] = { - # Tabular / columnar ------------------------------------------------------- - ".csv": ("csv", "tabular"), - ".tsv": ("tsv", "tabular"), - ".psv": ("psv", "tabular"), - ".parquet": ("parquet", "tabular"), - ".parq": ("parquet", "tabular"), - ".pq": ("parquet", "tabular"), - ".arrow": ("arrow", "tabular"), - ".ipc": ("arrow", "tabular"), - ".feather": ("arrow", "tabular"), # Feather v1/v2 (magic: FEA1 / ARROW1) - ".orc": ("orc", "tabular"), - ".avro": ("avro", "tabular"), - ".xls": ("excel", "tabular"), - ".xlsx": ("excel", "tabular"), - ".xlsm": ("excel", "tabular"), - ".xlsb": ("excel", "tabular"), - ".jsonl": ("jsonlines", "tabular"), - ".ndjson": ("jsonlines", "tabular"), - ".db": ("sqlite", "tabular"), # DuckDB / SQLite (disambiguated by magic) - ".sqlite": ("sqlite", "tabular"), - ".sqlitedb": ("sqlite", "tabular"), - ".duckdb": ("duckdb", "tabular"), - # Array / scientific ------------------------------------------------------- - ".npy": ("numpy", "array"), - ".npz": ("numpy", "array"), - ".hdf5": ("hdf5", "array"), - ".hdf": ("hdf5", "array"), - ".h5": ("hdf5", "array"), - ".h4": ("hdf5", "array"), - ".he5": ("hdf5", "array"), - ".nc": ("netcdf", "array"), - ".nc3": ("netcdf", "array"), - ".nc4": ("netcdf", "array"), - ".mat": ("matlab", "array"), - ".fits": ("fits", "array"), - ".grib": ("grib", "timeseries"), - ".grb": ("grib", "timeseries"), - ".grib2": ("grib", "timeseries"), - ".grb2": ("grib", "timeseries"), - ".asdf": ("asdf", "array"), - ".zarr": ("zarr", "array"), - # Image / biomedical imaging ----------------------------------------------- - ".png": ("png", "image"), - ".jpg": ("jpeg", "image"), - ".jpeg": ("jpeg", "image"), - ".tif": ("tiff", "image"), # also geotiff โ€” ambiguous; image wins - ".tiff": ("tiff", "image"), - ".cog": ("tiff", "geospatial"), # Cloud-Optimised GeoTIFF - ".bmp": ("bmp", "image"), - ".gif": ("gif", "image"), - ".webp": ("webp", "image"), - ".dcm": ("dicom", "image"), - ".dicom": ("dicom", "image"), - ".nii": ("nifti", "image"), - ".nrrd": ("nrrd", "image"), - ".nhdr": ("nrrd", "image"), - ".mha": ("metaimage", "image"), - ".mhd": ("metaimage", "image"), - ".svs": ("svs", "image"), # Aperio whole-slide image - ".ndpi": ("ndpi", "image"), # Hamamatsu whole-slide image - ".scn": ("scn", "image"), # Leica whole-slide image - ".lsm": ("lsm", "image"), # Zeiss confocal - ".exr": ("exr", "image"), # OpenEXR HDR - ".qptiff": ("qptiff", "image"), # PerkinElmer whole-slide - # Geospatial --------------------------------------------------------------- - ".shp": ("shapefile", "geospatial"), - ".shx": ("shapefile", "geospatial"), - ".dbf": ("shapefile", "geospatial"), - ".geojson": ("geojson", "geospatial"), - ".gpkg": ("geopackage", "geospatial"), - ".fgb": ("flatgeobuf", "geospatial"), - ".kml": ("kml", "geospatial"), - ".pmtiles": ("pmtiles", "geospatial"), - # Audio -------------------------------------------------------------------- - ".wav": ("wav", "timeseries"), - ".flac": ("flac", "timeseries"), - ".mp3": ("mp3", "timeseries"), - ".ogg": ("ogg", "timeseries"), - # Video -------------------------------------------------------------------- - ".mp4": ("mp4", "video"), - ".avi": ("avi", "video"), - ".mov": ("mov", "video"), - ".mkv": ("mkv", "video"), - ".webm": ("webm", "video"), - # ML model weights --------------------------------------------------------- - ".safetensors": ("safetensors", "model"), - ".gguf": ("gguf", "model"), - ".pt": ("pytorch", "model"), - ".pth": ("pytorch", "model"), - ".onnx": ("onnx", "model"), - ".tfrec": ("tfrecord", "model"), - # Archive / bundle --------------------------------------------------------- - ".pkl": ("pickle", "archive"), - ".bin": ("binary", "archive"), -} - -_DATA_EXTENSIONS: frozenset[str] = frozenset(_EXT_TO_FORMAT) - -# Magic-byte signatures (format, modality, offset, bytes_pattern). -_MAGIC: list[tuple[str, str, int | None, bytes]] = [ - # Fixed-offset signatures - ("dicom", "image", 128, b"DICM"), # DICOM preamble - ("nifti", "image", 344, b"ni1\x00"), # NIfTI-1 - ("nifti", "image", 344, b"n+1\x00"), # NIfTI-1 single file - ("duckdb", "tabular", 8, b"DUCK"), - ("safetensors", "model", 8, b"{"), # SafeTensors JSON header - ("wav", "timeseries", 8, b"WAVE"), # RIFFโ€ฆWAVE - # Offset-0 signatures - ("parquet", "tabular", 0, b"PAR1"), - ("hdf5", "array", 0, b"\x89HDF"), - ("netcdf", "array", 0, b"CDF\x01"), # NetCDF classic - ("netcdf", "array", 0, b"CDF\x02"), # NetCDF-64bit - ("orc", "tabular", 0, b"ORC"), - ("avro", "tabular", 0, b"Obj\x01"), - ("arrow", "tabular", 0, b"ARROW1"), # IPC stream - ("arrow", "tabular", 0, b"FEA1"), # Feather v1 - ("numpy", "array", 0, b"\x93NUMPY"), - ("matlab", "array", 0, b"MATLAB"), - ("fits", "array", 0, b"SIMPLE"), - ("grib", "timeseries", 0, b"GRIB"), - ("asdf", "array", 0, b"#ASDF"), - ("flatgeobuf", "geospatial", 0, b"fgb"), - ("gguf", "model", 0, b"GGUF"), - ("png", "image", 0, b"\x89PNG"), - ("jpeg", "image", 0, b"\xff\xd8\xff"), - ("tiff", "image", 0, b"II*\x00"), # little-endian TIFF - ("tiff", "image", 0, b"MM\x00*"), # big-endian TIFF - ("sqlite", "tabular", 0, b"SQLite format"), - ("shapefile", "geospatial", 0, b"\x00\x00\x27\x0a"), - ("pmtiles", "geospatial", 0, b"PMTiles"), -] - -# Regex that matches Hive-style partition directory names (e.g. "year=2024"). -_HIVE_DIR_RE = re.compile(r"^[^=]+=.+$") - - -def _read_schema(path: str, fmt: str, fs) -> dict | list: - """Return a best-effort schema dict/list for *path*, or {} on any failure.""" - try: - if fmt == "parquet": - try: - import pyarrow.parquet as pq - - with fs.open(path, "rb") as fh: - pf = pq.ParquetFile(fh) - return {field.name: str(field.type) for field in pf.schema_arrow} - except ImportError: - pass - - elif fmt == "arrow": - try: - import pyarrow.ipc as ipc - - with fs.open(path, "rb") as fh: - reader = ipc.open_file(fh) - return {field.name: str(field.type) for field in reader.schema} - except ImportError: - pass - - elif fmt == "hdf5": - try: - import h5py - - with fs.open(path, "rb") as fh: - with h5py.File(fh, "r") as ds: - return { - "variables": list(ds.keys()), - "attrs": dict(ds.attrs), - } - except ImportError: - pass - - elif fmt == "netcdf": - try: - import netCDF4 as nc # type: ignore[import] - - with fs.open(path, "rb") as fh: - ds = nc.Dataset("in-mem", memory=fh.read()) - return { - "variables": list(ds.variables.keys()), - "dims": {k: len(v) for k, v in ds.dimensions.items()}, - } - except ImportError: - try: - import xarray as xr # type: ignore[import] - - with fs.open(path, "rb") as fh: - ds = xr.open_dataset(fh, engine="scipy") - return { - "variables": list(ds.data_vars), - "dims": dict(ds.dims), - } - except ImportError: - pass - - elif fmt in ("jpeg", "png", "bmp", "gif", "webp", "tiff"): - try: - from PIL import Image # type: ignore[import] - - with fs.open(path, "rb") as fh: - img = Image.open(fh) - img.load() - mode = img.mode - channels = len(img.getbands()) - return { - "width": img.width, - "height": img.height, - "channels": channels, - "mode": mode, - } - except ImportError: - pass - - elif fmt in ("wav", "flac", "mp3", "ogg"): - try: - import soundfile as sf # type: ignore[import] - - with fs.open(path, "rb") as fh: - info = sf.info(fh) - return { - "sample_rate": info.samplerate, - "channels": info.channels, - "frames": info.frames, - } - except ImportError: - pass - - except Exception: # โ€” never let schema extraction abort parsing - pass - - return {} - - -def _filelist_dirs(filelist: list[dict]) -> list[dict]: - """Return only directory entries from a filelist.""" - return [e for e in filelist if e.get("type", "") == "directory"] - - -def _filelist_files(filelist: list[dict]) -> list[dict]: - """Return only file entries from a filelist.""" - return [e for e in filelist if e.get("type", "") != "directory"] - - -def _fmt_from_path(path: str) -> tuple[str, str] | None: - """Return (format, modality) for *path* by extension, or None if unknown.""" - ext = os.path.splitext(path)[1].lower() - return _EXT_TO_FORMAT.get(ext) - - -def _identify_by_magic(path: str, fs) -> tuple[str, str] | None: - """Return (format, modality) by probing *path*'s header bytes, or None. - - Reads up to 1 KiB. Checks fixed-offset patterns first (longer offsets - first, to avoid short patterns shadowing longer ones), then scans for - anywhere-patterns via re.search. - """ - try: - with fs.open(path, "rb") as fh: - head = fh.read(1024) - except Exception: - return None - - for fmt, modality, offset, pattern in _MAGIC: - if offset is None: - if re.search(re.escape(pattern), head): - return fmt, modality - else: - if head[offset : offset + len(pattern)] == pattern: - return fmt, modality - return None - - -# Token that may vary across files in a series: digits, dashes, underscores, dots. -# Alphabetic variation (e.g. "users" vs "orders") disqualifies collation. -_SERIES_VAR_RE = re.compile(r"^[\d\-_.]+$") - - -def _common_affix(stems: list[str]) -> tuple[str, str]: - """Return the longest (prefix, suffix) shared by every stem in *stems*.""" - if not stems: - return "", "" - prefix = os.path.commonprefix(stems) - # Reverse each stem to find common suffix via commonprefix trick - rev = [s[::-1] for s in stems] - suffix = os.path.commonprefix(rev)[::-1] - # Ensure prefix and suffix don't overlap (can happen with a single-char stem) - if len(prefix) + len(suffix) > min(len(s) for s in stems): - suffix = "" - return prefix, suffix - - -def _group_by_naming_series(entries: list[dict]) -> list[list[dict]]: - """Partition *entries* (same-format file list) into naming-series groups. - - Two or more files belong to the same series when their basenames (stems) - differ only in a contiguous segment that consists solely of digits, dashes, - underscores, or dots โ€” i.e. a numeric counter or a date component. - - A single file is always its own series (trivially consistent). - - Returns a list of groups, each group being a non-empty list of entries that - share a common naming pattern. - """ - if len(entries) <= 1: - return [entries] if entries else [] - - # Compute stems once - stems = [os.path.splitext(_basename(e["name"]))[0] for e in entries] - - prefix, suffix = _common_affix(stems) - plen, slen = len(prefix), len(suffix) - - # Extract the variable middle segment for each stem - variables = [] - for stem in stems: - mid = stem[plen : len(stem) - slen if slen else len(stem)] - variables.append(mid) - - # All files form one series if: - # 1. There is a non-trivial shared prefix OR suffix (at least 1 char), AND - # 2. Every variable segment is numeric/date-like (no alphabetic chars) - has_affix = plen >= 1 or slen >= 1 - all_numeric_var = all(_SERIES_VAR_RE.match(v) or v == "" for v in variables) - - if has_affix and all_numeric_var: - return [entries] - - # Otherwise fall back: each file is its own "series" (separate resource) - return [[e] for e in entries] - - -# Notably absent: datapackage.json, catalog.yaml/yml, .dvc/ โ€” those belong -# to projspec.proj.datapackage and are treated as compatible companions. -_NON_DATA_SENTINELS: frozenset[str] = frozenset( - { - # Python - "pyproject.toml", - "setup.py", - "setup.cfg", - "hatch.toml", - # Rust - "Cargo.toml", - # JavaScript / Node - "package.json", - # Go - "go.mod", - # Container / infra - "Dockerfile", - "docker-compose.yml", - "docker-compose.yaml", - # Helm - "Chart.yaml", - # Ruby / Java / .NET - "Gemfile", - "pom.xml", - "build.gradle", - "*.csproj", - # R - "DESCRIPTION", - # Conda - "environment.yml", - "environment.yaml", - "meta.yaml", - # Pixi - "pixi.toml", - # Mkdocs / Sphinx / RTD - "mkdocs.yml", - "mkdocs.yaml", - "conf.py", - ".readthedocs.yaml", - ".readthedocs.yml", - # Scripts / notebooks that imply code-first dirs - "Makefile", - } -) - - -class Data(ProjectSpec): - """A directory whose primary contents are data files. - - Matches on any of: - - At least one file with an unambiguous data extension (CSV, Parquet, Arrow, - HDF5, images, audio, etc.) โ€” without requiring a metadata sidecar. - - A recognised directory layout: Hive partitioning (`key=value/` subdirs), - Apache Iceberg (`metadata/` directory), Delta Lake (`_delta_log/`), or - a Zarr store (`.zattrs` / `.zgroup` at the root). - - If no non-datapackage project signals are present in the directory the spec - parses unconditionally. If sentinel files that indicate another project type - (`pyproject.toml`, `Cargo.toml`, `package.json`, โ€ฆ) are found, parsing - succeeds only when the majority of bytes in the root file listing belong to - recognised data files; otherwise `ParseFailed` is raised so that the - directory is not double-counted as both a code project and a data project. - """ - - icon = "๐Ÿ—„๏ธ" - - def match(self) -> bool: - # Fast path: structural layout signals (no file-content inspection needed) - if self._detect_layout(): - return True - # Slow path: any top-level file with an unambiguous data extension - return any( - os.path.splitext(name)[1].lower() in _DATA_EXTENSIONS - for name in self.proj.basenames - ) - - def parse(self) -> None: - if self._has_non_data_sentinels(): - if not self._data_bytes_majority(): - raise ParseFailed( - "Non-data project sentinels found and data files are not " - "the majority of bytes โ€” skipping Data spec" - ) - - layout = self._detect_layout() - resources: list - - if layout in ("hive", "iceberg", "delta"): - resources = self._parse_layout_dirs(layout) - # Delta/Iceberg also commonly store data files at the root level - # alongside the log/metadata directory; collect those too. - if layout in ("iceberg", "delta"): - root_resources = self._parse_flat() - resources = resources + root_resources - elif layout in ("zarr_store", "tiledarray"): - resources = [self._parse_zarr_root()] - else: - resources = self._parse_flat() - - if not resources: - raise ParseFailed("No recognisable data files found") - - if len(resources) == 1: - self._contents["data_resource"] = resources[0] - else: - self._contents["data_resource"] = AttrDict( - {_safe_key(r.path): r for r in resources} - ) - - def _has_non_data_sentinels(self) -> bool: - """Return True if any non-datapackage project sentinel is present.""" - basenames = self.proj.basenames - return any(name in _NON_DATA_SENTINELS for name in basenames) - - def _data_bytes_majority(self) -> bool: - """Return True if data files account for >50 % of root-listing bytes. - - Files with unknown / zero size are excluded from both totals so they - do not unfairly skew the ratio. - """ - total_bytes = 0 - data_bytes = 0 - for entry in self.proj.filelist: - size = entry.get("size") or 0 - if size <= 0: - continue - total_bytes += size - ext = os.path.splitext(entry["name"].rsplit("/", 1)[-1])[1].lower() - if ext in _DATA_EXTENSIONS: - data_bytes += size - if total_bytes == 0: - return False - return data_bytes > total_bytes / 2 - - def _detect_layout(self) -> str: - """Return a layout string, or '' if none of the known layouts match. - - Uses the `contains` sentinel approach from intake: certain well-known - files/directories at the root identify a directory as a logical dataset. - """ - basenames = self.proj.basenames - # Zarr store: .zattrs, .zgroup, or zarr.json at the root - # (zarr.json is the Zarr v3 sentinel; .zattrs/.zgroup are v2) - if any(s in basenames for s in (".zattrs", ".zgroup", "zarr.json")): - return "zarr_store" - dir_names = {_basename(e["name"]) for e in _filelist_dirs(self.proj.filelist)} - # Delta Lake - if "_delta_log" in dir_names: - return "delta" - # TileDB array directory - if "__meta" in dir_names and "__schema" in dir_names: - return "tiledarray" - # Apache Iceberg: metadata/ directory present - if "metadata" in dir_names: - return "iceberg" - # Partitioned Parquet: _metadata sentinel file at root (written by Spark/Dask) - if "_metadata" in basenames: - return "iceberg" - # Hive: any top-level subdirectory whose name matches key=value - if any(_HIVE_DIR_RE.match(d) for d in dir_names): - return "hive" - return "" - - def _resource_from_entries( - self, entries: list[dict], fmt: str, modality: str, layout: str - ): - """Build a DataResource from a list of same-format file entries. - - The `path` field is set to: - - - Single file: the bare basename, e.g. `"data.csv"`. - - Multi-file series: a glob pattern, e.g. `"part*.csv"`, built from - the shared prefix/suffix of the basenames. - """ - from projspec.content.data import DataResource - - full_paths = [e["name"] for e in entries] - total_size = sum(e.get("size", 0) or 0 for e in entries) - sample_path = full_paths[0] if full_paths else "" - schema = _read_schema(sample_path, fmt, self.proj.fs) if sample_path else {} - - ext = os.path.splitext(_basename(full_paths[0]))[1] if full_paths else "" - - if len(entries) == 1: - path = _basename(full_paths[0]) or fmt - else: - stems = [os.path.splitext(_basename(p))[0] for p in full_paths] - prefix, suffix = _common_affix(stems) - stem_pattern = (prefix.rstrip("-_.") or fmt) + "*" + suffix - path = stem_pattern + ext - - return DataResource( - proj=self.proj, - path=path, - format=fmt, - modality=modality, - layout=layout, - file_count=len(entries), - total_size=total_size, - schema=schema, - sample_path=sample_path, - ) - - def _parse_flat(self) -> list: - """Group top-level files by format and naming series. - - Files of the same format are only collated into a single DataResource - when they share a consistent naming schema โ€” i.e. their stems differ - only in a numeric or date-like segment (e.g. `part0.csv`, - `part1.csv` or `2024-02.tiff`, `2024-03.tiff`). Files whose - stems vary in alphabetic content (e.g. `users.csv`, `orders.csv`) - each become their own DataResource. - """ - # First bucket by (fmt, modality) - fmt_groups: dict[tuple[str, str], list[dict]] = {} - for entry in _filelist_files(self.proj.filelist): - fmt_info = _fmt_from_path(entry["name"]) - if fmt_info is None: - continue - fmt_groups.setdefault(fmt_info, []).append(entry) - - resources = [] - for (fmt, modality), entries in fmt_groups.items(): - # Split each format-group into naming series - for series in _group_by_naming_series(entries): - resources.append( - self._resource_from_entries(series, fmt, modality, "flat") - ) - return resources - - def _parse_layout_dirs(self, layout: str) -> list: - """One DataResource per top-level subdirectory (partition / table root). - - Within each subdirectory the dominant format is determined, then files - are checked for a consistent naming series before collating. - """ - dir_entries = _filelist_dirs(self.proj.filelist) - resources = [] - for dir_entry in dir_entries: - dir_path = dir_entry["name"] - dir_name = _basename(dir_path) - # Skip hidden/internal dirs for iceberg/delta - if layout in ("iceberg", "delta") and dir_name.startswith( - ("metadata", "_delta_log", "_") - ): - continue - # Enumerate files one level inside this subdirectory - try: - sub_filelist = self.proj.fs.ls(dir_path, detail=True) - except Exception: - continue - - sub_files = _filelist_files(sub_filelist) - # Determine dominant (fmt, modality) by file count - fmt_counts: dict[tuple[str, str], int] = {} - for e in sub_files: - fmt_info = _fmt_from_path(e["name"]) - if fmt_info: - fmt_counts[fmt_info] = fmt_counts.get(fmt_info, 0) + 1 - if not fmt_counts: - continue - dominant = max(fmt_counts, key=lambda k: fmt_counts[k]) - dominant_fmt, dominant_modality = dominant - dominant_files = [ - e for e in sub_files if _fmt_from_path(e["name"]) == dominant - ] - resource = self._resource_from_entries( - dominant_files, dominant_fmt, dominant_modality, layout - ) - # Override path with the directory basename + trailing slash - # (partition dirs are already logically grouped by the directory) - resource.path = dir_name + "/" - resources.append(resource) - return resources - - def _parse_zarr_root(self): - """Describe the whole directory as a single array-store resource. - - Used for Zarr stores and TileDB arrays โ€” both are directory-as-dataset - layouts with no individual data files at the root. - """ - from projspec.content.data import DataResource - - url = self.proj.url - layout = self._detect_layout() - # TileDB directories are not Zarr; distinguish the format accordingly - if layout == "tiledarray": - fmt, modality = "tiledb", "array" - schema: dict | list = {} - else: - fmt, modality = "zarr", "array" - schema = {} - try: - import zarr # type: ignore[import] - - store = zarr.open(url, mode="r") - schema = { - "arrays": list(store.array_keys()), - "groups": list(store.group_keys()), - "attrs": dict(store.attrs), - } - except (ImportError, Exception): - pass - - total_size = sum( - e.get("size", 0) or 0 for e in _filelist_files(self.proj.filelist) - ) - return DataResource( - proj=self.proj, - path=(_basename(url) or fmt) + "/", - format=fmt, - modality=modality, - layout=layout, - file_count=len(_filelist_files(self.proj.filelist)), - total_size=total_size, - schema=schema, - sample_path="", - ) - - -# --------------------------------------------------------------------------- -# Utilities -# --------------------------------------------------------------------------- - - -def _safe_key(name: str) -> str: - """Convert an arbitrary name to a valid Python identifier for AttrDict keys.""" - key = re.sub(r"[^0-9a-zA-Z_]", "_", name) - if key and key[0].isdigit(): - key = "_" + key - return key or "_unnamed" diff --git a/src/projspec/proj/data_project.py b/src/projspec/proj/data_project.py new file mode 100644 index 0000000..d930b2f --- /dev/null +++ b/src/projspec/proj/data_project.py @@ -0,0 +1,434 @@ +"""The :class:`DataProject` project type. + +A *data project* is a directory that is wholly or substantially made up of +data files (as opposed to source code, docs or config). Examples: + +* a directory of CSV/parquet/JSON files exported from a database, +* a folder of images or arrays, +* a code project that *also* ships a significant amount of bundled data. + +Detection policy +---------------- +Scanning data is comparatively expensive (intake reads magic bytes / samples), +so we only do it when the data is *worth* describing. Data is considered +significant when **any** of the following holds: + +* the candidate data files make up at least ``data_min_fraction`` of the + project's total bytes, **and** their combined size is at least + ``data_min_total_size`` (guards against a project of tiny files); +* at least one individual data file is at least ``data_min_file_size`` + (a single big file is always worth describing); +* the directory matched no other project type at all (a bare data dump), as + long as the data clears ``data_min_total_size``. + +Consolidation +------------- +Before handing files to intake, obviously-related files are grouped into a +single dataset (see :mod:`projspec.proj._consolidate`): + +* numbered series โ€“ ``001.csv``, ``002.csv`` โ†’ ``*.csv`` +* spark/dask parts โ€“ ``part-00000.parquet`` โ€ฆ โ†’ ``part-*.parquet`` +* token series โ€“ ``green.gif``, ``red.gif`` โ†’ ``*.gif`` + +Intake's own directory-dataset recognition (hive parquet, zarr, delta, โ€ฆ) is +preserved: such directories are inspected as a whole rather than file-by-file. + +Per-dataset significance +------------------------ +Just as the whole directory must clear the significance bar above, the +individual datasets within a data project are filtered too: a dataset whose +size is less than ``data_min_fraction`` of the largest dataset is treated as +incidental and dropped (see :meth:`DataProject._filter_small_datasets`). This +mirrors the project-level fraction test so that a project dominated by one big +dataset doesn't also report a handful of tiny, unrelated ones. +""" + +from __future__ import annotations + +import logging + +from projspec.config import get_conf +from projspec.proj.base import ProjectSpec, ParseFailed +from projspec.proj._consolidate import consolidate, FileGroup +from projspec.utils import AttrDict + +logger = logging.getLogger("projspec.data_project") + +# Extensions that are *not* data: source code, build/config, docs. Anything +# else (or no extension) is a candidate data file. Kept conservative on +# purpose - intake makes the final call on whether something is real data. +_NON_DATA_EXT = { + # python / compiled + ".py", + ".pyc", + ".pyi", + ".pyx", + ".pxd", + ".so", + ".pyd", + ".ipynb", + # other languages + ".c", + ".h", + ".cpp", + ".hpp", + ".cc", + ".rs", + ".go", + ".java", + ".kt", + ".scala", + ".js", + ".jsx", + ".ts", + ".tsx", + ".rb", + ".php", + ".swift", + ".m", + ".sh", + ".bash", + ".lua", + ".pl", + ".r", + ".jl", + # config / build / project metadata + ".toml", + ".cfg", + ".ini", + ".lock", + ".mk", + ".cmake", + ".gradle", + ".bazel", + ".dockerfile", + ".env", + ".editorconfig", + ".gitignore", + ".gitattributes", + # docs / web + ".md", + ".rst", + ".txt", + ".html", + ".htm", + ".css", + ".scss", + ".tex", + # these are ambiguous - yaml/json are often config but also data; we treat + # them as candidate data only when they dominate (handled by thresholds). +} + +# Directory-based dataset markers intake understands; if present we inspect the +# whole directory rather than enumerating files. +_DIR_DATASET_MARKERS = ( + "_metadata", + "_common_metadata", + "_delta_log", + ".zgroup", + ".zarray", + "zarr.json", + "_latest.manifest", +) + + +class DataProject(ProjectSpec): + """A project that is wholly or substantially composed of data files. + + Produces one :class:`projspec.content.data.Dataset` content object per + consolidated dataset found, populated from + :func:`intake.readers.inspect.inspect_dataset` where intake is available. + """ + + icon = "๐Ÿ—ƒ๏ธ" + spec_doc = ( + "https://intake.readthedocs.io/en/latest/api2.html" + "#intake.readers.inspect.inspect_dataset" + ) + + # โ”€โ”€ helpers โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + @staticmethod + def _is_data_ext(name: str) -> bool: + """Whether a basename looks like a data file (not code/docs/config).""" + lower = name.lower() + if lower.startswith("."): + return False # dotfiles are metadata, not data + if "." not in lower: + return False # no extension - usually not a recognisable dataset + for double in (".csv.gz", ".json.gz", ".tsv.gz"): + if lower.endswith(double): + return True + ext = "." + lower.rsplit(".", 1)[-1] + return ext not in _NON_DATA_EXT + + def _candidate_files(self) -> list[tuple[str, int | None]]: + """``(basename, size)`` for data-like files directly in the root.""" + out = [] + for info in self.proj.filelist: + if info.get("type") == "directory": + continue + name = info["name"].rsplit("/", 1)[-1] + if self._is_data_ext(name): + out.append((name, info.get("size"))) + return out + + def _has_dir_dataset(self) -> bool: + """True if the root itself is an intake directory-dataset (hive, zarrโ€ฆ).""" + return any(m in self.proj.basenames for m in _DIR_DATASET_MARKERS) + + # โ”€โ”€ match โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + def match(self) -> bool: + """Cheap check: are there any candidate data files (or a dir-dataset)? + + Significance (size/fraction) is enforced in :meth:`parse` so that + ``match`` stays cheap and never reads file contents. + """ + if self._has_dir_dataset(): + return True + return bool(self._candidate_files()) + + # โ”€โ”€ significance policy โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + def _other_type_matches(self) -> bool: + """Cheaply test whether any *other* project type matches this directory. + + ``parse`` runs in registry order, so ``self.proj.specs`` is not yet + complete when ``DataProject`` is parsed. Instead we re-run the cheap + ``match()`` of every other registered spec. ``match`` is contractually + cheap (basename checks), so this adds little cost and only happens once + per directory that has candidate data. + """ + from projspec.proj.base import registry, ProjectExtra + + for name, cls in registry.items(): + if name == "data_project": + continue + # ProjectExtra specs (licences, CI, intake catalogs, โ€ฆ) are + # cross-cutting add-ons, not standalone project types, so a match + # from one of them should not suppress a data project. + if issubclass(cls, ProjectExtra): + continue + try: + inst = cls(self.proj) # __init__ runs match(), raises if no match + except Exception: + continue + else: + logger.debug("DataProject deferring to %s for %s", name, self.proj.url) + return True + return False + + def _is_significant(self, data_bytes: int, max_file: int) -> bool: + """Apply the detection policy described in the module docstring.""" + min_file = get_conf("data_min_file_size") + min_total = get_conf("data_min_total_size") + min_frac = get_conf("data_min_fraction") + min_play = get_conf("data_min_play_size") + + # 1. a single big file is always worth describing + if max_file >= min_file: + return True + + total = self.proj.total_size or data_bytes + # 2. data dominates the project by byte fraction (and isn't trivially small) + if total and data_bytes / total >= min_frac and data_bytes >= min_total: + return True + + # 3. nothing else matched -> treat any non-play data dump as a project. + # Here the bar is only "more than play data", not the full + # data_min_total_size used for the also-a-data-project case above. + if data_bytes >= min_play and not self._other_type_matches(): + return True + + return False + + def _filter_small_datasets(self, datasets: list) -> list: + """Drop datasets that are a small fraction of the largest one. + + Operates on a list of ``(name, Dataset)`` pairs (the form used while + assembling :meth:`parse`'s output). + + Just as :meth:`_is_significant` decides whether the directory as a + whole is data-y enough to report, this applies the same spirit to the + individual datasets within a data project: a dataset whose size is + less than ``data_min_fraction`` of the biggest dataset is treated as + incidental and discarded. + + The comparison is by byte size relative to the largest dataset. If + fewer than two datasets are present, or any dataset's size is unknown + (``None``), no filtering is applied (we can't reason about fractions). + """ + if len(datasets) < 2: + return datasets + sizes = [getattr(ds, "total_size", None) for _, ds in datasets] + if any(s is None for s in sizes): + return datasets + largest = max(s for s in sizes if s is not None) + if largest <= 0: + return datasets + min_frac = get_conf("data_min_fraction") + kept = [ + pair + for pair, s in zip(datasets, sizes) + if s is not None and s / largest >= min_frac + ] + # never drop everything: if the threshold somehow excludes all (e.g. + # min_frac > 1), fall back to keeping the original set. + return kept or datasets + + # โ”€โ”€ parse โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + def parse(self) -> None: + candidates = self._candidate_files() + has_dir_dataset = self._has_dir_dataset() + + data_bytes = sum(s or 0 for _, s in candidates) + max_file = max((s or 0 for _, s in candidates), default=0) + + if not has_dir_dataset and not self._is_significant(data_bytes, max_file): + raise ParseFailed("Data present but not a significant data project") + + groups: list[FileGroup] + if has_dir_dataset: + # Let intake describe the whole directory as one dataset. + name = self.proj.url.rstrip("/").rsplit("/", 1)[-1] or "dataset" + groups = [ + FileGroup( + members=[], + total_size=self.proj.total_size, + pattern=name, + consolidated=True, + ) + ] + dir_dataset = True + else: + min_group = get_conf("data_consolidate_min_group") + groups = consolidate(candidates, min_group=min_group) + dir_dataset = False + + if len(groups) > get_conf("data_inspect_max_datasets"): + logger.debug( + "Too many datasets (%d) in %s; describing without intake", + len(groups), + self.proj.url, + ) + described = [self._describe_without_intake(g) for g in groups] + else: + described = [self._describe(g, dir_dataset=dir_dataset) for g in groups] + + # Each entry is a (name, Dataset) pair. Only keep datasets that intake + # could assign a datatype to; datasets whose type could not be + # identified are not useful as data content. + described = [(name, ds) for name, ds in described if ds.datatype is not None] + + # Drop datasets that are only a small fraction of the largest one, + # analogous to the project-level significance test. + described = self._filter_small_datasets(described) + + if not described: + raise ParseFailed("No datasets with an identified datatype found") + + # Datasets are keyed by their (unique) name; the name is therefore not + # duplicated as a field on the Dataset objects themselves. + datasets = AttrDict() + for name, ds in described: + key = name + # guard against the (rare) case of duplicate names + n = 2 + while key in datasets: + key = f"{name}#{n}" + n += 1 + datasets[key] = ds + self._contents = AttrDict(dataset=datasets) + + # โ”€โ”€ dataset description โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + def _root_url(self) -> str: + """Protocol-qualified root URL for handing to intake / building dataset + URLs. + + ``self.proj.url`` is the filesystem-specific path with the protocol + stripped (e.g. ``bucket/key`` for ``s3://bucket/key``). Intake needs + the protocol to pick the right filesystem, so we restore it here. + """ + return self.proj.fs.unstrip_protocol(self.proj.url) + + def _dataset_url(self, group: FileGroup, dir_dataset: bool): + if dir_dataset: + return self._root_url() + return group.url(self._root_url()) + + def _describe_without_intake(self, group: FileGroup): + """Build a Dataset content object using only filename info (no I/O). + + Returns a ``(name, Dataset)`` pair; the name becomes the key in the + project's ``contents.dataset`` mapping. + """ + from projspec.content.data import Dataset + + return group.name, Dataset( + proj=self.proj, + url=group.url(self._root_url()), + datatype=None, + structure=[], + schema={}, + n_files=len(group.members) or 1, + total_size=group.total_size, + metadata={}, + ) + + def _describe(self, group: FileGroup, dir_dataset: bool = False): + """Describe a single file-group as a Dataset, using intake if available.""" + from projspec.content.data import Dataset + + url = self._dataset_url(group, dir_dataset) + info: dict | None = None + try: + from intake.readers.inspect import inspect_dataset + + # storage_options keep remote access working; the size guard and + # timeout protect against pathological inputs. + info = inspect_dataset( + url, + storage_options=self.proj.storage_options or None, + ) + except ImportError: + logger.debug("intake not installed; describing %s by name only", url) + except Exception as exc: # never let a bad file abort the whole parse + logger.debug("inspect_dataset failed for %s: %s", url, exc) + + if not info: + return self._describe_without_intake(group) + + n_files = info.get("n_files") or (len(group.members) or 1) + total = info.get("file_size_bytes") + if total is None: + total = group.total_size + + meta = { + k: info[k] + for k in ( + "shape", + "npartitions", + "reader_used", + "description", + "html_repr", + "thumbnail", + ) + if info.get(k) is not None + } + # report which readers intake thinks can load this, if any + readers = info.get("readers") or {} + if readers: + meta["readers"] = sorted(readers) + + structure = info.get("structure") or set() + name = group.pattern if dir_dataset else group.name + return name, Dataset( + proj=self.proj, + url=url, + datatype=info.get("detected_type"), + structure=sorted(structure) + if isinstance(structure, set) + else list(structure), + schema=info.get("datashape") or {}, + n_files=n_files, + total_size=total, + metadata=meta, + ) diff --git a/src/projspec/proj/knowledge_catalog.py b/src/projspec/proj/knowledge_catalog.py new file mode 100644 index 0000000..bf31fca --- /dev/null +++ b/src/projspec/proj/knowledge_catalog.py @@ -0,0 +1,208 @@ +"""The :class:`KnowledgeCatalog` project spec. + +Detects an *Open Knowledge Format* (OKF) bundle: a directory tree of markdown +files with YAML frontmatter, where every non-reserved ``.md`` file is a +"concept" carrying at least a ``type`` field. Two filenames are reserved at +any level: ``index.md`` (directory listing) and ``log.md`` (update history). + +See https://github.com/GoogleCloudPlatform/knowledge-catalog/blob/main/okf/SPEC.md +""" + +from __future__ import annotations + +import os +from io import StringIO + +from projspec.proj import ProjectSpec +from projspec.proj.base import ParseFailed +from projspec.utils import AttrDict + +# Filenames with reserved meaning that are never concept documents (ยง3.1). +_RESERVED = {"index.md", "log.md"} + + +def _split_frontmatter(text: str | bytes) -> dict | None: + """Return the parsed YAML frontmatter block of a markdown document. + + Returns ``None`` when the document has no parseable ``---``-delimited + frontmatter mapping at its start. + """ + import yaml + + if isinstance(text, bytes): + text = text.decode("utf-8", "replace") + # Frontmatter must be delimited by '---' on its own line at the start and a + # closing '---'. Require at least the opening and closing fences. + if text.count("---\n") < 2 and not text.lstrip().startswith("---"): + return None + parts = text.split("---\n") + if len(parts) < 3: + return None + # parts[0] is whatever precedes the first fence (should be empty/whitespace) + if parts[0].strip(): + return None + try: + meta = yaml.safe_load(StringIO(parts[1])) + except Exception: + return None + return meta if isinstance(meta, dict) else None + + +class KnowledgeCatalog(ProjectSpec): + """An Open Knowledge Format (OKF) knowledge bundle. + + An OKF bundle is a directory of markdown "concept" documents, each with a + YAML frontmatter block declaring a ``type``. Reserved ``index.md`` / + ``log.md`` files provide directory listings and update history. + + Produces one :class:`projspec.content.metadata.DescriptiveMetadata` per + concept, keyed by its *concept ID* (the file path within the bundle with + the ``.md`` suffix removed, e.g. ``tables/orders``). + """ + + icon = "๐Ÿ“š" + spec_doc = ( + "https://github.com/GoogleCloudPlatform/knowledge-catalog/blob/main/okf/SPEC.md" + ) + + def match(self) -> bool: + """Cheap check: a reserved ``index.md`` is present, plus either another + markdown document or a subdirectory that might hold concepts. + + Full validation (that concepts carry a ``type`` field) is deferred to + :meth:`parse`, which raises :class:`ParseFailed` if none qualify, so a + plain ``index.md`` from some other tool does not register as an OKF + bundle. + """ + if "index.md" not in self.proj.basenames: + return False + # another markdown concept at the root... + for name in self.proj.basenames: + if name.endswith(".md") and name not in _RESERVED: + return True + # ...or a subdirectory that might contain concepts + for info in self.proj.filelist: + if info.get("type") == "directory": + base = str(info["name"]).rstrip("/").rsplit("/", 1)[-1] + # skip hidden/dunder dirs (handled like project walking) + if not base.startswith((".", "_")): + return True + return False + + def _concept_files(self) -> list[str]: + """Full paths of candidate concept documents (recursive, non-reserved).""" + root = self.proj.url.rstrip("/") + try: + # glob may return a list or (with detail) a dict keyed by path + paths = list(self.proj.fs.glob(f"{root}/**/*.md")) + except Exception: + # fall back to the top-level listing if globbing isn't supported + paths = [ + full + for name, full in self.proj.basenames.items() + if name.endswith(".md") + ] + out = [] + for p in paths: + p = str(p) + base = p.rsplit("/", 1)[-1] + if base in _RESERVED: + continue + out.append(p) + return sorted(out) + + def _concept_id(self, full_path: str) -> str: + """The concept ID: bundle-relative path with the ``.md`` suffix removed.""" + root = self.proj.url.rstrip("/") + "/" + rel = full_path[len(root) :] if full_path.startswith(root) else full_path + if rel.endswith(".md"): + rel = rel[: -len(".md")] + return rel + + def parse(self) -> None: + from projspec.content.metadata import DescriptiveMetadata + + concepts = AttrDict() + for full in self._concept_files(): + try: + with self.proj.fs.open(full, "rt") as f: + text = f.read() + except OSError: + continue + meta = _split_frontmatter(text) + if not meta: + # not a conformant concept document - skip + continue + type_ = meta.get("type") + if not type_ or not str(type_).strip(): + # ยง9: every concept frontmatter must carry a non-empty `type` + continue + + entry: dict[str, str] = {"type": str(type_)} + for field in ("title", "description", "resource", "timestamp"): + val = meta.get(field) + if val: + entry[field] = str(val) + tags = meta.get("tags") + if tags: + if isinstance(tags, (list, tuple)): + entry["tags"] = ", ".join(str(t) for t in tags) + else: + entry["tags"] = str(tags) + + key = self._concept_id(full) + concepts[key] = DescriptiveMetadata(proj=self.proj, meta=entry) + + if not concepts: + raise ParseFailed("No OKF concept documents with a 'type' field found") + + # The bundle-root index.md may declare the OKF version it targets. + bundle_meta: dict[str, str] = {} + if "index.md" in self.proj.basenames: + try: + with self.proj.get_file("index.md") as f: + idx = _split_frontmatter(f.read()) + except OSError: + idx = None + if idx and idx.get("okf_version"): + bundle_meta["okf_version"] = str(idx["okf_version"]) + + contents = AttrDict(concept=concepts) + if bundle_meta: + contents["descriptive_metadata"] = DescriptiveMetadata( + proj=self.proj, meta=bundle_meta + ) + self._contents = contents + self._artifacts = AttrDict() + + @staticmethod + def _create(path: str) -> None: + """Scaffold a minimal but conformant OKF bundle.""" + name = os.path.basename(path.rstrip("/")) or "bundle" + + with open(f"{path}/index.md", "w") as f: + f.write( + "---\n" + 'okf_version: "0.1"\n' + "---\n\n" + f"# {name}\n\n" + "* [Overview](overview.md) - what this bundle contains\n" + ) + + with open(f"{path}/log.md", "w") as f: + f.write( + "# Update Log\n\n" + "## 2026-01-01\n" + "* **Initialization**: Created the bundle.\n" + ) + + with open(f"{path}/overview.md", "w") as f: + f.write( + "---\n" + "type: Reference\n" + f"title: {name} overview\n" + "description: A short description of this knowledge bundle.\n" + "---\n\n" + f"# {name}\n\n" + "Free-form markdown describing the knowledge captured here.\n" + ) diff --git a/src/projspec/textapp/main.py b/src/projspec/textapp/main.py index 5911bb4..bff7407 100644 --- a/src/projspec/textapp/main.py +++ b/src/projspec/textapp/main.py @@ -171,19 +171,9 @@ def _basename(url: str) -> str: def _fmt_age(ts: float) -> str: - import datetime + from projspec.proj.base import _humanize_age - days = (datetime.datetime.now() - datetime.datetime.fromtimestamp(ts)).days - if days == 0: - return "today" - if days == 1: - return "yesterday" - if days < 30: - return f"{days} days ago" - if days < 365: - return f"{days // 30} months ago" - yrs = days // 365 - return f"{yrs} year{'s' if yrs > 1 else ''} ago" + return _humanize_age(ts) def _is_enum(v: Any) -> bool: @@ -253,7 +243,15 @@ def _yaml_lines( return [f"{pad}{_role('{}', 'muted')}"] out = [] for k, v in data.items(): - if _is_enum(v): + # The web UIs embed these as live HTML / an image; a TUI can't, so + # show a short placeholder rather than dumping the huge raw string. + if k in ("html_repr", "thumbnail") and isinstance(v, str): + note = "HTML preview" if k == "html_repr" else "image thumbnail" + out.append( + f"{pad}{_role(str(k), 'field')}: " + f"{_role(f'<{note} available in graphical UI>', 'muted')}" + ) + elif _is_enum(v): out.append( f"{pad}{_role(str(k), 'field')}: " f"{_role(_enum_label(v, enums), 'enum')}" @@ -713,6 +711,9 @@ def compose(self) -> ComposeResult: age = _fmt_age(float(last_modified)) by = self.project.get("last_modified_by") meta_parts.append("last modified " + age + (f" by {by}" if by else "")) + scanned_at = self.project.get("scanned_at") + if scanned_at is not None: + meta_parts.append("scanned " + _fmt_age(float(scanned_at))) if meta_parts: yield Static(" ยท ".join(meta_parts), classes="meta") # Build the full list of chips first, then split into horizontal diff --git a/src/projspec/utils.py b/src/projspec/utils.py index fa70261..64dd015 100644 --- a/src/projspec/utils.py +++ b/src/projspec/utils.py @@ -15,6 +15,10 @@ logger = logging.getLogger("projspec") +class DEFAULT: + ... + + class Enum(enum.Enum): """Named enum values, so that str(x) looks like the label.""" @@ -113,7 +117,10 @@ def from_dict(dic, proj=None): if dic["klass"] == "project": return Project.from_dict(dic) category, name = dic.pop("klass") - cls = get_cls(name, category) + try: + cls = get_cls(name, category) + except KeyError: + return None if category == "enum": return cls(dic["value"]) obj = object.__new__(cls) diff --git a/src/projspec/webui/panel.js b/src/projspec/webui/panel.js index 870e249..b1f3904 100644 --- a/src/projspec/webui/panel.js +++ b/src/projspec/webui/panel.js @@ -102,8 +102,18 @@ } } function fmtAge(ts) { - const days = Math.floor((Date.now() / 1000 - parseFloat(ts)) / 86400); - if (days === 0) return 'today'; + const secs = Math.floor(Date.now() / 1000 - parseFloat(ts)); + if (secs < 0) return 'just now'; + const days = Math.floor(secs / 86400); + if (days === 0) { + if (secs < 60) return 'just now'; + if (secs < 3600) { + const m = Math.floor(secs / 60); + return m + ' minute' + (m !== 1 ? 's' : '') + ' ago'; + } + const h = Math.floor(secs / 3600); + return h + ' hour' + (h !== 1 ? 's' : '') + ' ago'; + } if (days === 1) return 'yesterday'; if (days < 30) return days + ' days ago'; if (days < 365) return Math.floor(days / 30) + ' months ago'; @@ -200,6 +210,8 @@ const by = project.last_modified_by != null ? project.last_modified_by : null; metaParts.push('last modified ' + age + (by ? ' by ' + by : '')); } + if (project.scanned_at != null) + metaParts.push('scanned ' + fmtAge(project.scanned_at)); if (metaParts.length > 0) { const meta = document.createElement('div'); meta.className = 'meta'; @@ -456,14 +468,69 @@ body.innerHTML = sanitizeHtml(html); w.appendChild(body); } else { + // Datasets (and other content) may carry rich previews in + // ``metadata.html_repr`` (an HTML fragment) and + // ``metadata.thumbnail`` (a data: image URL). Embed those rather + // than dumping their (often huge) raw strings into the YAML tree. + const meta = (kind === 'content' && data && typeof data === 'object' + && data.metadata && typeof data.metadata === 'object') ? data.metadata : null; + const htmlRepr = meta && typeof meta.html_repr === 'string' ? meta.html_repr : null; + const thumb = meta && typeof meta.thumbnail === 'string' ? meta.thumbnail : null; + const tree = document.createElement('div'); tree.className = 'tree yaml'; - tree.appendChild(renderYaml(stripKlass(data))); + tree.appendChild(renderYaml(stripPreview(stripKlass(data)))); w.appendChild(tree); + + if (thumb) w.appendChild(thumbnailImg(thumb)); + if (htmlRepr) { + const body = document.createElement('div'); + body.className = 'widget-html'; + body.innerHTML = sanitizeHtml(htmlRepr); + w.appendChild(body); + } } return w; } + /** + * Return a shallow copy of a content dict with the embedded-preview keys + * (``metadata.html_repr`` / ``metadata.thumbnail``) removed, so the YAML + * tree doesn't show their large raw strings - they are rendered as live + * HTML / an image instead. + */ + function stripPreview(obj) { + if (!obj || typeof obj !== 'object' || Array.isArray(obj)) return obj; + if (!obj.metadata || typeof obj.metadata !== 'object' || Array.isArray(obj.metadata)) return obj; + const meta = {}; + let changed = false; + for (const k of Object.keys(obj.metadata)) { + if (k === 'html_repr' || k === 'thumbnail') { changed = true; continue; } + meta[k] = obj.metadata[k]; + } + if (!changed) return obj; + const out = {}; + for (const k of Object.keys(obj)) out[k] = obj[k]; + out.metadata = meta; + return out; + } + + /** + * Build an for a ``data:image/...`` thumbnail URL. Only accepts + * data: image URLs (never remote/javascript URLs). + */ + function thumbnailImg(src) { + const wrap = document.createElement('div'); + wrap.className = 'widget-html'; + if (/^data:image\//i.test(src)) { + const img = document.createElement('img'); + img.src = src; + img.alt = 'thumbnail'; + wrap.appendChild(img); + } + return wrap; + } + /** * Minimal HTML sanitisation for content-provided ``_html`` fragments. * The markup originates from projspec itself, so we don't need a diff --git a/tests/test_basic.py b/tests/test_basic.py index 778d53b..b9f011d 100644 --- a/tests/test_basic.py +++ b/tests/test_basic.py @@ -1,5 +1,6 @@ import json import pickle +import time import pytest @@ -14,7 +15,32 @@ def test_basic(proj): assert "src/projspec" in proj.children assert repr(proj).count("\n") == 0 assert str(proj).count("\n") > 0 - proj._repr_html_() + proj._ipython_display_() + + +def test_humanize_age(): + from projspec.proj.base import _humanize_age + + now = time.time() + assert _humanize_age(now) == "just now" + assert _humanize_age(now + 100) == "just now" # future / clock skew + assert _humanize_age(now - 5 * 60) == "5 minutes ago" + assert _humanize_age(now - 60) == "1 minute ago" + assert _humanize_age(now - 3 * 3600) == "3 hours ago" + assert _humanize_age(now - 1.5 * 86400) == "yesterday" + assert _humanize_age(now - 10 * 86400) == "10 days ago" + assert _humanize_age(now - 60 * 86400) == "2 months ago" + assert _humanize_age(now - 400 * 86400) == "1 year ago" + assert _humanize_age(now - 800 * 86400) == "2 years ago" + + +def test_scanned_at_in_stats_line(proj): + # scanned_at should appear in the textual surfaces + assert "scanned " in proj._stats_line() + assert "scanned " in proj.text_summary() + assert "scanned " in str(proj) + # bare summary omits the stats line entirely + assert "scanned " not in proj.text_summary(bare=True) def test_errors(): diff --git a/tests/test_data_html.py b/tests/test_data_html.py deleted file mode 100644 index 2d6e6ea..0000000 --- a/tests/test_data_html.py +++ /dev/null @@ -1,449 +0,0 @@ -"""Tests for projspec.content.data_html โ€” repr_text and repr_html. - -These tests use a mock DataResource to avoid needing real data files on disk -for basic formatting checks, then run format-specific loader tests when the -required optional libraries are available. -""" - -from __future__ import annotations - -import io -import os -import tempfile -from unittest.mock import MagicMock - -import pytest - -import projspec - -# --------------------------------------------------------------------------- -# Helpers -# --------------------------------------------------------------------------- - - -def _make_dr( - path="mytable.parquet", - fmt="parquet", - modality="tabular", - layout="flat", - file_count=3, - total_size=1024 * 512, - schema=None, - sample_path="", - metadata=None, -): - """Build a DataResource backed by a real Project (the repo root) but with - controlled field values.""" - from projspec.content.data import DataResource - - mock_proj = MagicMock(spec=projspec.Project) - # Use a real local filesystem via fsspec - import fsspec - - mock_proj.fs = fsspec.filesystem("file") - mock_proj.url = "/tmp" - - return DataResource( - proj=mock_proj, - path=path, - format=fmt, - modality=modality, - layout=layout, - file_count=file_count, - total_size=total_size, - schema=schema or {}, - sample_path=sample_path, - metadata=metadata or {}, - ) - - -# --------------------------------------------------------------------------- -# repr_text tests -# --------------------------------------------------------------------------- - - -class TestReprText: - def test_basic_fields_present(self): - dr = _make_dr() - text = repr(dr) - assert "mytable.parquet" in text - assert "parquet" in text - assert "tabular" in text - assert "files=3" in text - - def test_size_formatting(self): - dr = _make_dr(total_size=1024) - text = repr(dr) - assert "KB" in text or "B" in text - - def test_size_zero(self): - dr = _make_dr(total_size=0) - text = repr(dr) - assert "unknown" in text - - def test_schema_hint_dict(self): - dr = _make_dr(schema={"col_a": "int64", "col_b": "float32", "col_c": "str"}) - text = repr(dr) - assert "col_a" in text - - def test_schema_hint_many_fields(self): - schema = {f"col_{i}": "int64" for i in range(10)} - dr = _make_dr(schema=schema) - text = repr(dr) - assert "+7 more" in text - - def test_schema_hint_list(self): - dr = _make_dr(schema=[{"name": "a"}, {"name": "b"}]) - text = repr(dr) - assert "2 fields" in text - - def test_non_flat_layout_shown(self): - dr = _make_dr(layout="hive") - text = repr(dr) - assert "hive" in text - - def test_flat_layout_hidden(self): - dr = _make_dr(layout="flat") - text = repr(dr) - assert "layout" not in text - - def test_no_modality(self): - dr = _make_dr(modality="") - text = repr(dr) - assert "modality" not in text - - def test_single_line(self): - dr = _make_dr() - text = repr(dr) - assert "\n" not in text - - def test_path_shown(self): - """repr_text must show the path field, not a separate name.""" - dr = _make_dr(path="part*.csv") - text = repr(dr) - assert "part*.csv" in text - - def test_dir_path_shown(self): - dr = _make_dr(path="year=2024/") - text = repr(dr) - assert "year=2024/" in text - - -# --------------------------------------------------------------------------- -# repr_html tests -# --------------------------------------------------------------------------- - - -class TestReprHtml: - def test_returns_string(self): - dr = _make_dr() - html = dr._repr_html_() - assert isinstance(html, str) - assert len(html) > 0 - - def test_contains_path(self): - dr = _make_dr(path="my_dataset.parquet") - html = dr._repr_html_() - assert "my_dataset.parquet" in html - - def test_contains_glob_path(self): - dr = _make_dr(path="part*.parquet") - html = dr._repr_html_() - assert "part*.parquet" in html - - def test_contains_dir_path(self): - dr = _make_dr(path="year=2024/") - html = dr._repr_html_() - assert "year=2024/" in html - - def test_contains_format_badge(self): - dr = _make_dr(fmt="parquet") - html = dr._repr_html_() - assert "parquet" in html - - def test_contains_modality_badge(self): - dr = _make_dr(modality="tabular") - html = dr._repr_html_() - assert "tabular" in html - - def test_contains_file_count(self): - dr = _make_dr(file_count=7) - html = dr._repr_html_() - assert "7" in html - - def test_contains_size(self): - dr = _make_dr(total_size=2048) - html = dr._repr_html_() - assert "KB" in html or "B" in html - - def test_schema_dict_rendered(self): - dr = _make_dr(schema={"id": "int64", "name": "string"}) - html = dr._repr_html_() - assert "id" in html - assert "int64" in html - - def test_schema_list_of_dicts_rendered(self): - dr = _make_dr( - schema=[ - {"name": "id", "type": "integer"}, - {"name": "val", "type": "number"}, - ] - ) - html = dr._repr_html_() - assert "id" in html - assert "integer" in html - - def test_schema_empty_no_details(self): - dr = _make_dr(schema={}) - html = dr._repr_html_() - assert "Schema" not in html - - def test_no_preview_section_without_sample_path(self): - dr = _make_dr(sample_path="") - html = dr._repr_html_() - assert "Preview" not in html - - def test_layout_badge_shown_for_hive(self): - dr = _make_dr(layout="hive") - html = dr._repr_html_() - assert "hive" in html - - def test_layout_badge_hidden_for_flat(self): - dr = _make_dr(layout="flat") - html = dr._repr_html_() - assert 'ps-badge-gray">flat<' not in html - - def test_html_structure(self): - dr = _make_dr() - html = dr._repr_html_() - assert "ps-data-card" in html - assert "ps-data-card-header" in html - assert "ps-data-meta" in html - - def test_icon_present_for_known_modality(self): - dr = _make_dr(modality="image") - html = dr._repr_html_() - # Image icon is ๐Ÿ–ผ (🖼) - assert "🖼" in html - - def test_icon_fallback_for_unknown_modality(self): - dr = _make_dr(modality="") - html = dr._repr_html_() - # Fallback icon 🗂 - assert "🗂" in html - - def test_large_schema_collapsed(self): - schema = {f"col_{i}": "int64" for i in range(20)} - dr = _make_dr(schema=schema) - html = dr._repr_html_() - # details element should NOT have open attribute when >8 fields - assert ( - "
    ' in html - ) - - def test_small_schema_open(self): - schema = {f"col_{i}": "int64" for i in range(4)} - dr = _make_dr(schema=schema) - html = dr._repr_html_() - assert "
    with a dataframe class - assert "dataframe" in html or "ps-df-wrap" in html - - def test_csv_preview_row_limit(self, tmp_path): - """Only _PREVIEW_ROWS rows of data should appear, not all 50.""" - pytest.importorskip("pandas") - import pandas as pd - - path = str(tmp_path / "big.csv") - pd.DataFrame({"v": range(50)}).to_csv(path, index=False) - dr = self._dr_for_file(path, "csv", "tabular") - html = dr._repr_html_() - # Extract just the preview section so CSS text doesn't interfere - preview_start = html.find('
    ') - assert preview_start != -1, "no preview section found" - preview_html = html[preview_start:] - # The last row value (49) should not appear as a table cell - assert "49" not in preview_html - - def test_parquet_preview(self, tmp_path): - pytest.importorskip("pyarrow") - import pyarrow as pa - import pyarrow.parquet as pq - - path = str(tmp_path / "data.parquet") - table = pa.table({"a": [1, 2, 3], "b": ["x", "y", "z"]}) - pq.write_table(table, path) - dr = self._dr_for_file(path, "parquet", "tabular") - html = dr._repr_html_() - assert "Preview" in html - assert " 1 MB threshold - np.save(path, np.zeros((512, 512), dtype="float64")) - dr = self._dr_for_file(path, "numpy", "array") - html = dr._repr_html_() - assert "(512, 512)" in html # shape shown - assert "float64" in html # dtype shown - # The data slice key ("preview") should NOT appear in the info table; - # check the table cell content rather than the CSS class names - assert ">preview<" not in html # no preview row - - -# --------------------------------------------------------------------------- -# fmt_size helper -# --------------------------------------------------------------------------- - - -def test_fmt_size(): - from projspec.content.data_html import _fmt_size - - assert _fmt_size(0) == "unknown" - assert _fmt_size(512) == "512 B" - assert "KB" in _fmt_size(2048) - assert "MB" in _fmt_size(2 * 1024 * 1024) - assert "GB" in _fmt_size(3 * 1024**3) diff --git a/tests/test_data_project.py b/tests/test_data_project.py index 3dae345..f0d13b5 100644 --- a/tests/test_data_project.py +++ b/tests/test_data_project.py @@ -1,326 +1,594 @@ -import json +"""Tests for the DataProject spec and the file-consolidation helper. + +The consolidation helper is filesystem-agnostic and tested directly on +``(basename, size)`` lists. The DataProject spec is tested end-to-end by +writing files into a tmpdir and constructing a real ``projspec.Project``. + +Intake may or may not be installed (and which readers are available varies), +so the DataProject assertions only check things that do not depend on a +specific reader being present: that the project is/ isn't detected, how files +are consolidated, file counts and sizes. Where intake is available we also +spot-check ``datatype``/``structure``. +""" + import os import pytest import projspec -from projspec.content.data import DataResource -from projspec.utils import from_dict - - -def _data_project(tmp_path): - """Return a projspec.Project rooted at *tmp_path* (no walk needed).""" - return projspec.Project(str(tmp_path)) - - -class TestDataDetection: - def test_csv_detected(self, tmp_path): - (tmp_path / "data.csv").write_text("x,y\n1,2\n3,4\n") - proj = _data_project(tmp_path) - assert "data" in proj.specs - - def test_parquet_detected(self, tmp_path): - pytest.importorskip("pyarrow") - import pyarrow as pa - import pyarrow.parquet as pq - - pq.write_table(pa.table({"a": [1, 2]}), str(tmp_path / "t.parquet")) - proj = _data_project(tmp_path) - assert "data" in proj.specs - - def test_no_data_files_not_detected(self, tmp_path): - (tmp_path / "README.md").write_text("hello") - (tmp_path / "config.json").write_text("{}") - proj = _data_project(tmp_path) - assert "data" not in proj.specs - - -class TestDataParse: - def test_single_csv_resource(self, tmp_path): - (tmp_path / "sales.csv").write_text("col1,col2\n1,a\n2,b\n") - proj = _data_project(tmp_path) - dr = proj.specs["data"].contents["data_resource"] - assert isinstance(dr, DataResource) - assert dr.path == "sales.csv" - assert dr.format == "csv" - assert dr.modality == "tabular" - assert dr.file_count == 1 - - def test_series_collated_to_glob_path(self, tmp_path): - """part0.csv + part1.csv โ†’ path == 'part*.csv'""" - for i in range(3): - (tmp_path / f"part{i}.csv").write_text("x\n1\n") - proj = _data_project(tmp_path) - dr = proj.specs["data"].contents["data_resource"] - assert isinstance(dr, DataResource) - assert dr.path == "part*.csv" - assert dr.file_count == 3 - - def test_distinct_csv_files_separate_resources(self, tmp_path): - """users.csv and orders.csv differ alphabetically โ†’ two resources.""" - (tmp_path / "users.csv").write_text("id\n1\n") - (tmp_path / "orders.csv").write_text("id\n1\n") - proj = _data_project(tmp_path) - dr_map = proj.specs["data"].contents["data_resource"] - # Two separate DataResource objects, keyed in an AttrDict - assert len(dr_map) == 2 - paths = {dr_map[k].path for k in dr_map} - assert "users.csv" in paths - assert "orders.csv" in paths - - def test_sample_path_is_full_path(self, tmp_path): - csv = tmp_path / "data.csv" - csv.write_text("x\n1\n") - proj = _data_project(tmp_path) - dr = proj.specs["data"].contents["data_resource"] - assert dr.sample_path == str(csv) - - def test_total_size_nonzero(self, tmp_path): - content = "x,y\n" + "\n".join(f"{i},{i}" for i in range(20)) - (tmp_path / "nums.csv").write_text(content) - proj = _data_project(tmp_path) - dr = proj.specs["data"].contents["data_resource"] - assert dr.total_size > 0 - - -class TestDataResourceToDict: - def _make_dr(self, tmp_path): - (tmp_path / "items.csv").write_text("id,val\n1,a\n2,b\n") - proj = _data_project(tmp_path) - return proj.specs["data"].contents["data_resource"] - - def test_compact_omits_klass(self, tmp_path): - dr = self._make_dr(tmp_path) - d = dr.to_dict(compact=True) - assert "klass" not in d - - def test_compact_omits_html(self, tmp_path): - """compact=True is for human/console output โ€” _html must be absent.""" - dr = self._make_dr(tmp_path) - d = dr.to_dict(compact=True) - assert "_html" not in d - - -class TestDataResourceRoundTrip: - def _roundtrip(self, dr): - """Serialise to JSON and rehydrate, returning the new DataResource.""" - d = dr.to_dict(compact=False) - js = json.dumps(d) - d2 = json.loads(js) - return from_dict(d2, proj=dr.proj) - - def _make_dr(self, tmp_path): - (tmp_path / "orders.csv").write_text("order_id,amount\n1,99\n2,42\n") - proj = _data_project(tmp_path) - return proj.specs["data"].contents["data_resource"] - - def test_roundtrip_returns_dataresource(self, tmp_path): - dr2 = self._roundtrip(self._make_dr(tmp_path)) - assert isinstance(dr2, DataResource) - - def test_roundtrip_preserves_path(self, tmp_path): - dr2 = self._roundtrip(self._make_dr(tmp_path)) - assert dr2.path == "orders.csv" - - def test_roundtrip_preserves_format(self, tmp_path): - dr2 = self._roundtrip(self._make_dr(tmp_path)) - assert dr2.format == "csv" - - def test_roundtrip_preserves_modality(self, tmp_path): - dr2 = self._roundtrip(self._make_dr(tmp_path)) - assert dr2.modality == "tabular" - - def test_roundtrip_preserves_file_count(self, tmp_path): - dr2 = self._roundtrip(self._make_dr(tmp_path)) - assert dr2.file_count == 1 - - def test_roundtrip_preserves_total_size(self, tmp_path): - dr = self._make_dr(tmp_path) - dr2 = self._roundtrip(dr) - assert dr2.total_size == dr.total_size - - def test_roundtrip_preserves_schema(self, tmp_path): - pytest.importorskip("pyarrow") - import pyarrow as pa, pyarrow.parquet as pq - - pq.write_table( - pa.table({"col_a": [1, 2, 3], "col_b": ["x", "y", "z"]}), - str(tmp_path / "data.parquet"), +from projspec.config import temp_conf +from projspec.proj._consolidate import consolidate, FileGroup +from projspec.proj.data_project import DataProject +from projspec.content.data import Dataset, TabularData, IntakeSource + +try: + import intake.readers.inspect # noqa: F401 + + HAS_INTAKE = True +except Exception: # pragma: no cover + HAS_INTAKE = False + +try: + import pandas as _pd # noqa: F401 + + HAS_PANDAS = True +except Exception: # pragma: no cover + HAS_PANDAS = False + +try: + # importing here puts PIL in sys.modules so intake's check_imports (which + # uses importlib.metadata.distribution and falls back to sys.modules) finds + # it - Pillow's distribution name ("pillow") differs from the import name. + import PIL # noqa: F401 + import numpy as _np # noqa: F401 + + HAS_PIL = True +except Exception: # pragma: no cover + HAS_PIL = False + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +# Production-equivalent significance thresholds. Tests that depend on these +# values set them explicitly via temp_conf so they do not rely on (and are not +# broken by changes to) the config defaults. +PROD_THRESHOLDS = dict( + data_min_fraction=0.5, + data_min_file_size=1024 * 1024, + data_min_total_size=10 * 1024 * 1024, + data_min_play_size=64 * 1024, +) + + +def write_data(tmpdir, files: dict[str, int | bytes]) -> str: + """Write files into *tmpdir*. + + Values are either an int (number of zero bytes to write) or raw bytes. + """ + path = str(tmpdir) + for rel, content in files.items(): + full = os.path.join(path, rel) + os.makedirs(os.path.dirname(full), exist_ok=True) + data = content if isinstance(content, bytes) else b"\0" * content + with open(full, "wb") as f: + f.write(data) + return path + + +def datasets(proj) -> dict[str, Dataset]: + """The ``name -> Dataset`` mapping for a project's data datasets.""" + dp = proj.specs.get("data_project") + return dict(dp.contents.get("dataset", {})) if dp else {} + + +def dataset_names(proj) -> set[str]: + """The set of dataset names (mapping keys) for a project.""" + return set(datasets(proj)) + + +# --------------------------------------------------------------------------- +# consolidate() +# --------------------------------------------------------------------------- + + +class TestConsolidate: + def test_numbered_csv_series(self): + files = [(f"{i:03d}.csv", 100) for i in range(1, 6)] + groups = consolidate(files) + assert len(groups) == 1 + g = groups[0] + assert g.consolidated + assert g.pattern == "*.csv" + assert len(g.members) == 5 + assert g.total_size == 500 + + def test_spark_parts(self): + files = [(f"part-{i:05d}.parquet", 10) for i in range(4)] + groups = consolidate(files) + assert len(groups) == 1 + assert groups[0].pattern == "part-*.parquet" + assert groups[0].consolidated + + def test_year_series(self): + files = [(f"data_{y}.json", 5) for y in range(2015, 2021)] + groups = consolidate(files) + assert len(groups) == 1 + assert groups[0].pattern == "data_*.json" + + def test_token_series_colours(self): + files = [("red.gif", 1), ("green.gif", 1), ("blue.gif", 1)] + groups = consolidate(files, min_token_group=2) + assert len(groups) == 1 + assert groups[0].pattern == "*.gif" + assert groups[0].consolidated + assert sorted(groups[0].members) == ["blue.gif", "green.gif", "red.gif"] + + def test_below_min_group_stays_standalone(self): + # only two numbered files, default min_group=3 -> not consolidated + files = [("001.csv", 10), ("002.csv", 10)] + groups = consolidate(files, min_group=3, min_token_group=99) + assert all(not g.consolidated for g in groups) + assert len(groups) == 2 + + def test_mixed_extensions_separate_groups(self): + files = [(f"{i:03d}.csv", 10) for i in range(5)] + files += [(f"{i:03d}.json", 10) for i in range(5)] + groups = consolidate(files) + patterns = sorted(g.pattern for g in groups) + assert patterns == ["*.csv", "*.json"] + + def test_unrelated_files_standalone(self): + files = [("readme_data.bin", 10), ("schema.avro", 10)] + groups = consolidate(files, min_token_group=99) + assert all(not g.consolidated for g in groups) + assert {g.name for g in groups} == {"readme_data.bin", "schema.avro"} + + def test_double_extension_grouping(self): + files = [(f"part{i}.csv.gz", 10) for i in range(5)] + groups = consolidate(files) + assert len(groups) == 1 + assert groups[0].ext == ".csv.gz" + assert groups[0].consolidated + + def test_url_glob_vs_list(self, tmp_path): + g = FileGroup( + members=["001.csv", "002.csv", "003.csv"], + ext=".csv", + pattern="*.csv", + consolidated=True, ) - proj = _data_project(tmp_path) - dr = proj.specs["data"].contents["data_resource"] - dr2 = self._roundtrip(dr) - assert dr2.schema == dr.schema - - def test_roundtrip_html_matches_original(self, tmp_path): - """_repr_html_() on the rehydrated object must equal the original render.""" - dr = self._make_dr(tmp_path) - html_original = dr._repr_html_() - dr2 = self._roundtrip(dr) - assert dr2._repr_html_() == html_original - - def test_roundtrip_html_cached_without_rerender(self, tmp_path): - """After from_dict the HTML is already in _html โ€” no re-render occurs.""" - dr = self._make_dr(tmp_path) - html_original = dr._repr_html_() - d = dr.to_dict(compact=False) - d2 = json.loads(json.dumps(d)) - dr2 = from_dict(d2, proj=dr.proj) - - # Confirm _html is set directly on the instance (not via lazy render) - assert ( - "_html" in dr2.__dict__ - ), "_html should be in instance __dict__ after from_dict" - assert dr2.__dict__["_html"] == html_original - - def test_roundtrip_html_survives_missing_sample_path(self, tmp_path): - """After rehydration, _repr_html_() must work even if sample_path - no longer resolves (e.g. moved to a different machine).""" - dr = self._make_dr(tmp_path) - # Trigger render with a real file, then remove the file - html_original = dr._repr_html_() - os.remove(dr.sample_path) - - dr2 = self._roundtrip(dr) - # sample_path is gone โ€” but HTML was cached in the dict - assert dr2._repr_html_() == html_original - - -class TestDataConditionalParse: - """Tests for the 'other project types present' guard in Data.parse().""" - - def _big_csv(self, path, rows=500): - """Write a CSV large enough to dominate byte counts.""" - content = "id,value\n" + "\n".join(f"{i},{i * 2}" for i in range(rows)) - path.write_text(content) - - def test_pure_data_dir_no_sentinel(self, tmp_path): - """No sentinel โ†’ Data always parsed regardless of byte ratios.""" - (tmp_path / "data.csv").write_text("x\n1\n") - proj = _data_project(tmp_path) - assert "data" in proj.specs - - def test_datapackage_companion_not_a_sentinel(self, tmp_path): - """datapackage.json is a compatible companion โ€” not a sentinel.""" - self._big_csv(tmp_path / "data.csv") - (tmp_path / "datapackage.json").write_text('{"resources": []}') - proj = _data_project(tmp_path) - assert "data" in proj.specs - - def test_dvc_companion_not_a_sentinel(self, tmp_path): - """catalog.yaml (IntakeCatalog / DVCRepo companion) is not a sentinel.""" - self._big_csv(tmp_path / "data.csv") - (tmp_path / "catalog.yaml").write_text("sources: {}") - proj = _data_project(tmp_path) - assert "data" in proj.specs - - def test_sentinel_present_data_majority(self, tmp_path): - """Sentinel is present, but data files are the majority of bytes โ†’ Data parsed.""" - self._big_csv(tmp_path / "data.csv") # large data file - (tmp_path / "pyproject.toml").write_text( - "[project]\nname='x'\n" - ) # tiny sentinel - proj = _data_project(tmp_path) - assert "data" in proj.specs - - def test_sentinel_present_data_majority_parquet(self, tmp_path): - pytest.importorskip("pyarrow") - import pyarrow as pa, pyarrow.parquet as pq - - pq.write_table( - pa.table({"x": list(range(1000)), "y": list(range(1000))}), - str(tmp_path / "data.parquet"), + assert g.url("/data/foo") == "/data/foo/*.csv" + single = FileGroup(members=["only.csv"], ext=".csv", pattern="only.csv") + assert single.url("/data/foo") == "/data/foo/only.csv" + + def test_size_unknown_propagates_none(self): + files = [("001.csv", None), ("002.csv", 10), ("003.csv", 10)] + groups = consolidate(files) + assert groups[0].total_size is None + + +# --------------------------------------------------------------------------- +# Content classes +# --------------------------------------------------------------------------- + + +class TestContentClasses: + def test_dataset_roundtrip(self, tmp_path): + proj = projspec.Project(str(tmp_path)) + ds = Dataset( + proj=proj, + url=f"{proj.url}/*.csv", + datatype="CSV", + structure=["table"], + schema={"columns": ["a", "b"]}, + n_files=3, + total_size=999, + metadata={"readers": ["DaskCSV"]}, ) - (tmp_path / "Cargo.toml").write_text('[package]\nname="x"\n') - proj = _data_project(tmp_path) - assert "data" in proj.specs - - # -- mixed dirs where non-data dominates -- - - def test_sentinel_present_code_majority(self, tmp_path): - """Sentinel present and code files dominate โ†’ Data spec suppressed.""" - # Large Python source file - (tmp_path / "main.py").write_text("x = 1\n" * 5000) - # Tiny CSV - (tmp_path / "tiny.csv").write_text("a,b\n1,2\n") - (tmp_path / "pyproject.toml").write_text("[project]\nname='x'\n") - proj = _data_project(tmp_path) - assert "data" not in proj.specs - - def test_sentinel_present_equal_split_not_majority(self, tmp_path): - """Exactly 50/50 bytes is not a majority โ€” Data suppressed.""" - payload = "x" * 1000 - (tmp_path / "code.py").write_text(payload) - (tmp_path / "data.csv").write_text(payload) - (tmp_path / "pyproject.toml").write_text("[project]\nname='x'\n") - proj = _data_project(tmp_path) - assert "data" not in proj.specs - - # -- helpers / unit tests for the private methods -- - - def test_has_non_data_sentinels_true(self, tmp_path): - from projspec.proj.data_dir import Data - - (tmp_path / "data.csv").write_text("x\n1\n") - (tmp_path / "pyproject.toml").write_text("") - proj = object.__new__(projspec.Project) - import fsspec - - proj.fs = fsspec.filesystem("file") - proj.url = str(tmp_path) - proj.__dict__["basenames"] = { - e["name"].rsplit("/", 1)[-1]: e["name"] - for e in proj.fs.ls(str(tmp_path), detail=True) - } - proj.__dict__["filelist"] = proj.fs.ls(str(tmp_path), detail=True) - inst = Data.__new__(Data) - inst.proj = proj - assert inst._has_non_data_sentinels() is True - - def test_has_non_data_sentinels_false(self, tmp_path): - from projspec.proj.data_dir import Data - - (tmp_path / "data.csv").write_text("x\n1\n") - proj = object.__new__(projspec.Project) - import fsspec - - proj.fs = fsspec.filesystem("file") - proj.url = str(tmp_path) - proj.__dict__["basenames"] = { - e["name"].rsplit("/", 1)[-1]: e["name"] - for e in proj.fs.ls(str(tmp_path), detail=True) - } - proj.__dict__["filelist"] = proj.fs.ls(str(tmp_path), detail=True) - inst = Data.__new__(Data) - inst.proj = proj - assert inst._has_non_data_sentinels() is False - - def test_data_bytes_majority_true(self, tmp_path): - from projspec.proj.data_dir import Data - - self._big_csv(tmp_path / "data.csv") - (tmp_path / "small.py").write_text("x = 1\n") - proj = object.__new__(projspec.Project) - import fsspec - - proj.fs = fsspec.filesystem("file") - proj.url = str(tmp_path) - proj.__dict__["filelist"] = proj.fs.ls(str(tmp_path), detail=True) - inst = Data.__new__(Data) - inst.proj = proj - assert inst._data_bytes_majority() is True - - def test_data_bytes_majority_false(self, tmp_path): - from projspec.proj.data_dir import Data - - (tmp_path / "main.py").write_text("x = 1\n" * 5000) - (tmp_path / "tiny.csv").write_text("a\n1\n") - proj = object.__new__(projspec.Project) + d = ds.to_dict(compact=False) + assert d["klass"] == ["content", "dataset"] + # the dataset name lives in the containing dict's key, not the object + assert "name" not in d + from projspec.utils import from_dict + + ds2 = from_dict(d, proj=proj) + assert isinstance(ds2, Dataset) + assert ds2.datatype == "CSV" + assert ds2.n_files == 3 + + def test_tabular_and_intake_source_registered(self): + from projspec.content.base import registry + + assert registry["tabular_data"] is TabularData + assert registry["intake_source"] is IntakeSource + assert registry["dataset"] is Dataset + + +# --------------------------------------------------------------------------- +# DataProject detection / significance +# --------------------------------------------------------------------------- + + +class TestDataProjectSignificance: + def test_pure_data_dir_detected(self, tmp_path): + # three numbered csvs, well above the play-data floor + with temp_conf(**PROD_THRESHOLDS): + write_data(tmp_path, {f"{i:03d}.csv": 100_000 for i in range(1, 4)}) + proj = projspec.Project(str(tmp_path)) + assert "data_project" in proj.specs + ds = datasets(proj) + assert len(ds) == 1 + assert "*.csv" in ds + assert ds["*.csv"].n_files == 3 + + def test_tiny_play_data_rejected(self, tmp_path): + with temp_conf(**PROD_THRESHOLDS): + write_data(tmp_path, {f"{i:03d}.csv": 20 for i in range(1, 4)}) + proj = projspec.Project(str(tmp_path)) + assert "data_project" not in proj.specs + + def test_big_single_file_in_code_project(self, tmp_path): + # python package + one big csv -> both python_code and data_project + with temp_conf(**PROD_THRESHOLDS): + write_data( + tmp_path, + { + "__init__.py": b"x = 1\n", + "big.csv": 2 * 1024 * 1024, # > data_min_file_size (1MB) + }, + ) + proj = projspec.Project(str(tmp_path)) + assert "python_code" in proj.specs + assert "data_project" in proj.specs + ds = datasets(proj) + assert "big.csv" in ds + + def test_small_data_in_code_project_ignored(self, tmp_path): + with temp_conf(**PROD_THRESHOLDS): + write_data( + tmp_path, + { + "__init__.py": b"x = 1\n", + "main.py": b"print(1)\n" * 100, + "sample.csv": 200, # tiny + }, + ) + proj = projspec.Project(str(tmp_path)) + assert "python_code" in proj.specs + assert "data_project" not in proj.specs + + def test_fraction_rule_large_data_in_code_project(self, tmp_path): + # small code, large data -> data dominates by fraction and total size. + # Use a .csv so intake can identify a datatype (datasets with no + # identified datatype are dropped from the result). + with temp_conf(**PROD_THRESHOLDS): + write_data( + tmp_path, + { + "__init__.py": b"x = 1\n", + "data.csv": b"a,b,c\n" + b"1,2,3\n" * (4 * 1024 * 1024), # >20MB + }, + ) + proj = projspec.Project(str(tmp_path)) + assert "python_code" in proj.specs + assert "data_project" in proj.specs + + def test_threshold_overridable_via_config(self, tmp_path): + write_data(tmp_path, {f"{i:03d}.csv": 20 for i in range(1, 4)}) + # with the production play-size floor: rejected + with temp_conf(**PROD_THRESHOLDS): + proj = projspec.Project(str(tmp_path)) + assert "data_project" not in proj.specs + # with a tiny play-size floor it should be detected + with temp_conf(data_min_play_size=1): + proj = projspec.Project(str(tmp_path)) + assert "data_project" in proj.specs + + +# --------------------------------------------------------------------------- +# DataProject consolidation + intake integration +# --------------------------------------------------------------------------- + + +class TestDataProjectDatasets: + def test_image_series_consolidated(self, tmp_path): + with temp_conf(**PROD_THRESHOLDS): + write_data( + tmp_path, + { + f"{c}.gif": b"GIF89a" + b"\0" * 50_000 + for c in ("red", "green", "blue") + }, + ) + proj = projspec.Project(str(tmp_path)) + ds = datasets(proj) + assert len(ds) == 1 + assert "*.gif" in ds + assert ds["*.gif"].n_files == 3 + + def test_directory_dataset_marker(self, tmp_path): + # a _metadata marker means intake treats the whole dir as one dataset + with temp_conf(**PROD_THRESHOLDS): + write_data( + tmp_path, + { + "_metadata": 100, + "part-0.parquet": b"PAR1" + b"\0" * 200_000, + "part-1.parquet": b"PAR1" + b"\0" * 200_000, + }, + ) + proj = projspec.Project(str(tmp_path)) + assert "data_project" in proj.specs + ds = datasets(proj) + # whole directory described as a single dataset + assert len(ds) == 1 + + @pytest.mark.skipif(not HAS_INTAKE, reason="intake not installed") + def test_intake_identifies_csv(self, tmp_path): + with temp_conf(**PROD_THRESHOLDS): + rows = b"a,b,c\n" + b"".join(b"1,2,3\n" for _ in range(50_000)) + write_data(tmp_path, {f"{i:03d}.csv": rows for i in range(1, 4)}) + proj = projspec.Project(str(tmp_path)) + ds = datasets(proj) + assert len(ds) == 1 + assert ds["*.csv"].datatype == "CSV" + assert "table" in ds["*.csv"].structure + + def test_no_data_files_no_match(self, tmp_path): + write_data(tmp_path, {"README.md": b"# hi\n", "setup.py": b"x=1\n"}) + proj = projspec.Project(str(tmp_path)) + assert "data_project" not in proj.specs + + @pytest.mark.skipif(not HAS_INTAKE, reason="intake not installed") + def test_remote_url_keeps_protocol_for_intake(self): + """Regression: scanning a remote (protocol-prefixed) directory must + hand intake a protocol-qualified URL. + + ``proj.url`` has the protocol stripped by ``fsspec.url_to_fs``; if that + bare path reaches intake it can't pick the filesystem and resolves no + files. The dataset URL handed to / stored by intake must keep the + protocol (e.g. ``memory://``). + """ import fsspec - proj.fs = fsspec.filesystem("file") - proj.url = str(tmp_path) - proj.__dict__["filelist"] = proj.fs.ls(str(tmp_path), detail=True) - inst = Data.__new__(Data) - inst.proj = proj - assert inst._data_bytes_majority() is False + fs = fsspec.filesystem("memory") + root = "/data_project_remote" + rows = b"a,b,c\n" + b"".join(b"1,2,3\n" for _ in range(50_000)) + try: + for i in range(1, 4): + with fs.open(f"{root}/{i:03d}.csv", "wb") as f: + f.write(rows) + + with temp_conf(data_min_play_size=1, data_min_fraction=0.5): + proj = projspec.Project(f"memory://{root}") + # the bare filesystem path has no protocol... + assert "://" not in proj.url + ds = datasets(proj) + assert "*.csv" in ds + # ...but intake was able to resolve and type the files, and the + # stored dataset URL is protocol-qualified. + assert ds["*.csv"].datatype == "CSV" + assert str(ds["*.csv"].url).startswith("memory://") + finally: + try: + fs.rm(root, recursive=True) + except FileNotFoundError: + pass + + +# --------------------------------------------------------------------------- +# match() / _is_data_ext unit checks +# --------------------------------------------------------------------------- + + +class TestDataExt: + @pytest.mark.parametrize( + "name,expected", + [ + ("data.csv", True), + ("table.parquet", True), + ("image.png", True), + ("archive.csv.gz", True), + ("module.py", False), + ("README.md", False), + ("pyproject.toml", False), + (".gitignore", False), + ("Makefile", False), # no extension + ("config.ini", False), + ], + ) + def test_is_data_ext(self, name, expected): + assert DataProject._is_data_ext(name) is expected + + +# --------------------------------------------------------------------------- +# HTML repr / thumbnail captured into Dataset.metadata +# --------------------------------------------------------------------------- + + +def _make_csv_bytes(rows: int = 200_000) -> bytes: + """A CSV big enough to clear the single-big-file significance threshold.""" + body = "a,b,c\n" + "\n".join(f"{i},{i * 2},val{i}" for i in range(rows)) + return body.encode() + + +class TestDatasetHTMLOutput: + """The DataProject pipeline should carry intake's ``html_repr`` and + ``thumbnail`` through into ``Dataset.metadata`` when a reader discovers the + underlying object.""" + + @pytest.mark.skipif(not (HAS_INTAKE and HAS_PANDAS), reason="needs intake + pandas") + def test_html_repr_for_tabular(self, tmp_path): + # single file > data_min_file_size so it is described on its own and a + # single-file pandas reader can discover it + with temp_conf(**PROD_THRESHOLDS): + write_data(tmp_path, {"big.csv": _make_csv_bytes()}) + proj = projspec.Project(str(tmp_path)) + ds = datasets(proj) + assert len(ds) == 1 + meta = ds["big.csv"].metadata + assert "PandasCSV" in meta.get("readers") + assert meta.get("html_repr"), "expected html_repr in Dataset.metadata" + assert " html_repr/thumbnail simply + # absent, never None-valued keys + with temp_conf(data_min_play_size=1): + rows = b"a,b,c\n" + b"1,2,3\n" * 10 + write_data(tmp_path, {f"{i:03d}.csv": rows for i in range(5)}) + proj = projspec.Project(str(tmp_path)) + ds = datasets(proj) + assert ds, "expected a dataset" + for d in ds.values(): + assert d.datatype is not None + assert "html_repr" not in d.metadata or isinstance( + d.metadata["html_repr"], str + ) + assert "thumbnail" not in d.metadata or isinstance( + d.metadata["thumbnail"], str + ) + + +# --------------------------------------------------------------------------- +# Per-dataset fraction filtering (_filter_small_datasets) +# --------------------------------------------------------------------------- + + +def _bare_data_project(tmp_path) -> DataProject: + """A DataProject instance not bound to any real data (for unit testing + the pure-Python helper without triggering match()/parse()).""" + proj = projspec.Project(str(tmp_path)) + dp = DataProject.__new__(DataProject) + dp.proj = proj + return dp + + +def _ds(proj, name, size): + """Return a ``(name, Dataset)`` pair as consumed by + ``DataProject._filter_small_datasets``.""" + return name, Dataset( + proj=proj, + url=f"{proj.url}/{name}", + datatype="CSV", + structure=["table"], + schema={}, + n_files=1, + total_size=size, + metadata={}, + ) + + +def _kept_names(pairs): + return [name for name, _ in pairs] + + +class TestFilterSmallDatasets: + def test_drops_dataset_below_fraction_of_largest(self, tmp_path): + dp = _bare_data_project(tmp_path) + big = _ds(dp.proj, "big.csv", 1000) + small = _ds(dp.proj, "small.csv", 10) # 1% of largest + with temp_conf(data_min_fraction=0.5): + kept = dp._filter_small_datasets([big, small]) + assert _kept_names(kept) == ["big.csv"] + + def test_keeps_datasets_above_fraction(self, tmp_path): + dp = _bare_data_project(tmp_path) + a = _ds(dp.proj, "a.csv", 1000) + b = _ds(dp.proj, "b.csv", 800) # 80% of largest + with temp_conf(data_min_fraction=0.5): + kept = dp._filter_small_datasets([a, b]) + assert set(_kept_names(kept)) == {"a.csv", "b.csv"} + + def test_single_dataset_never_filtered(self, tmp_path): + dp = _bare_data_project(tmp_path) + only = _ds(dp.proj, "only.csv", 1) + with temp_conf(data_min_fraction=0.5): + kept = dp._filter_small_datasets([only]) + assert _kept_names(kept) == ["only.csv"] + + def test_unknown_sizes_disable_filtering(self, tmp_path): + dp = _bare_data_project(tmp_path) + big = _ds(dp.proj, "big.csv", 1000) + unknown = _ds(dp.proj, "u.csv", None) + with temp_conf(data_min_fraction=0.5): + kept = dp._filter_small_datasets([big, unknown]) + assert set(_kept_names(kept)) == {"big.csv", "u.csv"} + + def test_never_drops_everything(self, tmp_path): + # an impossible threshold (>1) would exclude all -> fall back to all + dp = _bare_data_project(tmp_path) + a = _ds(dp.proj, "a.csv", 1000) + b = _ds(dp.proj, "b.csv", 1000) + with temp_conf(data_min_fraction=2.0): + kept = dp._filter_small_datasets([a, b]) + assert set(_kept_names(kept)) == {"a.csv", "b.csv"} + + def test_zero_fraction_keeps_all(self, tmp_path): + dp = _bare_data_project(tmp_path) + big = _ds(dp.proj, "big.csv", 1000) + tiny = _ds(dp.proj, "tiny.csv", 1) + with temp_conf(data_min_fraction=0.0): + kept = dp._filter_small_datasets([big, tiny]) + assert set(_kept_names(kept)) == {"big.csv", "tiny.csv"} + + @pytest.mark.skipif(not HAS_INTAKE, reason="intake not installed") + def test_end_to_end_drops_tiny_dataset(self, tmp_path): + # one large csv-series dataset and one tiny json file; the tiny one + # should be dropped as a small fraction of the largest. + big_rows = b"a,b,c\n" + b"1,2,3\n" * 20000 # large + with temp_conf(data_min_play_size=1, data_min_fraction=0.5): + write_data( + tmp_path, + { + **{f"{i:03d}.csv": big_rows for i in range(3)}, + "tiny.json": b'{"x": 1}\n', + }, + ) + proj = projspec.Project(str(tmp_path)) + names = dataset_names(proj) + assert "*.csv" in names + assert "tiny.json" not in names + + @pytest.mark.skipif(not HAS_INTAKE, reason="intake not installed") + def test_end_to_end_keeps_similar_sized_datasets(self, tmp_path): + # two datasets of comparable size are both kept (neither is a small + # fraction of the other). + csv_rows = b"a,b,c\n" + b"1,2,3\n" * 20000 + json_rows = b'{"x": 1}\n' * 20000 + with temp_conf(data_min_play_size=1, data_min_fraction=0.5): + write_data( + tmp_path, + { + **{f"{i:03d}.csv": csv_rows for i in range(3)}, + **{f"{i:03d}.json": json_rows for i in range(3)}, + }, + ) + proj = projspec.Project(str(tmp_path)) + names = dataset_names(proj) + assert "*.csv" in names + assert "*.json" in names diff --git a/tests/test_ipywidget_helpers.py b/tests/test_ipywidget_helpers.py index 84a4dea..485a481 100644 --- a/tests/test_ipywidget_helpers.py +++ b/tests/test_ipywidget_helpers.py @@ -367,6 +367,7 @@ def test_add_confirmed_valid_path(self, tmp_path, widget_and_lib): widget, lib, url = widget_and_lib new_proj = tmp_path / "newproj" new_proj.mkdir() + (new_proj / "requirements.txt").write_text("numpy") # Start with just the original entry original_keys = set(lib.entries) sends, toasts = _fire( diff --git a/tests/test_library.py b/tests/test_library.py index 276e66a..3796588 100644 --- a/tests/test_library.py +++ b/tests/test_library.py @@ -1,6 +1,9 @@ +import json import os +import time from projspec import Project +from projspec.config import temp_conf from projspec.library import ProjectLibrary here = os.path.abspath(os.path.dirname(__file__)) @@ -40,3 +43,88 @@ def test_filter(tmp_path): # miss assert not library.filter([("spec", "xx")]) + + +def test_scanned_at_set_on_scan(tmp_path): + (tmp_path / "__init__.py").write_text("x = 1\n") + before = time.time() + proj = Project(str(tmp_path), walk=False) + after = time.time() + assert isinstance(proj.scanned_at, float) + assert before <= proj.scanned_at <= after + + +def test_scanned_at_serialised_and_roundtrips(tmp_path): + (tmp_path / "__init__.py").write_text("x = 1\n") + proj = Project(str(tmp_path), walk=False) + + dic = proj.to_dict(compact=False) + assert "scanned_at" in dic + + proj2 = Project.from_dict(dic) + # round-trips back to the same numeric value (serialiser stringifies floats) + assert isinstance(proj2.scanned_at, float) + assert proj2.scanned_at == proj.scanned_at + + +def test_scanned_at_defaults_to_now_when_missing(tmp_path): + (tmp_path / "__init__.py").write_text("x = 1\n") + proj = Project(str(tmp_path), walk=False) + + dic = proj.to_dict(compact=False) + dic.pop("scanned_at") # simulate an older library without the field + + before = time.time() + proj2 = Project.from_dict(dic) + assert before <= proj2.scanned_at <= time.time() + 1 + + +def _make_library_with_old_entry(tmp_path, age_seconds): + """Create a library file containing one project scanned *age_seconds* ago.""" + proj_dir = tmp_path / "proj" + proj_dir.mkdir() + (proj_dir / "__init__.py").write_text("x = 1\n") + fn = str(tmp_path / "library.json") + + proj = Project(str(proj_dir), walk=False) + library = ProjectLibrary(fn, auto_save=True) + key = proj.fs.unstrip_protocol(proj.url) + library.add_entry(key, proj) + + # rewrite the saved scanned_at to be old + data = json.load(open(fn)) + for entry in data.values(): + entry["scanned_at"] = time.time() - age_seconds + json.dump(data, open(fn, "w")) + return fn, key + + +def test_auto_rescan_refreshes_old_entry(tmp_path): + fn, key = _make_library_with_old_entry(tmp_path, age_seconds=1000) + + with temp_conf(auto_rescan=10): # threshold below the entry's age + library = ProjectLibrary(fn) + # the stale entry was rescanned -> timestamp is fresh + assert library.entries[key].scanned_at >= time.time() - 5 + # ...and the refreshed library was written back to disk + data = json.load(open(fn)) + assert float(data[key]["scanned_at"]) >= time.time() - 5 + + +def test_auto_rescan_keeps_fresh_entry(tmp_path): + fn, key = _make_library_with_old_entry(tmp_path, age_seconds=5) + + with temp_conf(auto_rescan=1000): # threshold well above the entry's age + library = ProjectLibrary(fn) + # fresh enough -> not rescanned, original (old) timestamp preserved + assert library.entries[key].scanned_at < time.time() - 1 + + +def test_auto_rescan_disabled_with_zero(tmp_path): + fn, key = _make_library_with_old_entry(tmp_path, age_seconds=10_000) + old = json.load(open(fn))[key]["scanned_at"] + + with temp_conf(auto_rescan=0): # disabled entirely + library = ProjectLibrary(fn) + # the very old entry is kept as-is, never rescanned + assert abs(library.entries[key].scanned_at - old) < 1 diff --git a/tests/test_new_specs.py b/tests/test_new_specs.py index f41a6c5..9d1d3a5 100644 --- a/tests/test_new_specs.py +++ b/tests/test_new_specs.py @@ -2016,3 +2016,147 @@ def test_roundtrip_create_and_detect(self, tmpdir): # Re-scan so scanned_files picks up the new flow.py proj2 = projspec.Project(path) assert "metaflow" in proj2 + + +# --------------------------------------------------------------------------- +# KnowledgeCatalog (Open Knowledge Format bundle) +# --------------------------------------------------------------------------- + + +class TestKnowledgeCatalog: + FILES = { + "index.md": '---\nokf_version: "0.1"\n---\n\n# My Bundle\n\n' + "* [Sales](datasets/sales.md) - sales data\n", + "log.md": "# Update Log\n\n## 2026-01-01\n* **Creation**: started.\n", + "datasets/sales.md": """\ + --- + type: BigQuery Dataset + title: Sales + description: All sales-related tables. + tags: [sales, revenue] + timestamp: 2026-05-28T00:00:00Z + --- + + The sales dataset. + """, + "tables/orders.md": """\ + --- + type: BigQuery Table + title: Orders + resource: https://example.com/orders + --- + + # Schema + """, + # not a concept: no frontmatter + "notes/random.md": "just some prose, no frontmatter\n", + } + + def test_match_positive(self, tmpdir): + proj = make_proj(tmpdir, self.FILES) + from projspec.proj.knowledge_catalog import KnowledgeCatalog + + assert raw_spec(KnowledgeCatalog, proj).match() + + def test_match_root_concept(self, tmpdir): + # index.md plus a concept at the root (no subdirs) + proj = make_proj( + tmpdir, + { + "index.md": "# Bundle\n", + "overview.md": "---\ntype: Reference\n---\nbody\n", + }, + ) + from projspec.proj.knowledge_catalog import KnowledgeCatalog + + assert raw_spec(KnowledgeCatalog, proj).match() + + def test_match_negative_no_index(self, tmpdir): + proj = make_proj(tmpdir, {"tables/orders.md": "---\ntype: T\n---\n"}) + from projspec.proj.knowledge_catalog import KnowledgeCatalog + + assert not raw_spec(KnowledgeCatalog, proj).match() + + def test_match_negative_empty(self, tmpdir): + proj = make_proj(tmpdir, {}) + from projspec.proj.knowledge_catalog import KnowledgeCatalog + + assert not raw_spec(KnowledgeCatalog, proj).match() + + def test_parse_contents(self, tmpdir): + proj = make_proj(tmpdir, self.FILES) + from projspec.proj.knowledge_catalog import KnowledgeCatalog + + spec = raw_spec(KnowledgeCatalog, proj) + spec.parse() + assert "concept" in spec._contents + concepts = spec._contents["concept"] + # keyed by concept ID (bundle-relative path, no .md) + assert set(concepts) == {"datasets/sales", "tables/orders"} + + def test_parse_detail(self, tmpdir): + proj = make_proj(tmpdir, self.FILES) + from projspec.proj.knowledge_catalog import KnowledgeCatalog + + spec = raw_spec(KnowledgeCatalog, proj) + spec.parse() + sales = spec._contents["concept"]["datasets/sales"].meta + assert sales["type"] == "BigQuery Dataset" + assert sales["title"] == "Sales" + assert sales["tags"] == "sales, revenue" + orders = spec._contents["concept"]["tables/orders"].meta + assert orders["type"] == "BigQuery Table" + assert orders["resource"] == "https://example.com/orders" + + def test_parse_bundle_version(self, tmpdir): + proj = make_proj(tmpdir, self.FILES) + from projspec.proj.knowledge_catalog import KnowledgeCatalog + + spec = raw_spec(KnowledgeCatalog, proj) + spec.parse() + # root index.md okf_version surfaces as bundle-level metadata + assert spec._contents["descriptive_metadata"].meta["okf_version"] == "0.1" + + def test_parse_skips_non_typed_docs(self, tmpdir): + proj = make_proj(tmpdir, self.FILES) + from projspec.proj.knowledge_catalog import KnowledgeCatalog + + spec = raw_spec(KnowledgeCatalog, proj) + spec.parse() + # notes/random.md has no frontmatter -> not a concept + assert "notes/random" not in spec._contents["concept"] + + def test_parse_no_typed_concepts_raises(self, tmpdir): + from projspec.proj.knowledge_catalog import KnowledgeCatalog + from projspec.proj import ParseFailed + + proj = make_proj( + tmpdir, + {"index.md": "# index\n", "notes.md": "plain prose, no frontmatter\n"}, + ) + spec = raw_spec(KnowledgeCatalog, proj) + with pytest.raises(ParseFailed): + spec.parse() + + def test_parse_requires_type_field(self, tmpdir): + # a markdown doc with frontmatter but no 'type' is not a concept + from projspec.proj.knowledge_catalog import KnowledgeCatalog + from projspec.proj import ParseFailed + + proj = make_proj( + tmpdir, + { + "index.md": "# index\n", + "doc.md": "---\ntitle: No Type Here\n---\nbody\n", + }, + ) + spec = raw_spec(KnowledgeCatalog, proj) + with pytest.raises(ParseFailed): + spec.parse() + + def test_roundtrip_create_and_detect(self, tmpdir): + path = str(tmpdir) + proj = projspec.Project(path) + proj.create("KnowledgeCatalog") + proj2 = projspec.Project(path) + assert "knowledge_catalog" in proj2 diff --git a/tests/test_roundtrips.py b/tests/test_roundtrips.py index a1ed3a8..a3159e4 100644 --- a/tests/test_roundtrips.py +++ b/tests/test_roundtrips.py @@ -37,6 +37,7 @@ "MDBook", "RTD", "BackstageCatalog", + "KnowledgeCatalog", # CI/CD โ€” file-only _create() "GitHubActions", "GitLabCI", diff --git a/tests/test_textapp_helpers.py b/tests/test_textapp_helpers.py index d898d7e..191d49b 100644 --- a/tests/test_textapp_helpers.py +++ b/tests/test_textapp_helpers.py @@ -126,8 +126,15 @@ class TestFmtAge: def _ts(self, days_ago: float) -> float: return time.time() - days_ago * 86400 - def test_today(self): - assert _fmt_age(self._ts(0.1)) == "today" + def test_just_now(self): + assert _fmt_age(time.time() - 5) == "just now" + + def test_minutes(self): + assert _fmt_age(time.time() - 5 * 60) == "5 minutes ago" + + def test_hours(self): + # 0.1 days ~= 2.4 hours -> reported in hours, not "today" + assert _fmt_age(self._ts(0.1)) == "2 hours ago" def test_yesterday(self): assert _fmt_age(self._ts(1.5)) == "yesterday" @@ -333,6 +340,23 @@ def test_indentation_increases_for_nested(self): # Outer indent: lines_2 should have more leading spaces assert lines_2[0].startswith(" " * 2) + def test_html_repr_shown_as_placeholder(self): + # the giant raw HTML must not be dumped; show a short note instead + big = "" + "x" * 5000 + "
    " + lines = _yaml_lines({"metadata": {"html_repr": big}}, {}, 0) + combined = " ".join(lines) + assert big not in combined + assert "html_repr" in combined + assert "HTML preview" in combined + + def test_thumbnail_shown_as_placeholder(self): + url = "data:image/png;base64," + "A" * 5000 + lines = _yaml_lines({"metadata": {"thumbnail": url}}, {}, 0) + combined = " ".join(lines) + assert url not in combined + assert "thumbnail" in combined + assert "image thumbnail" in combined + # --------------------------------------------------------------------------- # _wrap_chips diff --git a/tests/test_webui.py b/tests/test_webui.py index 06c4c96..bb11b2d 100644 --- a/tests/test_webui.py +++ b/tests/test_webui.py @@ -224,6 +224,23 @@ def test_panel_js_is_root_scoped(): assert "window.projspecRoot" in js +def test_panel_js_embeds_dataset_preview(): + """The shared panel.js must embed a content's ``metadata.html_repr`` as + live HTML (via sanitizeHtml + innerHTML) and ``metadata.thumbnail`` as an + , rather than dumping their raw strings into the YAML tree.""" + js = get_panel_js() + # preview keys are pulled out of metadata + assert "meta.html_repr" in js + assert "meta.thumbnail" in js + # and removed from the YAML tree via stripPreview + assert "stripPreview" in js + assert "renderYaml(stripPreview(stripKlass(data)))" in js + # html_repr is embedded as sanitised HTML; thumbnail as a data: image + assert "sanitizeHtml(htmlRepr)" in js + assert "thumbnailImg" in js + assert "data:image/" in js + + def test_make_cwd_uses_project_path_not_library_key(tmp_path, monkeypatch): """Regression: Make must use the stored ``Project.path`` as the subprocess cwd, never the library key. diff --git a/vsextension/src/panel.ts b/vsextension/src/panel.ts index 9f3d6a7..7149c9c 100644 --- a/vsextension/src/panel.ts +++ b/vsextension/src/panel.ts @@ -933,8 +933,18 @@ const PANEL_JS = String.raw` } } function fmtAge(ts) { - const days = Math.floor((Date.now() / 1000 - parseFloat(ts)) / 86400); - if (days === 0) return 'today'; + const secs = Math.floor(Date.now() / 1000 - parseFloat(ts)); + if (secs < 0) return 'just now'; + const days = Math.floor(secs / 86400); + if (days === 0) { + if (secs < 60) return 'just now'; + if (secs < 3600) { + const m = Math.floor(secs / 60); + return m + ' minute' + (m !== 1 ? 's' : '') + ' ago'; + } + const h = Math.floor(secs / 3600); + return h + ' hour' + (h !== 1 ? 's' : '') + ' ago'; + } if (days === 1) return 'yesterday'; if (days < 30) return days + ' days ago'; if (days < 365) return Math.floor(days / 30) + ' months ago'; @@ -1018,6 +1028,8 @@ const PANEL_JS = String.raw` const by = project.last_modified_by != null ? project.last_modified_by : null; metaParts.push('last modified ' + age + (by ? ' by ' + by : '')); } + if (project.scanned_at != null) + metaParts.push('scanned ' + fmtAge(project.scanned_at)); if (metaParts.length > 0) { const meta = document.createElement('div'); meta.className = 'meta'; @@ -1309,15 +1321,70 @@ const PANEL_JS = String.raw` body.innerHTML = sanitizeHtml(html); w.appendChild(body); } else { + // Datasets (and other content) may carry rich previews in + // metadata.html_repr (an HTML fragment) and metadata.thumbnail + // (a data: image URL). Embed those rather than dumping their + // (often huge) raw strings into the YAML tree. + const meta = (kind === 'content' && data && typeof data === 'object' + && data.metadata && typeof data.metadata === 'object') ? data.metadata : null; + const htmlRepr = meta && typeof meta.html_repr === 'string' ? meta.html_repr : null; + const thumb = meta && typeof meta.thumbnail === 'string' ? meta.thumbnail : null; + const tree = document.createElement('div'); tree.className = 'tree yaml'; - tree.appendChild(renderYaml(stripKlass(data))); + tree.appendChild(renderYaml(stripPreview(stripKlass(data)))); w.appendChild(tree); + + if (thumb) w.appendChild(thumbnailImg(thumb)); + if (htmlRepr) { + const body = document.createElement('div'); + body.className = 'widget-html'; + body.innerHTML = sanitizeHtml(htmlRepr); + w.appendChild(body); + } } return w; } + /** + * Return a shallow copy of a content dict with the embedded-preview keys + * (metadata.html_repr / metadata.thumbnail) removed, so the YAML tree + * doesn't show their large raw strings - they are rendered as live + * HTML / an image instead. + */ + function stripPreview(obj) { + if (!obj || typeof obj !== 'object' || Array.isArray(obj)) return obj; + if (!obj.metadata || typeof obj.metadata !== 'object' || Array.isArray(obj.metadata)) return obj; + const meta = {}; + let changed = false; + for (const k of Object.keys(obj.metadata)) { + if (k === 'html_repr' || k === 'thumbnail') { changed = true; continue; } + meta[k] = obj.metadata[k]; + } + if (!changed) return obj; + const out = {}; + for (const k of Object.keys(obj)) out[k] = obj[k]; + out.metadata = meta; + return out; + } + + /** + * Build an for a data:image/... thumbnail URL. Only accepts + * data: image URLs (never remote/javascript URLs). + */ + function thumbnailImg(src) { + const wrap = document.createElement('div'); + wrap.className = 'widget-html'; + if (/^data:image\//i.test(src)) { + const img = document.createElement('img'); + img.src = src; + img.alt = 'thumbnail'; + wrap.appendChild(img); + } + return wrap; + } + /** * Minimal sanitisation of content-provided HTML. The markup comes from * the projspec library itself so we don't need a full DOMPurify - but we diff --git a/vsextension/src/projspec.ts b/vsextension/src/projspec.ts index 20afb8b..97ed9a8 100644 --- a/vsextension/src/projspec.ts +++ b/vsextension/src/projspec.ts @@ -171,6 +171,7 @@ export interface ProjectData { is_writable?: string; last_modified?: string; last_modified_by?: string; + scanned_at?: string; } export interface SpecData { diff --git a/vsextension/tsconfig.json b/vsextension/tsconfig.json index 356580f..5e22142 100644 --- a/vsextension/tsconfig.json +++ b/vsextension/tsconfig.json @@ -8,6 +8,11 @@ ], "sourceMap": true, "rootDir": "src", + "types": [ + "node", + "vscode", + "mocha" + ], "strict": true, /* enable all strict type-checking options */ /* Additional Checks */ // "noImplicitReturns": true, /* Report error when not all code paths in function return a value. */