diff --git a/docs/source/api.rst b/docs/source/api.rst index 982cea6..b67cf41 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -51,6 +51,7 @@ User Classes proj.conda_workspace.CondaWorkspace proj.datapackage.DVCRepo proj.datapackage.DataPackage + proj.datapackage.CroissantDataset proj.dataworkflows.Dbt proj.dataworkflows.Quarto proj.dataworkflows.Prefect @@ -136,6 +137,7 @@ User Classes .. autoclass:: projspec.proj.conda_workspace.CondaWorkspace .. autoclass:: projspec.proj.datapackage.DVCRepo .. autoclass:: projspec.proj.datapackage.DataPackage +.. autoclass:: projspec.proj.datapackage.CroissantDataset .. autoclass:: projspec.proj.dataworkflows.Dbt .. autoclass:: projspec.proj.dataworkflows.Quarto .. autoclass:: projspec.proj.dataworkflows.Prefect @@ -224,6 +226,7 @@ User Classes content.cicd.ServiceDependency content.data.IntakeSource content.data.TabularData + content.data.CroissantRecordSet content.data.DataResource content.env_var.EnvironmentVariables content.environment.Environment @@ -241,6 +244,7 @@ User Classes .. autoclass:: projspec.content.cicd.ServiceDependency .. autoclass:: projspec.content.data.IntakeSource .. autoclass:: projspec.content.data.DataResource +.. autoclass:: projspec.content.data.CroissantRecordSet .. autoclass:: projspec.content.data.TabularData .. autoclass:: projspec.content.env_var.EnvironmentVariables .. autoclass:: projspec.content.environment.Environment diff --git a/src/projspec/content/__init__.py b/src/projspec/content/__init__.py index fcf96d8..558ea04 100644 --- a/src/projspec/content/__init__.py +++ b/src/projspec/content/__init__.py @@ -8,6 +8,7 @@ ServiceDependency, ) from projspec.content.data import ( + CroissantRecordSet, Dataset, FrictionlessData, IntakeSource, @@ -24,6 +25,7 @@ __all__ = [ "BaseContent", "CIWorkflow", + "CroissantRecordSet", "GithubAction", "PipelineStage", "ServiceDependency", diff --git a/src/projspec/content/data.py b/src/projspec/content/data.py index fc6997e..e1ffa6d 100644 --- a/src/projspec/content/data.py +++ b/src/projspec/content/data.py @@ -48,6 +48,32 @@ class IntakeSource(BaseContent): name: str +@dataclass +class CroissantRecordSet(BaseContent): + """A RecordSet described in a Croissant/JSON-LD dataset metadata file. + + Croissant (http://mlcommons.org/croissant/1.0) is the ML Commons standard + for describing ML datasets using JSON-LD. A ``RecordSet`` is a named, + structured collection of records (e.g. one table or one set of image + annotations) within the dataset. + + Attributes + ---------- + name: + The ``name`` (or ``@id``) of the ``cr:RecordSet``. + description: + Optional free-text description of the record set. + fields: + List of field names declared inside this record set. + """ + + icon = "🥐" + + name: str + description: str = "" + fields: list = field(default_factory=list) + + @dataclass class Dataset(BaseContent): """A generic dataset discovered on disk and described by intake. diff --git a/src/projspec/proj/__init__.py b/src/projspec/proj/__init__.py index 2f535b5..2823f7d 100644 --- a/src/projspec/proj/__init__.py +++ b/src/projspec/proj/__init__.py @@ -25,7 +25,7 @@ from projspec.proj.conda_project import CondaProject from projspec.proj.conda_workspace import CondaWorkspace from projspec.proj.data_project import DataProject -from projspec.proj.datapackage import DataPackage, DVCRepo +from projspec.proj.datapackage import DataPackage, DVCRepo, CroissantDataset from projspec.proj.dataworkflows import ( Airflow, Dagster, @@ -93,6 +93,7 @@ "CondaWorkspace", "RattlerRecipe", # Data + "CroissantDataset", "DataPackage", "DVCRepo", "DataProject", diff --git a/src/projspec/proj/datapackage.py b/src/projspec/proj/datapackage.py index bca9468..8af3aa5 100644 --- a/src/projspec/proj/datapackage.py +++ b/src/projspec/proj/datapackage.py @@ -161,3 +161,190 @@ def _create(path: str) -> None: version: 2 """ ) + + +class CroissantDataset(ProjectSpec): + """An ML Commons Croissant dataset described by a JSON-LD metadata file. + + Croissant (http://mlcommons.org/croissant/1.0) is the standard format for + describing ML datasets using JSON-LD / schema.org vocabulary. It captures + dataset-level metadata (name, description, license, citation) as well as a + structured schema of the data via ``RecordSet`` and ``Field`` objects. + + Detection heuristic + ------------------- + 1. Look for any ``.json`` / ``.jsonld`` file whose **basename** matches a + list of common Croissant filenames (``croissant.json``, + ``croissant_metadata.json``, ``metadata.json``, …). + 2. Open the candidate file and confirm it carries the Croissant conformance + marker (``conformsTo`` containing ``mlcommons.org/croissant`` or a + ``@context`` that maps ``cr`` / ``mlcommons``). + + No file I/O other than reading the single metadata file is needed, so this + parser is compatible with remote filesystems. + """ + + icon = "🥐" + spec_doc = "https://docs.mlcommons.org/croissant/docs/croissant-spec.html" + + # Filename matched during match(); reused in parse() to avoid re-scanning. + _matched_file: str | None = None + _CROISSANT_NAMES = re.compile( + r"^(croissant.*|.*[-_]?croissant[-_]?.*|metadata)\.json(ld)?$", + re.IGNORECASE, + ) + _CROISSANT_CONFORMSTO = "mlcommons.org/croissant" + + def match(self) -> bool: + """Return True when a plausible Croissant JSON-LD file is present.""" + for basename in self.proj.basenames: + if self._CROISSANT_NAMES.match(basename): + # Peek at the file to confirm it is really Croissant. + # We use get_file() so the content may already be cached. + try: + fobj = self.proj.get_file(basename) + if fobj is None: + continue + text = fobj.read() + if isinstance(text, bytes): + text = text.decode("utf-8", errors="replace") + if self._CROISSANT_CONFORMSTO in text: + self._matched_file = basename + return True + except Exception: + continue + return False + + def parse(self) -> None: + import json + + from projspec.content import DescriptiveMetadata, License, Citation + from projspec.content.data import CroissantRecordSet + from projspec.utils import AttrDict + + if self._matched_file is None: + raise ParseFailed("No Croissant file identified") + + self._contents = AttrDict() + self._artifacts = AttrDict() + + with self.proj.fs.open(self.proj.basenames[self._matched_file], "rt") as f: + meta = json.load(f) + + # --- dataset-level metadata --- + dm_fields = { + "name", + "description", + "url", + "version", + "datePublished", + "dateCreated", + "dateModified", + "keywords", + "inLanguage", + } + self._contents["descriptive_metadata"] = DescriptiveMetadata( + proj=self.proj, + meta={k: str(v) for k, v in meta.items() if k in dm_fields and v}, + ) + + # --- license --- + lic_raw = meta.get("license") + if lic_raw: + # license may be a string URL or a dict with @id / name + if isinstance(lic_raw, str): + self._contents["license"] = License( + proj=self.proj, shortname=lic_raw, url=lic_raw + ) + elif isinstance(lic_raw, dict): + self._contents["license"] = License( + proj=self.proj, + shortname=lic_raw.get("name", lic_raw.get("@id", "")), + url=lic_raw.get("@id", lic_raw.get("url", "")), + ) + + # --- citation --- + cite_raw = meta.get("citeAs") or meta.get("citation") + if cite_raw: + self._contents["citation"] = Citation( + proj=self.proj, + meta={"citeAs": cite_raw} if isinstance(cite_raw, str) else cite_raw, + ) + + # --- record sets --- + record_sets_raw = meta.get("recordSet") or meta.get("cr:recordSet") or [] + if isinstance(record_sets_raw, dict): + record_sets_raw = [record_sets_raw] + + record_sets = {} + for rs in record_sets_raw: + rs_id = rs.get("name") or rs.get("@id", "") + description = rs.get("description", "") + fields_raw = rs.get("field") or rs.get("cr:field") or [] + if isinstance(fields_raw, dict): + fields_raw = [fields_raw] + field_names = [ + f.get("name") or f.get("@id", "") + for f in fields_raw + if isinstance(f, dict) + ] + record_sets[rs_id] = CroissantRecordSet( + proj=self.proj, + name=rs_id, + description=description, + fields=field_names, + ) + + if record_sets: + self._contents["croissant_record_set"] = AttrDict(record_sets) + + @staticmethod + def _create(path: str) -> None: + """Write a minimal valid Croissant metadata file.""" + import json + + doc = { + "@context": { + "@language": "en", + "@vocab": "https://schema.org/", + "cr": "http://mlcommons.org/schema/", + "dct": "http://purl.org/dc/terms/", + }, + "@type": "sc:Dataset", + "name": "my-dataset", + "description": "A short description of the dataset.", + "license": "https://creativecommons.org/licenses/by/4.0/", + "url": "https://example.com/my-dataset", + "dct:conformsTo": "http://mlcommons.org/croissant/1.0", + "distribution": [ + { + "@type": "cr:FileObject", + "@id": "data.csv", + "contentUrl": "data.csv", + "encodingFormat": "text/csv", + } + ], + "recordSet": [ + { + "@type": "cr:RecordSet", + "@id": "records", + "name": "records", + "field": [ + { + "@type": "cr:Field", + "@id": "records/id", + "name": "id", + "dataType": "sc:Integer", + }, + { + "@type": "cr:Field", + "@id": "records/value", + "name": "value", + "dataType": "sc:Text", + }, + ], + } + ], + } + with open(f"{path}/croissant.json", "wt") as f: + json.dump(doc, f, indent=2) diff --git a/tests/test_croissant.py b/tests/test_croissant.py new file mode 100644 index 0000000..be1120c --- /dev/null +++ b/tests/test_croissant.py @@ -0,0 +1,318 @@ +"""Tests for the CroissantDataset project spec and CroissantRecordSet content class.""" + +import json +import os +import textwrap + +import pytest + +import projspec +from projspec.proj.datapackage import CroissantDataset + + +# --------------------------------------------------------------------------- +# Helpers (copied from test_new_specs.py pattern) +# --------------------------------------------------------------------------- + + +def write_files(tmpdir, files: dict[str, str]) -> str: + path = str(tmpdir) + for rel, content in files.items(): + full = os.path.join(path, rel) + os.makedirs(os.path.dirname(full), exist_ok=True) + with open(full, "w") as f: + f.write(textwrap.dedent(content)) + return path + + +def make_proj(tmpdir, files: dict[str, str]): + path = write_files(tmpdir, files) + return projspec.Project(path) + + +def raw_spec(cls, proj): + """Instantiate a spec bypassing __init__'s match() call.""" + inst = cls.__new__(cls) + inst.proj = proj + inst._contents = None + inst._artifacts = None + inst._matched_file = None + return inst + + +# --------------------------------------------------------------------------- +# Minimal valid Croissant document used across several tests +# --------------------------------------------------------------------------- + +MINIMAL_CROISSANT = { + "@context": { + "@language": "en", + "@vocab": "https://schema.org/", + "cr": "http://mlcommons.org/schema/", + "dct": "http://purl.org/dc/terms/", + }, + "@type": "sc:Dataset", + "name": "test-dataset", + "description": "A test dataset for projspec.", + "license": "https://creativecommons.org/licenses/by/4.0/", + "url": "https://example.com/test-dataset", + "dct:conformsTo": "http://mlcommons.org/croissant/1.0", + "distribution": [ + { + "@type": "cr:FileObject", + "@id": "data.csv", + "contentUrl": "data.csv", + "encodingFormat": "text/csv", + } + ], + "recordSet": [ + { + "@type": "cr:RecordSet", + "@id": "records", + "name": "records", + "description": "The main record set.", + "field": [ + { + "@type": "cr:Field", + "@id": "records/id", + "name": "id", + "dataType": "sc:Integer", + }, + { + "@type": "cr:Field", + "@id": "records/label", + "name": "label", + "dataType": "sc:Text", + }, + ], + } + ], +} + + +# --------------------------------------------------------------------------- +# match() tests +# --------------------------------------------------------------------------- + + +class TestCroissantMatch: + def test_match_positive_croissant_json(self, tmpdir): + """croissant.json with conformsTo marker is detected.""" + files = {"croissant.json": json.dumps(MINIMAL_CROISSANT)} + proj = make_proj(tmpdir, files) + spec = raw_spec(CroissantDataset, proj) + assert spec.match() + assert spec._matched_file == "croissant.json" + + def test_match_positive_metadata_json(self, tmpdir): + """metadata.json with conformsTo marker is also detected.""" + files = {"metadata.json": json.dumps(MINIMAL_CROISSANT)} + proj = make_proj(tmpdir, files) + spec = raw_spec(CroissantDataset, proj) + assert spec.match() + assert spec._matched_file == "metadata.json" + + def test_match_negative_no_json(self, tmpdir): + """Empty directory does not match.""" + proj = make_proj(tmpdir, {}) + spec = raw_spec(CroissantDataset, proj) + assert not spec.match() + + def test_match_negative_plain_json(self, tmpdir): + """A plain JSON file without the Croissant conformsTo marker is not detected.""" + files = {"data.json": json.dumps({"key": "value"})} + proj = make_proj(tmpdir, files) + spec = raw_spec(CroissantDataset, proj) + assert not spec.match() + + def test_match_negative_datapackage(self, tmpdir): + """datapackage.json (Frictionless) is not treated as Croissant.""" + files = { + "datapackage.json": json.dumps( + {"name": "pkg", "resources": [{"name": "r", "path": "r.csv"}]} + ) + } + proj = make_proj(tmpdir, files) + spec = raw_spec(CroissantDataset, proj) + assert not spec.match() + + def test_match_positive_via_project(self, tmpdir): + """Project.resolve() picks up CroissantDataset in its specs.""" + files = {"croissant.json": json.dumps(MINIMAL_CROISSANT)} + proj = make_proj(tmpdir, files) + assert "croissant_dataset" in proj.specs + + +# --------------------------------------------------------------------------- +# parse() tests +# --------------------------------------------------------------------------- + + +class TestCroissantParse: + FILES = {"croissant.json": json.dumps(MINIMAL_CROISSANT)} + + def _spec(self, tmpdir): + proj = make_proj(tmpdir, self.FILES) + spec = raw_spec(CroissantDataset, proj) + assert spec.match() + spec.parse() + return spec + + def test_parse_descriptive_metadata(self, tmpdir): + spec = self._spec(tmpdir) + assert "descriptive_metadata" in spec._contents + dm = spec._contents["descriptive_metadata"] + assert dm.meta["name"] == "test-dataset" + assert "description" in dm.meta + + def test_parse_license(self, tmpdir): + spec = self._spec(tmpdir) + assert "license" in spec._contents + lic = spec._contents["license"] + assert "creativecommons" in lic.url + + def test_parse_record_sets(self, tmpdir): + spec = self._spec(tmpdir) + assert "croissant_record_set" in spec._contents + rs_map = spec._contents["croissant_record_set"] + assert "records" in rs_map + + def test_parse_record_set_fields(self, tmpdir): + spec = self._spec(tmpdir) + rs = spec._contents["croissant_record_set"]["records"] + assert "id" in rs.fields + assert "label" in rs.fields + + def test_parse_record_set_description(self, tmpdir): + spec = self._spec(tmpdir) + rs = spec._contents["croissant_record_set"]["records"] + assert rs.description == "The main record set." + + def test_parse_no_record_sets(self, tmpdir): + """Documents without recordSet should parse without error.""" + doc = dict(MINIMAL_CROISSANT) + doc.pop("recordSet") + files = {"croissant.json": json.dumps(doc)} + proj = make_proj(tmpdir, files) + spec = raw_spec(CroissantDataset, proj) + assert spec.match() + spec.parse() + assert "descriptive_metadata" in spec._contents + assert "croissant_record_set" not in spec._contents + + def test_parse_citation(self, tmpdir): + """citeAs is parsed into a Citation content object.""" + doc = dict(MINIMAL_CROISSANT) + doc["citeAs"] = "@article{test2024, title={Test}}" + files = {"croissant.json": json.dumps(doc)} + proj = make_proj(tmpdir, files) + spec = raw_spec(CroissantDataset, proj) + assert spec.match() + spec.parse() + assert "citation" in spec._contents + assert "Test" in spec._contents["citation"].meta["citeAs"] + + def test_parse_license_dict(self, tmpdir): + """license expressed as a dict with @id is handled.""" + doc = dict(MINIMAL_CROISSANT) + doc["license"] = {"@id": "https://opensource.org/licenses/MIT", "name": "MIT"} + files = {"croissant.json": json.dumps(doc)} + proj = make_proj(tmpdir, files) + spec = raw_spec(CroissantDataset, proj) + assert spec.match() + spec.parse() + lic = spec._contents["license"] + assert lic.shortname == "MIT" + + def test_parse_multiple_record_sets(self, tmpdir): + """Multiple RecordSets are all captured.""" + doc = dict(MINIMAL_CROISSANT) + doc["recordSet"] = [ + { + "@type": "cr:RecordSet", + "@id": "train", + "name": "train", + "field": [{"@type": "cr:Field", "@id": "train/x", "name": "x"}], + }, + { + "@type": "cr:RecordSet", + "@id": "test", + "name": "test", + "field": [{"@type": "cr:Field", "@id": "test/y", "name": "y"}], + }, + ] + files = {"croissant.json": json.dumps(doc)} + proj = make_proj(tmpdir, files) + spec = raw_spec(CroissantDataset, proj) + assert spec.match() + spec.parse() + rs_map = spec._contents["croissant_record_set"] + assert "train" in rs_map + assert "test" in rs_map + assert "x" in rs_map["train"].fields + assert "y" in rs_map["test"].fields + + +# --------------------------------------------------------------------------- +# CroissantRecordSet content class tests +# --------------------------------------------------------------------------- + + +class TestCroissantRecordSet: + def test_import(self): + from projspec.content.data import CroissantRecordSet # noqa: F401 + + def test_public_import(self): + from projspec.content import CroissantRecordSet # noqa: F401 + + def test_fields(self, tmpdir): + files = {"croissant.json": json.dumps(MINIMAL_CROISSANT)} + proj = make_proj(tmpdir, files) + from projspec.content.data import CroissantRecordSet + + rs = CroissantRecordSet( + proj=proj, name="rs", description="desc", fields=["a", "b"] + ) + assert rs.name == "rs" + assert rs.description == "desc" + assert rs.fields == ["a", "b"] + + def test_to_dict(self, tmpdir): + files = {"croissant.json": json.dumps(MINIMAL_CROISSANT)} + proj = make_proj(tmpdir, files) + from projspec.content.data import CroissantRecordSet + + rs = CroissantRecordSet(proj=proj, name="rs", fields=["x"]) + d = rs.to_dict() + assert d["name"] == "rs" + assert d["fields"] == ["x"] + + +# --------------------------------------------------------------------------- +# _create() / round-trip test +# --------------------------------------------------------------------------- + + +class TestCroissantCreate: + def test_create_writes_file(self, tmp_path): + CroissantDataset._create(str(tmp_path)) + assert (tmp_path / "croissant.json").exists() + + def test_create_valid_json(self, tmp_path): + CroissantDataset._create(str(tmp_path)) + with open(tmp_path / "croissant.json") as f: + doc = json.load(f) + assert "dct:conformsTo" in doc + assert "mlcommons.org/croissant" in doc["dct:conformsTo"] + + def test_create_detected_by_project(self, tmp_path): + CroissantDataset._create(str(tmp_path)) + proj = projspec.Project(str(tmp_path)) + assert "croissant_dataset" in proj.specs + + def test_roundtrip_to_dict(self, tmp_path): + CroissantDataset._create(str(tmp_path)) + proj = projspec.Project(str(tmp_path)) + d = proj.to_dict(compact=False) + proj2 = projspec.Project.from_dict(d) + assert "croissant_dataset" in proj2.specs diff --git a/tests/test_qtapp_helpers.py b/tests/test_qtapp_helpers.py new file mode 100644 index 0000000..4fcb498 --- /dev/null +++ b/tests/test_qtapp_helpers.py @@ -0,0 +1,97 @@ +"""Unit tests for protocol-preserving helpers in the Qt app. + +These test the pure-Python helpers only, so they run without PyQt5 installed. +""" + +import json + +import fsspec +import pytest + +import projspec +from projspec.config import temp_conf +from projspec.library import ProjectLibrary +from projspec.qtapp.main import _rescan_path, _url_to_local + + +def _make_memory_project(root="/qt_rescan"): + mfs = fsspec.filesystem("memory") + try: + mfs.rm(root, recursive=True) + except FileNotFoundError: + pass + mfs.pipe(f"{root}/pyproject.toml", b'[project]\nname="x"\nversion="0.1"\n') + return mfs, root + + +class TestRescanPath: + def test_protocol_key_preferred(self): + # a protocol-qualified key is returned verbatim, even with no entry + assert _rescan_path("memory:///proj", None) == "memory:///proj" + assert _rescan_path("s3://bucket/key", None) == "s3://bucket/key" + + def test_protocol_key_preferred_over_local_fs_entry(self): + # Even if the stored entry's filesystem was (wrongly) reconstructed as + # local, the protocol-qualified key wins. + mfs, root = _make_memory_project() + try: + # an old-format serialised entry: stripped url -> local fs + old_entry = { + "klass": "project", + "specs": {}, + "children": {}, + "contents": {}, + "artifacts": {}, + "url": root, + "storage_options": {}, + "file_count": 1, + "total_size": 10, + "is_writable": True, + "last_modified": None, + "last_modified_by": None, + "scanned_at": 1.0, + } + entry = projspec.Project.from_dict(old_entry) + assert entry.is_local() # the reconstructed fs is wrongly local + # ...but the key's protocol is honoured + assert _rescan_path(f"memory://{root}", entry) == f"memory://{root}" + finally: + mfs.rm(root, recursive=True) + + def test_local_key_uses_entry_unstrip(self): + # a key without a protocol falls back to the entry's protocol-qualified + # URL + mfs, root = _make_memory_project("/qt_rescan2") + try: + proj = projspec.Project(f"memory://{root}", walk=False) + # key has no protocol -> use the entry's unstrip_protocol + assert _rescan_path(root, proj) == f"memory://{root}" + finally: + mfs.rm(root, recursive=True) + + def test_no_protocol_no_entry_returns_key(self): + assert _rescan_path("/plain/path", None) == "/plain/path" + + +def test_rescan_path_roundtrip_reopens_remote(): + """A library entry saved+loaded then resolved via _rescan_path re-opens as + the correct (non-local) filesystem.""" + mfs, root = _make_memory_project("/qt_rt") + try: + proj = projspec.Project(f"memory://{root}", walk=False) + key = proj.fs.unstrip_protocol(proj.url) + # serialise + deserialise as the library would + dic = json.loads(json.dumps(proj.to_dict(compact=False))) + entry = projspec.Project.from_dict(dic) + path = _rescan_path(key, entry) + reopened = projspec.Project(path, walk=False) + assert not reopened.is_local() + assert "python_library" in reopened.specs + finally: + mfs.rm(root, recursive=True) + + +def test_url_to_local(): + assert _url_to_local("file:///tmp/x") == "/tmp/x" + assert _url_to_local("/tmp/x") == "/tmp/x" + assert _url_to_local("memory://proj") == "memory://proj" diff --git a/tests/test_roundtrips.py b/tests/test_roundtrips.py index a3159e4..0619e7d 100644 --- a/tests/test_roundtrips.py +++ b/tests/test_roundtrips.py @@ -49,6 +49,7 @@ "Dbt", "Quarto", "Nox", + "CroissantDataset", # Documentation — file-only _create() "MkDocs", "Sphinx",