Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions docs/source/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ User Classes
proj.conda_workspace.CondaWorkspace
proj.datapackage.DVCRepo
proj.datapackage.DataPackage
proj.datapackage.CroissantDataset
proj.dataworkflows.Dbt
proj.dataworkflows.Quarto
proj.dataworkflows.Prefect
Expand Down Expand Up @@ -136,6 +137,7 @@ User Classes
.. autoclass:: projspec.proj.conda_workspace.CondaWorkspace
.. autoclass:: projspec.proj.datapackage.DVCRepo
.. autoclass:: projspec.proj.datapackage.DataPackage
.. autoclass:: projspec.proj.datapackage.CroissantDataset
.. autoclass:: projspec.proj.dataworkflows.Dbt
.. autoclass:: projspec.proj.dataworkflows.Quarto
.. autoclass:: projspec.proj.dataworkflows.Prefect
Expand Down Expand Up @@ -224,6 +226,7 @@ User Classes
content.cicd.ServiceDependency
content.data.IntakeSource
content.data.TabularData
content.data.CroissantRecordSet
content.data.DataResource
content.env_var.EnvironmentVariables
content.environment.Environment
Expand All @@ -241,6 +244,7 @@ User Classes
.. autoclass:: projspec.content.cicd.ServiceDependency
.. autoclass:: projspec.content.data.IntakeSource
.. autoclass:: projspec.content.data.DataResource
.. autoclass:: projspec.content.data.CroissantRecordSet
.. autoclass:: projspec.content.data.TabularData
.. autoclass:: projspec.content.env_var.EnvironmentVariables
.. autoclass:: projspec.content.environment.Environment
Expand Down
2 changes: 2 additions & 0 deletions src/projspec/content/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
ServiceDependency,
)
from projspec.content.data import (
CroissantRecordSet,
Dataset,
FrictionlessData,
IntakeSource,
Expand All @@ -24,6 +25,7 @@
__all__ = [
"BaseContent",
"CIWorkflow",
"CroissantRecordSet",
"GithubAction",
"PipelineStage",
"ServiceDependency",
Expand Down
26 changes: 26 additions & 0 deletions src/projspec/content/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,32 @@ class IntakeSource(BaseContent):
name: str


@dataclass
class CroissantRecordSet(BaseContent):
"""A RecordSet described in a Croissant/JSON-LD dataset metadata file.

Croissant (http://mlcommons.org/croissant/1.0) is the ML Commons standard
for describing ML datasets using JSON-LD. A ``RecordSet`` is a named,
structured collection of records (e.g. one table or one set of image
annotations) within the dataset.

Attributes
----------
name:
The ``name`` (or ``@id``) of the ``cr:RecordSet``.
description:
Optional free-text description of the record set.
fields:
List of field names declared inside this record set.
"""

icon = "🥐"

name: str
description: str = ""
fields: list = field(default_factory=list)


@dataclass
class Dataset(BaseContent):
"""A generic dataset discovered on disk and described by intake.
Expand Down
3 changes: 2 additions & 1 deletion src/projspec/proj/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
from projspec.proj.conda_project import CondaProject
from projspec.proj.conda_workspace import CondaWorkspace
from projspec.proj.data_project import DataProject
from projspec.proj.datapackage import DataPackage, DVCRepo
from projspec.proj.datapackage import DataPackage, DVCRepo, CroissantDataset
from projspec.proj.dataworkflows import (
Airflow,
Dagster,
Expand Down Expand Up @@ -93,6 +93,7 @@
"CondaWorkspace",
"RattlerRecipe",
# Data
"CroissantDataset",
"DataPackage",
"DVCRepo",
"DataProject",
Expand Down
187 changes: 187 additions & 0 deletions src/projspec/proj/datapackage.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,3 +161,190 @@ def _create(path: str) -> None:
version: 2
"""
)


class CroissantDataset(ProjectSpec):
"""An ML Commons Croissant dataset described by a JSON-LD metadata file.

Croissant (http://mlcommons.org/croissant/1.0) is the standard format for
describing ML datasets using JSON-LD / schema.org vocabulary. It captures
dataset-level metadata (name, description, license, citation) as well as a
structured schema of the data via ``RecordSet`` and ``Field`` objects.

Detection heuristic
-------------------
1. Look for any ``.json`` / ``.jsonld`` file whose **basename** matches a
list of common Croissant filenames (``croissant.json``,
``croissant_metadata.json``, ``metadata.json``, …).
2. Open the candidate file and confirm it carries the Croissant conformance
marker (``conformsTo`` containing ``mlcommons.org/croissant`` or a
``@context`` that maps ``cr`` / ``mlcommons``).

No file I/O other than reading the single metadata file is needed, so this
parser is compatible with remote filesystems.
"""

icon = "🥐"
spec_doc = "https://docs.mlcommons.org/croissant/docs/croissant-spec.html"

# Filename matched during match(); reused in parse() to avoid re-scanning.
_matched_file: str | None = None
_CROISSANT_NAMES = re.compile(
r"^(croissant.*|.*[-_]?croissant[-_]?.*|metadata)\.json(ld)?$",
re.IGNORECASE,
)
_CROISSANT_CONFORMSTO = "mlcommons.org/croissant"

def match(self) -> bool:
"""Return True when a plausible Croissant JSON-LD file is present."""
for basename in self.proj.basenames:
if self._CROISSANT_NAMES.match(basename):
# Peek at the file to confirm it is really Croissant.
# We use get_file() so the content may already be cached.
try:
fobj = self.proj.get_file(basename)
if fobj is None:
continue
text = fobj.read()
if isinstance(text, bytes):
text = text.decode("utf-8", errors="replace")
if self._CROISSANT_CONFORMSTO in text:
self._matched_file = basename
return True
except Exception:
continue
return False

def parse(self) -> None:
import json

from projspec.content import DescriptiveMetadata, License, Citation
from projspec.content.data import CroissantRecordSet
from projspec.utils import AttrDict

if self._matched_file is None:
raise ParseFailed("No Croissant file identified")

self._contents = AttrDict()
self._artifacts = AttrDict()

with self.proj.fs.open(self.proj.basenames[self._matched_file], "rt") as f:
meta = json.load(f)

# --- dataset-level metadata ---
dm_fields = {
"name",
"description",
"url",
"version",
"datePublished",
"dateCreated",
"dateModified",
"keywords",
"inLanguage",
}
self._contents["descriptive_metadata"] = DescriptiveMetadata(
proj=self.proj,
meta={k: str(v) for k, v in meta.items() if k in dm_fields and v},
)

# --- license ---
lic_raw = meta.get("license")
if lic_raw:
# license may be a string URL or a dict with @id / name
if isinstance(lic_raw, str):
self._contents["license"] = License(
proj=self.proj, shortname=lic_raw, url=lic_raw
)
elif isinstance(lic_raw, dict):
self._contents["license"] = License(
proj=self.proj,
shortname=lic_raw.get("name", lic_raw.get("@id", "")),
url=lic_raw.get("@id", lic_raw.get("url", "")),
)

# --- citation ---
cite_raw = meta.get("citeAs") or meta.get("citation")
if cite_raw:
self._contents["citation"] = Citation(
proj=self.proj,
meta={"citeAs": cite_raw} if isinstance(cite_raw, str) else cite_raw,
)

# --- record sets ---
record_sets_raw = meta.get("recordSet") or meta.get("cr:recordSet") or []
if isinstance(record_sets_raw, dict):
record_sets_raw = [record_sets_raw]

record_sets = {}
for rs in record_sets_raw:
rs_id = rs.get("name") or rs.get("@id", "")
description = rs.get("description", "")
fields_raw = rs.get("field") or rs.get("cr:field") or []
if isinstance(fields_raw, dict):
fields_raw = [fields_raw]
field_names = [
f.get("name") or f.get("@id", "")
for f in fields_raw
if isinstance(f, dict)
]
record_sets[rs_id] = CroissantRecordSet(
proj=self.proj,
name=rs_id,
description=description,
fields=field_names,
)

if record_sets:
self._contents["croissant_record_set"] = AttrDict(record_sets)

@staticmethod
def _create(path: str) -> None:
"""Write a minimal valid Croissant metadata file."""
import json

doc = {
"@context": {
"@language": "en",
"@vocab": "https://schema.org/",
"cr": "http://mlcommons.org/schema/",
"dct": "http://purl.org/dc/terms/",
},
"@type": "sc:Dataset",
"name": "my-dataset",
"description": "A short description of the dataset.",
"license": "https://creativecommons.org/licenses/by/4.0/",
"url": "https://example.com/my-dataset",
"dct:conformsTo": "http://mlcommons.org/croissant/1.0",
"distribution": [
{
"@type": "cr:FileObject",
"@id": "data.csv",
"contentUrl": "data.csv",
"encodingFormat": "text/csv",
}
],
"recordSet": [
{
"@type": "cr:RecordSet",
"@id": "records",
"name": "records",
"field": [
{
"@type": "cr:Field",
"@id": "records/id",
"name": "id",
"dataType": "sc:Integer",
},
{
"@type": "cr:Field",
"@id": "records/value",
"name": "value",
"dataType": "sc:Text",
},
],
}
],
}
with open(f"{path}/croissant.json", "wt") as f:
json.dump(doc, f, indent=2)
Loading
Loading