From 81148a5aee92135b66f1c346b635157322fdd3b4 Mon Sep 17 00:00:00 2001 From: Dan Birman Date: Thu, 19 Mar 2026 19:37:17 -0700 Subject: [PATCH 1/2] feat: define a new core file "files.json" --- .../behavior-videos/FaceCamera/metadata.csv | 0 .../behavior-videos/FaceCamera/video.mp4 | 0 .../behavior-videos/FrontCamera/metadata.csv | 0 .../behavior-videos/FrontCamera/video.mp4 | 0 .../behavior-videos/SideCamera/metadata.csv | 0 .../behavior-videos/SideCamera/video.mp4 | 0 scripts/files-test/test_behavior_videos.py | 67 ++++++++++++ src/aind_data_schema/core/files.py | 101 ++++++++++++++++++ 8 files changed, 168 insertions(+) create mode 100644 scripts/files-test/behavior-videos/FaceCamera/metadata.csv create mode 100644 scripts/files-test/behavior-videos/FaceCamera/video.mp4 create mode 100644 scripts/files-test/behavior-videos/FrontCamera/metadata.csv create mode 100644 scripts/files-test/behavior-videos/FrontCamera/video.mp4 create mode 100644 scripts/files-test/behavior-videos/SideCamera/metadata.csv create mode 100644 scripts/files-test/behavior-videos/SideCamera/video.mp4 create mode 100644 scripts/files-test/test_behavior_videos.py create mode 100644 src/aind_data_schema/core/files.py diff --git a/scripts/files-test/behavior-videos/FaceCamera/metadata.csv b/scripts/files-test/behavior-videos/FaceCamera/metadata.csv new file mode 100644 index 000000000..e69de29bb diff --git a/scripts/files-test/behavior-videos/FaceCamera/video.mp4 b/scripts/files-test/behavior-videos/FaceCamera/video.mp4 new file mode 100644 index 000000000..e69de29bb diff --git a/scripts/files-test/behavior-videos/FrontCamera/metadata.csv b/scripts/files-test/behavior-videos/FrontCamera/metadata.csv new file mode 100644 index 000000000..e69de29bb diff --git a/scripts/files-test/behavior-videos/FrontCamera/video.mp4 b/scripts/files-test/behavior-videos/FrontCamera/video.mp4 new file mode 100644 index 000000000..e69de29bb diff --git a/scripts/files-test/behavior-videos/SideCamera/metadata.csv b/scripts/files-test/behavior-videos/SideCamera/metadata.csv new file mode 100644 index 000000000..e69de29bb diff --git a/scripts/files-test/behavior-videos/SideCamera/video.mp4 b/scripts/files-test/behavior-videos/SideCamera/video.mp4 new file mode 100644 index 000000000..e69de29bb diff --git a/scripts/files-test/test_behavior_videos.py b/scripts/files-test/test_behavior_videos.py new file mode 100644 index 000000000..25236f9c2 --- /dev/null +++ b/scripts/files-test/test_behavior_videos.py @@ -0,0 +1,67 @@ +"""Integration test: load files.json as a data instance and validate +that the described files exist in the behavior-videos folder. + +Fake empty files mirror the real S3 layout: + aind-open-data/behavior_811026_2025-12-01_22-31-16/behavior-videos/ +""" + +import unittest +from pathlib import Path + +from aind_data_schema.core.files import BehaviorVideoFiles + +BEHAVIOR_VIDEOS = Path(__file__).parent / "behavior-videos" +FILES_JSON = BEHAVIOR_VIDEOS / "files.json" + + +class TestBehaviorVideosFolderValidation(unittest.TestCase): + def test_round_trip(self): + serialized = FILES_JSON.read_text() + spec = BehaviorVideoFiles.model_validate_json(serialized) + reserialized = spec.model_dump_json() + BehaviorVideoFiles.model_validate_json(reserialized) + + def test_valid_folder_has_no_errors(self): + spec = BehaviorVideoFiles.model_validate_json(FILES_JSON.read_text()) + errors = spec.validate_folder(BEHAVIOR_VIDEOS) + self.assertEqual(errors, [], f"Unexpected errors: {errors}") + + def test_missing_video_detected(self): + spec = BehaviorVideoFiles.from_standard() + missing_dir = BEHAVIOR_VIDEOS.parent / "missing-video" + missing_dir.mkdir(exist_ok=True) + cam = missing_dir / "FaceCamera" + cam.mkdir(exist_ok=True) + (cam / "metadata.csv").touch() + try: + errors = spec.validate_folder(missing_dir) + self.assertTrue( + any("video" in e.lower() for e in errors), + f"Expected missing-video error, got: {errors}", + ) + finally: + (cam / "metadata.csv").unlink() + cam.rmdir() + missing_dir.rmdir() + + def test_missing_metadata_detected(self): + spec = BehaviorVideoFiles.from_standard() + missing_dir = BEHAVIOR_VIDEOS.parent / "missing-metadata" + missing_dir.mkdir(exist_ok=True) + cam = missing_dir / "FrontCamera" + cam.mkdir(exist_ok=True) + (cam / "video.mp4").touch() + try: + errors = spec.validate_folder(missing_dir) + self.assertTrue( + any("metadata" in e.lower() for e in errors), + f"Expected missing-metadata error, got: {errors}", + ) + finally: + (cam / "video.mp4").unlink() + cam.rmdir() + missing_dir.rmdir() + + +if __name__ == "__main__": + unittest.main() diff --git a/src/aind_data_schema/core/files.py b/src/aind_data_schema/core/files.py new file mode 100644 index 000000000..5a9614353 --- /dev/null +++ b/src/aind_data_schema/core/files.py @@ -0,0 +1,101 @@ +import fnmatch +from pathlib import Path +from typing import List, Literal, Optional, Union + +from pydantic import Field, SkipValidation + +from aind_data_schema.base import DataCoreModel, DataModel + + +class FileSet(DataModel): + """A set of files matching a glob pattern, inspired by Croissant cr:FileSet""" + + name: str = Field(..., title="Name") + description: Optional[str] = Field(default=None, title="Description") + encoding_format: Optional[str] = Field(default=None, title="Encoding format (MIME type)") + includes: Union[str, List[str]] = Field(..., title="Glob pattern(s) for included files") + excludes: Optional[Union[str, List[str]]] = Field(default=None, title="Glob pattern(s) for excluded files") + + +class Files(DataCoreModel): + """Description of the expected file organization for a data asset folder""" + + _DESCRIBED_BY_URL = DataCoreModel._DESCRIBED_BY_BASE_URL.default + "aind_data_schema/core/files.py" + describedBy: str = Field(default=_DESCRIBED_BY_URL, json_schema_extra={"const": _DESCRIBED_BY_URL}) + schema_version: SkipValidation[Literal["0.1.0"]] = Field(default="0.1.0") + + file_sets: List[FileSet] = Field(..., title="File sets", min_length=1) + + def validate_folder(self, folder: Path) -> List[str]: + """Check that the file sets match actual files in the given folder. + + Returns a list of error strings. Empty list means valid. + """ + relative_paths = [ + str(p.relative_to(folder)).replace("\\", "/") + for p in folder.rglob("*") + if p.is_file() and p.name != "files.json" + ] + errors = [] + for file_set in self.file_sets: + patterns = [file_set.includes] if isinstance(file_set.includes, str) else file_set.includes + for pattern in patterns: + matches = [p for p in relative_paths if fnmatch.fnmatch(p, pattern)] + if not matches: + errors.append( + f"FileSet '{file_set.name}': no files matching pattern '{pattern}'" + ) + return errors + + +class BehaviorVideoFiles(Files): + """File organization for AIND behavior videos. + + Expected folder structure: + behavior-videos/ + / + metadata.csv + video. + """ + + @classmethod + def from_standard(cls) -> "BehaviorVideoFiles": + return cls( + file_sets=[ + FileSet( + name="Metadata CSV Files", + description="Per-camera metadata CSV files", + encoding_format="text/csv", + includes="*/metadata.csv", + ), + FileSet( + name="Video Files", + description="Per-camera video files", + includes="*/video.*", + ), + ], + ) + + def validate_folder(self, folder: Path) -> List[str]: + errors = super().validate_folder(folder) + relative_paths = [ + str(p.relative_to(folder)).replace("\\", "/") + for p in folder.rglob("*") + if p.is_file() and p.name != "files.json" + ] + + camera_dirs: dict[str, list[str]] = {} + for p in relative_paths: + parts = p.split("/", 1) + if len(parts) == 2: + camera_dirs.setdefault(parts[0], []).append(parts[1]) + + for camera_name, files in camera_dirs.items(): + has_metadata = "metadata.csv" in files + has_video = any(f.startswith("video.") for f in files) + if has_metadata and not has_video: + errors.append(f"Camera '{camera_name}': has metadata.csv but no video file") + if has_video and not has_metadata: + errors.append(f"Camera '{camera_name}': has video file but no metadata.csv") + + return errors From 4de50b3738d222b04f0d313492aca412d3b63c36 Mon Sep 17 00:00:00 2001 From: Dan Birman Date: Thu, 19 Mar 2026 19:45:28 -0700 Subject: [PATCH 2/2] feat: add to_croissant() --- pyproject.toml | 1 + src/aind_data_schema/core/files.py | 85 +++++++++++++++++++++++++++++- 2 files changed, 84 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index b1f36d949..cd2b6664b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,6 +27,7 @@ dev = [ 'dictdiffer', 'semver', 'argparse', + 'mlcroissant', ] linters = [ diff --git a/src/aind_data_schema/core/files.py b/src/aind_data_schema/core/files.py index 5a9614353..ec0e99968 100644 --- a/src/aind_data_schema/core/files.py +++ b/src/aind_data_schema/core/files.py @@ -1,6 +1,8 @@ import fnmatch +import json +import re from pathlib import Path -from typing import List, Literal, Optional, Union +from typing import Any, Dict, List, Literal, Optional, Union from pydantic import Field, SkipValidation @@ -12,10 +14,28 @@ class FileSet(DataModel): name: str = Field(..., title="Name") description: Optional[str] = Field(default=None, title="Description") - encoding_format: Optional[str] = Field(default=None, title="Encoding format (MIME type)") + encoding_format: str = Field(..., title="Encoding format (MIME type)") includes: Union[str, List[str]] = Field(..., title="Glob pattern(s) for included files") excludes: Optional[Union[str, List[str]]] = Field(default=None, title="Glob pattern(s) for excluded files") + def _croissant_id(self) -> str: + return re.sub(r"[^a-z0-9]+", "-", self.name.lower()).strip("-") + + def to_croissant(self) -> Dict[str, Any]: + entry: Dict[str, Any] = { + "@type": "cr:FileSet", + "@id": self._croissant_id(), + "name": self.name, + "includes": self.includes, + } + if self.description: + entry["description"] = self.description + if self.encoding_format: + entry["encodingFormat"] = self.encoding_format + if self.excludes: + entry["excludes"] = self.excludes + return entry + class Files(DataCoreModel): """Description of the expected file organization for a data asset folder""" @@ -47,6 +67,66 @@ def validate_folder(self, folder: Path) -> List[str]: ) return errors + def to_croissant(self) -> Dict[str, Any]: + """Convert this Files instance to a Croissant JSON-LD dict.""" + return { + "@context": { + "@language": "en", + "@vocab": "https://schema.org/", + "sc": "https://schema.org/", + "cr": "http://mlcommons.org/croissant/", + "rai": "http://mlcommons.org/croissant/RAI/", + "dct": "http://purl.org/dc/terms/", + "citeAs": "cr:citeAs", + "column": "cr:column", + "conformsTo": "dct:conformsTo", + "data": {"@id": "cr:data", "@type": "@json"}, + "dataType": {"@id": "cr:dataType", "@type": "@vocab"}, + "examples": {"@id": "cr:examples", "@type": "@json"}, + "extract": "cr:extract", + "field": "cr:field", + "fileProperty": "cr:fileProperty", + "fileObject": "cr:fileObject", + "fileSet": "cr:fileSet", + "format": "cr:format", + "includes": "cr:includes", + "excludes": "cr:excludes", + "isLiveDataset": "cr:isLiveDataset", + "jsonPath": "cr:jsonPath", + "key": "cr:key", + "md5": "cr:md5", + "parentField": "cr:parentField", + "path": "cr:path", + "recordSet": "cr:recordSet", + "references": "cr:references", + "regex": "cr:regex", + "repeated": "cr:repeated", + "replace": "cr:replace", + "samplingRate": "cr:samplingRate", + "separator": "cr:separator", + "source": "cr:source", + "subField": "cr:subField", + "transform": "cr:transform", + }, + "@type": "sc:Dataset", + "conformsTo": "http://mlcommons.org/croissant/1.0", + "name": self.default_filename().replace(".json", ""), + "version": self.schema_version, + "distribution": [fs.to_croissant() for fs in self.file_sets], + } + + def to_croissant_json(self) -> str: + """Serialize the Croissant JSON-LD to a string.""" + return json.dumps(self.to_croissant(), indent=3) + + def write_croissant_file(self, output_directory: Path) -> Path: + """Write a Croissant JSON-LD file alongside the data.""" + output_directory = Path(output_directory) + output_directory.mkdir(parents=True, exist_ok=True) + out = output_directory / "files_croissant.json" + out.write_text(self.to_croissant_json()) + return out + class BehaviorVideoFiles(Files): """File organization for AIND behavior videos. @@ -71,6 +151,7 @@ def from_standard(cls) -> "BehaviorVideoFiles": FileSet( name="Video Files", description="Per-camera video files", + encoding_format="video/mp4", includes="*/video.*", ), ],