diff --git a/README.md b/README.md index b42858b..134e0ec 100644 --- a/README.md +++ b/README.md @@ -148,6 +148,23 @@ flags apply only when `--format lerobot_v2.1` or `--format gr00t`. The `gr00t` format produces a LeRobot v2.1 dataset plus a GR00T-compatible `meta/modality.json` (see [Isaac-GR00T data preparation](https://github.com/NVIDIA/Isaac-GR00T/blob/main/getting_started/data_preparation.md)). +Upload a dataset to the Hugging Face Hub: + +```bash +openarm-dataset-upload \ + --repo-id / \ + [--private] # create the repo as private if it does not exist +``` + +The whole dataset directory is uploaded to a +[dataset repository](https://huggingface.co/docs/hub/datasets), creating it if it +does not already exist, and tagged with the dataset version. Cameras stored as +directories of JPEG files are repacked **in place** into one `.tar` archive per +camera before uploading, to stay within [Hugging Face Hub's file-count +recommendations](https://huggingface.co/docs/hub/storage-limits#recommendations). +Repacking is lossless and reversible (`openarm-dataset-convert --camera-format dir` +restores the JPEG-directory layout). + ## Development ### Test diff --git a/pyproject.toml b/pyproject.toml index 33364da..02e1a70 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,6 +25,8 @@ readme = "README.md" requires-python = ">=3.10" dependencies = [ + "huggingface_hub", + "jinja2", "numpy", "pandas", "pillow", @@ -58,6 +60,7 @@ Repository = "https://github.com/enactic/openarm_dataset.git" openarm-dataset-convert = "openarm_dataset.convert:main" openarm-dataset-merge = "openarm_dataset.merge:main" openarm-dataset-repair = "openarm_dataset.repair:main" +openarm-dataset-upload = "openarm_dataset.upload:main" openarm-dataset-validate = "openarm_dataset.validate:main" [dependency-groups] @@ -72,6 +75,9 @@ package-dir = {"" = "src"} [tool.setuptools.packages.find] where = ["src"] +[tool.setuptools.package-data] +openarm_dataset = ["card_template.md"] + [tool.pytest.ini_options] testpaths = ["tests"] diff --git a/src/openarm_dataset/camera.py b/src/openarm_dataset/camera.py index 2c0575b..352a24f 100644 --- a/src/openarm_dataset/camera.py +++ b/src/openarm_dataset/camera.py @@ -197,6 +197,11 @@ def num_frames(self) -> int: else: return len(self.all_files) + @property + def format(self) -> str: + """Get camera format, either "dir" or "tar".""" + return "tar" if self.tar_path is not None else "dir" + def get_frame(self, index: int) -> Frame: """Get frame at the index. @@ -247,7 +252,7 @@ def write(self, output: os.PathLike, format): """ if format == "dir": dest_dir = Path(output) - if self.tar_path is None: + if self.format == "dir": shutil.copytree(self.base_path, dest_dir) return dest_dir.mkdir(parents=True) @@ -263,7 +268,7 @@ def write(self, output: os.PathLike, format): elif format == "tar": dest_tar = Path(output).with_suffix(".tar") dest_tar.parent.mkdir(parents=True, exist_ok=True) - if self.tar_path is not None: + if self.format == "tar": shutil.copy2(self.tar_path, dest_tar) return with tarfile.open(dest_tar, mode="w") as tf: diff --git a/src/openarm_dataset/card_template.md b/src/openarm_dataset/card_template.md new file mode 100644 index 0000000..421665b --- /dev/null +++ b/src/openarm_dataset/card_template.md @@ -0,0 +1,29 @@ +--- +# For reference on dataset card metadata, see the spec: https://github.com/huggingface/hub-docs/blob/main/datasetcard.md?plain=1 +# Doc / guide: https://huggingface.co/docs/hub/datasets-cards +# prettier-ignore +{{card_data}} +--- + +This dataset was uploaded using [OpenArm Dataset](https://github.com/enactic/openarm_dataset). + + +## Dataset Description + +{{ dataset_description | default("", true) }} + +- **Homepage:** {{ url | default("[More Information Needed]", true)}} +- **Paper:** {{ paper | default("[More Information Needed]", true)}} +- **License:** {{ license | default("[More Information Needed]", true)}} + +## Dataset Structure + +{{ dataset_structure | default("[More Information Needed]", true)}} + +## Citation + +**BibTeX:** + +```bibtex +{{ citation_bibtex | default("[More Information Needed]", true)}} +``` diff --git a/src/openarm_dataset/dataset.py b/src/openarm_dataset/dataset.py index 5585d1c..b50c07e 100644 --- a/src/openarm_dataset/dataset.py +++ b/src/openarm_dataset/dataset.py @@ -131,6 +131,24 @@ def camera_names(self) -> list[str]: return self._camera_names return list(self.meta.equipment.perceptions.cameras) + @property + def camera_format(self) -> str: + """Return the camera format ("dir" or "tar") shared by all cameras. + + Every camera in the dataset is expected to use the same format. + + Raises: + ValueError: If cameras use a mix of "dir" and "tar". + + """ + formats = set() + for episode in self.meta.episodes: + for name in self.camera_names: + formats.add(self.load_camera(name, episode).format) + if len(formats) > 1: + raise ValueError(f"Inconsistent camera formats: {sorted(formats)}") + return formats.pop() + def _episode_id(self, index: int) -> str: return self.meta.episodes[index]["id"] diff --git a/src/openarm_dataset/upload.py b/src/openarm_dataset/upload.py new file mode 100644 index 0000000..23b1bd6 --- /dev/null +++ b/src/openarm_dataset/upload.py @@ -0,0 +1,236 @@ +# Copyright 2026 Enactic, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Upload OpenArm Dataset to Hugging Face Hub.""" + +import argparse +import pathlib +import shutil +import sys +import importlib.resources +from huggingface_hub import DatasetCard, DatasetCardData, HfApi +from huggingface_hub.errors import RevisionNotFoundError +import contextlib + +from .dataset import Dataset + + +def pack_cameras_as_tar(dataset: Dataset) -> None: + """Repack every "dir"-format camera into a sibling ".tar" archive in place. + + Each ``cameras//`` directory of JPEG frames is replaced by one + uncompressed ``cameras/.tar`` archive. Packing is lossless and + reversible: ``Dataset`` reads either layout through the same API. + + Args: + dataset: The dataset to repack in place. + + """ + for episode in dataset.meta.episodes: + for camera in dataset.load_cameras(episode).values(): + if camera.format == "tar": + continue + camera.write(camera.base_path, "tar") + shutil.rmtree(camera.base_path) + + +def create_dataset_card( + tags: list | None = None, + metadata_yaml: str | None = None, + camera_names: list[str] | None = None, + **kwargs, +) -> DatasetCard: + """Create a `DatasetCard` for a OpenArm Dataset. + + Args: + tags (list | None): A list of tags to add to the dataset card. + metadata_yaml (str | None): The dataset's ``metadata.yaml`` contents, + embedded verbatim on the card. + camera_names (list[str] | None): Camera names to expose as dataset + viewer configs. Each becomes a WebDataset config so the camera + frames are browsable on the Hugging Face Hub. + **kwargs: Additional keyword arguments to populate the card template. + + Returns: + DatasetCard: The generated dataset card object. + + """ + card_tags = ["OpenArm"] + + if tags: + card_tags += tags + if kwargs.get("license"): + kwargs = {**kwargs, "license": kwargs["license"]} + if metadata_yaml: + dataset_structure = "[metadata.yaml](metadata.yaml):\n" + dataset_structure += f"```yaml\n{metadata_yaml}\n```\n" + kwargs = {**kwargs, "dataset_structure": dataset_structure} + configs = [ + { + "config_name": name, + "data_files": f"episodes/*/cameras/{name}.tar", # for dataset viewer + } + for name in (camera_names or []) + ] + card_data = DatasetCardData( + license=kwargs.get("license"), + tags=card_tags, + task_categories=["robotics"], + configs=configs or None, + ) + + card_template = ( + importlib.resources.files("openarm_dataset") + .joinpath("card_template.md") + .read_text() + ) + + return DatasetCard.from_template( + card_data=card_data, template_str=card_template, **kwargs + ) + + +def upload_dataset( + input_path: pathlib.Path, + repo_id: str, + branch: str = "main", + tag: str | None = None, + metadata_yaml: str | None = None, + licence: str | None = None, + camera_names: list[str] | None = None, + private: bool = False, + upload_large_folder: bool = False, +) -> None: + """Upload an OpenArm Dataset directory to the Hugging Face Hub. + + Creates the dataset repository if it does not exist, then uploads the whole + directory. Camera frames are never uploaded as loose image files; pack them + into ``.tar`` archives first (see ``Dataset.write(camera_format="tar")``). + + Args: + input_path: Path of the OpenArm Dataset directory to upload. + repo_id: Target repository id, e.g. ``username/dataset-name``. + branch: Branch (revision) to upload to. + tag: If given, create this tag on ``branch`` after the upload. + metadata_yaml: The dataset's ``metadata.yaml`` contents, shown verbatim + on the dataset card. + licence: Licence identifier recorded on the dataset card. + camera_names: Camera names to expose as dataset viewer configs so the + camera frames are browsable on the Hugging Face Hub. + private: Create the repository as private when it does not exist. + upload_large_folder: Use ``upload_large_folder`` for a resumable, + multi-threaded upload of large datasets. + + """ + hf_api = HfApi() + hf_api.create_repo( + repo_id=repo_id, + repo_type="dataset", + private=private, + exist_ok=True, + ) + # Never upload camera frames as loose image files; they belong in .tar + # archives to stay within Hugging Face Hub's per-repository file-count limit. + ignore_patterns = ["*.jpeg", "*.jpg", "*.png"] + upload_kwargs = { + "repo_id": repo_id, + "folder_path": str(input_path), + "repo_type": "dataset", + "revision": branch, + "ignore_patterns": ignore_patterns, + } + if upload_large_folder: + hf_api.upload_large_folder(**upload_kwargs) + else: + hf_api.upload_folder(**upload_kwargs) + + card = create_dataset_card( + tag=tag, + metadata_yaml=metadata_yaml, + license=licence, + camera_names=camera_names, + ) + card.push_to_hub( + repo_id=repo_id, + repo_type="dataset", + revision=branch, + ) + if tag is not None: + with contextlib.suppress(RevisionNotFoundError): + hf_api.delete_tag(repo_id, tag=tag, repo_type="dataset") + hf_api.create_tag( + repo_id, tag=tag, revision=branch, repo_type="dataset", exist_ok=True + ) + + +def main(): + """Upload OpenArm Dataset to Hugging Face Hub.""" + parser = argparse.ArgumentParser( + description="Upload an OpenArm Dataset to the Hugging Face Hub" + ) + parser.add_argument( + "input", + help="Path of an OpenArm Dataset to upload", + type=pathlib.Path, + ) + parser.add_argument( + "--repo-id", + required=True, + help="Target Hugging Face dataset repository id, e.g. username/dataset-name", + ) + parser.add_argument( + "--private", + action="store_true", + default=False, + help="Create the repository as private if it does not exist", + ) + parser.add_argument( + "--licence", + default="apache-2.0", + help="The licence to associate with the dataset on the Hugging Face Hub. " + "Defaults to Apache-2.0.", + ) + parser.add_argument( + "--large-folder", + action="store_true", + default=False, + help="Use a resumable, multi-threaded upload for large datasets. " + "Recommended for datasets larger than 1 GB.", + ) + args = parser.parse_args() + + dataset = Dataset(args.input) + + if dataset.camera_format == "dir": + print( + "Packing camera frames into .tar archives in place before upload " + "(Hugging Face Hub file-count recommendation)...", + file=sys.stderr, + ) + pack_cameras_as_tar(dataset) + + upload_dataset( + args.input, + args.repo_id, + tag=dataset.meta.version, + metadata_yaml=(args.input / "metadata.yaml").read_text(), + licence=args.licence, + camera_names=dataset.camera_names, + upload_large_folder=args.large_folder, + private=args.private, + ) + + +if __name__ == "__main__": + main() diff --git a/tests/test_dataset_tar.py b/tests/test_dataset_tar.py index 91d2881..e5f96d1 100644 --- a/tests/test_dataset_tar.py +++ b/tests/test_dataset_tar.py @@ -85,3 +85,20 @@ def test_tar_input_roundtrips_to_dir(tmp_path): assert camera.tar_path is None assert camera.num_frames == 3 assert camera.get_frame(0).load().shape == (600, 960, 3) + + +def test_camera_format_dir(): + assert Dataset(DATASET_DIR).camera_format == "dir" + + +def test_camera_format_tar(tar_dataset): + assert tar_dataset.camera_format == "tar" + + +def test_camera_format_inconsistent_raises(tmp_path): + out = tmp_path / "mixed" + Dataset(DATASET_DIR).write(out, format="openarm", camera_format="tar") + # Turn one camera back into "dir" layout so the dataset mixes both formats. + (out / "episodes" / "0" / "cameras" / "head").mkdir() + with pytest.raises(ValueError, match="Inconsistent camera formats"): + Dataset(out).camera_format