From 781072aadf41737a22df4fe4ad732997a5c7f2a7 Mon Sep 17 00:00:00 2001 From: k1000dai Date: Wed, 17 Jun 2026 15:00:05 +0900 Subject: [PATCH 1/5] upload script --- README.md | 18 ++ pyproject.toml | 6 + src/openarm_dataset/camera.py | 9 +- src/openarm_dataset/card_template.md | 29 ++++ src/openarm_dataset/dataset.py | 18 ++ src/openarm_dataset/upload.py | 242 +++++++++++++++++++++++++++ tests/test_dataset_tar.py | 17 ++ 7 files changed, 337 insertions(+), 2 deletions(-) create mode 100644 src/openarm_dataset/card_template.md create mode 100644 src/openarm_dataset/upload.py diff --git a/README.md b/README.md index b42858b..742a36b 100644 --- a/README.md +++ b/README.md @@ -148,6 +148,24 @@ flags apply only when `--format lerobot_v2.1` or `--format gr00t`. The `gr00t` format produces a LeRobot v2.1 dataset plus a GR00T-compatible `meta/modality.json` (see [Isaac-GR00T data preparation](https://github.com/NVIDIA/Isaac-GR00T/blob/main/getting_started/data_preparation.md)). +Upload a dataset to the Hugging Face Hub: + +```bash +openarm-dataset-upload \ + --repo-id / \ + [--private] # create the repo as private if it does not exist +``` + +Validates the dataset first and exits with status `1` if any errors are reported, +before contacting the Hub. The whole dataset directory is uploaded to a +[dataset repository](https://huggingface.co/docs/hub/datasets), creating it if it +does not already exist, and tagged with the dataset version. Cameras stored as +directories of JPEG files are repacked **in place** into one `.tar` archive per +camera before uploading, to stay within [Hugging Face Hub's file-count +recommendations](https://huggingface.co/docs/hub/storage-limits#recommendations). +Repacking is lossless and reversible (`openarm-dataset-convert --camera-format dir` +restores the JPEG-directory layout). + ## Development ### Test diff --git a/pyproject.toml b/pyproject.toml index 33364da..02e1a70 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,6 +25,8 @@ readme = "README.md" requires-python = ">=3.10" dependencies = [ + "huggingface_hub", + "jinja2", "numpy", "pandas", "pillow", @@ -58,6 +60,7 @@ Repository = "https://github.com/enactic/openarm_dataset.git" openarm-dataset-convert = "openarm_dataset.convert:main" openarm-dataset-merge = "openarm_dataset.merge:main" openarm-dataset-repair = "openarm_dataset.repair:main" +openarm-dataset-upload = "openarm_dataset.upload:main" openarm-dataset-validate = "openarm_dataset.validate:main" [dependency-groups] @@ -72,6 +75,9 @@ package-dir = {"" = "src"} [tool.setuptools.packages.find] where = ["src"] +[tool.setuptools.package-data] +openarm_dataset = ["card_template.md"] + [tool.pytest.ini_options] testpaths = ["tests"] diff --git a/src/openarm_dataset/camera.py b/src/openarm_dataset/camera.py index 2c0575b..352a24f 100644 --- a/src/openarm_dataset/camera.py +++ b/src/openarm_dataset/camera.py @@ -197,6 +197,11 @@ def num_frames(self) -> int: else: return len(self.all_files) + @property + def format(self) -> str: + """Get camera format, either "dir" or "tar".""" + return "tar" if self.tar_path is not None else "dir" + def get_frame(self, index: int) -> Frame: """Get frame at the index. @@ -247,7 +252,7 @@ def write(self, output: os.PathLike, format): """ if format == "dir": dest_dir = Path(output) - if self.tar_path is None: + if self.format == "dir": shutil.copytree(self.base_path, dest_dir) return dest_dir.mkdir(parents=True) @@ -263,7 +268,7 @@ def write(self, output: os.PathLike, format): elif format == "tar": dest_tar = Path(output).with_suffix(".tar") dest_tar.parent.mkdir(parents=True, exist_ok=True) - if self.tar_path is not None: + if self.format == "tar": shutil.copy2(self.tar_path, dest_tar) return with tarfile.open(dest_tar, mode="w") as tf: diff --git a/src/openarm_dataset/card_template.md b/src/openarm_dataset/card_template.md new file mode 100644 index 0000000..7902033 --- /dev/null +++ b/src/openarm_dataset/card_template.md @@ -0,0 +1,29 @@ +--- +# For reference on dataset card metadata, see the spec: https://github.com/huggingface/hub-docs/blob/main/datasetcard.md?plain=1 +# Doc / guide: https://huggingface.co/docs/hub/datasets-cards +# prettier-ignore +{{card_data}} +--- + +This dataset was created using [OpenArm Dataset](https://github.com/enactic/openarm_dataset). + + +## Dataset Description + +{{ dataset_description | default("", true) }} + +- **Homepage:** {{ url | default("[More Information Needed]", true)}} +- **Paper:** {{ paper | default("[More Information Needed]", true)}} +- **License:** {{ license | default("[More Information Needed]", true)}} + +## Dataset Structure + +{{ dataset_structure | default("[More Information Needed]", true)}} + +## Citation + +**BibTeX:** + +```bibtex +{{ citation_bibtex | default("[More Information Needed]", true)}} +``` diff --git a/src/openarm_dataset/dataset.py b/src/openarm_dataset/dataset.py index 5585d1c..cc78d6e 100644 --- a/src/openarm_dataset/dataset.py +++ b/src/openarm_dataset/dataset.py @@ -131,6 +131,24 @@ def camera_names(self) -> list[str]: return self._camera_names return list(self.meta.equipment.perceptions.cameras) + @property + def camera_format(self) -> str: + """Return the camera format ("dir" or "tar") shared by all cameras. + + Every camera in the dataset is expected to use the same format. + + Raises: + ValueError: If cameras use a mix of "dir" and "tar". + + """ + first_episode = self.meta.episodes[0] + formats = { + self.load_camera(name, first_episode).format for name in self.camera_names + } + if len(formats) > 1: + raise ValueError(f"Inconsistent camera formats: {sorted(formats)}") + return next(iter(formats), None) + def _episode_id(self, index: int) -> str: return self.meta.episodes[index]["id"] diff --git a/src/openarm_dataset/upload.py b/src/openarm_dataset/upload.py new file mode 100644 index 0000000..d8092c4 --- /dev/null +++ b/src/openarm_dataset/upload.py @@ -0,0 +1,242 @@ +# Copyright 2026 Enactic, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Upload OpenArm dataset to Hugging Face Hub.""" + +import argparse +import json +import pathlib +import shutil +import sys +import importlib.resources +from huggingface_hub import DatasetCard, DatasetCardData, HfApi +from huggingface_hub.errors import RevisionNotFoundError +import contextlib + +from .dataset import Dataset + + +def pack_cameras_as_tar(dataset: Dataset) -> None: + """Repack every "dir"-format camera into a sibling ".tar" archive in place. + + Each ``cameras//`` directory of JPEG frames is replaced by one + uncompressed ``cameras/.tar`` archive. Packing is lossless and + reversible: ``Dataset`` reads either layout through the same API. + + Args: + dataset: The dataset to repack in place. + + """ + for episode in dataset.meta.episodes: + for camera in dataset.load_cameras(episode).values(): + if camera.format != "dir": + continue + camera.write(camera.base_path, "tar") + shutil.rmtree(camera.base_path) + + +def create_dataset_card( + tags: list | None = None, + dataset_info: dict | None = None, + camera_names: list[str] | None = None, + **kwargs, +) -> DatasetCard: + """Create a `DatasetCard` for a OpenArm Dataset. + + Args: + tags (list | None): A list of tags to add to the dataset card. + dataset_info (dict | None): The dataset's info dictionary, which will + be displayed on the card. + camera_names (list[str] | None): Camera names to expose as dataset + viewer configs. Each becomes a WebDataset config so the camera + frames are browsable on the Hugging Face Hub. + **kwargs: Additional keyword arguments to populate the card template. + + Returns: + DatasetCard: The generated dataset card object. + + """ + card_tags = ["OpenArm"] + + if tags: + card_tags += tags + if kwargs.get("license"): + kwargs = {**kwargs, "license": kwargs["license"]} + if dataset_info: + dataset_structure = "[metadata.yaml](metadata.yaml):\n" + dataset_structure += f"```json\n{json.dumps(dataset_info, indent=4)}\n```\n" + kwargs = {**kwargs, "dataset_structure": dataset_structure} + configs = [ + { + "config_name": name, + "data_files": f"episodes/*/cameras/{name}.tar", # for dataset viewer + } + for name in (camera_names or []) + ] + card_data = DatasetCardData( + license=kwargs.get("license"), + tags=card_tags, + task_categories=["robotics"], + configs=configs or None, + ) + + card_template = ( + importlib.resources.files("openarm_dataset") + .joinpath("card_template.md") + .read_text() + ) + + return DatasetCard.from_template( + card_data=card_data, template_str=card_template, **kwargs + ) + + +def upload_dataset( + input_path: pathlib.Path, + repo_id: str, + branch: str = "main", + tag: str | None = None, + dataset_info: dict | None = None, + licence: str | None = None, + camera_names: list[str] | None = None, + private: bool = False, + upload_large_folder: bool = False, +) -> None: + """Upload an OpenArm dataset directory to the Hugging Face Hub. + + Creates the dataset repository if it does not exist, then uploads the whole + directory. Camera frames are never uploaded as loose image files; pack them + into ``.tar`` archives first (see ``Dataset.write(camera_format="tar")``). + + Args: + input_path: Path of the OpenArm dataset directory to upload. + repo_id: Target repository id, e.g. ``username/dataset-name``. + branch: Branch (revision) to upload to. + tag: If given, create this tag on ``branch`` after the upload. + dataset_info: The dataset's info dictionary, shown on the dataset card. + licence: Licence identifier recorded on the dataset card. + camera_names: Camera names to expose as dataset viewer configs so the + camera frames are browsable on the Hugging Face Hub. + private: Create the repository as private when it does not exist. + upload_large_folder: Use ``upload_large_folder`` for a resumable, + multi-threaded upload of large datasets. + + """ + hf_api = HfApi() + hf_api.create_repo( + repo_id=repo_id, + repo_type="dataset", + private=private, + exist_ok=True, + ) + # Never upload camera frames as loose image files; they belong in .tar + # archives to stay within Hugging Face Hub's per-repository file-count limit. + ignore_patterns = ["*.jpeg", "*.jpg", "*.png"] + upload_kwargs = { + "repo_id": repo_id, + "folder_path": str(input_path), + "repo_type": "dataset", + "revision": branch, + "ignore_patterns": ignore_patterns, + } + if upload_large_folder: + hf_api.upload_large_folder(**upload_kwargs) + else: + hf_api.upload_folder(**upload_kwargs) + + card = create_dataset_card( + tag=tag, + dataset_info=dataset_info, + license=licence, + camera_names=camera_names, + ) + card.push_to_hub( + repo_id=repo_id, + repo_type="dataset", + revision=branch, + ) + if tag is not None: + with contextlib.suppress(RevisionNotFoundError): + hf_api.delete_tag(repo_id, tag=tag, repo_type="dataset") + hf_api.create_tag( + repo_id, tag=tag, revision=branch, repo_type="dataset", exist_ok=True + ) + + +def main(): + """Upload OpenArm dataset to Hugging Face Hub.""" + parser = argparse.ArgumentParser( + description="Upload an OpenArm dataset to the Hugging Face Hub" + ) + parser.add_argument( + "input", + help="Path of an OpenArm dataset to upload", + type=pathlib.Path, + ) + parser.add_argument( + "--repo-id", + required=True, + help="Target Hugging Face dataset repository id, e.g. username/dataset-name", + ) + parser.add_argument( + "--private", + action="store_true", + default=False, + help="Create the repository as private if it does not exist", + ) + parser.add_argument( + "--licence", + default="apache-2.0", + help="The licence to associate with the dataset on the Hugging Face Hub. " + "Defaults to Apache-2.0.", + ) + parser.add_argument( + "--upload-large-folder", + action="store_true", + default=False, + help="Use a resumable, multi-threaded upload for large datasets. " + "Recommended for datasets larger than 1 GB.", + ) + parser.add_argument( + "--token", + default=None, + help="Hugging Face Hub access token. Defaults to the locally cached " + "login when not given.", + ) + args = parser.parse_args() + + dataset = Dataset(args.input) + + if dataset.camera_format == "dir": + print( + "Packing camera frames into .tar archives in place before upload " + "(Hugging Face Hub file-count recommendation)...", + file=sys.stderr, + ) + pack_cameras_as_tar(dataset) + + upload_dataset( + args.input, + args.repo_id, + tag=dataset.meta.version, + dataset_info=dataset.meta.data, + licence=args.licence, + camera_names=dataset.camera_names, + upload_large_folder=args.upload_large_folder, + private=args.private, + ) + + +if __name__ == "__main__": + main() diff --git a/tests/test_dataset_tar.py b/tests/test_dataset_tar.py index 91d2881..e5f96d1 100644 --- a/tests/test_dataset_tar.py +++ b/tests/test_dataset_tar.py @@ -85,3 +85,20 @@ def test_tar_input_roundtrips_to_dir(tmp_path): assert camera.tar_path is None assert camera.num_frames == 3 assert camera.get_frame(0).load().shape == (600, 960, 3) + + +def test_camera_format_dir(): + assert Dataset(DATASET_DIR).camera_format == "dir" + + +def test_camera_format_tar(tar_dataset): + assert tar_dataset.camera_format == "tar" + + +def test_camera_format_inconsistent_raises(tmp_path): + out = tmp_path / "mixed" + Dataset(DATASET_DIR).write(out, format="openarm", camera_format="tar") + # Turn one camera back into "dir" layout so the dataset mixes both formats. + (out / "episodes" / "0" / "cameras" / "head").mkdir() + with pytest.raises(ValueError, match="Inconsistent camera formats"): + Dataset(out).camera_format From d23c7c0060434f60835d9e5c1ff7ae5a70641973 Mon Sep 17 00:00:00 2001 From: k1000dai Date: Wed, 17 Jun 2026 15:07:35 +0900 Subject: [PATCH 2/5] remvoe mismatch readme --- README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/README.md b/README.md index 742a36b..134e0ec 100644 --- a/README.md +++ b/README.md @@ -156,8 +156,7 @@ openarm-dataset-upload \ [--private] # create the repo as private if it does not exist ``` -Validates the dataset first and exits with status `1` if any errors are reported, -before contacting the Hub. The whole dataset directory is uploaded to a +The whole dataset directory is uploaded to a [dataset repository](https://huggingface.co/docs/hub/datasets), creating it if it does not already exist, and tagged with the dataset version. Cameras stored as directories of JPEG files are repacked **in place** into one `.tar` archive per From 05ffe66cf1814b57c3f1346af4fa2ef8e807a161 Mon Sep 17 00:00:00 2001 From: Kohei SENDAI <80389896+k1000dai@users.noreply.github.com> Date: Wed, 17 Jun 2026 18:35:26 +0900 Subject: [PATCH 3/5] Update src/openarm_dataset/upload.py Co-authored-by: Sutou Kouhei --- src/openarm_dataset/upload.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/openarm_dataset/upload.py b/src/openarm_dataset/upload.py index d8092c4..9bdb92f 100644 --- a/src/openarm_dataset/upload.py +++ b/src/openarm_dataset/upload.py @@ -40,7 +40,7 @@ def pack_cameras_as_tar(dataset: Dataset) -> None: """ for episode in dataset.meta.episodes: for camera in dataset.load_cameras(episode).values(): - if camera.format != "dir": + if camera.format == "tar": continue camera.write(camera.base_path, "tar") shutil.rmtree(camera.base_path) From f8a6c090bb03fbad508a5edb981521ba58bff90f Mon Sep 17 00:00:00 2001 From: k1000dai Date: Wed, 17 Jun 2026 19:35:17 +0900 Subject: [PATCH 4/5] fix comments and datacard to yaml --- src/openarm_dataset/dataset.py | 10 +++++----- src/openarm_dataset/upload.py | 22 +++++++++++----------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/src/openarm_dataset/dataset.py b/src/openarm_dataset/dataset.py index cc78d6e..b50c07e 100644 --- a/src/openarm_dataset/dataset.py +++ b/src/openarm_dataset/dataset.py @@ -141,13 +141,13 @@ def camera_format(self) -> str: ValueError: If cameras use a mix of "dir" and "tar". """ - first_episode = self.meta.episodes[0] - formats = { - self.load_camera(name, first_episode).format for name in self.camera_names - } + formats = set() + for episode in self.meta.episodes: + for name in self.camera_names: + formats.add(self.load_camera(name, episode).format) if len(formats) > 1: raise ValueError(f"Inconsistent camera formats: {sorted(formats)}") - return next(iter(formats), None) + return formats.pop() def _episode_id(self, index: int) -> str: return self.meta.episodes[index]["id"] diff --git a/src/openarm_dataset/upload.py b/src/openarm_dataset/upload.py index 9bdb92f..eb9dce6 100644 --- a/src/openarm_dataset/upload.py +++ b/src/openarm_dataset/upload.py @@ -12,10 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""Upload OpenArm dataset to Hugging Face Hub.""" +"""Upload OpenArm Dataset to Hugging Face Hub.""" import argparse -import json import pathlib import shutil import sys @@ -48,7 +47,7 @@ def pack_cameras_as_tar(dataset: Dataset) -> None: def create_dataset_card( tags: list | None = None, - dataset_info: dict | None = None, + metadata_yaml: str | None = None, camera_names: list[str] | None = None, **kwargs, ) -> DatasetCard: @@ -56,8 +55,8 @@ def create_dataset_card( Args: tags (list | None): A list of tags to add to the dataset card. - dataset_info (dict | None): The dataset's info dictionary, which will - be displayed on the card. + metadata_yaml (str | None): The dataset's ``metadata.yaml`` contents, + embedded verbatim on the card. camera_names (list[str] | None): Camera names to expose as dataset viewer configs. Each becomes a WebDataset config so the camera frames are browsable on the Hugging Face Hub. @@ -73,9 +72,9 @@ def create_dataset_card( card_tags += tags if kwargs.get("license"): kwargs = {**kwargs, "license": kwargs["license"]} - if dataset_info: + if metadata_yaml: dataset_structure = "[metadata.yaml](metadata.yaml):\n" - dataset_structure += f"```json\n{json.dumps(dataset_info, indent=4)}\n```\n" + dataset_structure += f"```yaml\n{metadata_yaml}\n```\n" kwargs = {**kwargs, "dataset_structure": dataset_structure} configs = [ { @@ -107,7 +106,7 @@ def upload_dataset( repo_id: str, branch: str = "main", tag: str | None = None, - dataset_info: dict | None = None, + metadata_yaml: str | None = None, licence: str | None = None, camera_names: list[str] | None = None, private: bool = False, @@ -124,7 +123,8 @@ def upload_dataset( repo_id: Target repository id, e.g. ``username/dataset-name``. branch: Branch (revision) to upload to. tag: If given, create this tag on ``branch`` after the upload. - dataset_info: The dataset's info dictionary, shown on the dataset card. + metadata_yaml: The dataset's ``metadata.yaml`` contents, shown verbatim + on the dataset card. licence: Licence identifier recorded on the dataset card. camera_names: Camera names to expose as dataset viewer configs so the camera frames are browsable on the Hugging Face Hub. @@ -157,7 +157,7 @@ def upload_dataset( card = create_dataset_card( tag=tag, - dataset_info=dataset_info, + metadata_yaml=metadata_yaml, license=licence, camera_names=camera_names, ) @@ -230,7 +230,7 @@ def main(): args.input, args.repo_id, tag=dataset.meta.version, - dataset_info=dataset.meta.data, + metadata_yaml=(args.input / "metadata.yaml").read_text(), licence=args.licence, camera_names=dataset.camera_names, upload_large_folder=args.upload_large_folder, From b883314f82c1eb66e0c47a773eaebbfbd091f3d1 Mon Sep 17 00:00:00 2001 From: k1000dai Date: Thu, 18 Jun 2026 19:16:20 +0900 Subject: [PATCH 5/5] fixes --- src/openarm_dataset/card_template.md | 2 +- src/openarm_dataset/upload.py | 20 +++++++------------- 2 files changed, 8 insertions(+), 14 deletions(-) diff --git a/src/openarm_dataset/card_template.md b/src/openarm_dataset/card_template.md index 7902033..421665b 100644 --- a/src/openarm_dataset/card_template.md +++ b/src/openarm_dataset/card_template.md @@ -5,7 +5,7 @@ {{card_data}} --- -This dataset was created using [OpenArm Dataset](https://github.com/enactic/openarm_dataset). +This dataset was uploaded using [OpenArm Dataset](https://github.com/enactic/openarm_dataset). ## Dataset Description diff --git a/src/openarm_dataset/upload.py b/src/openarm_dataset/upload.py index eb9dce6..23b1bd6 100644 --- a/src/openarm_dataset/upload.py +++ b/src/openarm_dataset/upload.py @@ -112,14 +112,14 @@ def upload_dataset( private: bool = False, upload_large_folder: bool = False, ) -> None: - """Upload an OpenArm dataset directory to the Hugging Face Hub. + """Upload an OpenArm Dataset directory to the Hugging Face Hub. Creates the dataset repository if it does not exist, then uploads the whole directory. Camera frames are never uploaded as loose image files; pack them into ``.tar`` archives first (see ``Dataset.write(camera_format="tar")``). Args: - input_path: Path of the OpenArm dataset directory to upload. + input_path: Path of the OpenArm Dataset directory to upload. repo_id: Target repository id, e.g. ``username/dataset-name``. branch: Branch (revision) to upload to. tag: If given, create this tag on ``branch`` after the upload. @@ -175,13 +175,13 @@ def upload_dataset( def main(): - """Upload OpenArm dataset to Hugging Face Hub.""" + """Upload OpenArm Dataset to Hugging Face Hub.""" parser = argparse.ArgumentParser( - description="Upload an OpenArm dataset to the Hugging Face Hub" + description="Upload an OpenArm Dataset to the Hugging Face Hub" ) parser.add_argument( "input", - help="Path of an OpenArm dataset to upload", + help="Path of an OpenArm Dataset to upload", type=pathlib.Path, ) parser.add_argument( @@ -202,18 +202,12 @@ def main(): "Defaults to Apache-2.0.", ) parser.add_argument( - "--upload-large-folder", + "--large-folder", action="store_true", default=False, help="Use a resumable, multi-threaded upload for large datasets. " "Recommended for datasets larger than 1 GB.", ) - parser.add_argument( - "--token", - default=None, - help="Hugging Face Hub access token. Defaults to the locally cached " - "login when not given.", - ) args = parser.parse_args() dataset = Dataset(args.input) @@ -233,7 +227,7 @@ def main(): metadata_yaml=(args.input / "metadata.yaml").read_text(), licence=args.licence, camera_names=dataset.camera_names, - upload_large_folder=args.upload_large_folder, + upload_large_folder=args.large_folder, private=args.private, )