diff --git a/README.md b/README.md
index b42858b..134e0ec 100644
--- a/README.md
+++ b/README.md
@@ -148,6 +148,23 @@ flags apply only when `--format lerobot_v2.1` or `--format gr00t`.
The `gr00t` format produces a LeRobot v2.1 dataset plus a GR00T-compatible
`meta/modality.json` (see [Isaac-GR00T data preparation](https://github.com/NVIDIA/Isaac-GR00T/blob/main/getting_started/data_preparation.md)).
+Upload a dataset to the Hugging Face Hub:
+
+```bash
+openarm-dataset-upload \
+ --repo-id / \
+ [--private] # create the repo as private if it does not exist
+```
+
+The whole dataset directory is uploaded to a
+[dataset repository](https://huggingface.co/docs/hub/datasets), creating it if it
+does not already exist, and tagged with the dataset version. Cameras stored as
+directories of JPEG files are repacked **in place** into one `.tar` archive per
+camera before uploading, to stay within [Hugging Face Hub's file-count
+recommendations](https://huggingface.co/docs/hub/storage-limits#recommendations).
+Repacking is lossless and reversible (`openarm-dataset-convert --camera-format dir`
+restores the JPEG-directory layout).
+
## Development
### Test
diff --git a/pyproject.toml b/pyproject.toml
index 33364da..02e1a70 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -25,6 +25,8 @@ readme = "README.md"
requires-python = ">=3.10"
dependencies = [
+ "huggingface_hub",
+ "jinja2",
"numpy",
"pandas",
"pillow",
@@ -58,6 +60,7 @@ Repository = "https://github.com/enactic/openarm_dataset.git"
openarm-dataset-convert = "openarm_dataset.convert:main"
openarm-dataset-merge = "openarm_dataset.merge:main"
openarm-dataset-repair = "openarm_dataset.repair:main"
+openarm-dataset-upload = "openarm_dataset.upload:main"
openarm-dataset-validate = "openarm_dataset.validate:main"
[dependency-groups]
@@ -72,6 +75,9 @@ package-dir = {"" = "src"}
[tool.setuptools.packages.find]
where = ["src"]
+[tool.setuptools.package-data]
+openarm_dataset = ["card_template.md"]
+
[tool.pytest.ini_options]
testpaths = ["tests"]
diff --git a/src/openarm_dataset/camera.py b/src/openarm_dataset/camera.py
index 2c0575b..352a24f 100644
--- a/src/openarm_dataset/camera.py
+++ b/src/openarm_dataset/camera.py
@@ -197,6 +197,11 @@ def num_frames(self) -> int:
else:
return len(self.all_files)
+ @property
+ def format(self) -> str:
+ """Get camera format, either "dir" or "tar"."""
+ return "tar" if self.tar_path is not None else "dir"
+
def get_frame(self, index: int) -> Frame:
"""Get frame at the index.
@@ -247,7 +252,7 @@ def write(self, output: os.PathLike, format):
"""
if format == "dir":
dest_dir = Path(output)
- if self.tar_path is None:
+ if self.format == "dir":
shutil.copytree(self.base_path, dest_dir)
return
dest_dir.mkdir(parents=True)
@@ -263,7 +268,7 @@ def write(self, output: os.PathLike, format):
elif format == "tar":
dest_tar = Path(output).with_suffix(".tar")
dest_tar.parent.mkdir(parents=True, exist_ok=True)
- if self.tar_path is not None:
+ if self.format == "tar":
shutil.copy2(self.tar_path, dest_tar)
return
with tarfile.open(dest_tar, mode="w") as tf:
diff --git a/src/openarm_dataset/card_template.md b/src/openarm_dataset/card_template.md
new file mode 100644
index 0000000..421665b
--- /dev/null
+++ b/src/openarm_dataset/card_template.md
@@ -0,0 +1,29 @@
+---
+# For reference on dataset card metadata, see the spec: https://github.com/huggingface/hub-docs/blob/main/datasetcard.md?plain=1
+# Doc / guide: https://huggingface.co/docs/hub/datasets-cards
+# prettier-ignore
+{{card_data}}
+---
+
+This dataset was uploaded using [OpenArm Dataset](https://github.com/enactic/openarm_dataset).
+
+
+## Dataset Description
+
+{{ dataset_description | default("", true) }}
+
+- **Homepage:** {{ url | default("[More Information Needed]", true)}}
+- **Paper:** {{ paper | default("[More Information Needed]", true)}}
+- **License:** {{ license | default("[More Information Needed]", true)}}
+
+## Dataset Structure
+
+{{ dataset_structure | default("[More Information Needed]", true)}}
+
+## Citation
+
+**BibTeX:**
+
+```bibtex
+{{ citation_bibtex | default("[More Information Needed]", true)}}
+```
diff --git a/src/openarm_dataset/dataset.py b/src/openarm_dataset/dataset.py
index 5585d1c..b50c07e 100644
--- a/src/openarm_dataset/dataset.py
+++ b/src/openarm_dataset/dataset.py
@@ -131,6 +131,24 @@ def camera_names(self) -> list[str]:
return self._camera_names
return list(self.meta.equipment.perceptions.cameras)
+ @property
+ def camera_format(self) -> str:
+ """Return the camera format ("dir" or "tar") shared by all cameras.
+
+ Every camera in the dataset is expected to use the same format.
+
+ Raises:
+ ValueError: If cameras use a mix of "dir" and "tar".
+
+ """
+ formats = set()
+ for episode in self.meta.episodes:
+ for name in self.camera_names:
+ formats.add(self.load_camera(name, episode).format)
+ if len(formats) > 1:
+ raise ValueError(f"Inconsistent camera formats: {sorted(formats)}")
+ return formats.pop()
+
def _episode_id(self, index: int) -> str:
return self.meta.episodes[index]["id"]
diff --git a/src/openarm_dataset/upload.py b/src/openarm_dataset/upload.py
new file mode 100644
index 0000000..23b1bd6
--- /dev/null
+++ b/src/openarm_dataset/upload.py
@@ -0,0 +1,236 @@
+# Copyright 2026 Enactic, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Upload OpenArm Dataset to Hugging Face Hub."""
+
+import argparse
+import pathlib
+import shutil
+import sys
+import importlib.resources
+from huggingface_hub import DatasetCard, DatasetCardData, HfApi
+from huggingface_hub.errors import RevisionNotFoundError
+import contextlib
+
+from .dataset import Dataset
+
+
+def pack_cameras_as_tar(dataset: Dataset) -> None:
+ """Repack every "dir"-format camera into a sibling ".tar" archive in place.
+
+ Each ``cameras//`` directory of JPEG frames is replaced by one
+ uncompressed ``cameras/.tar`` archive. Packing is lossless and
+ reversible: ``Dataset`` reads either layout through the same API.
+
+ Args:
+ dataset: The dataset to repack in place.
+
+ """
+ for episode in dataset.meta.episodes:
+ for camera in dataset.load_cameras(episode).values():
+ if camera.format == "tar":
+ continue
+ camera.write(camera.base_path, "tar")
+ shutil.rmtree(camera.base_path)
+
+
+def create_dataset_card(
+ tags: list | None = None,
+ metadata_yaml: str | None = None,
+ camera_names: list[str] | None = None,
+ **kwargs,
+) -> DatasetCard:
+ """Create a `DatasetCard` for a OpenArm Dataset.
+
+ Args:
+ tags (list | None): A list of tags to add to the dataset card.
+ metadata_yaml (str | None): The dataset's ``metadata.yaml`` contents,
+ embedded verbatim on the card.
+ camera_names (list[str] | None): Camera names to expose as dataset
+ viewer configs. Each becomes a WebDataset config so the camera
+ frames are browsable on the Hugging Face Hub.
+ **kwargs: Additional keyword arguments to populate the card template.
+
+ Returns:
+ DatasetCard: The generated dataset card object.
+
+ """
+ card_tags = ["OpenArm"]
+
+ if tags:
+ card_tags += tags
+ if kwargs.get("license"):
+ kwargs = {**kwargs, "license": kwargs["license"]}
+ if metadata_yaml:
+ dataset_structure = "[metadata.yaml](metadata.yaml):\n"
+ dataset_structure += f"```yaml\n{metadata_yaml}\n```\n"
+ kwargs = {**kwargs, "dataset_structure": dataset_structure}
+ configs = [
+ {
+ "config_name": name,
+ "data_files": f"episodes/*/cameras/{name}.tar", # for dataset viewer
+ }
+ for name in (camera_names or [])
+ ]
+ card_data = DatasetCardData(
+ license=kwargs.get("license"),
+ tags=card_tags,
+ task_categories=["robotics"],
+ configs=configs or None,
+ )
+
+ card_template = (
+ importlib.resources.files("openarm_dataset")
+ .joinpath("card_template.md")
+ .read_text()
+ )
+
+ return DatasetCard.from_template(
+ card_data=card_data, template_str=card_template, **kwargs
+ )
+
+
+def upload_dataset(
+ input_path: pathlib.Path,
+ repo_id: str,
+ branch: str = "main",
+ tag: str | None = None,
+ metadata_yaml: str | None = None,
+ licence: str | None = None,
+ camera_names: list[str] | None = None,
+ private: bool = False,
+ upload_large_folder: bool = False,
+) -> None:
+ """Upload an OpenArm Dataset directory to the Hugging Face Hub.
+
+ Creates the dataset repository if it does not exist, then uploads the whole
+ directory. Camera frames are never uploaded as loose image files; pack them
+ into ``.tar`` archives first (see ``Dataset.write(camera_format="tar")``).
+
+ Args:
+ input_path: Path of the OpenArm Dataset directory to upload.
+ repo_id: Target repository id, e.g. ``username/dataset-name``.
+ branch: Branch (revision) to upload to.
+ tag: If given, create this tag on ``branch`` after the upload.
+ metadata_yaml: The dataset's ``metadata.yaml`` contents, shown verbatim
+ on the dataset card.
+ licence: Licence identifier recorded on the dataset card.
+ camera_names: Camera names to expose as dataset viewer configs so the
+ camera frames are browsable on the Hugging Face Hub.
+ private: Create the repository as private when it does not exist.
+ upload_large_folder: Use ``upload_large_folder`` for a resumable,
+ multi-threaded upload of large datasets.
+
+ """
+ hf_api = HfApi()
+ hf_api.create_repo(
+ repo_id=repo_id,
+ repo_type="dataset",
+ private=private,
+ exist_ok=True,
+ )
+ # Never upload camera frames as loose image files; they belong in .tar
+ # archives to stay within Hugging Face Hub's per-repository file-count limit.
+ ignore_patterns = ["*.jpeg", "*.jpg", "*.png"]
+ upload_kwargs = {
+ "repo_id": repo_id,
+ "folder_path": str(input_path),
+ "repo_type": "dataset",
+ "revision": branch,
+ "ignore_patterns": ignore_patterns,
+ }
+ if upload_large_folder:
+ hf_api.upload_large_folder(**upload_kwargs)
+ else:
+ hf_api.upload_folder(**upload_kwargs)
+
+ card = create_dataset_card(
+ tag=tag,
+ metadata_yaml=metadata_yaml,
+ license=licence,
+ camera_names=camera_names,
+ )
+ card.push_to_hub(
+ repo_id=repo_id,
+ repo_type="dataset",
+ revision=branch,
+ )
+ if tag is not None:
+ with contextlib.suppress(RevisionNotFoundError):
+ hf_api.delete_tag(repo_id, tag=tag, repo_type="dataset")
+ hf_api.create_tag(
+ repo_id, tag=tag, revision=branch, repo_type="dataset", exist_ok=True
+ )
+
+
+def main():
+ """Upload OpenArm Dataset to Hugging Face Hub."""
+ parser = argparse.ArgumentParser(
+ description="Upload an OpenArm Dataset to the Hugging Face Hub"
+ )
+ parser.add_argument(
+ "input",
+ help="Path of an OpenArm Dataset to upload",
+ type=pathlib.Path,
+ )
+ parser.add_argument(
+ "--repo-id",
+ required=True,
+ help="Target Hugging Face dataset repository id, e.g. username/dataset-name",
+ )
+ parser.add_argument(
+ "--private",
+ action="store_true",
+ default=False,
+ help="Create the repository as private if it does not exist",
+ )
+ parser.add_argument(
+ "--licence",
+ default="apache-2.0",
+ help="The licence to associate with the dataset on the Hugging Face Hub. "
+ "Defaults to Apache-2.0.",
+ )
+ parser.add_argument(
+ "--large-folder",
+ action="store_true",
+ default=False,
+ help="Use a resumable, multi-threaded upload for large datasets. "
+ "Recommended for datasets larger than 1 GB.",
+ )
+ args = parser.parse_args()
+
+ dataset = Dataset(args.input)
+
+ if dataset.camera_format == "dir":
+ print(
+ "Packing camera frames into .tar archives in place before upload "
+ "(Hugging Face Hub file-count recommendation)...",
+ file=sys.stderr,
+ )
+ pack_cameras_as_tar(dataset)
+
+ upload_dataset(
+ args.input,
+ args.repo_id,
+ tag=dataset.meta.version,
+ metadata_yaml=(args.input / "metadata.yaml").read_text(),
+ licence=args.licence,
+ camera_names=dataset.camera_names,
+ upload_large_folder=args.large_folder,
+ private=args.private,
+ )
+
+
+if __name__ == "__main__":
+ main()
diff --git a/tests/test_dataset_tar.py b/tests/test_dataset_tar.py
index 91d2881..e5f96d1 100644
--- a/tests/test_dataset_tar.py
+++ b/tests/test_dataset_tar.py
@@ -85,3 +85,20 @@ def test_tar_input_roundtrips_to_dir(tmp_path):
assert camera.tar_path is None
assert camera.num_frames == 3
assert camera.get_frame(0).load().shape == (600, 960, 3)
+
+
+def test_camera_format_dir():
+ assert Dataset(DATASET_DIR).camera_format == "dir"
+
+
+def test_camera_format_tar(tar_dataset):
+ assert tar_dataset.camera_format == "tar"
+
+
+def test_camera_format_inconsistent_raises(tmp_path):
+ out = tmp_path / "mixed"
+ Dataset(DATASET_DIR).write(out, format="openarm", camera_format="tar")
+ # Turn one camera back into "dir" layout so the dataset mixes both formats.
+ (out / "episodes" / "0" / "cameras" / "head").mkdir()
+ with pytest.raises(ValueError, match="Inconsistent camera formats"):
+ Dataset(out).camera_format