enactic · kou · Jun 18, 2026 · Jun 17, 2026 · Jun 17, 2026 · Jun 17, 2026
diff --git a/README.md b/README.md
@@ -148,6 +148,23 @@ flags apply only when `--format lerobot_v2.1` or `--format gr00t`.
 The `gr00t` format produces a LeRobot v2.1 dataset plus a GR00T-compatible
 `meta/modality.json` (see [Isaac-GR00T data preparation](https://github.com/NVIDIA/Isaac-GR00T/blob/main/getting_started/data_preparation.md)).
 
+Upload a dataset to the Hugging Face Hub:
+
+```bash
+openarm-dataset-upload <input> \
+    --repo-id <user>/<dataset> \
+    [--private]                # create the repo as private if it does not exist
+```
+
+The whole dataset directory is uploaded to a
+[dataset repository](https://huggingface.co/docs/hub/datasets), creating it if it
+does not already exist, and tagged with the dataset version. Cameras stored as
+directories of JPEG files are repacked **in place** into one `.tar` archive per
+camera before uploading, to stay within [Hugging Face Hub's file-count
+recommendations](https://huggingface.co/docs/hub/storage-limits#recommendations).
+Repacking is lossless and reversible (`openarm-dataset-convert --camera-format dir`
+restores the JPEG-directory layout).
+
 ## Development
 
 ### Test

diff --git a/pyproject.toml b/pyproject.toml
@@ -25,6 +25,8 @@ readme = "README.md"
 requires-python = ">=3.10"
 
 dependencies = [
+    "huggingface_hub",
+    "jinja2",
     "numpy",
     "pandas",
     "pillow",
@@ -58,6 +60,7 @@ Repository = "https://github.com/enactic/openarm_dataset.git"
 openarm-dataset-convert = "openarm_dataset.convert:main"
 openarm-dataset-merge = "openarm_dataset.merge:main"
 openarm-dataset-repair = "openarm_dataset.repair:main"
+openarm-dataset-upload = "openarm_dataset.upload:main"
 openarm-dataset-validate = "openarm_dataset.validate:main"
 
 [dependency-groups]
@@ -72,6 +75,9 @@ package-dir = {"" = "src"}
 [tool.setuptools.packages.find]
 where = ["src"]
 
+[tool.setuptools.package-data]
+openarm_dataset = ["card_template.md"]
+
 [tool.pytest.ini_options]
 testpaths = ["tests"]
 

diff --git a/src/openarm_dataset/camera.py b/src/openarm_dataset/camera.py
@@ -197,6 +197,11 @@ def num_frames(self) -> int:
         else:
             return len(self.all_files)
 
+    @property
+    def format(self) -> str:
+        """Get camera format, either "dir" or "tar"."""
+        return "tar" if self.tar_path is not None else "dir"
+
     def get_frame(self, index: int) -> Frame:
         """Get frame at the index.
 
@@ -247,7 +252,7 @@ def write(self, output: os.PathLike, format):
         """
         if format == "dir":
             dest_dir = Path(output)
-            if self.tar_path is None:
+            if self.format == "dir":
                 shutil.copytree(self.base_path, dest_dir)
                 return
             dest_dir.mkdir(parents=True)
@@ -263,7 +268,7 @@ def write(self, output: os.PathLike, format):
         elif format == "tar":
             dest_tar = Path(output).with_suffix(".tar")
             dest_tar.parent.mkdir(parents=True, exist_ok=True)
-            if self.tar_path is not None:
+            if self.format == "tar":
                 shutil.copy2(self.tar_path, dest_tar)
                 return
             with tarfile.open(dest_tar, mode="w") as tf:

diff --git a/src/openarm_dataset/card_template.md b/src/openarm_dataset/card_template.md
@@ -0,0 +1,29 @@
+---
+# For reference on dataset card metadata, see the spec: https://github.com/huggingface/hub-docs/blob/main/datasetcard.md?plain=1
+# Doc / guide: https://huggingface.co/docs/hub/datasets-cards
+# prettier-ignore
+{{card_data}}
+---
+
+This dataset was uploaded using [OpenArm Dataset](https://github.com/enactic/openarm_dataset).
+
+
+## Dataset Description
+
+{{ dataset_description | default("", true) }}
+
+- **Homepage:** {{ url | default("[More Information Needed]", true)}}
+- **Paper:** {{ paper | default("[More Information Needed]", true)}}
+- **License:** {{ license | default("[More Information Needed]", true)}}
+
+## Dataset Structure
+
+{{ dataset_structure | default("[More Information Needed]", true)}}
+
+## Citation
+
+**BibTeX:**
+
+```bibtex
+{{ citation_bibtex | default("[More Information Needed]", true)}}
+```
diff --git a/src/openarm_dataset/dataset.py b/src/openarm_dataset/dataset.py
@@ -131,6 +131,24 @@ def camera_names(self) -> list[str]:
             return self._camera_names
         return list(self.meta.equipment.perceptions.cameras)
 
+    @property
+    def camera_format(self) -> str:
+        """Return the camera format ("dir" or "tar") shared by all cameras.
+
+        Every camera in the dataset is expected to use the same format.
+
+        Raises:
+            ValueError: If cameras use a mix of "dir" and "tar".
+
+        """
+        formats = set()
+        for episode in self.meta.episodes:
+            for name in self.camera_names:
+                formats.add(self.load_camera(name, episode).format)
+        if len(formats) > 1:
+            raise ValueError(f"Inconsistent camera formats: {sorted(formats)}")
+        return formats.pop()
+
     def _episode_id(self, index: int) -> str:
         return self.meta.episodes[index]["id"]
 

diff --git a/src/openarm_dataset/upload.py b/src/openarm_dataset/upload.py
@@ -0,0 +1,236 @@
+# Copyright 2026 Enactic, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Upload OpenArm Dataset to Hugging Face Hub."""
+
+import argparse
+import pathlib
+import shutil
+import sys
+import importlib.resources
+from huggingface_hub import DatasetCard, DatasetCardData, HfApi
+from huggingface_hub.errors import RevisionNotFoundError
+import contextlib
+
+from .dataset import Dataset
+
+
+def pack_cameras_as_tar(dataset: Dataset) -> None:
+    """Repack every "dir"-format camera into a sibling ".tar" archive in place.
+
+    Each ``cameras/<name>/`` directory of JPEG frames is replaced by one
+    uncompressed ``cameras/<name>.tar`` archive. Packing is lossless and
+    reversible: ``Dataset`` reads either layout through the same API.
+
+    Args:
+        dataset: The dataset to repack in place.
+
+    """
+    for episode in dataset.meta.episodes:
+        for camera in dataset.load_cameras(episode).values():
+            if camera.format == "tar":
+                continue
+            camera.write(camera.base_path, "tar")
+            shutil.rmtree(camera.base_path)
+
+
+def create_dataset_card(
+    tags: list | None = None,
+    metadata_yaml: str | None = None,
+    camera_names: list[str] | None = None,
+    **kwargs,
+) -> DatasetCard:
+    """Create a `DatasetCard` for a OpenArm Dataset.
+
+    Args:
+        tags (list | None): A list of tags to add to the dataset card.
+        metadata_yaml (str | None): The dataset's ``metadata.yaml`` contents,
+            embedded verbatim on the card.
+        camera_names (list[str] | None): Camera names to expose as dataset
+            viewer configs. Each becomes a WebDataset config so the camera
+            frames are browsable on the Hugging Face Hub.
+        **kwargs: Additional keyword arguments to populate the card template.
+
+    Returns:
+        DatasetCard: The generated dataset card object.
+
+    """
+    card_tags = ["OpenArm"]
+
+    if tags:
+        card_tags += tags
+    if kwargs.get("license"):
+        kwargs = {**kwargs, "license": kwargs["license"]}
+    if metadata_yaml:
+        dataset_structure = "[metadata.yaml](metadata.yaml):\n"
+        dataset_structure += f"```yaml\n{metadata_yaml}\n```\n"
+        kwargs = {**kwargs, "dataset_structure": dataset_structure}
+    configs = [
+        {
+            "config_name": name,
+            "data_files": f"episodes/*/cameras/{name}.tar",  # for dataset viewer
+        }
+        for name in (camera_names or [])
+    ]
+    card_data = DatasetCardData(
+        license=kwargs.get("license"),
+        tags=card_tags,
+        task_categories=["robotics"],
+        configs=configs or None,
+    )
+
+    card_template = (
+        importlib.resources.files("openarm_dataset")
+        .joinpath("card_template.md")
+        .read_text()
+    )
+
+    return DatasetCard.from_template(
+        card_data=card_data, template_str=card_template, **kwargs
+    )
+
+
+def upload_dataset(
+    input_path: pathlib.Path,
+    repo_id: str,
+    branch: str = "main",
+    tag: str | None = None,
+    metadata_yaml: str | None = None,
+    licence: str | None = None,
+    camera_names: list[str] | None = None,
+    private: bool = False,
+    upload_large_folder: bool = False,
+) -> None:
+    """Upload an OpenArm Dataset directory to the Hugging Face Hub.
+
+    Creates the dataset repository if it does not exist, then uploads the whole
+    directory. Camera frames are never uploaded as loose image files; pack them
+    into ``.tar`` archives first (see ``Dataset.write(camera_format="tar")``).
+
+    Args:
+        input_path: Path of the OpenArm Dataset directory to upload.
+        repo_id: Target repository id, e.g. ``username/dataset-name``.
+        branch: Branch (revision) to upload to.
+        tag: If given, create this tag on ``branch`` after the upload.
+        metadata_yaml: The dataset's ``metadata.yaml`` contents, shown verbatim
+            on the dataset card.
+        licence: Licence identifier recorded on the dataset card.
+        camera_names: Camera names to expose as dataset viewer configs so the
+            camera frames are browsable on the Hugging Face Hub.
+        private: Create the repository as private when it does not exist.
+        upload_large_folder: Use ``upload_large_folder`` for a resumable,
+            multi-threaded upload of large datasets.
+
+    """
+    hf_api = HfApi()
+    hf_api.create_repo(
+        repo_id=repo_id,
+        repo_type="dataset",
+        private=private,
+        exist_ok=True,
+    )
+    # Never upload camera frames as loose image files; they belong in .tar
+    # archives to stay within Hugging Face Hub's per-repository file-count limit.
+    ignore_patterns = ["*.jpeg", "*.jpg", "*.png"]
+    upload_kwargs = {
+        "repo_id": repo_id,
+        "folder_path": str(input_path),
+        "repo_type": "dataset",
+        "revision": branch,
+        "ignore_patterns": ignore_patterns,
+    }
+    if upload_large_folder:
+        hf_api.upload_large_folder(**upload_kwargs)
+    else:
+        hf_api.upload_folder(**upload_kwargs)
+
+    card = create_dataset_card(
+        tag=tag,
+        metadata_yaml=metadata_yaml,
+        license=licence,
+        camera_names=camera_names,
+    )
+    card.push_to_hub(
+        repo_id=repo_id,
+        repo_type="dataset",
+        revision=branch,
+    )
+    if tag is not None:
+        with contextlib.suppress(RevisionNotFoundError):
+            hf_api.delete_tag(repo_id, tag=tag, repo_type="dataset")
+        hf_api.create_tag(
+            repo_id, tag=tag, revision=branch, repo_type="dataset", exist_ok=True
+        )
+
+
+def main():
+    """Upload OpenArm Dataset to Hugging Face Hub."""
+    parser = argparse.ArgumentParser(
+        description="Upload an OpenArm Dataset to the Hugging Face Hub"
+    )
+    parser.add_argument(
+        "input",
+        help="Path of an OpenArm Dataset to upload",
+        type=pathlib.Path,
+    )
+    parser.add_argument(
+        "--repo-id",
+        required=True,
+        help="Target Hugging Face dataset repository id, e.g. username/dataset-name",
+    )
+    parser.add_argument(
+        "--private",
+        action="store_true",
+        default=False,
+        help="Create the repository as private if it does not exist",
+    )
+    parser.add_argument(
+        "--licence",
+        default="apache-2.0",
-        default="apache-2.0",
+        default="Apache-2.0",
-        default="apache-2.0",
+        default="Apache-2.0",
+        help="The licence to associate with the dataset on the Hugging Face Hub. "
+        "Defaults to Apache-2.0.",
+    )
+    parser.add_argument(
+        "--large-folder",
+        action="store_true",
+        default=False,
+        help="Use a resumable, multi-threaded upload for large datasets. "
+        "Recommended for datasets larger than 1 GB.",
+    )
+    args = parser.parse_args()
+
+    dataset = Dataset(args.input)
+
+    if dataset.camera_format == "dir":
+        print(
+            "Packing camera frames into .tar archives in place before upload "
+            "(Hugging Face Hub file-count recommendation)...",
+            file=sys.stderr,
+        )
+        pack_cameras_as_tar(dataset)
+
+    upload_dataset(
+        args.input,
+        args.repo_id,
+        tag=dataset.meta.version,
+        metadata_yaml=(args.input / "metadata.yaml").read_text(),
+        licence=args.licence,
+        camera_names=dataset.camera_names,
+        upload_large_folder=args.large_folder,
+        private=args.private,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/test_dataset_tar.py b/tests/test_dataset_tar.py
@@ -85,3 +85,20 @@ def test_tar_input_roundtrips_to_dir(tmp_path):
     assert camera.tar_path is None
     assert camera.num_frames == 3
     assert camera.get_frame(0).load().shape == (600, 960, 3)
+
+
+def test_camera_format_dir():
+    assert Dataset(DATASET_DIR).camera_format == "dir"
+
+
+def test_camera_format_tar(tar_dataset):
+    assert tar_dataset.camera_format == "tar"
+
+
+def test_camera_format_inconsistent_raises(tmp_path):
+    out = tmp_path / "mixed"
+    Dataset(DATASET_DIR).write(out, format="openarm", camera_format="tar")
+    # Turn one camera back into "dir" layout so the dataset mixes both formats.
+    (out / "episodes" / "0" / "cameras" / "head").mkdir()
+    with pytest.raises(ValueError, match="Inconsistent camera formats"):
+        Dataset(out).camera_format