Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,23 @@ flags apply only when `--format lerobot_v2.1` or `--format gr00t`.
The `gr00t` format produces a LeRobot v2.1 dataset plus a GR00T-compatible
`meta/modality.json` (see [Isaac-GR00T data preparation](https://github.com/NVIDIA/Isaac-GR00T/blob/main/getting_started/data_preparation.md)).

Upload a dataset to the Hugging Face Hub:

```bash
openarm-dataset-upload <input> \
--repo-id <user>/<dataset> \
[--private] # create the repo as private if it does not exist
```

The whole dataset directory is uploaded to a
[dataset repository](https://huggingface.co/docs/hub/datasets), creating it if it
does not already exist, and tagged with the dataset version. Cameras stored as
directories of JPEG files are repacked **in place** into one `.tar` archive per
camera before uploading, to stay within [Hugging Face Hub's file-count
recommendations](https://huggingface.co/docs/hub/storage-limits#recommendations).
Repacking is lossless and reversible (`openarm-dataset-convert --camera-format dir`
restores the JPEG-directory layout).

## Development

### Test
Expand Down
6 changes: 6 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ readme = "README.md"
requires-python = ">=3.10"

dependencies = [
"huggingface_hub",
"jinja2",
"numpy",
"pandas",
"pillow",
Expand Down Expand Up @@ -58,6 +60,7 @@ Repository = "https://github.com/enactic/openarm_dataset.git"
openarm-dataset-convert = "openarm_dataset.convert:main"
openarm-dataset-merge = "openarm_dataset.merge:main"
openarm-dataset-repair = "openarm_dataset.repair:main"
openarm-dataset-upload = "openarm_dataset.upload:main"
openarm-dataset-validate = "openarm_dataset.validate:main"

[dependency-groups]
Expand All @@ -72,6 +75,9 @@ package-dir = {"" = "src"}
[tool.setuptools.packages.find]
where = ["src"]

[tool.setuptools.package-data]
openarm_dataset = ["card_template.md"]

[tool.pytest.ini_options]
testpaths = ["tests"]

Expand Down
9 changes: 7 additions & 2 deletions src/openarm_dataset/camera.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,11 @@ def num_frames(self) -> int:
else:
return len(self.all_files)

@property
def format(self) -> str:
"""Get camera format, either "dir" or "tar"."""
return "tar" if self.tar_path is not None else "dir"

def get_frame(self, index: int) -> Frame:
"""Get frame at the index.

Expand Down Expand Up @@ -247,7 +252,7 @@ def write(self, output: os.PathLike, format):
"""
if format == "dir":
dest_dir = Path(output)
if self.tar_path is None:
if self.format == "dir":
shutil.copytree(self.base_path, dest_dir)
return
dest_dir.mkdir(parents=True)
Expand All @@ -263,7 +268,7 @@ def write(self, output: os.PathLike, format):
elif format == "tar":
dest_tar = Path(output).with_suffix(".tar")
dest_tar.parent.mkdir(parents=True, exist_ok=True)
if self.tar_path is not None:
if self.format == "tar":
shutil.copy2(self.tar_path, dest_tar)
return
with tarfile.open(dest_tar, mode="w") as tf:
Expand Down
29 changes: 29 additions & 0 deletions src/openarm_dataset/card_template.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
---
# For reference on dataset card metadata, see the spec: https://github.com/huggingface/hub-docs/blob/main/datasetcard.md?plain=1
# Doc / guide: https://huggingface.co/docs/hub/datasets-cards
# prettier-ignore
{{card_data}}
---

This dataset was uploaded using [OpenArm Dataset](https://github.com/enactic/openarm_dataset).


## Dataset Description

{{ dataset_description | default("", true) }}

- **Homepage:** {{ url | default("[More Information Needed]", true)}}
- **Paper:** {{ paper | default("[More Information Needed]", true)}}
Comment on lines +13 to +16

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's add these information to metadata.yaml as a follow-up task.

- **License:** {{ license | default("[More Information Needed]", true)}}

## Dataset Structure

{{ dataset_structure | default("[More Information Needed]", true)}}

## Citation

**BibTeX:**

```bibtex
{{ citation_bibtex | default("[More Information Needed]", true)}}
```
18 changes: 18 additions & 0 deletions src/openarm_dataset/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,24 @@ def camera_names(self) -> list[str]:
return self._camera_names
return list(self.meta.equipment.perceptions.cameras)

@property
def camera_format(self) -> str:
"""Return the camera format ("dir" or "tar") shared by all cameras.

Every camera in the dataset is expected to use the same format.

Raises:
ValueError: If cameras use a mix of "dir" and "tar".

"""
formats = set()
for episode in self.meta.episodes:
for name in self.camera_names:
formats.add(self.load_camera(name, episode).format)
if len(formats) > 1:
raise ValueError(f"Inconsistent camera formats: {sorted(formats)}")
return formats.pop()

def _episode_id(self, index: int) -> str:
return self.meta.episodes[index]["id"]

Expand Down
236 changes: 236 additions & 0 deletions src/openarm_dataset/upload.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,236 @@
# Copyright 2026 Enactic, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Upload OpenArm Dataset to Hugging Face Hub."""

import argparse
import pathlib
import shutil
import sys
import importlib.resources
from huggingface_hub import DatasetCard, DatasetCardData, HfApi
from huggingface_hub.errors import RevisionNotFoundError
import contextlib

from .dataset import Dataset


def pack_cameras_as_tar(dataset: Dataset) -> None:
"""Repack every "dir"-format camera into a sibling ".tar" archive in place.

Each ``cameras/<name>/`` directory of JPEG frames is replaced by one
uncompressed ``cameras/<name>.tar`` archive. Packing is lossless and
reversible: ``Dataset`` reads either layout through the same API.

Args:
dataset: The dataset to repack in place.

"""
for episode in dataset.meta.episodes:
for camera in dataset.load_cameras(episode).values():
if camera.format == "tar":
continue
camera.write(camera.base_path, "tar")
shutil.rmtree(camera.base_path)


def create_dataset_card(
tags: list | None = None,
metadata_yaml: str | None = None,
camera_names: list[str] | None = None,
**kwargs,
) -> DatasetCard:
"""Create a `DatasetCard` for a OpenArm Dataset.

Args:
tags (list | None): A list of tags to add to the dataset card.
metadata_yaml (str | None): The dataset's ``metadata.yaml`` contents,
embedded verbatim on the card.
camera_names (list[str] | None): Camera names to expose as dataset
viewer configs. Each becomes a WebDataset config so the camera
frames are browsable on the Hugging Face Hub.
**kwargs: Additional keyword arguments to populate the card template.

Returns:
DatasetCard: The generated dataset card object.

"""
card_tags = ["OpenArm"]

if tags:
card_tags += tags
if kwargs.get("license"):
kwargs = {**kwargs, "license": kwargs["license"]}
if metadata_yaml:
dataset_structure = "[metadata.yaml](metadata.yaml):\n"
dataset_structure += f"```yaml\n{metadata_yaml}\n```\n"
kwargs = {**kwargs, "dataset_structure": dataset_structure}
configs = [
{
"config_name": name,
"data_files": f"episodes/*/cameras/{name}.tar", # for dataset viewer
}
for name in (camera_names or [])
]
card_data = DatasetCardData(
license=kwargs.get("license"),
tags=card_tags,
task_categories=["robotics"],
configs=configs or None,
)

card_template = (
importlib.resources.files("openarm_dataset")
.joinpath("card_template.md")
.read_text()
)

return DatasetCard.from_template(
card_data=card_data, template_str=card_template, **kwargs
)


def upload_dataset(
input_path: pathlib.Path,
repo_id: str,
branch: str = "main",
tag: str | None = None,
metadata_yaml: str | None = None,
licence: str | None = None,
camera_names: list[str] | None = None,
private: bool = False,
upload_large_folder: bool = False,
) -> None:
"""Upload an OpenArm Dataset directory to the Hugging Face Hub.

Creates the dataset repository if it does not exist, then uploads the whole
directory. Camera frames are never uploaded as loose image files; pack them
into ``.tar`` archives first (see ``Dataset.write(camera_format="tar")``).

Args:
input_path: Path of the OpenArm Dataset directory to upload.
repo_id: Target repository id, e.g. ``username/dataset-name``.
branch: Branch (revision) to upload to.
tag: If given, create this tag on ``branch`` after the upload.
metadata_yaml: The dataset's ``metadata.yaml`` contents, shown verbatim
on the dataset card.
licence: Licence identifier recorded on the dataset card.
camera_names: Camera names to expose as dataset viewer configs so the
camera frames are browsable on the Hugging Face Hub.
private: Create the repository as private when it does not exist.
upload_large_folder: Use ``upload_large_folder`` for a resumable,
multi-threaded upload of large datasets.

"""
hf_api = HfApi()
hf_api.create_repo(
repo_id=repo_id,
repo_type="dataset",
private=private,
exist_ok=True,
)
# Never upload camera frames as loose image files; they belong in .tar
# archives to stay within Hugging Face Hub's per-repository file-count limit.
ignore_patterns = ["*.jpeg", "*.jpg", "*.png"]
upload_kwargs = {
"repo_id": repo_id,
"folder_path": str(input_path),
"repo_type": "dataset",
"revision": branch,
"ignore_patterns": ignore_patterns,
}
if upload_large_folder:
hf_api.upload_large_folder(**upload_kwargs)
else:
hf_api.upload_folder(**upload_kwargs)

card = create_dataset_card(
tag=tag,
metadata_yaml=metadata_yaml,
license=licence,
camera_names=camera_names,
)
card.push_to_hub(
repo_id=repo_id,
repo_type="dataset",
revision=branch,
)
if tag is not None:
with contextlib.suppress(RevisionNotFoundError):
hf_api.delete_tag(repo_id, tag=tag, repo_type="dataset")
hf_api.create_tag(
repo_id, tag=tag, revision=branch, repo_type="dataset", exist_ok=True
)


def main():
"""Upload OpenArm Dataset to Hugging Face Hub."""
parser = argparse.ArgumentParser(
description="Upload an OpenArm Dataset to the Hugging Face Hub"
)
parser.add_argument(
"input",
help="Path of an OpenArm Dataset to upload",
type=pathlib.Path,
)
parser.add_argument(
"--repo-id",
required=True,
help="Target Hugging Face dataset repository id, e.g. username/dataset-name",
)
Comment on lines +187 to +191

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's better that we use our recommended naming convention (we should describe it...) as the default. Let's work on it as a follow-up task.

parser.add_argument(
"--private",
action="store_true",
default=False,
help="Create the repository as private if it does not exist",
)
parser.add_argument(
"--licence",
default="apache-2.0",

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In general, we should use SPDX ID ( https://spdx.org/licenses/ ) for license ID:

Suggested change
default="apache-2.0",
default="Apache-2.0",

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

huggingfaced api only accepts a small letter.https://huggingface.co/docs/hub/repositories-licenses

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh... Hugging Face doesn't use SPDX ID...
Then we should change the "Defaults to Apache-2.0." text ("Apache-2.0" -> "apache-2.0"). Anyway, let's work on license related thing as a follow-up task.

help="The licence to associate with the dataset on the Hugging Face Hub. "
"Defaults to Apache-2.0.",
)
Comment on lines +198 to +203

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, let's add license: in metadata.yaml and Metadata.license.

Can we rename this to --default-license that is used when metadata misses license information?

We can work on this as a follow-up task.

parser.add_argument(
"--large-folder",
action="store_true",
default=False,
help="Use a resumable, multi-threaded upload for large datasets. "
"Recommended for datasets larger than 1 GB.",
Comment thread
k1000dai marked this conversation as resolved.
)
args = parser.parse_args()

dataset = Dataset(args.input)

if dataset.camera_format == "dir":
print(
"Packing camera frames into .tar archives in place before upload "
"(Hugging Face Hub file-count recommendation)...",
file=sys.stderr,
)
pack_cameras_as_tar(dataset)

upload_dataset(
args.input,
args.repo_id,
tag=dataset.meta.version,
metadata_yaml=(args.input / "metadata.yaml").read_text(),
licence=args.licence,
camera_names=dataset.camera_names,
upload_large_folder=args.large_folder,
private=args.private,
)


if __name__ == "__main__":
main()
17 changes: 17 additions & 0 deletions tests/test_dataset_tar.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,3 +85,20 @@ def test_tar_input_roundtrips_to_dir(tmp_path):
assert camera.tar_path is None
assert camera.num_frames == 3
assert camera.get_frame(0).load().shape == (600, 960, 3)


def test_camera_format_dir():
assert Dataset(DATASET_DIR).camera_format == "dir"


def test_camera_format_tar(tar_dataset):
assert tar_dataset.camera_format == "tar"


def test_camera_format_inconsistent_raises(tmp_path):
out = tmp_path / "mixed"
Dataset(DATASET_DIR).write(out, format="openarm", camera_format="tar")
# Turn one camera back into "dir" layout so the dataset mixes both formats.
(out / "episodes" / "0" / "cameras" / "head").mkdir()
with pytest.raises(ValueError, match="Inconsistent camera formats"):
Dataset(out).camera_format