diff --git a/egomimic/scripts/language_process/augment_prompt.txt b/egomimic/scripts/language_process/augment_prompt.txt index f5897f73b..a7c8db7f1 100644 --- a/egomimic/scripts/language_process/augment_prompt.txt +++ b/egomimic/scripts/language_process/augment_prompt.txt @@ -29,6 +29,7 @@ Rules: - Keep each variant grammatical, concise (under 25 words), and natural. - Every string must be a valid standalone instruction a robot could follow. - Variants that drop information must still fully describe the core action and object. +- NEVER drop or generalize the object's descriptors. Keep its color, material, size, and every modifier exactly as in the original (e.g. keep "the blue stuffed animal" as "the blue stuffed animal"; never shorten it to "the stuffed animal" or drop "blue"). Only arm, placement/orientation, and grabbing-method information may be omitted — per the variant types above; the object's identity and descriptors are never omittable. - Do not include the original instruction in the output; the caller will add it back. - Return only the JSON array, e.g. ["variant one", "variant two"]. - Ensure there are exactly 12 variants total, each from the types listed above. Only 3 variants should include full information. diff --git a/egomimic/scripts/language_process/converter.py b/egomimic/scripts/language_process/converter.py index 22fdde166..16948151c 100644 --- a/egomimic/scripts/language_process/converter.py +++ b/egomimic/scripts/language_process/converter.py @@ -48,38 +48,51 @@ def __init__( else: self.augment_prompt_template = None - def scale_to_str_format(self, annotation_dict: dict) -> dict: - annotations = annotation_dict["annotations"] + def scale_to_str_format(self, annotation_dict: dict) -> list: zarr_annotations_list = [] - for annotation in annotations: + for prompt_dict, start_idx, end_idx in self._iter_pick_place_clips( + annotation_dict + ): + base_instruction = self.scale_annotation_to_str(prompt_dict) + instructions = self.augment_instruction(base_instruction, prompt_dict) + for instruction in instructions: + zarr_annotations_list.append((instruction, start_idx, end_idx)) + return zarr_annotations_list + + def _iter_pick_place_clips(self, annotation_dict: dict): + """Yield ``(prompt_dict, start_idx, end_idx)`` for every valid + pick-and-place clip. + + Encapsulates the shared parsing/filtering — derive the arm from the + actuator label, convert the clip's microsecond span to frames, drop + clips flagged as a mistake and "Adjust" (or action-less) clips — so + subclasses such as :class:`SortConverter` reuse the exact same + low-level extraction instead of re-deriving it. + """ + for annotation in annotation_dict["annotations"]: if "label" not in annotation: continue arm = annotation["label"].split(" ")[0].lower() - clips = annotation["clips"] - for clip in clips: + for clip in annotation["clips"]: + if "attributes" not in clip: + # High-level tracks (e.g. the sort "Sorting" track) carry a + # bare text clip with no action attributes — not a + # pick-and-place motion, so skip it here. + continue timestamp = micro_seconds_to_frames(clip["timestamp"], self.fps) duration = micro_seconds_to_frames(clip["duration"], self.fps) - attributes = clip["attributes"] - text = clip["text"] attr_dict = {} - for attribute in attributes: + for attribute in clip["attributes"]: attr_dict[attribute["name"]] = attribute["values"][0] - if attr_dict["Mistake"] == "Yes": + if attr_dict.get("Mistake") == "Yes": continue if "Action" not in attr_dict or attr_dict["Action"] == "Adjust": continue prompt_dict = attr_dict.copy() - prompt_dict.pop("Mistake") - prompt_dict["description"] = text + prompt_dict.pop("Mistake", None) + prompt_dict["description"] = clip["text"] prompt_dict["arm"] = arm - - base_instruction = self.scale_annotation_to_str(prompt_dict) - instructions = self.augment_instruction(base_instruction, prompt_dict) - start_idx = timestamp - end_idx = timestamp + duration - for instruction in instructions: - zarr_annotations_list.append((instruction, start_idx, end_idx)) - return zarr_annotations_list + yield prompt_dict, timestamp, timestamp + duration def scale_annotation_to_str(self, scale_annotation_dict: dict) -> str: model_prompt = self.prompt_template + "\n" + json.dumps(scale_annotation_dict) @@ -95,16 +108,26 @@ def augment_instruction( plus, when an augmentation prompt is configured, LLM-generated synonyms and variants that omit arm and place-orientation info. """ - if self.augment_prompt_template is None: + return self._augment_with_template( + base_instruction, scale_annotation_dict, self.augment_prompt_template + ) + + def _augment_with_template( + self, base_instruction: str, metadata: dict, template: str | None + ) -> list[str]: + """Shared augmentation: ask the LLM for a JSON array of variants and + return ``[base_instruction, *unique_variants]``. Returns just + ``[base_instruction]`` when no augmentation template is configured.""" + if template is None: return [base_instruction] model_prompt = ( - self.augment_prompt_template + template + "\n" + json.dumps( { "instruction": base_instruction, - "metadata": scale_annotation_dict, + "metadata": metadata, } ) ) @@ -126,6 +149,231 @@ def augment_instruction( return deduped +class SortConverter(PickPlaceLLMConverter): + """Convert Scale annotations for *sort* tasks into Zarr annotation tuples. + + A sort episode's Scale annotation has separate label tracks: + + * ``Left Gripper`` / ``Right Gripper`` — the *low-level* pick-and-place + clips (with Action/Mistake/… attributes), handled exactly like + :class:`PickPlaceLLMConverter`. + * ``Sorting`` — the *high-level* sort goals, written by the annotators as + plain ``text`` clips (no attributes), each spanning the window of the + sort sub-goal it describes (e.g. "Sort the corn and the croissant on + the white plate"). + * ``Both Grippers`` — return-to-home transitions (empty ``text``). Kept + as low-level clips too, mirroring :class:`PickPlaceLLMConverter` (which + annotates "Return to home" from the Action even with empty text); each + pairs with the nearest sort goal since they fall between sort windows. + + High-level instructions are therefore *read* from the ``Sorting`` track + rather than generated. Each low-level pick-and-place clip is paired with the + sort instruction active during it (max temporal overlap); both sides are + augmented and truncated to a common count so that **every low-level span + carries an equal number of high-level sort and low-level pick-and-place + instructions** — i.e. the two granularities are balanced at all times + (frames with no pick-and-place clip trivially carry 0 == 0). + + If an annotation has no ``Sorting`` track and ``sort_prompt_filepath`` is + configured, the converter falls back to LLM-generating a single high-level + instruction from the pick-and-place steps. + """ + + #: Annotation labels (case-insensitive prefix) that hold the high-level + #: sort-goal text track. + SORT_LABEL_PREFIX = "sort" + + def __init__( + self, + scale_annotation_dir: str, + prompt_filepath: str, + sort_prompt_filepath: str | None = None, + augment_prompt_filepath: str | None = None, + sort_augment_prompt_filepath: str | None = None, + ): + super().__init__( + scale_annotation_dir, + prompt_filepath, + augment_prompt_filepath=augment_prompt_filepath, + ) + # Optional LLM-generation prompt — only used as a fallback when the + # annotation has no human-written "Sorting" track. + self.sort_prompt_template = self._read_template(sort_prompt_filepath) + # Augmentation prompt for the high-level sort instructions. + self.sort_augment_prompt_template = self._read_template( + sort_augment_prompt_filepath + ) + + @staticmethod + def _read_template(path: str | None) -> str | None: + if path is None: + return None + with open(path, "r") as f: + return f.read() + + def scale_to_str_format(self, annotation_dict: dict) -> list: + # Low-level: every gripper pick-and-place clip, exactly like + # PickPlaceLLMConverter (only Mistake/Adjust clips are filtered, inside + # _iter_pick_place_clips). This keeps "Return to home" transition clips, + # which pick_place also annotates — the LLM phrases them from the Action + # even though their text is empty. + low_clips = list(self._iter_pick_place_clips(annotation_dict)) + if not low_clips: + return [] + + # High-level: read the human-written "Sorting" track. Fall back to LLM + # generation only when no such track exists. + sort_intervals = self._iter_sort_clips(annotation_dict) + fallback_pool: list[str] | None = None + if not sort_intervals: + if self.sort_prompt_template is None: + return [] + context = self.build_sort_context(low_clips) + fallback_pool = self._nonblank( + self.augment_sort_instruction( + self.sort_annotation_to_str(context), context + ) + ) + if not fallback_pool: + return [] + + # Augment each distinct sort instruction at most once. + aug_cache: dict[str, list[str]] = {} + + def high_pool_for(text: str) -> list[str]: + if text not in aug_cache: + aug_cache[text] = self._nonblank( + self.augment_sort_instruction(text, {"task": "sort"}) + ) + return aug_cache[text] + + zarr_annotations_list = [] + for i, (prompt_dict, start_idx, end_idx) in enumerate(low_clips): + if sort_intervals: + high_text = self._sort_text_for_span(start_idx, end_idx, sort_intervals) + high_pool = high_pool_for(high_text) if high_text else [] + else: + high_pool = fallback_pool + + low_base = self.scale_annotation_to_str(prompt_dict) + low_pool = self._nonblank(self.augment_instruction(low_base, prompt_dict)) + if not low_pool or not high_pool: + continue + + # Rotate the high-level pool so successive spans pair with different + # phrasings, then truncate both sides to equal length so the span + # carries the same count of each granularity. + offset = i % len(high_pool) + high_rotated = high_pool[offset:] + high_pool[:offset] + low_balanced, high_balanced = self.balance_instructions( + low_pool, high_rotated + ) + for instruction in (*low_balanced, *high_balanced): + zarr_annotations_list.append((instruction, start_idx, end_idx)) + return zarr_annotations_list + + def _iter_sort_clips(self, annotation_dict: dict) -> list[tuple[str, float, float]]: + """Return ``(text, start_idx, end_idx)`` for the high-level "Sorting" + track — annotations whose label starts with "sort", whose clips carry + the sort goal as ``text`` (and have no action attributes).""" + intervals: list[tuple[str, float, float]] = [] + for annotation in annotation_dict["annotations"]: + label = annotation.get("label", "") or "" + if not label.strip().lower().startswith(self.SORT_LABEL_PREFIX): + continue + for clip in annotation["clips"]: + text = (clip.get("text") or "").strip() + if not text: + continue + start_idx = micro_seconds_to_frames(clip["timestamp"], self.fps) + end_idx = start_idx + micro_seconds_to_frames( + clip["duration"], self.fps + ) + intervals.append((text, start_idx, end_idx)) + return intervals + + @staticmethod + def _sort_text_for_span( + start_idx: float, + end_idx: float, + sort_intervals: list[tuple[str, float, float]], + ) -> str | None: + """Return the sort instruction active during the clip span + ``[start_idx, end_idx]``: the one with the most temporal overlap, or — + for a clip that falls in a gap between sort goals (e.g. a return-to-home + transition) — the temporally nearest one. ``None`` only when there are + no sort intervals at all.""" + best_text, best_overlap = None, 0.0 + for text, s, e in sort_intervals: + overlap = max(0.0, min(end_idx, e) - max(start_idx, s)) + if overlap > best_overlap: + best_overlap, best_text = overlap, text + if best_text is not None: + return best_text + # No overlap: fall back to the temporally nearest sort instruction. + nearest_text, nearest_dist = None, None + for text, s, e in sort_intervals: + dist = max(s - end_idx, start_idx - e, 0.0) + if nearest_dist is None or dist < nearest_dist: + nearest_dist, nearest_text = dist, text + return nearest_text + + @staticmethod + def _nonblank(instructions: list[str]) -> list[str]: + """Drop ``None``/blank instructions so an empty LLM response never + becomes an empty-text annotation (keeps per-span counts meaningful).""" + return [s for s in instructions if s and s.strip()] + + def build_sort_context(self, clips: list) -> dict: + """Summarize the episode's pick-and-place sub-actions into a grounded + context for the LLM-generation *fallback* (used only when there is no + human-written "Sorting" track). + + The arm/hand is intentionally omitted: a high-level sort goal is + arm-agnostic and both sort prompts forbid mentioning a hand. + + Args: + clips: ``(prompt_dict, start_idx, end_idx)`` tuples, in episode order. + """ + steps = [] + for prompt_dict, _, _ in clips: + steps.append( + { + "Action": prompt_dict.get("Action"), + "description": prompt_dict.get("description"), + } + ) + return { + "task": "sort", + "steps": steps, + } + + def sort_annotation_to_str(self, sort_context: dict) -> str: + """Generate one high-level sort instruction from the episode context + (fallback path; requires ``sort_prompt_filepath``).""" + model_prompt = self.sort_prompt_template + "\n" + json.dumps(sort_context) + response = self.client.responses.create(model=self.model, input=model_prompt) + return response.output_text + + def augment_sort_instruction( + self, base_instruction: str, sort_context: dict + ) -> list[str]: + """High-level analogue of :meth:`augment_instruction` using the sort + augmentation prompt. Always includes ``base_instruction`` first.""" + return self._augment_with_template( + base_instruction, sort_context, self.sort_augment_prompt_template + ) + + @staticmethod + def balance_instructions( + low_list: list[str], high_list: list[str] + ) -> tuple[list[str], list[str]]: + """Truncate both lists to a common length so a span carries an equal + number of low-level and high-level instructions.""" + k = min(len(low_list), len(high_list)) + return low_list[:k], high_list[:k] + + class HardCodedConverter(ScaleToZarrAnnotationConverter): def scale_to_str_format(self, annotation: dict) -> dict: pass diff --git a/egomimic/scripts/language_process/scale_to_bucket_annotation_parallel.py b/egomimic/scripts/language_process/scale_to_bucket_annotation_parallel.py index 9bd746d38..7bcdcaef8 100644 --- a/egomimic/scripts/language_process/scale_to_bucket_annotation_parallel.py +++ b/egomimic/scripts/language_process/scale_to_bucket_annotation_parallel.py @@ -72,10 +72,13 @@ def _make_converter( annotation_dir: str, prompt_filepath: str, augment_prompt_filepath: str | None = None, + sort_prompt_filepath: str | None = None, + sort_augment_prompt_filepath: str | None = None, ): from egomimic.scripts.language_process.converter import ( HardCodedConverter, PickPlaceLLMConverter, + SortConverter, ) if conversion_mode == "pick_place_llm": @@ -84,6 +87,16 @@ def _make_converter( prompt_filepath, augment_prompt_filepath=augment_prompt_filepath, ) + elif conversion_mode == "sort_llm": + # High-level sort text is read from the annotation's "Sorting" track; + # sort_prompt_filepath is only an optional LLM-generation fallback. + return SortConverter( + annotation_dir, + prompt_filepath, + sort_prompt_filepath, + augment_prompt_filepath=augment_prompt_filepath, + sort_augment_prompt_filepath=sort_augment_prompt_filepath, + ) elif conversion_mode == "hardcoded": return HardCodedConverter(annotation_dir) raise ValueError(f"Invalid conversion mode: {conversion_mode}") @@ -102,6 +115,8 @@ def process_episode( annotation_key: str = "annotations", overwrite: bool = False, augment_prompt_filepath: str | None = None, + sort_prompt_filepath: str | None = None, + sort_augment_prompt_filepath: str | None = None, ) -> str: """Self-contained Ray task: download, convert, and upload one episode's annotations.""" from egomimic.utils.aws.aws_data_utils import get_boto3_s3_client @@ -121,6 +136,8 @@ def process_episode( scale_annotation_dir, prompt_filepath, augment_prompt_filepath=augment_prompt_filepath, + sort_prompt_filepath=sort_prompt_filepath, + sort_augment_prompt_filepath=sort_augment_prompt_filepath, ) annotations = converter.convert(tid) @@ -162,13 +179,25 @@ def collect_unique_episodes( "--conversion-mode", type=str, required=True, - choices=["pick_place_llm", "hardcoded"], + choices=["pick_place_llm", "sort_llm", "hardcoded"], ) parser.add_argument( "-s", "--scale-api-key", default=os.environ.get("SCALE_API_KEY", "") ) parser.add_argument("--prompt-filepath", type=str, required=True) parser.add_argument("--augment-prompt-filepath", type=str, default=None) + parser.add_argument( + "--sort-prompt-filepath", + type=str, + default=None, + help="High-level sort instruction prompt (required for --conversion-mode sort_llm).", + ) + parser.add_argument( + "--sort-augment-prompt-filepath", + type=str, + default=None, + help="High-level sort augmentation prompt (optional, used by sort_llm).", + ) parser.add_argument("--annotation-key", type=str, default="annotations") parser.add_argument( "--bucket", @@ -284,6 +313,8 @@ def collect_unique_episodes( annotation_key=args.annotation_key, overwrite=args.overwrite, augment_prompt_filepath=args.augment_prompt_filepath, + sort_prompt_filepath=args.sort_prompt_filepath, + sort_augment_prompt_filepath=args.sort_augment_prompt_filepath, ) pending[ref] = ep_hash diff --git a/egomimic/scripts/language_process/scale_to_zarr_annotation.py b/egomimic/scripts/language_process/scale_to_zarr_annotation.py index 917484c02..5a706a439 100644 --- a/egomimic/scripts/language_process/scale_to_zarr_annotation.py +++ b/egomimic/scripts/language_process/scale_to_zarr_annotation.py @@ -14,7 +14,6 @@ from subprocess import run import hydra -import pandas as pd from omegaconf import OmegaConf from scaleapi import ScaleClient @@ -22,6 +21,7 @@ from egomimic.scripts.language_process.converter import ( HardCodedConverter, PickPlaceLLMConverter, + SortConverter, ) from egomimic.utils.scale_utils import ( build_df_from_tasks, @@ -29,7 +29,6 @@ get_available_hashes, get_episode_hash_to_tid, get_tasks, - get_tid_to_episode_hash, load_scale_annotation_csv, ) @@ -51,13 +50,25 @@ def download_scale_annotation_csv(dest_path: str): "--conversion-mode", type=str, required=True, - choices=["pick_place_llm", "hardcoded"], + choices=["pick_place_llm", "sort_llm", "hardcoded"], ) parser.add_argument( "-s", "--scale-api-key", default=os.environ.get("SCALE_API_KEY", "") ) parser.add_argument("--prompt-filepath", type=str, required=True) parser.add_argument("--augment-prompt-filepath", type=str, default=None) + parser.add_argument( + "--sort-prompt-filepath", + type=str, + default=None, + help="High-level sort instruction prompt (required for --conversion-mode sort_llm).", + ) + parser.add_argument( + "--sort-augment-prompt-filepath", + type=str, + default=None, + help="High-level sort augmentation prompt (optional, used by sort_llm).", + ) args = parser.parse_args() os.makedirs(args.scale_annotation_dir, exist_ok=True) @@ -109,6 +120,16 @@ def download_scale_annotation_csv(dest_path: str): args.prompt_filepath, augment_prompt_filepath=args.augment_prompt_filepath, ) + elif args.conversion_mode == "sort_llm": + # High-level sort text is read from the annotation's "Sorting" track; + # sort_prompt_filepath is only an optional LLM-generation fallback. + converter = SortConverter( + args.scale_annotation_dir, + args.prompt_filepath, + args.sort_prompt_filepath, + augment_prompt_filepath=args.augment_prompt_filepath, + sort_augment_prompt_filepath=args.sort_augment_prompt_filepath, + ) elif args.conversion_mode == "hardcoded": converter = HardCodedConverter(args.scale_annotation_dir) else: diff --git a/egomimic/scripts/language_process/scale_to_zarr_annotation_parallel.py b/egomimic/scripts/language_process/scale_to_zarr_annotation_parallel.py index 98b54201f..091da0d7b 100644 --- a/egomimic/scripts/language_process/scale_to_zarr_annotation_parallel.py +++ b/egomimic/scripts/language_process/scale_to_zarr_annotation_parallel.py @@ -56,10 +56,13 @@ def _make_converter( annotation_dir: str, prompt_filepath: str, augment_prompt_filepath: str | None = None, + sort_prompt_filepath: str | None = None, + sort_augment_prompt_filepath: str | None = None, ): from egomimic.scripts.language_process.converter import ( HardCodedConverter, PickPlaceLLMConverter, + SortConverter, ) if conversion_mode == "pick_place_llm": @@ -68,6 +71,16 @@ def _make_converter( prompt_filepath, augment_prompt_filepath=augment_prompt_filepath, ) + elif conversion_mode == "sort_llm": + # High-level sort text is read from the annotation's "Sorting" track; + # sort_prompt_filepath is only an optional LLM-generation fallback. + return SortConverter( + annotation_dir, + prompt_filepath, + sort_prompt_filepath, + augment_prompt_filepath=augment_prompt_filepath, + sort_augment_prompt_filepath=sort_augment_prompt_filepath, + ) elif conversion_mode == "hardcoded": return HardCodedConverter(annotation_dir) raise ValueError(f"Invalid conversion mode: {conversion_mode}") @@ -85,6 +98,8 @@ def process_episode( annotation_key: str = "annotations", overwrite: bool = False, augment_prompt_filepath: str | None = None, + sort_prompt_filepath: str | None = None, + sort_augment_prompt_filepath: str | None = None, ) -> str: """ Self-contained Ray task: download, convert, and write one episode's annotations. @@ -104,6 +119,8 @@ def process_episode( scale_annotation_dir, prompt_filepath, augment_prompt_filepath=augment_prompt_filepath, + sort_prompt_filepath=sort_prompt_filepath, + sort_augment_prompt_filepath=sort_augment_prompt_filepath, ) annotation = converter.convert(tid) @@ -148,13 +165,25 @@ def collect_unique_episodes( "--conversion-mode", type=str, required=True, - choices=["pick_place_llm", "hardcoded"], + choices=["pick_place_llm", "sort_llm", "hardcoded"], ) parser.add_argument( "-s", "--scale-api-key", default=os.environ.get("SCALE_API_KEY", "") ) parser.add_argument("--prompt-filepath", type=str, required=True) parser.add_argument("--augment-prompt-filepath", type=str, default=None) + parser.add_argument( + "--sort-prompt-filepath", + type=str, + default=None, + help="High-level sort instruction prompt (required for --conversion-mode sort_llm).", + ) + parser.add_argument( + "--sort-augment-prompt-filepath", + type=str, + default=None, + help="High-level sort augmentation prompt (optional, used by sort_llm).", + ) parser.add_argument("--annotation-key", type=str, default="annotations") parser.add_argument( "--overwrite", @@ -243,6 +272,8 @@ def collect_unique_episodes( annotation_key=args.annotation_key, overwrite=args.overwrite, augment_prompt_filepath=args.augment_prompt_filepath, + sort_prompt_filepath=args.sort_prompt_filepath, + sort_augment_prompt_filepath=args.sort_augment_prompt_filepath, ) pending[ref] = ep_hash diff --git a/egomimic/scripts/language_process/sort_augment_prompt.txt b/egomimic/scripts/language_process/sort_augment_prompt.txt new file mode 100644 index 000000000..8c5527ce4 --- /dev/null +++ b/egomimic/scripts/language_process/sort_augment_prompt.txt @@ -0,0 +1,31 @@ +You will be given a JSON object with two fields: + +- "instruction": a HIGH-LEVEL natural-language SORTING instruction describing the overall goal of an episode (e.g. "Sort the croissant and the corn on the white plate"). +- "metadata": the context used to produce it (the "task" and the ordered pick-and-place "steps"). + +Produce a set of language-augmented variants of the high-level sorting instruction for training a language-conditioned robot policy. Your output must be a single JSON array of strings with no surrounding prose, markdown, or code fences. + +Goal: every variant must keep the SAME meaning, the SAME objects, and the SAME destinations as the original, but should be a genuinely different phrasing — not a near-identical copy. Aim for moderate variety: the variants stay clearly about the same sort goal, yet differ from one another in wording and structure. + +Verb rule: +- Around 8 of the 12 variants should keep "sort" as the main verb (e.g. "Sort the corn and the croissant onto the white plate."). +- The remaining ~4 may use a close synonym verb (e.g. "place", "put", "arrange", "organize") or a different sentence structure. + +Ways to vary (combine a few per variant, but keep the objects and destinations exactly): +- Reword connectives and noun phrasing ("the croissant and the corn" -> "both the croissant and the corn" / "the corn together with the croissant"). +- Reorder the listed objects, and swap the destination preposition where natural ("on" / "onto" / "to"). +- Vary the sentence structure on some variants — e.g. a prepositional opener ("Onto the white plate, sort the corn and the croissant.") or "The croissant and the corn should be sorted onto the white plate.". + +Rules: +- Do NOT invent objects, containers, sorting criteria, or counts not in the instruction or metadata; keep exactly the same objects and destinations as the original. +- NEVER drop or generalize an object's descriptors. Preserve every color, material, size, and modifier that appears in the original (e.g. keep "the blue stuffed animal" as "the blue stuffed animal"; never shorten it to "the stuffed animal" or drop "blue"). Do not add descriptors that are not in the original either. +- Do NOT abstract away the specifics (no "organize the items into groups"); always name the same concrete objects and destinations. +- These are HIGH-LEVEL goals: do NOT mention any specific hand or arm, and do NOT describe a single pick-and-place motion or enumerate the steps. +- Do NOT use "lift" as a synonym for pick, grab, or take. +- Do NOT use adjectives or adverbs describing the style of execution (e.g. "carefully", "precisely"). +- Keep each variant grammatical, concise (under 25 words), and a valid standalone instruction. +- Do not include the original instruction in the output; the caller will add it back. +- Return only the JSON array, e.g. ["Sort the corn and the croissant onto the white plate.", "..."]. +- Ensure there are exactly 12 variants total, and that no two are identical. + +Input: diff --git a/egomimic/scripts/language_process/sort_prompt.txt b/egomimic/scripts/language_process/sort_prompt.txt new file mode 100644 index 000000000..d79392f69 --- /dev/null +++ b/egomimic/scripts/language_process/sort_prompt.txt @@ -0,0 +1,19 @@ +You are generating a single HIGH-LEVEL natural-language instruction for a language-conditioned robot bimanual manipulation policy that is performing a SORTING task. The instruction describes the OVERALL GOAL of the whole episode — not any individual motion. The policy receives one instruction string, so the instruction must be grounded strictly in the metadata provided below. Do not invent or infer any details that are not present. + +You will be given a JSON dictionary with: +- "task": always "sort". +- "steps": the ordered pick-and-place sub-actions that make up the episode, each with an "Action" (e.g. "Pick up", "Put") and a "description". + +Infer what is being sorted, and where/by what criterion it is being sorted, STRICTLY from the steps, then output one high-level sorting instruction. + +Output exactly one instruction sentence in the imperative mood (e.g. "Sort the utensils into the correct bins."). Keep it under 25 words. Do not include punctuation other than a period at the end. + +General rules: +- Describe the goal at a high level: what category of objects is being sorted, and the destinations or sorting criterion, when those are evident from the steps. +- Summarize the episode; do NOT list, enumerate, or count the individual pick-and-place steps. +- Do NOT mention which hand or arm is used — a high-level sort goal is arm-agnostic. +- Do NOT invent objects, containers, criteria, or counts that are not supported by the steps. If the destinations or criterion are not clear from the steps, give a general sorting instruction (e.g. "Sort the items into their correct containers."). +- Do not use "lift" as a synonym for pick, grab, or take. +- Use concrete, goal-oriented language a robot can be conditioned on. + +Dictionary: diff --git a/egomimic/scripts/viz_language.py b/egomimic/scripts/viz_language.py index 7cb23f1dc..dc0699bfa 100644 --- a/egomimic/scripts/viz_language.py +++ b/egomimic/scripts/viz_language.py @@ -63,6 +63,26 @@ def _extract_annotation(batch: dict, annotation_key: str) -> list[str]: return texts +def _sample_balanced_annotations(texts: list[str], per_type: int = 3) -> list[str]: + """Show both granularities for a frame. + + The SortConverter writes, per pick-and-place clip, ``[low-level pick/place + …, high-level sort …]`` (equal halves) over the same span — so for a frame + the active annotations arrive midpoint-split into pick/place (first half) + then sort (second half). Showing *all* of them (~26) overflows the strip and + can hide one granularity, so sample up to ``per_type`` of each and label + them, guaranteeing both the pick/place and the sort instructions are shown. + """ + n = len(texts) + if n == 0: + return [] + half = n // 2 + pick = texts[:half][:per_type] + srt = texts[half:][:per_type] + labeled = [f"[pick/place] {t}" for t in pick] + [f"[sort] {t}" for t in srt] + return labeled if labeled else texts[:per_type] + + _COMPACT_MAX_CHARS = 120 _COMPACT_FONT = cv2.FONT_HERSHEY_SIMPLEX _COMPACT_SCALE = 0.35 @@ -167,6 +187,7 @@ def _run_viz_for_datasets( max_batches: int, fps: int, frames_per_file: int, + sample_per_type: int = 3, ) -> None: for embodiment_name, dataset in datasets.items(): embodiment_cls = _EMBODIMENT_CLASSES.get(embodiment_name.lower()) @@ -221,7 +242,9 @@ def _run_viz_for_datasets( if ann_key is not None: fresh = _extract_annotation(batch, ann_key) if fresh: - carried_annotation = fresh + carried_annotation = _sample_balanced_annotations( + fresh, sample_per_type + ) try: frames = _viz_batch( embodiment_cls, @@ -266,6 +289,7 @@ def main(cfg: DictConfig) -> None: max_batches = cfg.get("max_batches", 500) fps = cfg.get("fps", 30) frames_per_file = cfg.get("frames_per_file", 1000) + sample_per_type = cfg.get("annotation_sample_per_type", 3) annotation_key = OmegaConf.select(cfg, "data.annotation_key", default=None) viz_cfg = cfg.viz_func @@ -300,6 +324,7 @@ def main(cfg: DictConfig) -> None: max_batches=max_batches, fps=fps, frames_per_file=frames_per_file, + sample_per_type=sample_per_type, )