Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions scripts/setup/cambrian_s_install.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#!/bin/bash
# scripts/setup/cambrian_s_install.sh
# Description: Setup environment for Cambrian-S inference in OpenWorldLib
# Usage: bash scripts/setup/cambrian_s_install.sh

echo "=== [1/4] Installing the base environment ==="
pip install torch==2.6.9 torchvision torchaudio
pip install git+https://github.com/openai/CLIP.git

echo "=== [2/4] Installing the OpenWorldLib requirements (transformers_low extra) ==="
pip install -e ".[transformers_low]"

echo "=== [3/4] Installing Cambrian-S runtime dependencies ==="
pip install sentencepiece decord

echo "=== [4/4] Installing the flash attention ==="
pip install "flash-attn==2.5.9.post1" --no-build-isolation

echo "=== Setup completed! ==="
112 changes: 112 additions & 0 deletions src/openworldlib/operators/cambrian_s_operator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
from pathlib import Path
from typing import Any

import torch

from .base_operator import BaseOperator
from ..reasoning.spatial_reasoning.cambrian_s.constants import DEFAULT_SYSTEM_PROMPT
from ..reasoning.spatial_reasoning.cambrian_s.conversation import (
build_qwen_chat_prompt,
extract_media_inputs,
)
from ..reasoning.spatial_reasoning.cambrian_s.mm_utils import (
preprocess_single_image,
preprocess_video_frames,
tokenizer_image_token,
)


class CambrianSOperator(BaseOperator):
"""
Lightweight operator placeholder for Cambrian-S.
It tracks interactions and converts OpenWorldLib chat messages into
Cambrian-S prompt tokens plus image/video tensors.
"""

def __init__(self, operation_types=None, interaction_template=None):
super().__init__(operation_types=operation_types or ["reasoning"])
self.interaction_template = interaction_template or []
self.interaction_template_init()

@classmethod
def from_pretrained(cls, *args, **kwargs) -> "CambrianSOperator":
return cls()

def check_interaction(self, interaction):
return True

def get_interaction(self, interaction):
if self.check_interaction(interaction):
self.current_interaction.append(interaction)

def process_interaction(self, *args, **kwargs):
return self.current_interaction

def process_perception(
self,
messages: list[dict[str, Any]],
tokenizer,
image_processors: list[Any],
model_config: Any = None,
system_prompt: str = DEFAULT_SYSTEM_PROMPT,
video_max_frames: int | None = None,
) -> dict[str, Any]:
prompt = build_qwen_chat_prompt(messages, system_prompt=system_prompt)
media_inputs = extract_media_inputs(messages)

if media_inputs and not image_processors:
raise RuntimeError("Cambrian-S received visual inputs, but no image processor is available.")

image_tensors = []
image_sizes = []
if media_inputs:
processor = image_processors[0]
image_count = sum(1 for media in media_inputs if media.get("type") == "image")
image_aspect_ratio = getattr(model_config, "image_aspect_ratio", "pad")
anyres_max_subimages = int(getattr(model_config, "anyres_max_subimages", 1))
for media in media_inputs:
media_type = media.get("type")
if media_type == "image":
use_anyres = image_count == 1 and image_aspect_ratio == "anyres"
pixel_values, original_size = preprocess_single_image(
media["image"],
processor,
image_aspect_ratio="anyres" if use_anyres else "pad",
anyres_max_subimages=anyres_max_subimages,
)
elif media_type == "video":
video_input = media["video"]
num_threads = -1
resolved_video_max_frames = (
video_max_frames
if video_max_frames is not None
else int(getattr(model_config, "video_max_frames", 32))
)
if isinstance(video_input, (str, Path)):
video_name = str(video_input)
if "Ego4D" in video_name or "video_mmmu" in video_name:
num_threads = 1
pixel_values, original_size = preprocess_video_frames(
video_input,
processor,
max_frames=resolved_video_max_frames,
model_config=model_config,
num_threads=num_threads,
)
else:
continue
image_tensors.append(pixel_values)
image_sizes.append(original_size)

input_ids = tokenizer_image_token(prompt, tokenizer, return_tensors="pt").unsqueeze(0)
attention_mask = torch.ones_like(input_ids, dtype=torch.bool)
return {
"prompt": prompt,
"input_ids": input_ids,
"attention_mask": attention_mask,
"images": image_tensors,
"image_sizes": image_sizes,
}

def delete_last_interaction(self):
super().delete_last_interaction()
107 changes: 107 additions & 0 deletions src/openworldlib/pipelines/cambrian_s/pipeline_cambrian_s.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
from typing import List, Optional, Sequence, Union

from PIL import Image as PILImage

from ...operators.cambrian_s_operator import CambrianSOperator
from ...reasoning.spatial_reasoning.cambrian_s.cambrian_s_reasoning import CambrianSReasoning


class CambrianSPipeline:
"""
Pipeline that builds Cambrian-S multimodal inputs and runs Cambrian-S reasoning.
"""

def __init__(self, reasoning: CambrianSReasoning, operator: CambrianSOperator):
self.reasoning = reasoning
self.operator = operator

@classmethod
def from_pretrained(
cls,
model_path: str = "nyu-visionx/Cambrian-S-7B",
device: Optional[Union[str, "torch.device"]] = None,
weight_dtype: "torch.dtype" = None,
**kwargs,
) -> "CambrianSPipeline":
reasoning = CambrianSReasoning.from_pretrained(
model_path=model_path,
device=device,
weight_dtype=weight_dtype,
**kwargs,
)
operator = CambrianSOperator.from_pretrained()
return cls(reasoning=reasoning, operator=operator)

def _build_messages(
self,
images: Optional[Union[str, PILImage.Image, Sequence[Union[str, PILImage.Image]]]],
videos: Optional[
Union[
str,
list[PILImage.Image],
Sequence[Union[str, list[PILImage.Image]]],
]
],
prompt: str,
):
if images is None:
images = []
if videos is None:
videos = []

if isinstance(images, (str, PILImage.Image)):
images = [images]
if isinstance(videos, str):
videos = [videos]
elif isinstance(videos, list) and videos and isinstance(videos[0], PILImage.Image):
videos = [videos]

content = [{"type": "image", "image": image} for image in images]
content += [{"type": "video", "video": video} for video in videos]
content.append({"type": "text", "text": prompt})
return [{"role": "user", "content": content}]

def __call__(
self,
prompt: str,
images: Optional[Union[str, PILImage.Image, Sequence[Union[str, PILImage.Image]]]] = None,
videos: Optional[
Union[
str,
list[PILImage.Image],
Sequence[Union[str, list[PILImage.Image]]],
]
] = None,
max_new_tokens: int = 2048,
messages: Optional[list] = None,
generation_kwargs: Optional[dict] = None,
) -> List[str]:
self.operator.get_interaction(prompt)
self.operator.process_interaction()

if messages is None:
batched_messages = [self._build_messages(images=images, videos=videos, prompt=prompt)]
else:
if not messages:
raise ValueError("messages must be non-empty.")
batched_messages = [messages] if isinstance(messages[0], dict) else messages

outputs: List[str] = []
for sample_messages in batched_messages:
model_config = getattr(getattr(self.reasoning, "model", None), "config", None)
model_inputs = self.operator.process_perception(
sample_messages,
tokenizer=self.reasoning.tokenizer,
image_processors=self.reasoning.image_processors,
model_config=model_config,
)
outputs.extend(
self.reasoning.inference(
inputs=model_inputs,
max_new_tokens=max_new_tokens,
generation_kwargs=generation_kwargs,
)
)

self.operator.delete_last_interaction()
return outputs
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@


from typing import Any, List, Optional, Union

import torch
from transformers import AutoTokenizer

from ...base_reasoning import BaseReasoning
from .constants import (
DEFAULT_IMAGE_PATCH_TOKEN,
DEFAULT_IM_END_TOKEN,
DEFAULT_IM_START_TOKEN,
)
from .mm_utils import validate_cambrian_s_environment
from .modeling_cambrian_s import CambrianSForCausalLM


class CambrianSReasoning(BaseReasoning):
"""
Cambrian-S: https://arxiv.org/abs/2511.04670
"""

def __init__(
self,
model: CambrianSForCausalLM,
tokenizer: Any,
image_processors: list[Any],
device: Union[str, "torch.device"] = "cuda",
):
super().__init__()
self.model = model
self.tokenizer = tokenizer
self.image_processors = image_processors
self.processor = image_processors[0] if image_processors else None
self.device = torch.device(device)

@classmethod
def from_pretrained(
cls,
model_path: str = "nyu-visionx/Cambrian-S-7B",
device: Optional[Union[str, "torch.device"]] = None,
weight_dtype: "torch.dtype" = None,
attn_implementation: Optional[str] = None,
**kwargs,
) -> "CambrianSReasoning":
validate_cambrian_s_environment(require_video=False)

config_override_names = (
"video_max_frames",
"video_fps",
"video_force_sample",
"add_time_instruction",
"miv_token_len",
"si_token_len",
"image_aspect_ratio",
"anyres_max_subimages",
)
config_overrides = {
attr_name: kwargs.pop(attr_name)
for attr_name in config_override_names
if attr_name in kwargs
}

if device is None:
device = "cuda" if torch.cuda.is_available() else "cpu"
if weight_dtype is None:
weight_dtype = torch.float16

tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
model = CambrianSForCausalLM.from_pretrained(
model_path,
torch_dtype=weight_dtype,
low_cpu_mem_usage=kwargs.pop("low_cpu_mem_usage", True),
attn_implementation=attn_implementation,
**kwargs,
)
mm_use_im_start_end = getattr(model.config, "mm_use_im_start_end", False)
mm_use_im_patch_token = getattr(model.config, "mm_use_im_patch_token", True)
if mm_use_im_patch_token:
tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
if mm_use_im_start_end:
tokenizer.add_tokens(
[DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN],
special_tokens=True,
)
model.resize_token_embeddings(len(tokenizer))
for attr_name, attr_value in config_overrides.items():
setattr(model.config, attr_name, attr_value)
model = model.to(device)
model.load_vision_towers(device=device, dtype=weight_dtype)
image_processors = [tower.image_processor for tower in model.get_model().get_vision_tower_aux_list()]
return cls(model=model, tokenizer=tokenizer, image_processors=image_processors, device=device)

def api_init(self, api_key, endpoint):
raise NotImplementedError("API init is not supported for Cambrian-S.")

def _get_default_device(self):
if torch.cuda.is_available():
return torch.device("cuda")
return torch.device("cpu")

@torch.no_grad()
def inference(
self,
inputs: dict[str, Any],
max_new_tokens: int = 2048,
generation_kwargs: Optional[dict] = None,
) -> List[str]:
generation_config = {"max_new_tokens": max_new_tokens}
if generation_kwargs:
generation_config.update(generation_kwargs)

input_ids = inputs["input_ids"].to(self.device)
attention_mask = inputs.get("attention_mask")
if attention_mask is not None:
attention_mask = attention_mask.to(self.device)

images = inputs.get("images") or []
model_dtype = next(self.model.parameters()).dtype
images = [image.to(device=self.device, dtype=model_dtype) for image in images]
image_sizes = inputs.get("image_sizes") or []

generated_ids = self.model.generate(
inputs=input_ids,
attention_mask=attention_mask,
images=images,
image_sizes=image_sizes,
**generation_config,
)
if hasattr(generated_ids, "sequences"):
generated_ids = generated_ids.sequences

return self.tokenizer.batch_decode(
generated_ids,
skip_special_tokens=True,
clean_up_tokenization_spaces=False,
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
IGNORE_INDEX = -100
IMAGE_TOKEN_INDEX = -200

DEFAULT_IMAGE_TOKEN = "<image>"
DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
DEFAULT_IM_START_TOKEN = "<im_start>"
DEFAULT_IM_END_TOKEN = "<im_end>"
DEFAULT_SYSTEM_PROMPT = "You are a helpful assistant."

SUPPORTED_TRANSFORMERS_MIN = "4.39.2"
SUPPORTED_TRANSFORMERS_MAX = "4.45.1"
SUPPORTED_TRANSFORMERS_RANGE = (
f">={SUPPORTED_TRANSFORMERS_MIN},<={SUPPORTED_TRANSFORMERS_MAX}"
)
Loading