Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions ROADMAP.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@
- [ ] 支持DPO对齐训练
- [ ] 支持colocate RL训练
- [ ] Preprocess支持batched
- [ ] 对多replica的支持和粘滞路由

### 网络能力

Expand All @@ -84,5 +85,6 @@
- [ ] Support for DPO alignment training
- [ ] Support for colocate RL training
- [ ] Support for batched preprocessing
- [ ] Support for multiple replicas and sticky routing

### Networking Capabilities
7 changes: 2 additions & 5 deletions cookbook/client/tinker/megatron/server_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ applications:
runtime_env:
env_vars:
TWINKLE_TRUST_REMOTE_CODE: "0"
DEVICE_COUNT_PER_PHYSICAL_NODE: "8"

# 3. Sampler Service - Runs inference / sampling using vLLM engine
# Used for generating text from the model (e.g., evaluating LoRA results).
Expand All @@ -52,7 +51,7 @@ applications:
device_group: # Logical device group for the sampler
name: sampler
gpus_per_worker: 1
ranks: [0,1,2,3] # GPU rank indices to use
ranks: 4 # GPU rank indices to use
device_type: cuda
device_mesh:
device_type: cuda
Expand All @@ -71,7 +70,6 @@ applications:
runtime_env:
env_vars:
TWINKLE_TRUST_REMOTE_CODE: "0"
DEVICE_COUNT_PER_PHYSICAL_NODE: "8"

# 2. Model Service (commented out) - Would host the base model for training.
# Uncomment and configure if you need a training model worker.
Expand All @@ -86,7 +84,7 @@ applications:
nproc_per_node: 4 # Number of GPU processes per node
device_group:
name: model
ranks: [4,5,6,7] # GPU rank indices
ranks: 4 # GPU rank indices
device_type: cuda
device_mesh:
device_type: cuda
Expand All @@ -111,4 +109,3 @@ applications:
runtime_env:
env_vars:
TWINKLE_TRUST_REMOTE_CODE: "0"
DEVICE_COUNT_PER_PHYSICAL_NODE: "8"
2 changes: 0 additions & 2 deletions cookbook/client/tinker/megatron/server_config_7b.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,6 @@ applications:
runtime_env:
env_vars:
TWINKLE_TRUST_REMOTE_CODE: "0"
DEVICE_COUNT_PER_PHYSICAL_NODE: "8"

# 3. Sampler Service - Runs inference / sampling using vLLM engine
# Used for generating text from the model (e.g., evaluating LoRA results).
Expand Down Expand Up @@ -104,4 +103,3 @@ applications:
runtime_env:
env_vars:
TWINKLE_TRUST_REMOTE_CODE: "0"
DEVICE_COUNT_PER_PHYSICAL_NODE: "8"
11 changes: 6 additions & 5 deletions cookbook/client/tinker/self_congnition.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def train():

# Connect to the Twinkle server running locally
service_client = init_tinker_compat_client(
base_url='http://www.modelscope.cn/twinkle', api_key=os.environ.get('MODELSCOPE_TOKEN'))
base_url='localhost:9000', api_key=os.environ.get('MODELSCOPE_TOKEN'))

# Create a LoRA training client for the base model (rank=16 for the LoRA adapter)
training_client = service_client.create_lora_training_client(base_model=base_model, rank=16)
Expand All @@ -68,9 +68,10 @@ def train():
optim_result = optim_future.result()

# Compute weighted average log-loss per token for monitoring
logprobs = np.concatenate([output['logprobs'].tolist() for output in fwdbwd_result.loss_fn_outputs])
weights = np.concatenate([example.loss_fn_inputs['weights'].tolist() for example in input_datum])
print(f'Loss per token: {-np.dot(logprobs, weights) / weights.sum():.4f}')
# logprobs = np.concatenate([output['logprobs'].tolist() for output in fwdbwd_result.loss_fn_outputs])
# weights = np.concatenate([example.loss_fn_inputs['weights'].tolist() for example in input_datum])
# print(f'Loss per token: {-np.dot(logprobs, weights) / weights.sum():.4f}')
print(f'Training Metrics: {optim_result}')

# Save a checkpoint after each epoch
save_future = training_client.save_state(f'twinkle-lora-{epoch}')
Expand All @@ -85,7 +86,7 @@ def eval():
weight_path = 'twinkle://20260212_174205-Qwen_Qwen2_5-7B-Instruct-51edc9ed/weights/twinkle-lora-2'

# Connect to the server and create a sampling client with the trained weights
service_client = init_tinker_compat_client(base_url='http://localhost:8000')
service_client = init_tinker_compat_client(base_url='http://localhost:9000')
sampling_client = service_client.create_sampling_client(model_path=weight_path, base_model=base_model)

# Step 2: Prepare the chat prompt
Expand Down
2 changes: 0 additions & 2 deletions cookbook/client/tinker/transformer/server_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,6 @@ applications:
runtime_env:
env_vars:
TWINKLE_TRUST_REMOTE_CODE: "0"
DEVICE_COUNT_PER_PHYSICAL_NODE: "8"

# 3. Sampler Service - Runs inference / sampling using vLLM engine
# Used for generating text from the model (e.g., evaluating LoRA results).
Expand Down Expand Up @@ -102,4 +101,3 @@ applications:
runtime_env:
env_vars:
TWINKLE_TRUST_REMOTE_CODE: "0"
DEVICE_COUNT_PER_PHYSICAL_NODE: "8"
3 changes: 0 additions & 3 deletions cookbook/client/twinkle/transformer/server_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,6 @@ applications:
runtime_env:
env_vars:
TWINKLE_TRUST_REMOTE_CODE: "0"
DEVICE_COUNT_PER_PHYSICAL_NODE: "8"

# 3. Processor Service - Handles data preprocessing on CPU
# Runs tokenization, template application, and other CPU-bound tasks.
Expand Down Expand Up @@ -90,7 +89,6 @@ applications:
runtime_env:
env_vars:
TWINKLE_TRUST_REMOTE_CODE: "0"
DEVICE_COUNT_PER_PHYSICAL_NODE: "8"

# 4. Sampler Service - Handles text generation inference
# Uses vLLM for efficient batched generation with optional LoRA adapters.
Expand Down Expand Up @@ -125,4 +123,3 @@ applications:
runtime_env:
env_vars:
TWINKLE_TRUST_REMOTE_CODE: "0"
DEVICE_COUNT_PER_PHYSICAL_NODE: "8"
7 changes: 0 additions & 7 deletions docs/source_en/Usage Guide/Server and Client/Server.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,12 +55,9 @@ This configuration starts 3 nodes:
Before starting the Server, you need to set the following environment variables:

```bash
export DEVICE_COUNT_PER_PHYSICAL_NODE=8 # Specify the total number of GPUs on each physical machine
export TWINKLE_TRUST_REMOTE_CODE=0 # Whether to trust remote code (security consideration)
```

> **Important Note**: `DEVICE_COUNT_PER_PHYSICAL_NODE` must be set to the actual number of physical GPUs on the machine, which is crucial for correctly parsing the `ranks` configuration.

### Node Rank in YAML Configuration

In the YAML configuration file, **each component needs to occupy a separate Node**.
Expand Down Expand Up @@ -117,7 +114,6 @@ applications:
**Important notes:**
- The `ranks` configuration uses **physical GPU card numbers**, directly corresponding to the actual GPU devices on the machine
- The `device_mesh` configuration uses parameters like `dp_size`, `tp_size`, `pp_size`, `ep_size` instead of the original `mesh` and `mesh_dim_names`
- The environment variable `DEVICE_COUNT_PER_PHYSICAL_NODE` must be set to inform the system of the total number of physical GPUs on each machine
- Different components will be automatically assigned to different Nodes
- Ray will automatically schedule to the appropriate Node based on resource requirements (`num_gpus`, `num_cpus` in `ray_actor_options`)

Expand Down Expand Up @@ -393,7 +389,6 @@ applications:
num_cpus: 0.1
runtime_env:
env_vars:
DEVICE_COUNT_PER_PHYSICAL_NODE: "8" # Total number of physical GPUs on each machine

# 3. Sampler service (optional, for inference sampling)
- name: sampler-Qwen2.5-0.5B-Instruct
Expand Down Expand Up @@ -425,7 +420,6 @@ applications:
num_gpus: 1 # Sampler needs independent GPU
runtime_env:
env_vars:
DEVICE_COUNT_PER_PHYSICAL_NODE: "8" # Total number of physical GPUs on each machine
```

## Configuration Item Description
Expand Down Expand Up @@ -471,6 +465,5 @@ device_mesh:
**Environment variables:**

```bash
export DEVICE_COUNT_PER_PHYSICAL_NODE=8 # Total number of GPUs on each physical machine (must be set)
export TWINKLE_TRUST_REMOTE_CODE=0 # Whether to trust remote code
```
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ for item in service_client.get_server_capabilities().supported_models:
When calling `init_tinker_compat_client`, the following operations are automatically executed:

1. **Patch Tinker SDK**: Bypass Tinker's `tinker://` prefix validation, allowing it to connect to standard HTTP addresses
2. **Set Request Headers**: Inject necessary authentication headers such as `X-Ray-Serve-Request-Id` and `Authorization`
2. **Set Request Headers**: Inject necessary authentication headers such as `serve_multiplexed_model_id` and `Authorization`
3. **Return `ServiceClient`**: Returns a standard Tinker `ServiceClient` object, subsequent operations are completely identical to native Tinker

This means that after initialization, **all existing Tinker training code can be used directly** without any modifications.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ for item in service_client.get_server_capabilities().supported_models:
调用 `init_tinker_compat_client` 时,会自动执行以下操作:

1. **Patch Tinker SDK**:绕过 Tinker 的 `tinker://` 前缀校验,使其可以连接到标准 HTTP 地址
2. **设置请求头**:注入 `X-Ray-Serve-Request-Id` 和 `Authorization` 等必要的认证头
2. **设置请求头**:注入 `serve_multiplexed_model_id` 和 `Authorization` 等必要的认证头
3. **返回 `ServiceClient`**:返回一个标准的 Tinker `ServiceClient` 对象,后续操作与原生 Tinker 完全一致

这意味着在初始化之后,**所有已有的 Tinker 训练代码都可以直接使用**,无需任何修改。
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,12 +55,9 @@ ray start --address=10.28.252.9:6379 --num-gpus=0
在启动 Server 之前,需要设置以下环境变量:

```bash
export DEVICE_COUNT_PER_PHYSICAL_NODE=8 # 指定每台物理机上的 GPU 总数
export TWINKLE_TRUST_REMOTE_CODE=0 # 是否信任远程代码(安全考虑)
```

> **重要提示**:`DEVICE_COUNT_PER_PHYSICAL_NODE` 必须设置为机器上实际的物理 GPU 数量,这对于正确解析 `ranks` 配置至关重要。

### YAML 配置中的 Node Rank

在 YAML 配置文件中,**每个组件需要占用一个独立的 Node**。
Expand Down Expand Up @@ -117,7 +114,6 @@ applications:
**重要提示:**
- `ranks` 配置使用**物理 GPU 卡号**,直接对应机器上的实际 GPU 设备
- `device_mesh` 配置使用 `dp_size`、`tp_size`、`pp_size`、`ep_size` 等参数替代原来的 `mesh` 和 `mesh_dim_names`
- 必须设置环境变量 `DEVICE_COUNT_PER_PHYSICAL_NODE` 来告知系统每台机器的物理 GPU 总数
- 不同组件会自动分配到不同的 Node 上
- Ray 会根据资源需求(`ray_actor_options` 中的 `num_gpus`、`num_cpus`)自动调度到合适的 Node

Expand Down Expand Up @@ -336,7 +332,6 @@ applications:
num_cpus: 0.1
runtime_env:
env_vars:
DEVICE_COUNT_PER_PHYSICAL_NODE: "8" # 每台机器的物理 GPU 总数

# 3. Sampler 服务(可选,用于推理采样)
- name: sampler-Qwen2.5-0.5B-Instruct
Expand Down Expand Up @@ -368,7 +363,6 @@ applications:
num_gpus: 1 # Sampler 需要独立 GPU
runtime_env:
env_vars:
DEVICE_COUNT_PER_PHYSICAL_NODE: "8" # 每台机器的物理 GPU 总数
```

## 配置项说明
Expand Down Expand Up @@ -414,6 +408,5 @@ device_mesh:
**环境变量:**

```bash
export DEVICE_COUNT_PER_PHYSICAL_NODE=8 # 每台物理机上的 GPU 总数(必须设置)
export TWINKLE_TRUST_REMOTE_CODE=0 # 是否信任远程代码
```
15 changes: 1 addition & 14 deletions src/twinkle/infra/_ray/ray_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,19 +229,6 @@ def has_ref(args, kwargs) -> bool:
return True
return False

@staticmethod
def _noset_env():
return {
'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': '1',
'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': '1',
'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': '1',
'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': '1',
'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': '1',
'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': '1',
'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': '1',
'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': '1',
}

@staticmethod
def create_workers(worker_cls: Type[T],
group: str,
Expand Down Expand Up @@ -320,7 +307,7 @@ def create_workers(worker_cls: Type[T],

# Prevent Ray from overriding CUDA_VISIBLE_DEVICES set in runtime_env
# This is critical for multi-GPU workers (gpus_per_worker > 1)
env_vars.update(RayHelper._noset_env())
env_vars.update(ResourceManager.noset_env())

runtime_env = RuntimeEnv(env_vars=env_vars)

Expand Down
47 changes: 39 additions & 8 deletions src/twinkle/infra/_ray/resource_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,28 @@ def __init__(self, nproc_per_node: int, ncpu_proc_per_node: int, groups: List[De
if self.node_ranks.count(0) > 1:
self.node_ranks = list(range(len(self.placement_groups)))

self.visible_devices = []

@ray.remote
def get_visible_devices():
return os.environ.get(Platform.get_platform(group.device_type).visible_device_env())

if self.placement_groups:
self.visible_devices = ray.get([
get_visible_devices.options(placement_group=pg, runtime_env={
'env_vars': self.noset_env()
}).remote() for pg in self.placement_groups
])

visible_devices = []
for visible_device in self.visible_devices:
if visible_device:
visible_device = [int(device) for device in visible_device.split(',')]
else:
visible_device = list(range(nproc_per_node))
visible_devices.append(visible_device)
self.visible_devices = visible_devices

self.node2pg: Dict[int, PlacementGroup] = {}
# Map actual node indices to placement groups
# For GPU/NPU groups, node indices start from self.min_node_idx
Expand All @@ -151,12 +173,8 @@ def __init__(self, nproc_per_node: int, ncpu_proc_per_node: int, groups: List[De

self.device_groups = {}
ray_address = str(ray.get_runtime_context().gcs_address)
if 'DEVICE_COUNT_PER_PHYSICAL_NODE' in os.environ:
# Sometimes, multiply nodes are in one physical node, there may be error in `gpu_rank`
device_per_node = int(os.environ['DEVICE_COUNT_PER_PHYSICAL_NODE'])
else:
device_per_node = nproc_per_node
for group in groups:
assert len(groups) == len(visible_devices)
for group, visible_device_list in zip(groups, self.visible_devices):
if group.device_type != 'CPU':
ranks = group.ranks
gpus_per_worker = getattr(group, 'gpus_per_worker', 1)
Expand All @@ -178,7 +196,7 @@ def __init__(self, nproc_per_node: int, ncpu_proc_per_node: int, groups: List[De

# All GPUs for a worker should be on the same node
node_ranks = [r // nproc_per_node for r in worker_ranks]
gpu_ranks_local = [r % device_per_node for r in worker_ranks]
gpu_ranks_local = [visible_device_list[r % nproc_per_node] for r in worker_ranks]

if len(set(node_ranks)) > 1:
raise ValueError(f"DeviceGroup '{group.name}': GPUs {worker_ranks} span multiple nodes. "
Expand All @@ -193,7 +211,7 @@ def __init__(self, nproc_per_node: int, ncpu_proc_per_node: int, groups: List[De
else:
for alloc_rank in normalized_ranks:
node_rank = alloc_rank // nproc_per_node
gpu_rank = alloc_rank % device_per_node
gpu_rank = visible_device_list[alloc_rank % nproc_per_node]
local_device_groups.append(
dict(gpu_rank=[gpu_rank], placement_group=self.node2pg[node_rank], ray_address=ray_address))

Expand Down Expand Up @@ -221,6 +239,19 @@ def __init__(self, nproc_per_node: int, ncpu_proc_per_node: int, groups: List[De
logger.info(f'node_ranks: {self.node_ranks}')
logger.info(f'node2pg keys: {list(self.node2pg.keys())}')

@staticmethod
def noset_env():
return {
'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': '1',
'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': '1',
'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': '1',
'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': '1',
'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': '1',
'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': '1',
'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': '1',
'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': '1',
}

def get_config(self, group: str):
for config in self.group_configs:
if config.name == group:
Expand Down
Loading
Loading