From 8c157596b7cc7894f3763a7a484f067bbb62a43f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=9B=A8=E6=B3=93?= Date: Mon, 29 Jun 2026 19:21:30 +0800 Subject: [PATCH 1/7] wip --- .dev_scripts/ci_container_test.sh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.dev_scripts/ci_container_test.sh b/.dev_scripts/ci_container_test.sh index 4e8c464d..7fb5b9f4 100644 --- a/.dev_scripts/ci_container_test.sh +++ b/.dev_scripts/ci_container_test.sh @@ -1,5 +1,5 @@ install_twinkle_with_kernels() { - pip install ".[kernels,test,tinker]" -i https://mirrors.aliyun.com/pypi/simple/ || pip install ".[kernels,test,tinker]" + pip install ".[test,client,server]" -i https://mirrors.aliyun.com/pypi/simple/ || pip install ".[test,client,server]" } if [ "$MODELSCOPE_SDK_DEBUG" == "True" ]; then @@ -28,6 +28,9 @@ if [ "$MODELSCOPE_SDK_DEBUG" == "True" ]; then pip uninstall tensorflow -y # Pin kernels<0.15 to avoid transformers' hub_kernels.py LayerRepository # crash (huggingface/transformers#46291). + # Also pin huggingface_hub<0.31 to avoid strict dataclass validator + # rejecting PEP 604 union types (str | None) used in kernels. + pip install 'huggingface_hub<0.31' pip install 'kernels<0.15' pip install ray==2.48 pip install optimum @@ -41,6 +44,7 @@ if [ "$MODELSCOPE_SDK_DEBUG" == "True" ]; then else install_twinkle_with_kernels # Same kernels pin and peft bump for the release-image branch. + pip install 'huggingface_hub<0.31' pip install 'kernels<0.15' pip install --upgrade 'peft>=0.19.1' echo "Running case in release image, run case directly!" From 53f810bbd8468e6da6a94a3c0302a83c33222af2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=9B=A8=E6=B3=93?= Date: Mon, 29 Jun 2026 19:25:01 +0800 Subject: [PATCH 2/7] fix --- .github/workflows/citest.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/citest.yaml b/.github/workflows/citest.yaml index bd560302..4e16ebdc 100644 --- a/.github/workflows/citest.yaml +++ b/.github/workflows/citest.yaml @@ -11,7 +11,6 @@ on: - "requirements/**" - "docs/**" - "tools/**" - - ".dev_scripts/**" - "README.md" - "README_*.md" - "NOTICE" @@ -25,7 +24,6 @@ on: - "requirements/**" - "docs/**" - "tools/**" - - ".dev_scripts/**" - "README.md" - "README_*.md" - "NOTICE" From 439a558ef6baadab075fcff8c229b66a8dd21770 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=9B=A8=E6=B3=93?= Date: Mon, 29 Jun 2026 19:30:35 +0800 Subject: [PATCH 3/7] fix --- .dev_scripts/ci_container_test.sh | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/.dev_scripts/ci_container_test.sh b/.dev_scripts/ci_container_test.sh index 7fb5b9f4..0a785e4b 100644 --- a/.dev_scripts/ci_container_test.sh +++ b/.dev_scripts/ci_container_test.sh @@ -26,12 +26,6 @@ if [ "$MODELSCOPE_SDK_DEBUG" == "True" ]; then pip uninstall autoawq -y pip uninstall lmdeploy -y pip uninstall tensorflow -y - # Pin kernels<0.15 to avoid transformers' hub_kernels.py LayerRepository - # crash (huggingface/transformers#46291). - # Also pin huggingface_hub<0.31 to avoid strict dataclass validator - # rejecting PEP 604 union types (str | None) used in kernels. - pip install 'huggingface_hub<0.31' - pip install 'kernels<0.15' pip install ray==2.48 pip install optimum @@ -41,12 +35,18 @@ if [ "$MODELSCOPE_SDK_DEBUG" == "True" ]; then # `from transformers import HybridCache` at peft_model.py:37 which # crashes on transformers v5. 0.19.1 dropped that top-level import. pip install --upgrade 'peft>=0.19.1' + # Pin huggingface_hub AFTER main install to prevent transitive upgrade. + # kernels<0.15 uses str | None (PEP 604) which newer huggingface_hub's + # strict dataclass validator rejects (huggingface/transformers#46291). + pip install 'huggingface_hub<0.31' + pip install 'kernels<0.15' else install_twinkle_with_kernels # Same kernels pin and peft bump for the release-image branch. + pip install --upgrade 'peft>=0.19.1' + # Pin huggingface_hub AFTER main install (same reason as debug branch). pip install 'huggingface_hub<0.31' pip install 'kernels<0.15' - pip install --upgrade 'peft>=0.19.1' echo "Running case in release image, run case directly!" fi # remove torch_extensions folder to avoid ci hang. From 0e54dad014b2506ea3c93eed13506d197d100d9c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=9B=A8=E6=B3=93?= Date: Mon, 29 Jun 2026 19:37:21 +0800 Subject: [PATCH 4/7] fix --- .dev_scripts/ci_container_test.sh | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/.dev_scripts/ci_container_test.sh b/.dev_scripts/ci_container_test.sh index 0a785e4b..beb5fe53 100644 --- a/.dev_scripts/ci_container_test.sh +++ b/.dev_scripts/ci_container_test.sh @@ -35,18 +35,16 @@ if [ "$MODELSCOPE_SDK_DEBUG" == "True" ]; then # `from transformers import HybridCache` at peft_model.py:37 which # crashes on transformers v5. 0.19.1 dropped that top-level import. pip install --upgrade 'peft>=0.19.1' - # Pin huggingface_hub AFTER main install to prevent transitive upgrade. - # kernels<0.15 uses str | None (PEP 604) which newer huggingface_hub's - # strict dataclass validator rejects (huggingface/transformers#46291). - pip install 'huggingface_hub<0.31' - pip install 'kernels<0.15' + # Uninstall kernels: kernels>=0.15 crashes transformers' hub_kernels.py + # (huggingface/transformers#46291), and kernels<0.15 requires + # huggingface_hub>=1.10.0 which conflicts with transformers' <1.0 cap. + # transformers gracefully skips hub_kernels when kernels is absent. + pip uninstall kernels kernels-data -y 2>/dev/null || true else install_twinkle_with_kernels - # Same kernels pin and peft bump for the release-image branch. + # Same peft bump and kernels removal for the release-image branch. pip install --upgrade 'peft>=0.19.1' - # Pin huggingface_hub AFTER main install (same reason as debug branch). - pip install 'huggingface_hub<0.31' - pip install 'kernels<0.15' + pip uninstall kernels kernels-data -y 2>/dev/null || true echo "Running case in release image, run case directly!" fi # remove torch_extensions folder to avoid ci hang. From 3fda95cd11014e78eb63211dd2b5c0710d51578d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=9B=A8=E6=B3=93?= Date: Mon, 29 Jun 2026 20:19:55 +0800 Subject: [PATCH 5/7] fix ci --- docs/source_en/Usage Guide/Quick-Start.md | 2 +- ...53\351\200\237\345\274\200\345\247\213.md" | 2 +- src/twinkle/dataset/base.py | 5 +- src/twinkle/kernel/__init__.py | 2 +- src/twinkle/kernel/base.py | 2 + src/twinkle/template/base.py | 3 ++ .../preprocessor/refuse_filter.py | 2 +- tests/dataloader/test_dataloader.py | 9 ++-- tests/dataloader/test_multimodal.py | 9 ++-- tests/dataset/test_lazy.py | 17 +++--- tests/dataset/test_loading.py | 2 + tests/dataset/test_multimodal.py | 53 ++++++++++--------- tests/dataset/test_packing.py | 11 ++-- tests/dataset/test_ray.py | 14 ++--- tests/kernel/test_kernel.py | 6 +++ tests/preprocessor/test_refuse_filter.py | 3 +- tests/preprocessor/test_token_soup.py | 27 +++++----- .../server/contract/client_api_baseline.json | 18 +++++++ tests/template/test_deepseek_v4_tool_call.py | 4 +- .../twinkle_agentic/test_extract_condensed.py | 2 +- tests/twinkle_agentic/test_model_condenser.py | 25 ++++----- 21 files changed, 134 insertions(+), 84 deletions(-) diff --git a/docs/source_en/Usage Guide/Quick-Start.md b/docs/source_en/Usage Guide/Quick-Start.md index 70747391..ff7b8727 100644 --- a/docs/source_en/Usage Guide/Quick-Start.md +++ b/docs/source_en/Usage Guide/Quick-Start.md @@ -473,7 +473,7 @@ python train.py A major feature of Twinkle is support for multi-tenant mixed training. Specifically, multiple users can use a single base model for LoRA training, which can greatly reduce server-side deployment costs. -Checkpoint resumption is also supported in client-server training. The recommended flow is to call `model.resume_from_checkpoint(resume_path)` to restore weights and optimizer state, then call `dataloader.resume_from_checkpoint(progress['consumed_train_samples'])` to skip consumed data. See [Twinkle-Client](./Server%20and%20Client/Twinkle-Client.md) and [self_cognition.py](../../../cookbook/server_mode/twinkle/self_host/self_cognition.py). +Checkpoint resumption is also supported in client-server training. The recommended flow is to call `model.resume_from_checkpoint(resume_path)` to restore weights and optimizer state, then call `dataloader.resume_from_checkpoint(progress['consumed_train_samples'])` to skip consumed data. See [Twinkle-Client](./Server%20and%20Client/Twinkle-Client.md) and [self_cognition.py](https://github.com/modelscope/twinkle/blob/main/cookbook/server_mode/twinkle/self_host/self_cognition.py). Suppose we start a service using eight GPUs. First, we need to start the Ray cluster: diff --git "a/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\345\277\253\351\200\237\345\274\200\345\247\213.md" "b/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\345\277\253\351\200\237\345\274\200\345\247\213.md" index 3bc5c4ba..11cb5bc9 100644 --- "a/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\345\277\253\351\200\237\345\274\200\345\247\213.md" +++ "b/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\345\277\253\351\200\237\345\274\200\345\247\213.md" @@ -472,7 +472,7 @@ python train.py ``` ### 远程训练 -client-server 训练场景同样支持断点续训。推荐流程是调用 `model.resume_from_checkpoint(resume_path)` 恢复权重和优化器状态,再调用 `dataloader.resume_from_checkpoint(progress['consumed_train_samples'])` 跳过已消费数据。详细示例可参考 [Twinkle客户端](./服务端和客户端/Twinkle客户端.md) 和 [self_cognition.py](../../../cookbook/server_mode/twinkle/self_host/self_cognition.py)。 +client-server 训练场景同样支持断点续训。推荐流程是调用 `model.resume_from_checkpoint(resume_path)` 恢复权重和优化器状态,再调用 `dataloader.resume_from_checkpoint(progress['consumed_train_samples'])` 跳过已消费数据。详细示例可参考 [Twinkle客户端](./服务端和客户端/Twinkle客户端.md) 和 [self_cognition.py](https://github.com/modelscope/twinkle/blob/main/cookbook/server_mode/twinkle/self_host/self_cognition.py)。 Twinkle 的一大特色是支持多租户用户混合训练。具体来说,多个用户可以使用一个基模进行 LoRA 训练,这样可以极大减小服务端部署成本。 diff --git a/src/twinkle/dataset/base.py b/src/twinkle/dataset/base.py index 5e88a6cf..c0fceb52 100644 --- a/src/twinkle/dataset/base.py +++ b/src/twinkle/dataset/base.py @@ -331,12 +331,13 @@ def mix_dataset(self, interleave=True): dataset_types) or not any(dataset_types), 'All datasets must be all streaming=True or streaming=False' if not any(dataset_types): dsets = list(self.datasets.values()) - # Align features + # Align features: only cast when columns match but types differ ref_features = dsets[0].features aligned = [] for ds in dsets: if ds.features != ref_features: - ds = ds.cast(ref_features) + if sorted(ds.features.keys()) == sorted(ref_features.keys()): + ds = ds.cast(ref_features) aligned.append(ds) else: aligned = list(self.datasets.values()) diff --git a/src/twinkle/kernel/__init__.py b/src/twinkle/kernel/__init__.py index c7262eb0..1fe787a6 100644 --- a/src/twinkle/kernel/__init__.py +++ b/src/twinkle/kernel/__init__.py @@ -104,7 +104,7 @@ def _is_npu_device(model=None) -> bool: param_device = next(model.parameters()).device if param_device.type == 'npu': return True - except StopIteration: + except (StopIteration, TypeError): pass # Priority 2: Fallback to global NPU availability diff --git a/src/twinkle/kernel/base.py b/src/twinkle/kernel/base.py index 6da669d5..b06c4a7e 100644 --- a/src/twinkle/kernel/base.py +++ b/src/twinkle/kernel/base.py @@ -54,6 +54,8 @@ def to_kernels_mode(mode: ModeType) -> Any: def validate_mode(mode: str) -> None: + if not is_kernels_available(): + return from kernels.layer.mode import Mode mode = to_kernels_mode(mode) diff --git a/src/twinkle/template/base.py b/src/twinkle/template/base.py index a809c88b..3c6c29f6 100644 --- a/src/twinkle/template/base.py +++ b/src/twinkle/template/base.py @@ -237,6 +237,9 @@ def _to_standard_reasoning_content(self, trajectory: Trajectory) -> List[Traject def _extract_reasoning_content(messages: list[Message]) -> List[Message]: result = [] for message in messages: + if not isinstance(message, dict): + result.append(message) + continue message = message.copy() if message.get('role') == 'assistant': content = message.get('content', '') diff --git a/src/twinkle_agentic/preprocessor/refuse_filter.py b/src/twinkle_agentic/preprocessor/refuse_filter.py index f2e7de64..842aae12 100644 --- a/src/twinkle_agentic/preprocessor/refuse_filter.py +++ b/src/twinkle_agentic/preprocessor/refuse_filter.py @@ -116,7 +116,7 @@ def _text(content: Any) -> str: return content if isinstance(content, str) else '' -def _is_refusal(text: str, check_window: int) -> bool: +def _is_refusal(text: str, check_window: int = 600) -> bool: """Return True if the text contains a self-referential refusal signal.""" window = text[:check_window] return any(p.search(window) for p in _ALL_PATTERNS) diff --git a/tests/dataloader/test_dataloader.py b/tests/dataloader/test_dataloader.py index 2da0a4f8..ba5b5baf 100644 --- a/tests/dataloader/test_dataloader.py +++ b/tests/dataloader/test_dataloader.py @@ -30,9 +30,12 @@ def _disable_process_pool(monkeypatch): SKIP_MODEL_DOWNLOAD = os.getenv('SKIP_MODEL_DOWNLOAD', 'false').lower() == 'true' -def convert_to_messages(example): - text = example.get('text', '') - return {'messages': [Message(role='user', content=text), Message(role='assistant', content='Response')]} +def convert_to_messages(examples): + """Batched map function: receives dict of lists, returns dict of lists.""" + messages_batch = [] + for text in examples.get('text', []): + messages_batch.append([Message(role='user', content=text), Message(role='assistant', content='Response')]) + return {'messages': messages_batch} def _build_resume_rows(): diff --git a/tests/dataloader/test_multimodal.py b/tests/dataloader/test_multimodal.py index 0031b150..27d87b9f 100644 --- a/tests/dataloader/test_multimodal.py +++ b/tests/dataloader/test_multimodal.py @@ -14,9 +14,12 @@ SKIP_MODEL_DOWNLOAD = os.getenv('SKIP_MODEL_DOWNLOAD', 'false').lower() == 'true' -def create_multimodal_messages(example): - text = example.get('text', '') - return {'messages': [{'role': 'user', 'content': f'\n{text}'}, {'role': 'assistant', 'content': 'Response'}]} +def create_multimodal_messages(examples): + """Batched map function: receives dict of lists, returns dict of lists.""" + messages_batch = [] + for text in examples.get('text', []): + messages_batch.append([{'role': 'user', 'content': f'\n{text}'}, {'role': 'assistant', 'content': 'Response'}]) + return {'messages': messages_batch} class TestDataLoaderMultimodal: diff --git a/tests/dataset/test_lazy.py b/tests/dataset/test_lazy.py index 47e39843..7eaa3245 100644 --- a/tests/dataset/test_lazy.py +++ b/tests/dataset/test_lazy.py @@ -10,12 +10,14 @@ SKIP_MODEL_DOWNLOAD = os.getenv('SKIP_MODEL_DOWNLOAD', 'false').lower() == 'true' -def convert_to_messages(example): - text = example.get('text', '') - if not text: - text = str(example.get('question', example.get('title', ''))) - - return {'messages': [Message(role='user', content=text), Message(role='assistant', content='Response')]} +def convert_to_messages(examples): + """Batched map function: receives dict of lists, returns dict of lists.""" + texts = examples.get('text', None) or examples.get('question', None) or examples.get('title', []) + messages_batch = [] + for text in texts: + text = text or '' + messages_batch.append([Message(role='user', content=str(text)), Message(role='assistant', content='Response')]) + return {'messages': messages_batch} class TestLazyDataset: @@ -48,8 +50,7 @@ def test_lazy_dataset_encode_flag(self): dataset.encode() - # Lazy load: encode() only sets flag, actual encoding on access; raw dataset has no input_ids - assert 'messages' in dataset.dataset[0] + # Lazy load: both map and encode are deferred; raw dataset has neither messages nor input_ids assert 'input_ids' not in dataset.dataset[0] item = dataset[0] assert 'input_ids' in item diff --git a/tests/dataset/test_loading.py b/tests/dataset/test_loading.py index 34bdaf54..aaf77fb2 100644 --- a/tests/dataset/test_loading.py +++ b/tests/dataset/test_loading.py @@ -43,12 +43,14 @@ def test_load_local_json(self): def test_load_local_lance(self): """Test loading local Lance file""" + pytest.importorskip('lance') lance_path = str(TEST_DATA_DIR / '1.lance') dataset = Dataset(dataset_meta=DatasetMeta(dataset_id=lance_path)) assert len(dataset) == 2 def test_load_local_lance_dir(self): """Test loading local Lance dir""" + pytest.importorskip('lance') lance_path = str(TEST_DATA_DIR / 'lance') dataset = Dataset(dataset_meta=DatasetMeta(dataset_id=lance_path)) assert len(dataset) == 2 diff --git a/tests/dataset/test_multimodal.py b/tests/dataset/test_multimodal.py index 5fca8f4e..f10f401d 100644 --- a/tests/dataset/test_multimodal.py +++ b/tests/dataset/test_multimodal.py @@ -9,12 +9,14 @@ SKIP_MODEL_DOWNLOAD = os.getenv('SKIP_MODEL_DOWNLOAD', 'false').lower() == 'true' -def create_multimodal_messages(example): - text = example.get('text', '') - if not text: - text = str(example.get('question', example.get('title', ''))) - - return {'messages': [{'role': 'user', 'content': f'\n{text}'}, {'role': 'assistant', 'content': 'Response'}]} +def create_multimodal_messages(examples): + """Batched map function: receives dict of lists, returns dict of lists.""" + texts = examples.get('text', None) or examples.get('question', None) or examples.get('title', []) + messages_batch = [] + for text in texts: + text = text or '' + messages_batch.append([{'role': 'user', 'content': f'\n{text}'}, {'role': 'assistant', 'content': 'Response'}]) + return {'messages': messages_batch} class TestMultimodalDataset: @@ -87,17 +89,18 @@ def test_multimodal_dataset_multiple_image_placeholders(self): csv_path = str(TEST_DATA_DIR / 'test.csv') dataset = Dataset(dataset_meta=DatasetMeta(dataset_id=csv_path)) - def create_multi_image_messages(example): - text = example.get('text', '') - return { - 'messages': [{ + def create_multi_image_messages(examples): + messages_batch = [] + for text in examples.get('text', []): + text = text or '' + messages_batch.append([{ 'role': 'user', 'content': f'\n{text}\n' }, { 'role': 'assistant', 'content': 'Response' - }] - } + }]) + return {'messages': messages_batch} dataset.map(create_multi_image_messages) @@ -110,17 +113,18 @@ def test_multimodal_dataset_video_placeholder(self): csv_path = str(TEST_DATA_DIR / 'test.csv') dataset = Dataset(dataset_meta=DatasetMeta(dataset_id=csv_path)) - def create_video_messages(example): - text = example.get('text', '') - return { - 'messages': [{ + def create_video_messages(examples): + messages_batch = [] + for text in examples.get('text', []): + text = text or '' + messages_batch.append([{ 'role': 'user', 'content': f'