diff --git a/.dev_scripts/ci_container_test.sh b/.dev_scripts/ci_container_test.sh index 4e8c464d2..beb5fe530 100644 --- a/.dev_scripts/ci_container_test.sh +++ b/.dev_scripts/ci_container_test.sh @@ -1,5 +1,5 @@ install_twinkle_with_kernels() { - pip install ".[kernels,test,tinker]" -i https://mirrors.aliyun.com/pypi/simple/ || pip install ".[kernels,test,tinker]" + pip install ".[test,client,server]" -i https://mirrors.aliyun.com/pypi/simple/ || pip install ".[test,client,server]" } if [ "$MODELSCOPE_SDK_DEBUG" == "True" ]; then @@ -26,9 +26,6 @@ if [ "$MODELSCOPE_SDK_DEBUG" == "True" ]; then pip uninstall autoawq -y pip uninstall lmdeploy -y pip uninstall tensorflow -y - # Pin kernels<0.15 to avoid transformers' hub_kernels.py LayerRepository - # crash (huggingface/transformers#46291). - pip install 'kernels<0.15' pip install ray==2.48 pip install optimum @@ -38,11 +35,16 @@ if [ "$MODELSCOPE_SDK_DEBUG" == "True" ]; then # `from transformers import HybridCache` at peft_model.py:37 which # crashes on transformers v5. 0.19.1 dropped that top-level import. pip install --upgrade 'peft>=0.19.1' + # Uninstall kernels: kernels>=0.15 crashes transformers' hub_kernels.py + # (huggingface/transformers#46291), and kernels<0.15 requires + # huggingface_hub>=1.10.0 which conflicts with transformers' <1.0 cap. + # transformers gracefully skips hub_kernels when kernels is absent. + pip uninstall kernels kernels-data -y 2>/dev/null || true else install_twinkle_with_kernels - # Same kernels pin and peft bump for the release-image branch. - pip install 'kernels<0.15' + # Same peft bump and kernels removal for the release-image branch. pip install --upgrade 'peft>=0.19.1' + pip uninstall kernels kernels-data -y 2>/dev/null || true echo "Running case in release image, run case directly!" fi # remove torch_extensions folder to avoid ci hang. diff --git a/.github/workflows/citest.yaml b/.github/workflows/citest.yaml index bd560302e..4e16ebdcd 100644 --- a/.github/workflows/citest.yaml +++ b/.github/workflows/citest.yaml @@ -11,7 +11,6 @@ on: - "requirements/**" - "docs/**" - "tools/**" - - ".dev_scripts/**" - "README.md" - "README_*.md" - "NOTICE" @@ -25,7 +24,6 @@ on: - "requirements/**" - "docs/**" - "tools/**" - - ".dev_scripts/**" - "README.md" - "README_*.md" - "NOTICE" diff --git a/.github/workflows/citest_npu.yaml b/.github/workflows/citest_npu.yaml index d48c7421d..a3878dae9 100644 --- a/.github/workflows/citest_npu.yaml +++ b/.github/workflows/citest_npu.yaml @@ -42,7 +42,7 @@ jobs: runs-on: [linux-aarch64-a2-1] timeout-minutes: 240 container: - image: 'ascendai/cann:8.3.rc2-910b-ubuntu22.04-py3.11' + image: 'ascendai/cann:9.0.0-910-ubuntu22.04-py3.11' steps: - name: Config mirrors run: | @@ -67,7 +67,7 @@ jobs: run: | set -e export IMAGE_NAME=ascendai/cann - export IMAGE_VERSION=8.3.rc2-910b-ubuntu22.04-py3.11 + export IMAGE_VERSION=9.0.0-910-ubuntu22.04-py3.11 export TEST_LEVEL=0 mkdir -p ~/.cache export MODELSCOPE_CACHE=~/.cache diff --git a/docs/source_en/Usage Guide/Quick-Start.md b/docs/source_en/Usage Guide/Quick-Start.md index 707473910..44c4dedf2 100644 --- a/docs/source_en/Usage Guide/Quick-Start.md +++ b/docs/source_en/Usage Guide/Quick-Start.md @@ -473,7 +473,7 @@ python train.py A major feature of Twinkle is support for multi-tenant mixed training. Specifically, multiple users can use a single base model for LoRA training, which can greatly reduce server-side deployment costs. -Checkpoint resumption is also supported in client-server training. The recommended flow is to call `model.resume_from_checkpoint(resume_path)` to restore weights and optimizer state, then call `dataloader.resume_from_checkpoint(progress['consumed_train_samples'])` to skip consumed data. See [Twinkle-Client](./Server%20and%20Client/Twinkle-Client.md) and [self_cognition.py](../../../cookbook/server_mode/twinkle/self_host/self_cognition.py). +Checkpoint resumption is also supported in client-server training. The recommended flow is to call `model.resume_from_checkpoint(resume_path)` to restore weights and optimizer state, then call `dataloader.resume_from_checkpoint(progress['consumed_train_samples'])` to skip consumed data. See [Twinkle-Client](./Server%20and%20Client/Twinkle-Client.md) and [self_cognition.py](https://github.com/modelscope/twinkle/blob/main/cookbook/client/twinkle/self_host/self_cognition.py). Suppose we start a service using eight GPUs. First, we need to start the Ray cluster: diff --git "a/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\345\277\253\351\200\237\345\274\200\345\247\213.md" "b/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\345\277\253\351\200\237\345\274\200\345\247\213.md" index 3bc5c4ba4..b4bbcd496 100644 --- "a/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\345\277\253\351\200\237\345\274\200\345\247\213.md" +++ "b/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\345\277\253\351\200\237\345\274\200\345\247\213.md" @@ -472,7 +472,7 @@ python train.py ``` ### 远程训练 -client-server 训练场景同样支持断点续训。推荐流程是调用 `model.resume_from_checkpoint(resume_path)` 恢复权重和优化器状态,再调用 `dataloader.resume_from_checkpoint(progress['consumed_train_samples'])` 跳过已消费数据。详细示例可参考 [Twinkle客户端](./服务端和客户端/Twinkle客户端.md) 和 [self_cognition.py](../../../cookbook/server_mode/twinkle/self_host/self_cognition.py)。 +client-server 训练场景同样支持断点续训。推荐流程是调用 `model.resume_from_checkpoint(resume_path)` 恢复权重和优化器状态,再调用 `dataloader.resume_from_checkpoint(progress['consumed_train_samples'])` 跳过已消费数据。详细示例可参考 [Twinkle客户端](./服务端和客户端/Twinkle客户端.md) 和 [self_cognition.py](https://github.com/modelscope/twinkle/blob/main/cookbook/client/twinkle/self_host/self_cognition.py)。 Twinkle 的一大特色是支持多租户用户混合训练。具体来说,多个用户可以使用一个基模进行 LoRA 训练,这样可以极大减小服务端部署成本。 diff --git a/src/twinkle/dataset/base.py b/src/twinkle/dataset/base.py index 5e88a6cf3..c0fceb52d 100644 --- a/src/twinkle/dataset/base.py +++ b/src/twinkle/dataset/base.py @@ -331,12 +331,13 @@ def mix_dataset(self, interleave=True): dataset_types) or not any(dataset_types), 'All datasets must be all streaming=True or streaming=False' if not any(dataset_types): dsets = list(self.datasets.values()) - # Align features + # Align features: only cast when columns match but types differ ref_features = dsets[0].features aligned = [] for ds in dsets: if ds.features != ref_features: - ds = ds.cast(ref_features) + if sorted(ds.features.keys()) == sorted(ref_features.keys()): + ds = ds.cast(ref_features) aligned.append(ds) else: aligned = list(self.datasets.values()) diff --git a/src/twinkle/kernel/__init__.py b/src/twinkle/kernel/__init__.py index c7262eb07..1fe787a6f 100644 --- a/src/twinkle/kernel/__init__.py +++ b/src/twinkle/kernel/__init__.py @@ -104,7 +104,7 @@ def _is_npu_device(model=None) -> bool: param_device = next(model.parameters()).device if param_device.type == 'npu': return True - except StopIteration: + except (StopIteration, TypeError): pass # Priority 2: Fallback to global NPU availability diff --git a/src/twinkle/kernel/base.py b/src/twinkle/kernel/base.py index 6da669d5c..b06c4a7eb 100644 --- a/src/twinkle/kernel/base.py +++ b/src/twinkle/kernel/base.py @@ -54,6 +54,8 @@ def to_kernels_mode(mode: ModeType) -> Any: def validate_mode(mode: str) -> None: + if not is_kernels_available(): + return from kernels.layer.mode import Mode mode = to_kernels_mode(mode) diff --git a/src/twinkle/template/base.py b/src/twinkle/template/base.py index a809c88b0..3c6c29f6c 100644 --- a/src/twinkle/template/base.py +++ b/src/twinkle/template/base.py @@ -237,6 +237,9 @@ def _to_standard_reasoning_content(self, trajectory: Trajectory) -> List[Traject def _extract_reasoning_content(messages: list[Message]) -> List[Message]: result = [] for message in messages: + if not isinstance(message, dict): + result.append(message) + continue message = message.copy() if message.get('role') == 'assistant': content = message.get('content', '') diff --git a/src/twinkle_agentic/preprocessor/refuse_filter.py b/src/twinkle_agentic/preprocessor/refuse_filter.py index f2e7de647..842aae121 100644 --- a/src/twinkle_agentic/preprocessor/refuse_filter.py +++ b/src/twinkle_agentic/preprocessor/refuse_filter.py @@ -116,7 +116,7 @@ def _text(content: Any) -> str: return content if isinstance(content, str) else '' -def _is_refusal(text: str, check_window: int) -> bool: +def _is_refusal(text: str, check_window: int = 600) -> bool: """Return True if the text contains a self-referential refusal signal.""" window = text[:check_window] return any(p.search(window) for p in _ALL_PATTERNS) diff --git a/tests/dataloader/test_dataloader.py b/tests/dataloader/test_dataloader.py index 2da0a4f88..ba5b5baf4 100644 --- a/tests/dataloader/test_dataloader.py +++ b/tests/dataloader/test_dataloader.py @@ -30,9 +30,12 @@ def _disable_process_pool(monkeypatch): SKIP_MODEL_DOWNLOAD = os.getenv('SKIP_MODEL_DOWNLOAD', 'false').lower() == 'true' -def convert_to_messages(example): - text = example.get('text', '') - return {'messages': [Message(role='user', content=text), Message(role='assistant', content='Response')]} +def convert_to_messages(examples): + """Batched map function: receives dict of lists, returns dict of lists.""" + messages_batch = [] + for text in examples.get('text', []): + messages_batch.append([Message(role='user', content=text), Message(role='assistant', content='Response')]) + return {'messages': messages_batch} def _build_resume_rows(): diff --git a/tests/dataloader/test_multimodal.py b/tests/dataloader/test_multimodal.py index 0031b1506..27d87b9f3 100644 --- a/tests/dataloader/test_multimodal.py +++ b/tests/dataloader/test_multimodal.py @@ -14,9 +14,12 @@ SKIP_MODEL_DOWNLOAD = os.getenv('SKIP_MODEL_DOWNLOAD', 'false').lower() == 'true' -def create_multimodal_messages(example): - text = example.get('text', '') - return {'messages': [{'role': 'user', 'content': f'\n{text}'}, {'role': 'assistant', 'content': 'Response'}]} +def create_multimodal_messages(examples): + """Batched map function: receives dict of lists, returns dict of lists.""" + messages_batch = [] + for text in examples.get('text', []): + messages_batch.append([{'role': 'user', 'content': f'\n{text}'}, {'role': 'assistant', 'content': 'Response'}]) + return {'messages': messages_batch} class TestDataLoaderMultimodal: diff --git a/tests/dataset/test_lazy.py b/tests/dataset/test_lazy.py index 47e398431..7eaa32451 100644 --- a/tests/dataset/test_lazy.py +++ b/tests/dataset/test_lazy.py @@ -10,12 +10,14 @@ SKIP_MODEL_DOWNLOAD = os.getenv('SKIP_MODEL_DOWNLOAD', 'false').lower() == 'true' -def convert_to_messages(example): - text = example.get('text', '') - if not text: - text = str(example.get('question', example.get('title', ''))) - - return {'messages': [Message(role='user', content=text), Message(role='assistant', content='Response')]} +def convert_to_messages(examples): + """Batched map function: receives dict of lists, returns dict of lists.""" + texts = examples.get('text', None) or examples.get('question', None) or examples.get('title', []) + messages_batch = [] + for text in texts: + text = text or '' + messages_batch.append([Message(role='user', content=str(text)), Message(role='assistant', content='Response')]) + return {'messages': messages_batch} class TestLazyDataset: @@ -48,8 +50,7 @@ def test_lazy_dataset_encode_flag(self): dataset.encode() - # Lazy load: encode() only sets flag, actual encoding on access; raw dataset has no input_ids - assert 'messages' in dataset.dataset[0] + # Lazy load: both map and encode are deferred; raw dataset has neither messages nor input_ids assert 'input_ids' not in dataset.dataset[0] item = dataset[0] assert 'input_ids' in item diff --git a/tests/dataset/test_loading.py b/tests/dataset/test_loading.py index 34bdaf547..aaf77fb20 100644 --- a/tests/dataset/test_loading.py +++ b/tests/dataset/test_loading.py @@ -43,12 +43,14 @@ def test_load_local_json(self): def test_load_local_lance(self): """Test loading local Lance file""" + pytest.importorskip('lance') lance_path = str(TEST_DATA_DIR / '1.lance') dataset = Dataset(dataset_meta=DatasetMeta(dataset_id=lance_path)) assert len(dataset) == 2 def test_load_local_lance_dir(self): """Test loading local Lance dir""" + pytest.importorskip('lance') lance_path = str(TEST_DATA_DIR / 'lance') dataset = Dataset(dataset_meta=DatasetMeta(dataset_id=lance_path)) assert len(dataset) == 2 diff --git a/tests/dataset/test_multimodal.py b/tests/dataset/test_multimodal.py index 5fca8f4e4..f10f401d1 100644 --- a/tests/dataset/test_multimodal.py +++ b/tests/dataset/test_multimodal.py @@ -9,12 +9,14 @@ SKIP_MODEL_DOWNLOAD = os.getenv('SKIP_MODEL_DOWNLOAD', 'false').lower() == 'true' -def create_multimodal_messages(example): - text = example.get('text', '') - if not text: - text = str(example.get('question', example.get('title', ''))) - - return {'messages': [{'role': 'user', 'content': f'\n{text}'}, {'role': 'assistant', 'content': 'Response'}]} +def create_multimodal_messages(examples): + """Batched map function: receives dict of lists, returns dict of lists.""" + texts = examples.get('text', None) or examples.get('question', None) or examples.get('title', []) + messages_batch = [] + for text in texts: + text = text or '' + messages_batch.append([{'role': 'user', 'content': f'\n{text}'}, {'role': 'assistant', 'content': 'Response'}]) + return {'messages': messages_batch} class TestMultimodalDataset: @@ -87,17 +89,18 @@ def test_multimodal_dataset_multiple_image_placeholders(self): csv_path = str(TEST_DATA_DIR / 'test.csv') dataset = Dataset(dataset_meta=DatasetMeta(dataset_id=csv_path)) - def create_multi_image_messages(example): - text = example.get('text', '') - return { - 'messages': [{ + def create_multi_image_messages(examples): + messages_batch = [] + for text in examples.get('text', []): + text = text or '' + messages_batch.append([{ 'role': 'user', 'content': f'\n{text}\n' }, { 'role': 'assistant', 'content': 'Response' - }] - } + }]) + return {'messages': messages_batch} dataset.map(create_multi_image_messages) @@ -110,17 +113,18 @@ def test_multimodal_dataset_video_placeholder(self): csv_path = str(TEST_DATA_DIR / 'test.csv') dataset = Dataset(dataset_meta=DatasetMeta(dataset_id=csv_path)) - def create_video_messages(example): - text = example.get('text', '') - return { - 'messages': [{ + def create_video_messages(examples): + messages_batch = [] + for text in examples.get('text', []): + text = text or '' + messages_batch.append([{ 'role': 'user', 'content': f'