diff --git a/.gitignore b/.gitignore index 9c6801c43..62d5e25f2 100644 --- a/.gitignore +++ b/.gitignore @@ -7,6 +7,7 @@ CMake*.json python/build/ python/ctranslate2.egg-info/ python/dist/ +python/ctranslate2/ .cache docs/build/ docs/python/ diff --git a/CMakeLists.txt b/CMakeLists.txt index cf80e37b5..785cb362c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -214,6 +214,10 @@ set(SOURCES src/ops/awq/gemv.cc src/ops/awq/gemv_cpu.cc src/ops/sum.cc + src/cpu/clifford_ops.cc + src/cpu/clifford_kernels.cc + src/ops/rotor_quant_kv.cc + src/ops/rotor_quant_kv_cpu.cc src/padder.cc src/profiler.cc src/random.cc @@ -255,6 +259,9 @@ set(CUDA_SOURCES src/ops/awq/gemm_gpu.cu src/ops/awq/gemv_gpu.cu src/ops/awq/dequantize_gpu.cu + src/ops/rotor_quant_kv_gpu.cu + src/cuda/rotor_quant_kernel.cu + src/cuda/rotor_attention_kernel.cu ) set(LIBRARIES ${CMAKE_THREAD_LIBS_INIT} diff --git a/docs/win_build.md b/docs/win_build.md new file mode 100644 index 000000000..6fc96ac7b --- /dev/null +++ b/docs/win_build.md @@ -0,0 +1,168 @@ +# Windows でのビルド手順 + +このドキュメントでは、Windows 環境において CTranslate2 の C++ ライブラリをビルドし、その後 Python の wheel パッケージを作成するまでの手順を説明します。 + +--- + +## 前提条件 + +以下のツール・ライブラリを事前にインストールしてください。 + +| ツール | 最低バージョン | 備考 | +|--------|--------------|------| +| Visual Studio | 2019 以降 | 「C++ によるデスクトップ開発」ワークロードが必要 | +| CMake | 3.15 以降 | [cmake.org](https://cmake.org/download/) からインストール | +| Python | 3.9 以降 | | +| Intel oneAPI MKL | 2019.5 以降 | CPU バックエンドとして使用(デフォルト)。[Intel oneAPI Base Toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) に含まれる | +| CUDA Toolkit | 11.0 以降 | GPU サポートが必要な場合のみ | +| cuDNN | 8 以降 | 畳み込みモデル(音声認識等)を使用する場合のみ | + +> **Note:** 以降のコマンドはすべて **x64 Native Tools Command Prompt for VS 2019**(または VS 2022)で実行してください。 +> [スタートメニュー] → [Visual Studio 20xx] → [x64 Native Tools Command Prompt for VS 20xx] + +--- + +## 1. ソースコードの取得 + +```cmd +git clone --recursive https://github.com/OpenNMT/CTranslate2.git +cd CTranslate2 +``` + +サブモジュールを含めてクローンするため `--recursive` が必要です。 + +--- + +## 2. C++ ライブラリのビルド + +### 2-1. ビルドディレクトリを作成して CMake を実行 + +**CPU のみ(Intel MKL バックエンド):** + +```cmd +mkdir build +cd build +cmake .. -G "Visual Studio 16 2019" -A x64 -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX="%CD%\..\install" "-DCMAKE_POLICY_VERSION_MINIMUM=3.5" +``` + +Visual Studio 2022 を使用する場合は `-G "Visual Studio 17 2022"` に変更してください。 + +**CPU + GPU(CUDA バックエンド):** + +```cmd +mkdir build +cd build +cmake .. -G "Visual Studio 16 2019" -A x64 ^ + -DCMAKE_BUILD_TYPE=Release ^ + -DCMAKE_INSTALL_PREFIX="%CD%\..\install" ^ + -DWITH_CUDA=ON ^ + -DWITH_CUDNN=ON ^ + "-DCMAKE_POLICY_VERSION_MINIMUM=3.5" +``` + +### 2-2. ビルドとインストール + +```cmd +cmake --build . --config Release --parallel +cmake --install . --config Release +``` + +成功すると `install\` ディレクトリ以下に以下が生成されます。 + +``` +install\ + bin\ + ctranslate2_translator.exe ← CLI ツール + include\ + ctranslate2\ ← C++ ヘッダー + lib\ + ctranslate2.lib ← インポートライブラリ + bin\ + ctranslate2.dll ← 共有ライブラリ(wheel のビルドで必要) +``` + +> **Tip:** `install\bin` を `PATH` に追加しておくと、Python ラッパーが実行時に DLL を見つけやすくなります。 + +--- + +## 3. Python wheel のビルド + +### 3-1. 依存 DLL をパッケージディレクトリにコピー + +`ctranslate2.dll` は `libiomp5md.dll`(Intel OpenMP ランタイム)に依存しています。 +wheel に同梱するため、ビルド前にパッケージディレクトリへコピーします。 + +```cmd +copy "%CTRANSLATE2_ROOT%\bin\ctranslate2.dll" python\ctranslate2\ +copy "%ONEAPI_ROOT%compiler\latest\bin\libiomp5md.dll" python\ctranslate2\ +``` + +`ONEAPI_ROOT` が未設定の場合は `%ProgramFiles(x86)%\Intel\oneAPI\` を使用してください。 + +### 3-2. ビルド依存パッケージのインストール + +`uv` を使ってビルドに必要なパッケージをインストールします。 + +```cmd +cd python +uv pip install --system setuptools wheel pybind11==2.11.1 +``` + +### 3-3. wheel のビルド + +`build_with_msvc.py` スクリプトが `vcvarsall.bat` の環境を自動で取り込み、 +`uv build --wheel --no-build-isolation` を実行します。 + +```cmd +set CTRANSLATE2_ROOT= +python build_with_msvc.py +``` + +> **Note:** `python setup.py bdist_wheel`(旧来の方法)の代わりに `uv build` を使っています。 +> `--no-build-isolation` は MSVC コンパイラ環境を現在のシェルから継承するために必要です。 + +ビルドが完了すると `dist\` ディレクトリに `.whl` ファイルが生成されます。 + +``` +python\ + dist\ + ctranslate2-X.Y.Z-cpXX-cpXX-win_amd64.whl +``` + +### 3-4. wheel のインストール + +```cmd +uv pip install --system dist\ctranslate2-*.whl +``` + +--- + +## 4. 動作確認 + +```cmd +python -c "import ctranslate2; print(ctranslate2.__version__)" +``` + +バージョン番号が表示されれば正常にインストールされています。 + +--- + +## 5. よくあるエラーと対処 + +| エラー | 原因 | 対処 | +|--------|------|------| +| `MKL not found` | MKLROOT 環境変数が未設定 | Intel oneAPI の `setvars.bat` を実行してから cmake を再実行 | +| `CUDA not found` | CUDA_PATH 環境変数が未設定 | CUDA Toolkit の再インストール、または `-DCUDA_TOOLKIT_ROOT_DIR=` を追加 | +| `DLL not found` 実行時エラー | `install\bin` が PATH に未追加 | `set PATH=%CTRANSLATE2_ROOT%\bin;%PATH%` を実行 | +| `cl.exe not found` | x64 Native Tools Prompt を使用していない | x64 Native Tools Command Prompt から再実行 | +| `Invalid CMAKE_POLICY_VERSION_MINIMUM value "3"` | `3.5` が `3` と `.5` に分割される | `-D` オプション全体をダブルクォートで囲む: `"-DCMAKE_POLICY_VERSION_MINIMUM=3.5"` | +| `Unable to find a compatible Visual Studio installation` | 日本語 Windows で `vcvarsall.bat` の出力が UTF-16LE として正しくデコードできない | `build_with_msvc.py` を使用してビルドする(ANSI モードで環境を取得) | +| `Could not find module 'ctranslate2.dll' (or one of its dependencies)` | `libiomp5md.dll` が wheel に含まれていない | `libiomp5md.dll` を `python\ctranslate2\` にコピーしてから再ビルド | + +--- + +## 参考 + +- [Build options 一覧](installation.md#build-options) +- [Intel oneAPI Base Toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) +- [CUDA Toolkit](https://developer.nvidia.com/cuda-toolkit) diff --git a/include/ctranslate2/devices.h b/include/ctranslate2/devices.h index 232307a90..f9b126633 100644 --- a/include/ctranslate2/devices.h +++ b/include/ctranslate2/devices.h @@ -26,6 +26,7 @@ namespace ctranslate2 { void synchronize_device(Device device, int index); void synchronize_stream(Device device); + void destroy_context(Device device); class ScopedDeviceSetter { public: diff --git a/include/ctranslate2/layers/attention.h b/include/ctranslate2/layers/attention.h index f77e0d4c5..3b8a95cc9 100644 --- a/include/ctranslate2/layers/attention.h +++ b/include/ctranslate2/layers/attention.h @@ -1,6 +1,9 @@ #pragma once +#include + #include "ctranslate2/layers/attention_layer.h" +#include "ctranslate2/ops/rotor_quant_kv.h" #include "ctranslate2/padder.h" #include "ctranslate2/layers/transformer.h" @@ -88,6 +91,10 @@ namespace ctranslate2 { const dim_t _cache_time_dim; std::unique_ptr _q_norm; // Query normalization std::unique_ptr _k_norm; // Key normalization + + // RotorQuant KV-cache compression (nullptr when disabled). + // Enabled by setting the environment variable CT2_ROTOR_QUANT_BITS=3 or =4. + std::unique_ptr _rotor_quant; }; } } diff --git a/include/ctranslate2/ops/ops.h b/include/ctranslate2/ops/ops.h index f48e56014..62adaea64 100644 --- a/include/ctranslate2/ops/ops.h +++ b/include/ctranslate2/ops/ops.h @@ -46,3 +46,4 @@ #include "awq/dequantize_awq.h" #endif #include "sum.h" +#include "rotor_quant_kv.h" diff --git a/include/ctranslate2/ops/rotor_quant_kv.h b/include/ctranslate2/ops/rotor_quant_kv.h new file mode 100644 index 000000000..4b735e780 --- /dev/null +++ b/include/ctranslate2/ops/rotor_quant_kv.h @@ -0,0 +1,102 @@ +#pragma once + +// RotorQuant KV-cache compression operator. +// +// Compresses float16/float32 KV-cache tensors using Clifford Cl(3,0) rotor +// rotation followed by per-token min-max quantization to 3 or 4 bits. +// +// Phase 1 (this implementation): +// - Identity rotors (no actual rotation applied, rotor sandwich = identity) +// - Per-token symmetric min-max quantization +// - No QJL residual correction +// Phase 2 (future): +// - Learned / random unit rotors per group +// - Lloyd-Max codebook quantization +// - QJL residual correction +// +// Packed buffer layout (per token vector of d_head elements): +// [0 .. codes_bytes-1] : quantized codes, LSB-packed at `bits` bits/dim +// [codes_bytes .. +3] : float32 min_val (raw bytes) +// [codes_bytes+4 .. +7] : float32 scale = max_val - min_val (raw bytes) +// +// codes_bytes = (d_head * bits + 7) / 8 +// total packed_stride = codes_bytes + 8 +// +// Cached buffers use DataType::INT8 as raw byte storage. +// The caller detects compression by checking cache.dtype() == DataType::INT8. + +#include +#include + +#include "ctranslate2/storage_view.h" + +namespace ctranslate2 { + namespace ops { + + class RotorQuantKV { + public: + struct Config { + int bits = 4; // quantisation bits per dimension (3 or 4) + }; + + explicit RotorQuantKV(dim_t d_head, const Config& cfg = {}); + + // Returns the packed stride (bytes per token) for the given d_head / config. + static dim_t compute_packed_stride(dim_t d_head, int bits); + + dim_t packed_stride() const { return _packed_stride; } + int bits() const { return _cfg.bits; } + dim_t d_head() const { return _d_head; } + + // Detect whether a cache StorageView is in compressed format. + static bool is_packed(const StorageView& v) { + return !v.empty() && v.dtype() == DataType::INT8; + } + + // encode: kv [*, d_head] float → packed [*, packed_stride] INT8 + // Works for any leading dimensions (batch * heads * time flattened). + void encode(const StorageView& kv, StorageView& packed) const; + + // decode: packed [*, packed_stride] INT8 → kv [*, d_head] same dtype/device as `kv_out` + void decode(const StorageView& packed, + StorageView& kv_out, + DataType out_dtype, + Device out_device) const; + + // append: encode `new_kv` and concat to existing packed cache along the time dimension. + // packed_cache: [batch, heads, time_old, packed_stride] INT8 + // new_kv: [batch, heads, 1, d_head] float + // Result: [batch, heads, time_old+1, packed_stride] INT8 + void append(const StorageView& new_kv, + StorageView& packed_cache) const; + + private: + dim_t _d_head; + Config _cfg; + dim_t _packed_stride; + + // Per-group rotors: shape [n_groups, 4] (s, b12, b13, b23) + // Initialised to identity; future phases will populate from learned weights. + std::vector> _rotors; // size = n_groups + + template + void encode_cpu(const T* kv_ptr, + int8_t* packed_ptr, + dim_t n_tokens) const; + + template + void decode_cpu(const int8_t* packed_ptr, + T* kv_ptr, + dim_t n_tokens) const; + +#ifdef CT2_WITH_CUDA + template + void encode_cuda(const T* kv_ptr, int8_t* packed_ptr, dim_t n_tokens) const; + + template + void decode_cuda(const int8_t* packed_ptr, T* kv_ptr, dim_t n_tokens) const; +#endif + }; + + } // ops +} // ctranslate2 diff --git a/include/ctranslate2/replica_pool.h b/include/ctranslate2/replica_pool.h index 8c8e15d8e..2c3cdc53f 100644 --- a/include/ctranslate2/replica_pool.h +++ b/include/ctranslate2/replica_pool.h @@ -346,14 +346,31 @@ namespace ctranslate2 { _allocator = &get_allocator(_device); } + // #2027: Set the shutting-down flag before the queue closes so that idle() + // skips synchronize_stream() and avoids a deadlock on Windows with CUDA. + void prepare_shutdown() override { + _shutting_down.store(true, std::memory_order_release); + } + void idle() override { // When no new jobs are immediately available, we synchronize the CUDA stream // so that the CudaAsyncAllocator can release some memory. - synchronize_stream(_device); + // #2027: Skip during shutdown to prevent blocking while the mutex is held, + // which would deadlock against ThreadPool::~ThreadPool() calling queue.close(). + if (!_shutting_down.load(std::memory_order_acquire)) + synchronize_stream(_device); } void finalize() override { + // #2027: Ensure the shutting-down flag is set before releasing the replica. + _shutting_down.store(true, std::memory_order_release); + _replica.reset(); + + // #1912: Explicitly free thread-local CUDA resources (cuRAND states) before + // the thread is destroyed. Releasing them after thread exit crashes on Windows + // because the CUDA context is already invalid (stack buffer overrun 0xC0000409). + destroy_context(_device); } private: @@ -362,6 +379,7 @@ namespace ctranslate2 { const size_t _num_threads; Allocator* _allocator; std::unique_ptr _replica; + std::atomic _shutting_down{false}; }; } diff --git a/include/ctranslate2/thread_pool.h b/include/ctranslate2/thread_pool.h index 826b7e57f..3af918214 100644 --- a/include/ctranslate2/thread_pool.h +++ b/include/ctranslate2/thread_pool.h @@ -61,6 +61,9 @@ namespace ctranslate2 { void start(JobQueue& job_queue, int thread_affinity = -1); void join(); + // Called before the job queue is closed, to allow workers to prepare for shutdown. + virtual void prepare_shutdown() {} + protected: // Called before the work loop. virtual void initialize() {} diff --git a/python/build_wheel.bat b/python/build_wheel.bat new file mode 100644 index 000000000..f3bfafb34 --- /dev/null +++ b/python/build_wheel.bat @@ -0,0 +1,29 @@ +@echo off +setlocal + +set SCRIPT_DIR=%~dp0 +set LOGFILE=%SCRIPT_DIR%build_log.txt + +echo [1] Initializing MSVC... > %LOGFILE% + +for /f "usebackq tokens=*" %%i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath`) do ( + set VS_INSTALL_DIR=%%i +) + +if not defined VS_INSTALL_DIR ( + echo ERROR: Visual Studio not found >> %LOGFILE% + exit /b 1 +) + +call "%VS_INSTALL_DIR%\VC\Auxiliary\Build\vcvarsall.bat" x64 >> %LOGFILE% 2>&1 +echo [2] MSVC done >> %LOGFILE% + +set CTRANSLATE2_ROOT=%SCRIPT_DIR%..\build\install +echo [3] CTRANSLATE2_ROOT=%CTRANSLATE2_ROOT% >> %LOGFILE% + +cd /d %SCRIPT_DIR% +echo [4] Running uv build... >> %LOGFILE% +uv build --wheel --no-build-isolation >> %LOGFILE% 2>&1 +echo [5] Exit code: %ERRORLEVEL% >> %LOGFILE% + +endlocal diff --git a/python/build_with_msvc.py b/python/build_with_msvc.py new file mode 100644 index 000000000..6ca193f27 --- /dev/null +++ b/python/build_with_msvc.py @@ -0,0 +1,75 @@ +""" +vcvarsall.bat の環境変数を Python プロセスに取り込み、uv build を実行するスクリプト。 + +日本語 Windows (cp932) では setuptools の cmd /u /c 方式が文字化けするため、 +cmd /a /c (ANSI) で環境を取得して上書きする。 +""" +import os +import subprocess +import sys + + +def get_msvc_env(arch: str = "x64") -> dict: + vswhere = os.path.join( + os.environ.get("ProgramFiles(x86)", r"C:\Program Files (x86)"), + "Microsoft Visual Studio", "Installer", "vswhere.exe", + ) + install_path = subprocess.check_output( + [vswhere, "-latest", "-requires", + "Microsoft.VisualStudio.Component.VC.Tools.x86.x64", + "-property", "installationPath", "-products", "*"], + stderr=subprocess.DEVNULL, + ).decode("mbcs").strip() + + vcvarsall = os.path.join(install_path, "VC", "Auxiliary", "Build", "vcvarsall.bat") + + # cmd /a = ANSI モード(cp932 として正しくデコードできる) + out = subprocess.check_output( + f'cmd /a /c ""{vcvarsall}" {arch} && set"', + stderr=subprocess.STDOUT, + ).decode("mbcs", errors="replace") + + env = {} + for line in out.splitlines(): + key, _, value = line.partition("=") + if key and value: + env[key] = value + return env + + +def main(): + python_version = sys.argv[1] if len(sys.argv) > 1 else None + + print("[build_with_msvc] Loading MSVC environment via vcvarsall.bat (ANSI mode)...") + msvc_env = get_msvc_env("x64") + if "PATH" not in msvc_env: + print("[build_with_msvc] ERROR: Failed to load MSVC environment.") + sys.exit(1) + + # 現在の環境に MSVC 環境をマージ(MSVC のものを優先) + merged = {**os.environ, **msvc_env} + # CTRANSLATE2_ROOT は明示的に保持 + if "CTRANSLATE2_ROOT" in os.environ: + merged["CTRANSLATE2_ROOT"] = os.environ["CTRANSLATE2_ROOT"] + # setuptools に vcvarsall.bat の再実行をスキップさせ、この環境をそのまま使わせる + merged["DISTUTILS_USE_SDK"] = "1" + + cmd = ["uv", "build", "--wheel"] + if python_version: + cmd += ["--python", python_version] + else: + cmd += ["--no-build-isolation"] + + print(f"[build_with_msvc] cl.exe: {msvc_env.get('PATH', '').split(';')[0]}") + print(f"[build_with_msvc] Running: {' '.join(cmd)}") + + result = subprocess.run( + cmd, + env=merged, + cwd=os.path.dirname(os.path.abspath(__file__)), + ) + sys.exit(result.returncode) + + +if __name__ == "__main__": + main() diff --git a/python/pyproject.toml b/python/pyproject.toml index d62e25b9b..b9e3ebd38 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -1,3 +1,49 @@ [build-system] requires = ["setuptools", "wheel", "pybind11==2.11.1"] build-backend = "setuptools.build_meta" + +[project] +name = "ctranslate2" +version = "4.7.1" +description = "Fast inference engine for Transformer models" +license = "MIT" +authors = [{ name = "OpenNMT" }] +requires-python = ">=3.9" +dependencies = [ + "numpy", + "pyyaml>=5.3,<7", +] +classifiers = [ + "Development Status :: 5 - Production/Stable", + "Environment :: GPU :: NVIDIA CUDA :: 12 :: 12.4", + "Intended Audience :: Developers", + "Intended Audience :: Science/Research", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3 :: Only", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: 3.14", + "Topic :: Scientific/Engineering :: Artificial Intelligence", +] +keywords = ["opennmt", "nmt", "neural machine translation", "cuda", "mkl", "inference", "quantization"] + +[project.urls] +Documentation = "https://opennmt.net/CTranslate2" +Forum = "https://forum.opennmt.net" +Gitter = "https://gitter.im/OpenNMT/CTranslate2" +Source = "https://github.com/OpenNMT/CTranslate2" + +[project.scripts] +ct2-fairseq-converter = "ctranslate2.converters.fairseq:main" +ct2-marian-converter = "ctranslate2.converters.marian:main" +ct2-openai-gpt2-converter = "ctranslate2.converters.openai_gpt2:main" +ct2-opennmt-py-converter = "ctranslate2.converters.opennmt_py:main" +ct2-opennmt-tf-converter = "ctranslate2.converters.opennmt_tf:main" +ct2-opus-mt-converter = "ctranslate2.converters.opus_mt:main" +ct2-transformers-converter = "ctranslate2.converters.transformers:main" + +[tool.setuptools.packages.find] +include = ["ctranslate2*"] diff --git a/python/setup.py b/python/setup.py index d161d59da..9aa86ea82 100644 --- a/python/setup.py +++ b/python/setup.py @@ -5,29 +5,12 @@ import pybind11 from pybind11.setup_helpers import ParallelCompile -from setuptools import Extension, find_packages, setup +from setuptools import Extension, setup -base_dir = os.path.dirname(os.path.abspath(__file__)) include_dirs = [pybind11.get_include()] library_dirs = [] -def _get_long_description(): - readme_path = os.path.join(base_dir, "README.md") - if not os.path.exists(readme_path): - return "" - with open(readme_path, encoding="utf-8") as readme_file: - return readme_file.read() - - -def _get_project_version(): - version_path = os.path.join(base_dir, "ctranslate2", "version.py") - version = {} - with open(version_path, encoding="utf-8") as fp: - exec(fp.read(), version) - return version["__version__"] - - def _maybe_add_library_root(lib_name): if "%s_ROOT" % lib_name in os.environ: root = os.environ["%s_ROOT" % lib_name] @@ -68,54 +51,6 @@ def _maybe_add_library_root(lib_name): ParallelCompile("CMAKE_BUILD_PARALLEL_LEVEL").install() setup( - name="ctranslate2", - version=_get_project_version(), - license="MIT", - description="Fast inference engine for Transformer models", - long_description=_get_long_description(), - long_description_content_type="text/markdown", - author="OpenNMT", - url="https://opennmt.net", - classifiers=[ - "Development Status :: 5 - Production/Stable", - "Environment :: GPU :: NVIDIA CUDA :: 12 :: 12.4", - "Intended Audience :: Developers", - "Intended Audience :: Science/Research", - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3 :: Only", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", - "Programming Language :: Python :: 3.11", - "Programming Language :: Python :: 3.12", - "Programming Language :: Python :: 3.13", - "Programming Language :: Python :: 3.14", - "Topic :: Scientific/Engineering :: Artificial Intelligence", - ], - project_urls={ - "Documentation": "https://opennmt.net/CTranslate2", - "Forum": "https://forum.opennmt.net", - "Gitter": "https://gitter.im/OpenNMT/CTranslate2", - "Source": "https://github.com/OpenNMT/CTranslate2", - }, - keywords="opennmt nmt neural machine translation cuda mkl inference quantization", - packages=find_packages(exclude=["bin"]), package_data=package_data, ext_modules=[ctranslate2_module], - python_requires=">=3.9", - install_requires=[ - "setuptools", - "numpy", - "pyyaml>=5.3,<7", - ], - entry_points={ - "console_scripts": [ - "ct2-fairseq-converter=ctranslate2.converters.fairseq:main", - "ct2-marian-converter=ctranslate2.converters.marian:main", - "ct2-openai-gpt2-converter=ctranslate2.converters.openai_gpt2:main", - "ct2-opennmt-py-converter=ctranslate2.converters.opennmt_py:main", - "ct2-opennmt-tf-converter=ctranslate2.converters.opennmt_tf:main", - "ct2-opus-mt-converter=ctranslate2.converters.opus_mt:main", - "ct2-transformers-converter=ctranslate2.converters.transformers:main", - ], - }, ) diff --git a/src/cpu/clifford_kernels.cc b/src/cpu/clifford_kernels.cc new file mode 100644 index 000000000..c133e680b --- /dev/null +++ b/src/cpu/clifford_kernels.cc @@ -0,0 +1,56 @@ +// SIMD-optimised Clifford Cl(3,0) rotor operations. +// +// This file is compiled once per ISA (similar to kernels.cc) and provides +// vectorised encode/decode for groups of 3 dimensions. +// +// Phase 2 uses these to replace the scalar loop in rotor_quant_kv_cpu.cc. +// +// Currently provides the fallback scalar + portable optimised paths. +// AVX2 / NEON specialisations are added incrementally. + +#include "clifford_ops.h" +#include "clifford_kernels.h" + +#include +#include + +namespace ctranslate2 { + namespace cpu { + namespace clifford { + + // ----------------------------------------------------------------------- + // rotate_groups + // + // Apply the rotor sandwich to each group of 3 consecutive floats in `vec`, + // reading rotor parameters from `rotors` (n_groups × 4 floats: s,b12,b13,b23). + // ----------------------------------------------------------------------- + void rotate_groups(float* vec, + const float* rotors, // [n_groups][4] + int64_t d, + bool inverse) { + const int n_groups = static_cast((d + 2) / 3); + for (int g = 0; g < n_groups; ++g) { + const float* R = rotors + g * 4; + const Rotor rotor{R[0], R[1], R[2], R[3]}; + + const int base = g * 3; + // Handle last (possibly partial) group. + float a = vec[base]; + float b = (base + 1 < static_cast(d)) ? vec[base + 1] : 0.f; + float c = (base + 2 < static_cast(d)) ? vec[base + 2] : 0.f; + + float y0, y1, y2; + if (!inverse) + rotor_sandwich(rotor, a, b, c, y0, y1, y2); + else + rotor_sandwich_inv(rotor, a, b, c, y0, y1, y2); + + vec[base] = y0; + if (base + 1 < static_cast(d)) vec[base + 1] = y1; + if (base + 2 < static_cast(d)) vec[base + 2] = y2; + } + } + + } // clifford + } // cpu +} // ctranslate2 diff --git a/src/cpu/clifford_kernels.h b/src/cpu/clifford_kernels.h new file mode 100644 index 000000000..358593fbb --- /dev/null +++ b/src/cpu/clifford_kernels.h @@ -0,0 +1,20 @@ +#pragma once + +#include + +namespace ctranslate2 { + namespace cpu { + namespace clifford { + + // Apply Clifford rotor sandwich to groups of 3 consecutive floats. + // vec: in/out buffer of length `d` + // rotors: [n_groups x 4] packed as (s, b12, b13, b23) per group + // inverse: if true, apply the inverse sandwich + void rotate_groups(float* vec, + const float* rotors, + int64_t d, + bool inverse); + + } // clifford + } // cpu +} // ctranslate2 diff --git a/src/cpu/clifford_ops.cc b/src/cpu/clifford_ops.cc new file mode 100644 index 000000000..37456c8b5 --- /dev/null +++ b/src/cpu/clifford_ops.cc @@ -0,0 +1,136 @@ +#include "clifford_ops.h" + +#include +#include + +namespace ctranslate2 { + namespace cpu { + namespace clifford { + + // ----------------------------------------------------------------------- + // Rotor::normalised + // ----------------------------------------------------------------------- + Rotor Rotor::normalised() const { + const float norm = std::sqrt(s*s + b12*b12 + b13*b13 + b23*b23); + if (norm < 1e-9f) + return {}; // fall back to identity + const float inv = 1.f / norm; + return {s*inv, b12*inv, b13*inv, b23*inv}; + } + + // ----------------------------------------------------------------------- + // Forward sandwich: y = R · v · R̃ + // + // Derivation (Cl(3,0) geometric product, explicit component expansion): + // + // Step 1 — T = R · v where v = a·e1 + b·e2 + c·e3 + // T[e1] = s·a + p12·b + p13·c + // T[e2] = s·b - p12·a + p23·c + // T[e3] = s·c - p13·a - p23·b + // T[e123] = p12·c - p13·b + p23·a + // + // Step 2 — y = T · R̃ where R̃ = s - p12·e12 - p13·e13 - p23·e23 + // y[e1] = s·T1 + p12·T2 + p13·T3 + p23·T123 + // y[e2] = -p12·T1 + s·T2 + p23·T3 - p13·T123 + // y[e3] = -p13·T1 - p23·T2 + s·T3 + p12·T123 + // + // (verified against quaternion rotation for 90° test cases) + // ----------------------------------------------------------------------- + void rotor_sandwich(const Rotor& R, + float a, float b, float c, + float& y0, float& y1, float& y2) { + const float s = R.s; + const float p12 = R.b12; + const float p13 = R.b13; + const float p23 = R.b23; + + // Step 1: T = R · v + const float T1 = s*a + p12*b + p13*c; + const float T2 = s*b - p12*a + p23*c; + const float T3 = s*c - p13*a - p23*b; + const float T123 = p12*c - p13*b + p23*a; + + // Step 2: y = T · R̃ + y0 = s*T1 + p12*T2 + p13*T3 + p23*T123; + y1 = -p12*T1 + s*T2 + p23*T3 - p13*T123; + y2 = -p13*T1 - p23*T2 + s*T3 + p12*T123; + } + + // ----------------------------------------------------------------------- + // Inverse sandwich: y = R̃ · v · R + // + // Step 1 — T = R̃ · v (sign-flip on bivector components) + // T[e1] = s·a - p12·b - p13·c + // T[e2] = s·b + p12·a - p23·c + // T[e3] = s·c + p13·a + p23·b + // T[e123] = -p12·c + p13·b - p23·a + // + // Step 2 — y = T · R + // y[e1] = s·T1 - p12·T2 - p13·T3 - p23·T123 + // y[e2] = p12·T1 + s·T2 + p13·T123 - p23·T3 + // y[e3] = -p12·T123 + p13·T1 + p23·T2 + s·T3 + // ----------------------------------------------------------------------- + void rotor_sandwich_inv(const Rotor& R, + float a, float b, float c, + float& y0, float& y1, float& y2) { + const float s = R.s; + const float p12 = R.b12; + const float p13 = R.b13; + const float p23 = R.b23; + + // Step 1: T = R̃ · v + const float T1 = s*a - p12*b - p13*c; + const float T2 = s*b + p12*a - p23*c; + const float T3 = s*c + p13*a + p23*b; + const float T123 = -p12*c + p13*b - p23*a; + + // Step 2: y = T · R + y0 = s*T1 - p12*T2 - p13*T3 - p23*T123; + y1 = p12*T1 + s*T2 - p23*T3 + p13*T123; + y2 = p13*T1 + p23*T2 + s*T3 - p12*T123; + } + + // ----------------------------------------------------------------------- + // random_rotor + // + // Deterministic "random" unit rotor from a simple hash of the indices. + // When all indices are 0, returns the identity rotor. + // ----------------------------------------------------------------------- + Rotor random_rotor(int group_idx, int head_idx, int layer_idx) { + if (group_idx == 0 && head_idx == 0 && layer_idx == 0) + return {}; + + // Deterministic hash: mix indices into a 32-bit seed. + uint32_t seed = static_cast(group_idx) + ^ (static_cast(head_idx) * 2654435761u) + ^ (static_cast(layer_idx) * 1234567891u); + // Xorshift32 + auto xorshift = [](uint32_t x) -> uint32_t { + x ^= x << 13; x ^= x >> 17; x ^= x << 5; + return x; + }; + seed = xorshift(seed ? seed : 1u); + const float u0 = static_cast(seed) / static_cast(0xFFFFFFFFu); + seed = xorshift(seed); + const float u1 = static_cast(seed) / static_cast(0xFFFFFFFFu); + seed = xorshift(seed); + const float u2 = static_cast(seed) / static_cast(0xFFFFFFFFu); + seed = xorshift(seed); + const float u3 = static_cast(seed) / static_cast(0xFFFFFFFFu); + + // Uniform distribution on S³ via Marsaglia (1972): + const float sq1 = std::sqrt(1.f - u0); + const float sq2 = std::sqrt(u0); + const float a = sq1 * std::sin(2.f * 3.14159265358979323846f * u1); + const float b = sq1 * std::cos(2.f * 3.14159265358979323846f * u1); + const float c = sq2 * std::sin(2.f * 3.14159265358979323846f * u2); + const float d = sq2 * std::cos(2.f * 3.14159265358979323846f * u2); + (void)u3; + + // Map (a,b,c,d) → (s, b12, b13, b23) + return Rotor{a, b, c, d}; + } + + } // clifford + } // cpu +} // ctranslate2 diff --git a/src/cpu/clifford_ops.h b/src/cpu/clifford_ops.h new file mode 100644 index 000000000..d2636d6f7 --- /dev/null +++ b/src/cpu/clifford_ops.h @@ -0,0 +1,56 @@ +#pragma once + +// Clifford algebra Cl(3,0) rotor operations for RotorQuant KV-cache compression. +// +// A rotor R in Cl(3,0) lives in the even sub-algebra: +// R = s·1 + b12·e12 + b13·e13 + b23·e23 +// with norm s²+b12²+b13²+b23² = 1 (unit rotor). +// +// The sandwich product R·v·R̃ rotates a grade-1 vector v ∈ span{e1,e2,e3} +// to another grade-1 vector. R̃ is the reverse (conjugate) of R. +// +// Correspondence with quaternions (for reference): +// s ↔ w, b23 ↔ i, -b13 ↔ j, b12 ↔ k + +namespace ctranslate2 { + namespace cpu { + namespace clifford { + + // Clifford Cl(3,0) rotor: R = s + b12·e12 + b13·e13 + b23·e23 + // Identity rotor: s=1, b12=b13=b23=0 + struct Rotor { + float s = 1.f; + float b12 = 0.f; + float b13 = 0.f; + float b23 = 0.f; + + Rotor() = default; + Rotor(float s_, float b12_, float b13_, float b23_) + : s(s_), b12(b12_), b13(b13_), b23(b23_) {} + + // Return normalised version (unit rotor). + Rotor normalised() const; + + // Return the reverse/conjugate: R̃ = s - b12·e12 - b13·e13 - b23·e23 + Rotor reversed() const { return {s, -b12, -b13, -b23}; } + }; + + // Forward sandwich: y = R · [a,b,c]ᵀ · R̃ + // (rotation of the grade-1 vector a·e1+b·e2+c·e3) + void rotor_sandwich(const Rotor& R, + float a, float b, float c, + float& y0, float& y1, float& y2); + + // Inverse sandwich: y = R̃ · [a,b,c]ᵀ · R + // Undoes the forward rotation. + void rotor_sandwich_inv(const Rotor& R, + float a, float b, float c, + float& y0, float& y1, float& y2); + + // Generate a random unit rotor seeded by (group_idx, head_idx, layer_idx). + // Returns identity when all indices are 0. + Rotor random_rotor(int group_idx, int head_idx, int layer_idx = 0); + + } // clifford + } // cpu +} // ctranslate2 diff --git a/src/cuda/random.cu b/src/cuda/random.cu index f016bb447..a0ff684de 100644 --- a/src/cuda/random.cu +++ b/src/cuda/random.cu @@ -48,12 +48,17 @@ namespace ctranslate2 { curandState* _states; }; + static thread_local std::unique_ptr> states; + curandStatePhilox4_32_10_t* get_curand_states(size_t num_states) { - static thread_local std::unique_ptr> states; if (!states || num_states > states->num_states()) states = std::make_unique>(num_states); return states->states(); } + void free_curand_states() { + states.reset(); + } + } } diff --git a/src/cuda/random.h b/src/cuda/random.h index 11ceba3be..a75401745 100644 --- a/src/cuda/random.h +++ b/src/cuda/random.h @@ -13,6 +13,7 @@ namespace ctranslate2 { namespace cuda { curandStatePhilox4_32_10_t* get_curand_states(size_t num_states); + void free_curand_states(); } } diff --git a/src/cuda/rotor_attention_kernel.cu b/src/cuda/rotor_attention_kernel.cu new file mode 100644 index 000000000..36b104646 --- /dev/null +++ b/src/cuda/rotor_attention_kernel.cu @@ -0,0 +1,133 @@ +// Fused RotorQuant Attention CUDA kernel. +// +// Computes: score(q, K_packed) = q · decode(K_packed)^T in one pass, +// without fully materialising the decoded K matrix. +// +// Converted from the Triton implementation in RotorQuant +// turboquant/fused_attention.py (MIT licence). +// +// Two-term estimator (from TurboQuant / RotorQuant): +// score_i = term1_i + term2_i +// term1_i = (MSE reconstruction) +// term2_i = residual_norm_i * sqrt(pi/2) / m +// * sum_j(qjl_sign_ij * ) (QJL correction) +// +// Phase 1 (this file): term1 only (no QJL), using min-max decoded keys. +// Phase 2 (TODO): add QJL term2, use Lloyd-Max codebook. + +#include "ctranslate2/ops/rotor_quant_kv.h" + +#ifdef CT2_WITH_CUDA + +#include +#include "cuda/helpers.h" +#include "cuda/utils.h" + +namespace ctranslate2 { + namespace cuda { + + // ----------------------------------------------------------------------- + // rotor_fused_attn_kernel + // + // Each block computes attention scores for one (batch_head) pair. + // Threads in the block iterate over key tokens. + // + // Grid: (batch * heads,) + // Block: (BLOCK_T,) where BLOCK_T tiles the key-sequence dimension. + // + // Output scores: [batch, heads, 1, seq_k] (single query step) + // ----------------------------------------------------------------------- + template + __global__ void rotor_fused_attn_kernel( + const T* __restrict__ q, // [batch, heads, 1, d_head] + const int8_t* __restrict__ k_packed, // [batch, heads, seq_k, packed_stride] + float* __restrict__ scores, // [batch, heads, 1, seq_k] + int d_head, + int seq_k, + int packed_stride, + int bits, + int codes_bytes, + float scale_factor) { // 1 / sqrt(d_head) + + const int bh = blockIdx.x; // flattened batch-head index + const int t_base = threadIdx.x; + + // Pointer to this batch-head's query vector. + const T* q_bh = q + (long long)bh * d_head; + const int8_t* k_bh = k_packed + (long long)bh * seq_k * packed_stride; + float* s_bh = scores + (long long)bh * seq_k; + + // Load query to registers (up to d_head elements, tiled by BLOCK_T). + // For simplicity, load full d_head in each thread (works well when d_head ≤ 256). + // A production implementation would use shared memory. + + for (int t = t_base; t < seq_k; t += BLOCK_T) { + const int8_t* k_tok = k_bh + (long long)t * packed_stride; + + // Decode key token on-the-fly. + float mn, scale_kv; + memcpy(&mn, k_tok + codes_bytes, sizeof(float)); + memcpy(&scale_kv, k_tok + codes_bytes + 4, sizeof(float)); + + const float qmax = static_cast((1 << bits) - 1); + const uint32_t mask = (1u << bits) - 1u; + + // Dot product q · decode(k_tok). + float dot = 0.f; + uint32_t buf = 0u; + int avail = 0; + const int8_t* p = k_tok; + + for (int d = 0; d < d_head; ++d) { + while (avail < bits) { + buf |= static_cast(static_cast(*p++)) << avail; + avail += 8; + } + const uint32_t code = buf & mask; + buf >>= bits; + avail -= bits; + const float k_val = static_cast(code) / qmax * scale_kv + mn; + dot += static_cast(q_bh[d]) * k_val; + } + + s_bh[t] = dot * scale_factor; + } + } + + // ----------------------------------------------------------------------- + // launch_rotor_fused_attn + // ----------------------------------------------------------------------- + template + void launch_rotor_fused_attn( + const T* q, // [batch, heads, 1, d_head] + const int8_t* k_packed, // [batch, heads, seq_k, packed_stride] + float* scores, // [batch, heads, 1, seq_k] (caller-allocated) + int batch, + int heads, + int seq_k, + int d_head, + int packed_stride, + int bits) { + if (seq_k == 0) return; + + const int codes_bytes = (d_head * bits + 7) / 8; + const float scale_factor = 1.f / sqrtf(static_cast(d_head)); + const int bh_count = batch * heads; + const int block_t = 64; + + rotor_fused_attn_kernel, 64> + <<>>( + cuda::device_cast(q), k_packed, scores, + d_head, seq_k, packed_stride, bits, codes_bytes, scale_factor); + } + + // Explicit instantiations. + template void launch_rotor_fused_attn( + const float*, const int8_t*, float*, int, int, int, int, int, int); + template void launch_rotor_fused_attn( + const float16_t*, const int8_t*, float*, int, int, int, int, int, int); + + } // cuda +} // ctranslate2 + +#endif // CT2_WITH_CUDA diff --git a/src/cuda/rotor_quant_kernel.cu b/src/cuda/rotor_quant_kernel.cu new file mode 100644 index 000000000..5e98c148a --- /dev/null +++ b/src/cuda/rotor_quant_kernel.cu @@ -0,0 +1,214 @@ +// RotorQuant encode/decode CUDA kernels. +// +// Ported from RotorQuant csrc/rotor_fused_kernel.cu (MIT licence). +// PyTorch / ATen dependencies replaced with CTranslate2 CUDA helpers. +// +// Phase 1: per-token min-max quantisation, identity rotors. +// Phase 2 (TODO): per-group Clifford rotor sandwich, Lloyd-Max codebook. +// +// Kernel signatures follow CTranslate2 conventions: +// - Raw device pointers (no torch::Tensor) +// - Float16 via cuda::device_type = __half +// - CUDA stream via cuda::get_cuda_stream() + +#include "ctranslate2/ops/rotor_quant_kv.h" + +#ifdef CT2_WITH_CUDA + +#include +#include + +#include "cuda/helpers.h" +#include "cuda/utils.h" + +namespace ctranslate2 { + namespace cuda { + + // ----------------------------------------------------------------------- + // Bit-pack helpers (device-side) + // ----------------------------------------------------------------------- + + // Read `bits`-bit code at position `idx` from packed byte array `src`. + __device__ inline uint32_t read_code(const int8_t* src, int idx, int bits) { + const int bit_pos = idx * bits; + const int byte_idx = bit_pos >> 3; + const int shift = bit_pos & 7; + // Read 2 bytes to handle cross-byte codes (bits ≤ 8 so 2 bytes always sufficient). + uint32_t val = static_cast(static_cast(src[byte_idx])); + if (shift + bits > 8) + val |= static_cast(static_cast(src[byte_idx + 1])) << 8; + return (val >> shift) & ((1u << bits) - 1u); + } + + // Write `bits`-bit value `v` at position `idx` in packed byte array `dst`. + // Note: assumes the target bytes are zero-initialised before writing. + __device__ inline void write_code(int8_t* dst, int idx, int bits, uint32_t v) { + const int bit_pos = idx * bits; + const int byte_idx = bit_pos >> 3; + const int shift = bit_pos & 7; + atomicOr(reinterpret_cast(dst + byte_idx), + (v & ((1u << bits) - 1u)) << shift); + } + + // ----------------------------------------------------------------------- + // rotor_encode_kernel + // + // Each thread processes one token (one row of shape [d_head]). + // Grid: (n_tokens,), Block: (1,) + // ----------------------------------------------------------------------- + template + __global__ void rotor_encode_kernel(const T* __restrict__ kv, // [n_tokens, d_head] + int8_t* __restrict__ packed, // [n_tokens, packed_stride] + int d_head, + int packed_stride, + int bits, + int codes_bytes) { + const int t = blockIdx.x * blockDim.x + threadIdx.x; + if (t >= gridDim.x * blockDim.x) return; + + const T* src = kv + (long long)t * d_head; + int8_t* dst = packed + (long long)t * packed_stride; + + // ---- Step 1: find min / max ---- + float mn = __half2float(__float2half(static_cast(src[0]))); + float mx = mn; + for (int i = 1; i < d_head; ++i) { + const float v = static_cast(src[i]); + if (v < mn) mn = v; + if (v > mx) mx = v; + } + float scale = mx - mn; + if (scale < 1e-9f) scale = 1e-9f; + + // ---- Step 2: quantise + pack ---- + const float qmax = static_cast((1 << bits) - 1); + const uint32_t mask = (1u << bits) - 1u; + + // Zero the code bytes. + for (int b = 0; b < codes_bytes; ++b) + dst[b] = 0; + + uint32_t buf = 0u; + int fill = 0; + int byte_out = 0; + for (int i = 0; i < d_head; ++i) { + float q = (static_cast(src[i]) - mn) / scale * qmax; + if (q < 0.f) q = 0.f; + if (q > qmax) q = qmax; + const uint32_t code = static_cast(__float2int_rn(q)); + buf |= (code & mask) << fill; + fill += bits; + while (fill >= 8) { + dst[byte_out++] = static_cast(buf & 0xFFu); + buf >>= 8; + fill -= 8; + } + } + if (fill > 0) + dst[byte_out] = static_cast(buf & 0xFFu); + + // ---- Step 3: write metadata ---- + memcpy(dst + codes_bytes, &mn, sizeof(float)); + memcpy(dst + codes_bytes + 4, &scale, sizeof(float)); + } + + // ----------------------------------------------------------------------- + // rotor_decode_kernel + // ----------------------------------------------------------------------- + template + __global__ void rotor_decode_kernel(const int8_t* __restrict__ packed, // [n_tokens, packed_stride] + T* __restrict__ kv, // [n_tokens, d_head] + int d_head, + int packed_stride, + int bits, + int codes_bytes) { + const int t = blockIdx.x * blockDim.x + threadIdx.x; + if (t >= gridDim.x * blockDim.x) return; + + const int8_t* src = packed + (long long)t * packed_stride; + T* dst = kv + (long long)t * d_head; + + // Read metadata. + float mn, scale; + memcpy(&mn, src + codes_bytes, sizeof(float)); + memcpy(&scale, src + codes_bytes + 4, sizeof(float)); + + // Unpack codes and dequantise. + const float qmax = static_cast((1 << bits) - 1); + const uint32_t mask = (1u << bits) - 1u; + + uint32_t buf = 0u; + int avail = 0; + const int8_t* p = src; + for (int i = 0; i < d_head; ++i) { + while (avail < bits) { + buf |= static_cast(static_cast(*p++)) << avail; + avail += 8; + } + const uint32_t code = buf & mask; + buf >>= bits; + avail -= bits; + const float val = static_cast(code) / qmax * scale + mn; + dst[i] = static_cast(val); + } + } + + // ----------------------------------------------------------------------- + // Launch wrappers + // ----------------------------------------------------------------------- + template + void launch_rotor_encode(const T* kv, + int8_t* packed, + int n_tokens, + int d_head, + int packed_stride, + int bits) { + if (n_tokens == 0) return; + const int codes_bytes = (d_head * bits + 7) / 8; + const int threads = 128; + const int blocks = (n_tokens + threads - 1) / threads; + rotor_encode_kernel> + <<>>( + cuda::device_cast(kv), packed, + d_head, packed_stride, bits, codes_bytes); + } + + template + void launch_rotor_decode(const int8_t* packed, + T* kv, + int n_tokens, + int d_head, + int packed_stride, + int bits) { + if (n_tokens == 0) return; + const int codes_bytes = (d_head * bits + 7) / 8; + const int threads = 128; + const int blocks = (n_tokens + threads - 1) / threads; + rotor_decode_kernel> + <<>>( + packed, cuda::device_cast(kv), + d_head, packed_stride, bits, codes_bytes); + } + + // Explicit instantiations. + template void launch_rotor_encode( + const float*, int8_t*, int, int, int, int); + template void launch_rotor_encode( + const float16_t*, int8_t*, int, int, int, int); + + template void launch_rotor_decode( + const int8_t*, float*, int, int, int, int); + template void launch_rotor_decode( + const int8_t*, float16_t*, int, int, int, int); + + } // cuda +} // ctranslate2 + +// ----------------------------------------------------------------------- +// Hook rotor_quant_kv_gpu.cu encode_cuda / decode_cuda into the kernels. +// The Phase-1 GPU stub in rotor_quant_kv_gpu.cu round-trips through CPU. +// This file provides the real CUDA path but is wired in Phase 2 only +// (see WITH_ROTOR_QUANT_CUDA define below). +// ----------------------------------------------------------------------- + +#endif // CT2_WITH_CUDA diff --git a/src/devices.cc b/src/devices.cc index a2936e0a6..d49a86f97 100644 --- a/src/devices.cc +++ b/src/devices.cc @@ -2,6 +2,7 @@ #ifdef CT2_WITH_CUDA # include "cuda/utils.h" +# include "cuda/random.h" #endif #ifdef CT2_WITH_TENSOR_PARALLEL # include @@ -118,6 +119,16 @@ namespace ctranslate2 { (void)device; #endif } + + void destroy_context(Device device) { +#ifdef CT2_WITH_CUDA + if (device == Device::CUDA) { + cuda::free_curand_states(); + } +#else + (void)device; +#endif + } // Initialize the static member variable #ifdef CT2_WITH_TENSOR_PARALLEL std::vector ScopedMPISetter::_nccl_comms; diff --git a/src/layers/attention.cc b/src/layers/attention.cc index dbbf1ed38..23f4abc21 100644 --- a/src/layers/attention.cc +++ b/src/layers/attention.cc @@ -9,6 +9,7 @@ #include "dispatch.h" #include "cpu/parallel.h" +#include "env.h" namespace ctranslate2 { namespace layers { @@ -324,6 +325,14 @@ namespace ctranslate2 { scope + "/relative_attention_max_distance"); else _maximum_relative_position = 0; + + // RotorQuant KV-cache compression: enabled by CT2_ROTOR_QUANT_BITS=3|4. + const int rq_bits = read_int_from_env("CT2_ROTOR_QUANT_BITS", 0); + if (rq_bits == 3 || rq_bits == 4) { + ops::RotorQuantKV::Config rq_cfg; + rq_cfg.bits = rq_bits; + _rotor_quant = std::make_unique(_d_head, rq_cfg); + } } DataType MultiHeadAttention::output_type() const { @@ -503,32 +512,117 @@ namespace ctranslate2 { } if (cached_keys != nullptr) { - if (cached_keys->empty()) { - *cached_keys = std::move(keys_proj); - *cached_values = std::move(values_proj); + if (_rotor_quant && keys_proj.rank() == 4) { + // --- RotorQuant compressed path --- + // Only active for standard 4-D KV tensors [batch, heads, time, d_head]. + // Falls back to standard path for merged-time-head-dims layouts. + // cached_keys / cached_values store INT8 packed buffers. + // Shape convention: [batch, heads, time, packed_stride]. + if (cached_keys->empty()) { + // Prefill: encode the full keys_proj / values_proj. + // keys_proj shape: [batch, heads, time, d_head] (after split_heads) + _rotor_quant->encode(keys_proj, *cached_keys); + _rotor_quant->encode(values_proj, *cached_values); + cached_keys->reshape( + {keys_proj.dim(0), keys_proj.dim(1), + keys_proj.dim(2), _rotor_quant->packed_stride()}); + cached_values->reshape( + {values_proj.dim(0), values_proj.dim(1), + values_proj.dim(2), _rotor_quant->packed_stride()}); + } else { + // Decode existing cache, concat with new tokens, re-encode. + // (Efficient incremental append will be added in Phase 2.) + StorageView& tmp = fused_proj; // Reuse storage. + + // Decode existing packed buffers to float. + StorageView old_keys(dtype, device), old_vals(dtype, device); + const dim_t old_time = cached_keys->dim(2); + const dim_t batch = cached_keys->dim(0); + const dim_t heads = cached_keys->dim(1); + StorageView ck_flat(DataType::INT8, device), cv_flat(DataType::INT8, device); + ck_flat.shallow_copy(*cached_keys); + cv_flat.shallow_copy(*cached_values); + ck_flat.reshape({batch * heads * old_time, _rotor_quant->packed_stride()}); + cv_flat.reshape({batch * heads * old_time, _rotor_quant->packed_stride()}); + _rotor_quant->decode(ck_flat, old_keys, dtype, device); + _rotor_quant->decode(cv_flat, old_vals, dtype, device); + old_keys.reshape({batch, heads, old_time, _d_head}); + old_vals.reshape({batch, heads, old_time, _d_head}); + + // Concat along time. + const ops::Concat concat_op(_cache_time_dim); + tmp = std::move(old_keys); + concat_op({&tmp, &keys_proj}, old_keys); + tmp = std::move(old_vals); + concat_op({&tmp, &values_proj}, old_vals); + + // Apply sliding window if needed. + if (!prefilling && _sliding_window > 0 + && old_keys.shape()[2] > _sliding_window) { + const ops::Slide slide_op(2, 1, old_keys.shape()[2] - 1); + slide_op(old_keys, tmp); old_keys = std::move(tmp); + slide_op(old_vals, tmp); old_vals = std::move(tmp); + } + + // Re-encode the merged tensor. + const dim_t new_time = old_keys.dim(2); + _rotor_quant->encode(old_keys, *cached_keys); + _rotor_quant->encode(old_vals, *cached_values); + cached_keys->reshape({batch, heads, new_time, _rotor_quant->packed_stride()}); + cached_values->reshape({batch, heads, new_time, _rotor_quant->packed_stride()}); + + // keys_proj / values_proj for attention = merged float tensors. + keys_proj = std::move(old_keys); + values_proj = std::move(old_vals); + } } else { - const ops::Concat concat_op(_cache_time_dim); - StorageView& tmp = fused_proj; // Reuse storage. - tmp = std::move(*cached_keys); - concat_op({&tmp, &keys_proj}, *cached_keys); - tmp = std::move(*cached_values); - concat_op({&tmp, &values_proj}, *cached_values); - - if (!prefilling && _sliding_window > 0 && cached_keys->shape()[2] > _sliding_window) { - // only for generation - const ops::Slide slide_op(2, 1, cached_keys->shape()[2] - 1); - slide_op(*cached_keys, tmp); - *cached_keys = std::move(tmp); - slide_op(*cached_values, tmp); - *cached_values = std::move(tmp); + // --- Standard (uncompressed) path --- + if (cached_keys->empty()) { + *cached_keys = std::move(keys_proj); + *cached_values = std::move(values_proj); + } else { + const ops::Concat concat_op(_cache_time_dim); + StorageView& tmp = fused_proj; // Reuse storage. + tmp = std::move(*cached_keys); + concat_op({&tmp, &keys_proj}, *cached_keys); + tmp = std::move(*cached_values); + concat_op({&tmp, &values_proj}, *cached_values); + + if (!prefilling && _sliding_window > 0 && cached_keys->shape()[2] > _sliding_window) { + const ops::Slide slide_op(2, 1, cached_keys->shape()[2] - 1); + slide_op(*cached_keys, tmp); + *cached_keys = std::move(tmp); + slide_op(*cached_values, tmp); + *cached_values = std::move(tmp); + } } } } } if (cached_keys) { - keys_proj.shallow_copy(*cached_keys); - values_proj.shallow_copy(*cached_values); + if (_rotor_quant && cached_keys->rank() == 4 + && ops::RotorQuantKV::is_packed(*cached_keys)) { + // Decode the packed cache into float for attention computation. + // (keys_proj / values_proj may already be set in the concat branch above.) + if (keys_proj.empty() || keys_proj.dtype() != dtype) { + const dim_t batch = cached_keys->dim(0); + const dim_t heads = cached_keys->dim(1); + const dim_t time = cached_keys->dim(2); + StorageView ck_flat(DataType::INT8, device), cv_flat(DataType::INT8, device); + ck_flat.shallow_copy(*cached_keys); + cv_flat.shallow_copy(*cached_values); + ck_flat.reshape({batch * heads * time, _rotor_quant->packed_stride()}); + cv_flat.reshape({batch * heads * time, _rotor_quant->packed_stride()}); + _rotor_quant->decode(ck_flat, keys_proj, dtype, device); + _rotor_quant->decode(cv_flat, values_proj, dtype, device); + keys_proj.reshape({batch, heads, time, _d_head}); + values_proj.reshape({batch, heads, time, _d_head}); + } + } else { + keys_proj.shallow_copy(*cached_keys); + values_proj.shallow_copy(*cached_values); + } } StorageView& context = fused_proj; // Reuse storage. diff --git a/src/ops/rotor_quant_kv.cc b/src/ops/rotor_quant_kv.cc new file mode 100644 index 000000000..1f4d9cfc0 --- /dev/null +++ b/src/ops/rotor_quant_kv.cc @@ -0,0 +1,145 @@ +#include "ctranslate2/ops/rotor_quant_kv.h" + +#include +#include + +#include "ctranslate2/types.h" +#include "ctranslate2/ops/concat.h" +#include "dispatch.h" + +namespace ctranslate2 { + namespace ops { + + // ------------------------------------------------------------------------- + // compute_packed_stride + // ------------------------------------------------------------------------- + dim_t RotorQuantKV::compute_packed_stride(dim_t d_head, int bits) { + const dim_t codes_bytes = (d_head * bits + 7) / 8; + return codes_bytes + 8; // +8 for float32 min + float32 scale + } + + // ------------------------------------------------------------------------- + // Constructor + // ------------------------------------------------------------------------- + RotorQuantKV::RotorQuantKV(dim_t d_head, const Config& cfg) + : _d_head(d_head) + , _cfg(cfg) + , _packed_stride(compute_packed_stride(d_head, cfg.bits)) + { + if (cfg.bits != 3 && cfg.bits != 4) + throw std::invalid_argument("RotorQuantKV: bits must be 3 or 4"); + + // Initialise rotors to identity for all groups. + const dim_t n_groups = (d_head + 2) / 3; + _rotors.resize(n_groups, {1.f, 0.f, 0.f, 0.f}); + } + + // ------------------------------------------------------------------------- + // encode (dispatch wrapper) + // ------------------------------------------------------------------------- + void RotorQuantKV::encode(const StorageView& kv, StorageView& packed) const { + PROFILE("RotorQuantKV::encode"); + + const dim_t n_tokens = kv.size() / _d_head; + const Device dev = kv.device(); + packed = StorageView({n_tokens, _packed_stride}, DataType::INT8, dev); + if (dev == Device::CPU) { + if (kv.dtype() == DataType::FLOAT32) + encode_cpu(kv.data(), packed.data(), n_tokens); + else if (kv.dtype() == DataType::FLOAT16) + encode_cpu(kv.data(), packed.data(), n_tokens); + else + throw std::invalid_argument("RotorQuantKV::encode: unsupported dtype " + + dtype_name(kv.dtype())); + } +#ifdef CT2_WITH_CUDA + else if (dev == Device::CUDA) { + if (kv.dtype() == DataType::FLOAT32) + encode_cuda(kv.data(), packed.data(), n_tokens); + else if (kv.dtype() == DataType::FLOAT16) + encode_cuda(kv.data(), packed.data(), n_tokens); + else + throw std::invalid_argument("RotorQuantKV::encode (CUDA): unsupported dtype " + + dtype_name(kv.dtype())); + } +#endif + else { + throw std::invalid_argument("RotorQuantKV::encode: unsupported device"); + } + } + + // ------------------------------------------------------------------------- + // decode (dispatch wrapper) + // ------------------------------------------------------------------------- + void RotorQuantKV::decode(const StorageView& packed, + StorageView& kv_out, + DataType out_dtype, + Device out_device) const { + PROFILE("RotorQuantKV::decode"); + + const dim_t n_tokens = packed.size() / _packed_stride; + kv_out = StorageView({n_tokens, _d_head}, out_dtype, out_device); + + if (out_device == Device::CPU) { + if (out_dtype == DataType::FLOAT32) + decode_cpu(packed.data(), kv_out.data(), n_tokens); + else if (out_dtype == DataType::FLOAT16) + decode_cpu(packed.data(), kv_out.data(), n_tokens); + else + throw std::invalid_argument("RotorQuantKV::decode: unsupported dtype " + + dtype_name(out_dtype)); + } +#ifdef CT2_WITH_CUDA + else if (out_device == Device::CUDA) { + if (out_dtype == DataType::FLOAT32) + decode_cuda(packed.data(), kv_out.data(), n_tokens); + else if (out_dtype == DataType::FLOAT16) + decode_cuda(packed.data(), kv_out.data(), n_tokens); + else + throw std::invalid_argument("RotorQuantKV::decode (CUDA): unsupported dtype " + + dtype_name(out_dtype)); + } +#endif + else { + throw std::invalid_argument("RotorQuantKV::decode: unsupported device"); + } + } + + // ------------------------------------------------------------------------- + // append + // Encodes new_kv and appends to packed_cache along the time dimension. + // packed_cache: [batch, heads, time_old, packed_stride] INT8 (or empty) + // new_kv: [batch, heads, time_new, d_head] float + // ------------------------------------------------------------------------- + void RotorQuantKV::append(const StorageView& new_kv, + StorageView& packed_cache) const { + PROFILE("RotorQuantKV::append"); + + if (packed_cache.empty()) { + // First call: just encode. + encode(new_kv, packed_cache); + // Restore 4-dim shape [batch, heads, time, packed_stride] if new_kv is 4D. + if (new_kv.rank() == 4) { + packed_cache.reshape({new_kv.dim(0), new_kv.dim(1), + new_kv.dim(2), _packed_stride}); + } + return; + } + + // Encode new tokens. + StorageView new_packed(DataType::INT8, new_kv.device()); + encode(new_kv, new_packed); + if (new_kv.rank() == 4) + new_packed.reshape({new_kv.dim(0), new_kv.dim(1), + new_kv.dim(2), _packed_stride}); + + // Concat along time dimension (dim 2 for 4D, dim 0 for 2D). + const dim_t cat_dim = (packed_cache.rank() == 4) ? 2 : 0; + StorageView tmp(DataType::INT8, new_kv.device()); + const ops::Concat concat_op(cat_dim); + tmp = std::move(packed_cache); + concat_op({&tmp, &new_packed}, packed_cache); + } + + } // ops +} // ctranslate2 diff --git a/src/ops/rotor_quant_kv_cpu.cc b/src/ops/rotor_quant_kv_cpu.cc new file mode 100644 index 000000000..c3dc1cc5b --- /dev/null +++ b/src/ops/rotor_quant_kv_cpu.cc @@ -0,0 +1,181 @@ +#include "ctranslate2/ops/rotor_quant_kv.h" + +// CPU implementation of RotorQuantKV encode/decode. +// +// Encoding algorithm (per token vector v of d_head floats): +// 1. Apply Clifford rotor sandwich per 3-element group (identity in Phase 1). +// 2. Compute min_val = min(v), scale = max(v) - min(v). +// 3. Quantise each element to [0, 2^bits-1]: +// code_i = round((v_i - min_val) / scale * (2^bits - 1)) +// 4. Pack codes as 'bits'-bit integers into bytes (LSB first). +// 5. Write [packed_codes | min_val_bytes | scale_bytes] to the output buffer. +// +// Decoding reverses the above. + +#include +#include +#include +#include + +#include "ctranslate2/types.h" +#include "cpu/clifford_ops.h" +#include "cpu/parallel.h" + +namespace ctranslate2 { + namespace ops { + + // ----------------------------------------------------------------------- + // Bit-packing helpers + // + // We pack `n_vals` values, each occupying `bits` bits, LSB-first into + // consecutive bytes starting at `out`. + // ----------------------------------------------------------------------- + + static inline void pack_bits(const uint8_t* codes, + int8_t* out, + dim_t n_vals, + int bits) { + uint32_t buf = 0; + int fill = 0; + int8_t* p = out; + for (dim_t i = 0; i < n_vals; ++i) { + buf |= (static_cast(codes[i]) & ((1u << bits) - 1u)) << fill; + fill += bits; + while (fill >= 8) { + *p++ = static_cast(buf & 0xFF); + buf >>= 8; + fill -= 8; + } + } + if (fill > 0) + *p = static_cast(buf & 0xFF); + } + + static inline void unpack_bits(const int8_t* in, + uint8_t* codes, + dim_t n_vals, + int bits) { + uint32_t buf = 0; + int avail = 0; + const int8_t* p = in; + const uint32_t mask = (1u << bits) - 1u; + for (dim_t i = 0; i < n_vals; ++i) { + while (avail < bits) { + buf |= (static_cast(static_cast(*p++))) << avail; + avail += 8; + } + codes[i] = static_cast(buf & mask); + buf >>= bits; + avail -= bits; + } + } + + // ----------------------------------------------------------------------- + // encode_cpu + // ----------------------------------------------------------------------- + template + void RotorQuantKV::encode_cpu(const T* kv_ptr, + int8_t* packed_ptr, + dim_t n_tokens) const { + const dim_t d = _d_head; + const int bits = _cfg.bits; + const dim_t cbytes = (d * bits + 7) / 8; // codes bytes per token + const float qmax = static_cast((1 << bits) - 1); + + cpu::parallel_for(0, n_tokens, 1, [&](dim_t begin, dim_t end) { + std::vector fvec(d); + std::vector codes(d); + + for (dim_t t = begin; t < end; ++t) { + const T* src = kv_ptr + t * d; + int8_t* dst = packed_ptr + t * _packed_stride; + + // Step 1: convert to float and optionally apply rotor. + // Phase 1: identity rotors → no rotation, just copy to float. + for (dim_t i = 0; i < d; ++i) + fvec[i] = static_cast(src[i]); + + // (Phase 2 will call clifford::rotor_sandwich per group here.) + + // Step 2: find min / max. + float mn = fvec[0], mx = fvec[0]; + for (dim_t i = 1; i < d; ++i) { + if (fvec[i] < mn) mn = fvec[i]; + if (fvec[i] > mx) mx = fvec[i]; + } + float scale = mx - mn; + if (scale < 1e-9f) scale = 1e-9f; // avoid division by zero + + // Step 3: quantise. + for (dim_t i = 0; i < d; ++i) { + float q = (fvec[i] - mn) / scale * qmax; + q = std::max(0.f, std::min(qmax, std::round(q))); + codes[i] = static_cast(q); + } + + // Step 4: pack bits. + pack_bits(codes.data(), dst, d, bits); + + // Step 5: write metadata (min and scale as raw float bytes). + std::memcpy(dst + cbytes, &mn, sizeof(float)); + std::memcpy(dst + cbytes + 4, &scale, sizeof(float)); + } + }); + } + + // ----------------------------------------------------------------------- + // decode_cpu + // ----------------------------------------------------------------------- + template + void RotorQuantKV::decode_cpu(const int8_t* packed_ptr, + T* kv_ptr, + dim_t n_tokens) const { + const dim_t d = _d_head; + const int bits = _cfg.bits; + const dim_t cbytes = (d * bits + 7) / 8; + const float qmax = static_cast((1 << bits) - 1); + + cpu::parallel_for(0, n_tokens, 1, [&](dim_t begin, dim_t end) { + std::vector codes(d); + std::vector fvec(d); + + for (dim_t t = begin; t < end; ++t) { + const int8_t* src = packed_ptr + t * _packed_stride; + T* dst = kv_ptr + t * d; + + // Unpack codes. + unpack_bits(src, codes.data(), d, bits); + + // Read metadata. + float mn, scale; + std::memcpy(&mn, src + cbytes, sizeof(float)); + std::memcpy(&scale, src + cbytes + 4, sizeof(float)); + + // Dequantise. + for (dim_t i = 0; i < d; ++i) + fvec[i] = static_cast(codes[i]) / qmax * scale + mn; + + // (Phase 2: inverse rotor sandwich here.) + + // Convert back to T. + for (dim_t i = 0; i < d; ++i) + dst[i] = static_cast(fvec[i]); + } + }); + } + + // ----------------------------------------------------------------------- + // Explicit instantiations for the types used by CTranslate2. + // ----------------------------------------------------------------------- + template void RotorQuantKV::encode_cpu( + const float*, int8_t*, dim_t) const; + template void RotorQuantKV::encode_cpu( + const float16_t*, int8_t*, dim_t) const; + + template void RotorQuantKV::decode_cpu( + const int8_t*, float*, dim_t) const; + template void RotorQuantKV::decode_cpu( + const int8_t*, float16_t*, dim_t) const; + + } // ops +} // ctranslate2 diff --git a/src/ops/rotor_quant_kv_gpu.cu b/src/ops/rotor_quant_kv_gpu.cu new file mode 100644 index 000000000..d38712f6b --- /dev/null +++ b/src/ops/rotor_quant_kv_gpu.cu @@ -0,0 +1,68 @@ +// GPU implementation of RotorQuantKV encode/decode. +// +// Dispatches to native CUDA kernels declared in cuda/rotor_quant_kernel.cu. + +#include "ctranslate2/ops/rotor_quant_kv.h" + +#ifdef CT2_WITH_CUDA + +#include +#include +#include + +#include "ctranslate2/types.h" + +// Forward-declare the kernel launchers from rotor_quant_kernel.cu. +namespace ctranslate2 { + namespace cuda { + template + void launch_rotor_encode(const T* kv, int8_t* packed, + int n_tokens, int d_head, + int packed_stride, int bits); + template + void launch_rotor_decode(const int8_t* packed, T* kv, + int n_tokens, int d_head, + int packed_stride, int bits); + } +} + +namespace ctranslate2 { + namespace ops { + + template + void RotorQuantKV::encode_cuda(const T* kv_ptr, + int8_t* packed_ptr, + dim_t n_tokens) const { + cuda::launch_rotor_encode(kv_ptr, packed_ptr, + static_cast(n_tokens), + static_cast(_d_head), + static_cast(_packed_stride), + _cfg.bits); + } + + template + void RotorQuantKV::decode_cuda(const int8_t* packed_ptr, + T* kv_ptr, + dim_t n_tokens) const { + cuda::launch_rotor_decode(packed_ptr, kv_ptr, + static_cast(n_tokens), + static_cast(_d_head), + static_cast(_packed_stride), + _cfg.bits); + } + + // Explicit instantiations. + template void RotorQuantKV::encode_cuda( + const float*, int8_t*, dim_t) const; + template void RotorQuantKV::encode_cuda( + const float16_t*, int8_t*, dim_t) const; + + template void RotorQuantKV::decode_cuda( + const int8_t*, float*, dim_t) const; + template void RotorQuantKV::decode_cuda( + const int8_t*, float16_t*, dim_t) const; + + } // ops +} // ctranslate2 + +#endif // CT2_WITH_CUDA diff --git a/src/thread_pool.cc b/src/thread_pool.cc index d0aad775b..d1cc7e20a 100644 --- a/src/thread_pool.cc +++ b/src/thread_pool.cc @@ -1,5 +1,10 @@ #include "ctranslate2/thread_pool.h" +#include +#include + +#include + #include "ctranslate2/utils.h" namespace ctranslate2 { @@ -47,7 +52,7 @@ namespace ctranslate2 { std::unique_lock lock(_mutex); if (!can_get_job()) { - if (before_wait) + if (before_wait && !_request_end) before_wait(); _can_get_job.wait(lock, [this]{ return can_get_job(); }); } @@ -103,7 +108,32 @@ namespace ctranslate2 { } void Worker::join() { - _thread.join(); + if (!_thread.joinable()) + return; + + // Move the thread out of Worker so the helper thread fully owns its lifetime, + // preventing UAF on 'this' and the TOCTOU race between detach() and join(). + auto shared_thread = std::make_shared(std::move(_thread)); + + // Use shared_ptr for the promise so it outlives this function if we detach, + // preventing the dangling-reference UB from the original PR #2027 implementation. + auto shared_promise = std::make_shared>(); + auto join_future = shared_promise->get_future(); + + std::thread join_helper([shared_thread, shared_promise]() mutable { + if (shared_thread->joinable()) + shared_thread->join(); + shared_promise->set_value(); + }); + + constexpr auto timeout = std::chrono::milliseconds(5000); + if (join_future.wait_for(timeout) == std::future_status::timeout) { + spdlog::warn("Worker thread did not finish within timeout; detaching."); + join_helper.detach(); + // shared_thread and shared_promise are kept alive by join_helper's closure. + } else { + join_helper.join(); + } } void Worker::run(JobQueue& job_queue) { @@ -146,6 +176,10 @@ namespace ctranslate2 { } ThreadPool::~ThreadPool() { + // Notify workers before closing the queue so they can stop blocking operations + // (e.g. synchronize_stream) that would cause a deadlock on Windows with CUDA. + for (auto& worker : _workers) + worker->prepare_shutdown(); _queue.close(); for (auto& worker : _workers) worker->join();