OpenNMT · PyRin-c · Mar 28, 2026 · Mar 28, 2026 · Mar 28, 2026 · Mar 28, 2026
diff --git a/.gitignore b/.gitignore
@@ -7,6 +7,7 @@ CMake*.json
 python/build/
 python/ctranslate2.egg-info/
 python/dist/
+python/ctranslate2/
 .cache
 docs/build/
 docs/python/
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -214,6 +214,10 @@ set(SOURCES
   src/ops/awq/gemv.cc
   src/ops/awq/gemv_cpu.cc
   src/ops/sum.cc
+  src/cpu/clifford_ops.cc
+  src/cpu/clifford_kernels.cc
+  src/ops/rotor_quant_kv.cc
+  src/ops/rotor_quant_kv_cpu.cc
   src/padder.cc
   src/profiler.cc
   src/random.cc
@@ -255,6 +259,9 @@ set(CUDA_SOURCES
   src/ops/awq/gemm_gpu.cu
   src/ops/awq/gemv_gpu.cu
   src/ops/awq/dequantize_gpu.cu
+  src/ops/rotor_quant_kv_gpu.cu
+  src/cuda/rotor_quant_kernel.cu
+  src/cuda/rotor_attention_kernel.cu
 )
 set(LIBRARIES
   ${CMAKE_THREAD_LIBS_INIT}

diff --git a/docs/win_build.md b/docs/win_build.md
@@ -0,0 +1,168 @@
+# Windows でのビルド手順
+
+このドキュメントでは、Windows 環境において CTranslate2 の C++ ライブラリをビルドし、その後 Python の wheel パッケージを作成するまでの手順を説明します。
+
+---
+
+## 前提条件
+
+以下のツール・ライブラリを事前にインストールしてください。
+
+| ツール | 最低バージョン | 備考 |
+|--------|--------------|------|
+| Visual Studio | 2019 以降 | 「C++ によるデスクトップ開発」ワークロードが必要 |
+| CMake | 3.15 以降 | [cmake.org](https://cmake.org/download/) からインストール |
+| Python | 3.9 以降 | |
+| Intel oneAPI MKL | 2019.5 以降 | CPU バックエンドとして使用（デフォルト）。[Intel oneAPI Base Toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) に含まれる |
+| CUDA Toolkit | 11.0 以降 | GPU サポートが必要な場合のみ |
+| cuDNN | 8 以降 | 畳み込みモデル（音声認識等）を使用する場合のみ |
+
+> **Note:** 以降のコマンドはすべて **x64 Native Tools Command Prompt for VS 2019**（または VS 2022）で実行してください。
+> [スタートメニュー] → [Visual Studio 20xx] → [x64 Native Tools Command Prompt for VS 20xx]
+
+---
+
+## 1. ソースコードの取得
+
+```cmd
+git clone --recursive https://github.com/OpenNMT/CTranslate2.git
+cd CTranslate2
+```
+
+サブモジュールを含めてクローンするため `--recursive` が必要です。
+
+---
+
+## 2. C++ ライブラリのビルド
+
+### 2-1. ビルドディレクトリを作成して CMake を実行
+
+**CPU のみ（Intel MKL バックエンド）:**
+
+```cmd
+mkdir build
+cd build
+cmake .. -G "Visual Studio 16 2019" -A x64 -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX="%CD%\..\install" "-DCMAKE_POLICY_VERSION_MINIMUM=3.5"
+```
+
+Visual Studio 2022 を使用する場合は `-G "Visual Studio 17 2022"` に変更してください。
+
+**CPU + GPU（CUDA バックエンド）:**
+
+```cmd
+mkdir build
+cd build
+cmake .. -G "Visual Studio 16 2019" -A x64 ^
+    -DCMAKE_BUILD_TYPE=Release ^
+    -DCMAKE_INSTALL_PREFIX="%CD%\..\install" ^
+    -DWITH_CUDA=ON ^
+    -DWITH_CUDNN=ON ^
+    "-DCMAKE_POLICY_VERSION_MINIMUM=3.5"
+```
+
+### 2-2. ビルドとインストール
+
+```cmd
+cmake --build . --config Release --parallel
+cmake --install . --config Release
+```
+
+成功すると `install\` ディレクトリ以下に以下が生成されます。
+
+```
+install\
+  bin\
+    ctranslate2_translator.exe   ← CLI ツール
+  include\
+    ctranslate2\                 ← C++ ヘッダー
+  lib\
+    ctranslate2.lib              ← インポートライブラリ
+  bin\
+    ctranslate2.dll              ← 共有ライブラリ（wheel のビルドで必要）
+```
+
+> **Tip:** `install\bin` を `PATH` に追加しておくと、Python ラッパーが実行時に DLL を見つけやすくなります。
+
+---
+
+## 3. Python wheel のビルド
+
+### 3-1. 依存 DLL をパッケージディレクトリにコピー
+
+`ctranslate2.dll` は `libiomp5md.dll`（Intel OpenMP ランタイム）に依存しています。
+wheel に同梱するため、ビルド前にパッケージディレクトリへコピーします。
+
+```cmd
+copy "%CTRANSLATE2_ROOT%\bin\ctranslate2.dll" python\ctranslate2\
+copy "%ONEAPI_ROOT%compiler\latest\bin\libiomp5md.dll" python\ctranslate2\
+```
+
+`ONEAPI_ROOT` が未設定の場合は `%ProgramFiles(x86)%\Intel\oneAPI\` を使用してください。
+
+### 3-2. ビルド依存パッケージのインストール
+
+`uv` を使ってビルドに必要なパッケージをインストールします。
+
+```cmd
+cd python
+uv pip install --system setuptools wheel pybind11==2.11.1
+```
+
+### 3-3. wheel のビルド
+
+`build_with_msvc.py` スクリプトが `vcvarsall.bat` の環境を自動で取り込み、
+`uv build --wheel --no-build-isolation` を実行します。
+
+```cmd
+set CTRANSLATE2_ROOT=<C++ライブラリのインストールパス>
+python build_with_msvc.py
+```
+
+> **Note:** `python setup.py bdist_wheel`（旧来の方法）の代わりに `uv build` を使っています。
+> `--no-build-isolation` は MSVC コンパイラ環境を現在のシェルから継承するために必要です。
+
+ビルドが完了すると `dist\` ディレクトリに `.whl` ファイルが生成されます。
+
+```
+python\
+  dist\
+    ctranslate2-X.Y.Z-cpXX-cpXX-win_amd64.whl
+```
+
+### 3-4. wheel のインストール
+
+```cmd
+uv pip install --system dist\ctranslate2-*.whl
+```
+
+---
+
+## 4. 動作確認
+
+```cmd
+python -c "import ctranslate2; print(ctranslate2.__version__)"
+```
+
+バージョン番号が表示されれば正常にインストールされています。
+
+---
+
+## 5. よくあるエラーと対処
+
+| エラー | 原因 | 対処 |
+|--------|------|------|
+| `MKL not found` | MKLROOT 環境変数が未設定 | Intel oneAPI の `setvars.bat` を実行してから cmake を再実行 |
+| `CUDA not found` | CUDA_PATH 環境変数が未設定 | CUDA Toolkit の再インストール、または `-DCUDA_TOOLKIT_ROOT_DIR=<path>` を追加 |
+| `DLL not found` 実行時エラー | `install\bin` が PATH に未追加 | `set PATH=%CTRANSLATE2_ROOT%\bin;%PATH%` を実行 |
+| `cl.exe not found` | x64 Native Tools Prompt を使用していない | x64 Native Tools Command Prompt から再実行 |
+| `Invalid CMAKE_POLICY_VERSION_MINIMUM value "3"` | `3.5` が `3` と `.5` に分割される | `-D` オプション全体をダブルクォートで囲む: `"-DCMAKE_POLICY_VERSION_MINIMUM=3.5"` |
+| `Unable to find a compatible Visual Studio installation` | 日本語 Windows で `vcvarsall.bat` の出力が UTF-16LE として正しくデコードできない | `build_with_msvc.py` を使用してビルドする（ANSI モードで環境を取得） |
+| `Could not find module 'ctranslate2.dll' (or one of its dependencies)` | `libiomp5md.dll` が wheel に含まれていない | `libiomp5md.dll` を `python\ctranslate2\` にコピーしてから再ビルド |
+
+---
+
+## 参考
+
+- [Build options 一覧](installation.md#build-options)
+- [Intel oneAPI Base Toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html)
+- [CUDA Toolkit](https://developer.nvidia.com/cuda-toolkit)
diff --git a/include/ctranslate2/devices.h b/include/ctranslate2/devices.h
@@ -26,6 +26,7 @@ namespace ctranslate2 {
 
   void synchronize_device(Device device, int index);
   void synchronize_stream(Device device);
+  void destroy_context(Device device);
 
   class ScopedDeviceSetter {
   public:

diff --git a/include/ctranslate2/layers/attention.h b/include/ctranslate2/layers/attention.h
@@ -1,6 +1,9 @@
 #pragma once
 
+#include <memory>
+
 #include "ctranslate2/layers/attention_layer.h"
+#include "ctranslate2/ops/rotor_quant_kv.h"
 #include "ctranslate2/padder.h"
 #include "ctranslate2/layers/transformer.h"
 
@@ -88,6 +91,10 @@ namespace ctranslate2 {
       const dim_t _cache_time_dim;
       std::unique_ptr<const LayerNorm> _q_norm;  // Query normalization
       std::unique_ptr<const LayerNorm> _k_norm;  // Key normalization
+
+      // RotorQuant KV-cache compression (nullptr when disabled).
+      // Enabled by setting the environment variable CT2_ROTOR_QUANT_BITS=3 or =4.
+      std::unique_ptr<ops::RotorQuantKV> _rotor_quant;
     };
   }
 }
diff --git a/include/ctranslate2/ops/ops.h b/include/ctranslate2/ops/ops.h
@@ -46,3 +46,4 @@
 #include "awq/dequantize_awq.h"
 #endif
 #include "sum.h"
+#include "rotor_quant_kv.h"
diff --git a/include/ctranslate2/ops/rotor_quant_kv.h b/include/ctranslate2/ops/rotor_quant_kv.h
@@ -0,0 +1,102 @@
+#pragma once
+
+// RotorQuant KV-cache compression operator.
+//
+// Compresses float16/float32 KV-cache tensors using Clifford Cl(3,0) rotor
+// rotation followed by per-token min-max quantization to 3 or 4 bits.
+//
+// Phase 1 (this implementation):
+//   - Identity rotors (no actual rotation applied, rotor sandwich = identity)
+//   - Per-token symmetric min-max quantization
+//   - No QJL residual correction
+//   Phase 2 (future):
+//   - Learned / random unit rotors per group
+//   - Lloyd-Max codebook quantization
+//   - QJL residual correction
+//
+// Packed buffer layout (per token vector of d_head elements):
+//   [0 .. codes_bytes-1]       : quantized codes, LSB-packed at `bits` bits/dim
+//   [codes_bytes .. +3]        : float32 min_val (raw bytes)
+//   [codes_bytes+4 .. +7]      : float32 scale = max_val - min_val (raw bytes)
+//
+//   codes_bytes = (d_head * bits + 7) / 8
+//   total packed_stride = codes_bytes + 8
+//
+// Cached buffers use DataType::INT8 as raw byte storage.
+// The caller detects compression by checking cache.dtype() == DataType::INT8.
+
+#include <memory>
+#include <vector>
+
+#include "ctranslate2/storage_view.h"
+
+namespace ctranslate2 {
+  namespace ops {
+
+    class RotorQuantKV {
+    public:
+      struct Config {
+        int bits = 4;  // quantisation bits per dimension (3 or 4)
+      };
+
+      explicit RotorQuantKV(dim_t d_head, const Config& cfg = {});
+
+      // Returns the packed stride (bytes per token) for the given d_head / config.
+      static dim_t compute_packed_stride(dim_t d_head, int bits);
+
+      dim_t packed_stride() const { return _packed_stride; }
+      int   bits()          const { return _cfg.bits; }
+      dim_t d_head()        const { return _d_head; }
+
+      // Detect whether a cache StorageView is in compressed format.
+      static bool is_packed(const StorageView& v) {
+        return !v.empty() && v.dtype() == DataType::INT8;
+      }
+
+      // encode: kv [*, d_head] float → packed [*, packed_stride] INT8
+      // Works for any leading dimensions (batch * heads * time flattened).
+      void encode(const StorageView& kv, StorageView& packed) const;
+
+      // decode: packed [*, packed_stride] INT8 → kv [*, d_head] same dtype/device as `kv_out`
+      void decode(const StorageView& packed,
+                  StorageView& kv_out,
+                  DataType out_dtype,
+                  Device   out_device) const;
+
+      // append: encode `new_kv` and concat to existing packed cache along the time dimension.
+      // packed_cache: [batch, heads, time_old, packed_stride] INT8
+      // new_kv:       [batch, heads, 1,        d_head]       float
+      // Result:       [batch, heads, time_old+1, packed_stride] INT8
+      void append(const StorageView& new_kv,
+                  StorageView& packed_cache) const;
+
+    private:
+      dim_t  _d_head;
+      Config _cfg;
+      dim_t  _packed_stride;
+
+      // Per-group rotors: shape [n_groups, 4]  (s, b12, b13, b23)
+      // Initialised to identity; future phases will populate from learned weights.
+      std::vector<std::array<float, 4>> _rotors;  // size = n_groups
+
+      template <typename T>
+      void encode_cpu(const T* kv_ptr,
+                      int8_t*  packed_ptr,
+                      dim_t    n_tokens) const;
+
+      template <typename T>
+      void decode_cpu(const int8_t* packed_ptr,
+                      T*            kv_ptr,
+                      dim_t         n_tokens) const;
+
+#ifdef CT2_WITH_CUDA
+      template <typename T>
+      void encode_cuda(const T* kv_ptr, int8_t* packed_ptr, dim_t n_tokens) const;
+
+      template <typename T>
+      void decode_cuda(const int8_t* packed_ptr, T* kv_ptr, dim_t n_tokens) const;
+#endif
+    };
+
+  } // ops
+} // ctranslate2
diff --git a/include/ctranslate2/replica_pool.h b/include/ctranslate2/replica_pool.h
@@ -346,14 +346,31 @@ namespace ctranslate2 {
       _allocator = &get_allocator(_device);
     }
 
+    // #2027: Set the shutting-down flag before the queue closes so that idle()
+    // skips synchronize_stream() and avoids a deadlock on Windows with CUDA.
+    void prepare_shutdown() override {
+      _shutting_down.store(true, std::memory_order_release);
+    }
+
     void idle() override {
       // When no new jobs are immediately available, we synchronize the CUDA stream
       // so that the CudaAsyncAllocator can release some memory.
-      synchronize_stream(_device);
+      // #2027: Skip during shutdown to prevent blocking while the mutex is held,
+      // which would deadlock against ThreadPool::~ThreadPool() calling queue.close().
+      if (!_shutting_down.load(std::memory_order_acquire))
+        synchronize_stream(_device);
     }
 
     void finalize() override {
+      // #2027: Ensure the shutting-down flag is set before releasing the replica.
+      _shutting_down.store(true, std::memory_order_release);
+
       _replica.reset();
+
+      // #1912: Explicitly free thread-local CUDA resources (cuRAND states) before
+      // the thread is destroyed. Releasing them after thread exit crashes on Windows
+      // because the CUDA context is already invalid (stack buffer overrun 0xC0000409).
+      destroy_context(_device);
     }
 
   private:
@@ -362,6 +379,7 @@ namespace ctranslate2 {
     const size_t _num_threads;
     Allocator* _allocator;
     std::unique_ptr<Replica> _replica;
+    std::atomic<bool> _shutting_down{false};
   };
 
 }
diff --git a/include/ctranslate2/thread_pool.h b/include/ctranslate2/thread_pool.h
@@ -61,6 +61,9 @@ namespace ctranslate2 {
     void start(JobQueue& job_queue, int thread_affinity = -1);
     void join();
 
+    // Called before the job queue is closed, to allow workers to prepare for shutdown.
+    virtual void prepare_shutdown() {}
+
   protected:
     // Called before the work loop.
     virtual void initialize() {}