diff --git a/.gitignore b/.gitignore
index 9c6801c43..62d5e25f2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,6 +7,7 @@ CMake*.json
 python/build/
 python/ctranslate2.egg-info/
 python/dist/
+python/ctranslate2/
 .cache
 docs/build/
 docs/python/
diff --git a/CMakeLists.txt b/CMakeLists.txt
index cf80e37b5..785cb362c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -214,6 +214,10 @@ set(SOURCES
   src/ops/awq/gemv.cc
   src/ops/awq/gemv_cpu.cc
   src/ops/sum.cc
+  src/cpu/clifford_ops.cc
+  src/cpu/clifford_kernels.cc
+  src/ops/rotor_quant_kv.cc
+  src/ops/rotor_quant_kv_cpu.cc
   src/padder.cc
   src/profiler.cc
   src/random.cc
@@ -255,6 +259,9 @@ set(CUDA_SOURCES
   src/ops/awq/gemm_gpu.cu
   src/ops/awq/gemv_gpu.cu
   src/ops/awq/dequantize_gpu.cu
+  src/ops/rotor_quant_kv_gpu.cu
+  src/cuda/rotor_quant_kernel.cu
+  src/cuda/rotor_attention_kernel.cu
 )
 set(LIBRARIES
   ${CMAKE_THREAD_LIBS_INIT}
diff --git a/docs/win_build.md b/docs/win_build.md
new file mode 100644
index 000000000..6fc96ac7b
--- /dev/null
+++ b/docs/win_build.md
@@ -0,0 +1,168 @@
+# Windows でのビルド手順
+
+このドキュメントでは、Windows 環境において CTranslate2 の C++ ライブラリをビルドし、その後 Python の wheel パッケージを作成するまでの手順を説明します。
+
+---
+
+## 前提条件
+
+以下のツール・ライブラリを事前にインストールしてください。
+
+| ツール | 最低バージョン | 備考 |
+|--------|--------------|------|
+| Visual Studio | 2019 以降 | 「C++ によるデスクトップ開発」ワークロードが必要 |
+| CMake | 3.15 以降 | [cmake.org](https://cmake.org/download/) からインストール |
+| Python | 3.9 以降 | |
+| Intel oneAPI MKL | 2019.5 以降 | CPU バックエンドとして使用（デフォルト）。[Intel oneAPI Base Toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) に含まれる |
+| CUDA Toolkit | 11.0 以降 | GPU サポートが必要な場合のみ |
+| cuDNN | 8 以降 | 畳み込みモデル（音声認識等）を使用する場合のみ |
+
+> **Note:** 以降のコマンドはすべて **x64 Native Tools Command Prompt for VS 2019**（または VS 2022）で実行してください。
+> [スタートメニュー] → [Visual Studio 20xx] → [x64 Native Tools Command Prompt for VS 20xx]
+
+---
+
+## 1. ソースコードの取得
+
+```cmd
+git clone --recursive https://github.com/OpenNMT/CTranslate2.git
+cd CTranslate2
+```
+
+サブモジュールを含めてクローンするため `--recursive` が必要です。
+
+---
+
+## 2. C++ ライブラリのビルド
+
+### 2-1. ビルドディレクトリを作成して CMake を実行
+
+**CPU のみ（Intel MKL バックエンド）:**
+
+```cmd
+mkdir build
+cd build
+cmake .. -G "Visual Studio 16 2019" -A x64 -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX="%CD%\..\install" "-DCMAKE_POLICY_VERSION_MINIMUM=3.5"
+```
+
+Visual Studio 2022 を使用する場合は `-G "Visual Studio 17 2022"` に変更してください。
+
+**CPU + GPU（CUDA バックエンド）:**
+
+```cmd
+mkdir build
+cd build
+cmake .. -G "Visual Studio 16 2019" -A x64 ^
+    -DCMAKE_BUILD_TYPE=Release ^
+    -DCMAKE_INSTALL_PREFIX="%CD%\..\install" ^
+    -DWITH_CUDA=ON ^
+    -DWITH_CUDNN=ON ^
+    "-DCMAKE_POLICY_VERSION_MINIMUM=3.5"
+```
+
+### 2-2. ビルドとインストール
+
+```cmd
+cmake --build . --config Release --parallel
+cmake --install . --config Release
+```
+
+成功すると `install\` ディレクトリ以下に以下が生成されます。
+
+```
+install\
+  bin\
+    ctranslate2_translator.exe   ← CLI ツール
+  include\
+    ctranslate2\                 ← C++ ヘッダー
+  lib\
+    ctranslate2.lib              ← インポートライブラリ
+  bin\
+    ctranslate2.dll              ← 共有ライブラリ（wheel のビルドで必要）
+```
+
+> **Tip:** `install\bin` を `PATH` に追加しておくと、Python ラッパーが実行時に DLL を見つけやすくなります。
+
+---
+
+## 3. Python wheel のビルド
+
+### 3-1. 依存 DLL をパッケージディレクトリにコピー
+
+`ctranslate2.dll` は `libiomp5md.dll`（Intel OpenMP ランタイム）に依存しています。
+wheel に同梱するため、ビルド前にパッケージディレクトリへコピーします。
+
+```cmd
+copy "%CTRANSLATE2_ROOT%\bin\ctranslate2.dll" python\ctranslate2\
+copy "%ONEAPI_ROOT%compiler\latest\bin\libiomp5md.dll" python\ctranslate2\
+```
+
+`ONEAPI_ROOT` が未設定の場合は `%ProgramFiles(x86)%\Intel\oneAPI\` を使用してください。
+
+### 3-2. ビルド依存パッケージのインストール
+
+`uv` を使ってビルドに必要なパッケージをインストールします。
+
+```cmd
+cd python
+uv pip install --system setuptools wheel pybind11==2.11.1
+```
+
+### 3-3. wheel のビルド
+
+`build_with_msvc.py` スクリプトが `vcvarsall.bat` の環境を自動で取り込み、
+`uv build --wheel --no-build-isolation` を実行します。
+
+```cmd
+set CTRANSLATE2_ROOT=<C++ライブラリのインストールパス>
+python build_with_msvc.py
+```
+
+> **Note:** `python setup.py bdist_wheel`（旧来の方法）の代わりに `uv build` を使っています。
+> `--no-build-isolation` は MSVC コンパイラ環境を現在のシェルから継承するために必要です。
+
+ビルドが完了すると `dist\` ディレクトリに `.whl` ファイルが生成されます。
+
+```
+python\
+  dist\
+    ctranslate2-X.Y.Z-cpXX-cpXX-win_amd64.whl
+```
+
+### 3-4. wheel のインストール
+
+```cmd
+uv pip install --system dist\ctranslate2-*.whl
+```
+
+---
+
+## 4. 動作確認
+
+```cmd
+python -c "import ctranslate2; print(ctranslate2.__version__)"
+```
+
+バージョン番号が表示されれば正常にインストールされています。
+
+---
+
+## 5. よくあるエラーと対処
+
+| エラー | 原因 | 対処 |
+|--------|------|------|
+| `MKL not found` | MKLROOT 環境変数が未設定 | Intel oneAPI の `setvars.bat` を実行してから cmake を再実行 |
+| `CUDA not found` | CUDA_PATH 環境変数が未設定 | CUDA Toolkit の再インストール、または `-DCUDA_TOOLKIT_ROOT_DIR=<path>` を追加 |
+| `DLL not found` 実行時エラー | `install\bin` が PATH に未追加 | `set PATH=%CTRANSLATE2_ROOT%\bin;%PATH%` を実行 |
+| `cl.exe not found` | x64 Native Tools Prompt を使用していない | x64 Native Tools Command Prompt から再実行 |
+| `Invalid CMAKE_POLICY_VERSION_MINIMUM value "3"` | `3.5` が `3` と `.5` に分割される | `-D` オプション全体をダブルクォートで囲む: `"-DCMAKE_POLICY_VERSION_MINIMUM=3.5"` |
+| `Unable to find a compatible Visual Studio installation` | 日本語 Windows で `vcvarsall.bat` の出力が UTF-16LE として正しくデコードできない | `build_with_msvc.py` を使用してビルドする（ANSI モードで環境を取得） |
+| `Could not find module 'ctranslate2.dll' (or one of its dependencies)` | `libiomp5md.dll` が wheel に含まれていない | `libiomp5md.dll` を `python\ctranslate2\` にコピーしてから再ビルド |
+
+---
+
+## 参考
+
+- [Build options 一覧](installation.md#build-options)
+- [Intel oneAPI Base Toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html)
+- [CUDA Toolkit](https://developer.nvidia.com/cuda-toolkit)
diff --git a/include/ctranslate2/devices.h b/include/ctranslate2/devices.h
index 232307a90..f9b126633 100644
--- a/include/ctranslate2/devices.h
+++ b/include/ctranslate2/devices.h
@@ -26,6 +26,7 @@ namespace ctranslate2 {
 
   void synchronize_device(Device device, int index);
   void synchronize_stream(Device device);
+  void destroy_context(Device device);
 
   class ScopedDeviceSetter {
   public:
diff --git a/include/ctranslate2/layers/attention.h b/include/ctranslate2/layers/attention.h
index f77e0d4c5..3b8a95cc9 100644
--- a/include/ctranslate2/layers/attention.h
+++ b/include/ctranslate2/layers/attention.h
@@ -1,6 +1,9 @@
 #pragma once
 
+#include <memory>
+
 #include "ctranslate2/layers/attention_layer.h"
+#include "ctranslate2/ops/rotor_quant_kv.h"
 #include "ctranslate2/padder.h"
 #include "ctranslate2/layers/transformer.h"
 
@@ -88,6 +91,10 @@ namespace ctranslate2 {
       const dim_t _cache_time_dim;
       std::unique_ptr<const LayerNorm> _q_norm;  // Query normalization
       std::unique_ptr<const LayerNorm> _k_norm;  // Key normalization
+
+      // RotorQuant KV-cache compression (nullptr when disabled).
+      // Enabled by setting the environment variable CT2_ROTOR_QUANT_BITS=3 or =4.
+      std::unique_ptr<ops::RotorQuantKV> _rotor_quant;
     };
   }
 }
diff --git a/include/ctranslate2/ops/ops.h b/include/ctranslate2/ops/ops.h
index f48e56014..62adaea64 100644
--- a/include/ctranslate2/ops/ops.h
+++ b/include/ctranslate2/ops/ops.h
@@ -46,3 +46,4 @@
 #include "awq/dequantize_awq.h"
 #endif
 #include "sum.h"
+#include "rotor_quant_kv.h"
diff --git a/include/ctranslate2/ops/rotor_quant_kv.h b/include/ctranslate2/ops/rotor_quant_kv.h
new file mode 100644
index 000000000..4b735e780
--- /dev/null
+++ b/include/ctranslate2/ops/rotor_quant_kv.h
@@ -0,0 +1,102 @@
+#pragma once
+
+// RotorQuant KV-cache compression operator.
+//
+// Compresses float16/float32 KV-cache tensors using Clifford Cl(3,0) rotor
+// rotation followed by per-token min-max quantization to 3 or 4 bits.
+//
+// Phase 1 (this implementation):
+//   - Identity rotors (no actual rotation applied, rotor sandwich = identity)
+//   - Per-token symmetric min-max quantization
+//   - No QJL residual correction
+//   Phase 2 (future):
+//   - Learned / random unit rotors per group
+//   - Lloyd-Max codebook quantization
+//   - QJL residual correction
+//
+// Packed buffer layout (per token vector of d_head elements):
+//   [0 .. codes_bytes-1]       : quantized codes, LSB-packed at `bits` bits/dim
+//   [codes_bytes .. +3]        : float32 min_val (raw bytes)
+//   [codes_bytes+4 .. +7]      : float32 scale = max_val - min_val (raw bytes)
+//
+//   codes_bytes = (d_head * bits + 7) / 8
+//   total packed_stride = codes_bytes + 8
+//
+// Cached buffers use DataType::INT8 as raw byte storage.
+// The caller detects compression by checking cache.dtype() == DataType::INT8.
+
+#include <memory>
+#include <vector>
+
+#include "ctranslate2/storage_view.h"
+
+namespace ctranslate2 {
+  namespace ops {
+
+    class RotorQuantKV {
+    public:
+      struct Config {
+        int bits = 4;  // quantisation bits per dimension (3 or 4)
+      };
+
+      explicit RotorQuantKV(dim_t d_head, const Config& cfg = {});
+
+      // Returns the packed stride (bytes per token) for the given d_head / config.
+      static dim_t compute_packed_stride(dim_t d_head, int bits);
+
+      dim_t packed_stride() const { return _packed_stride; }
+      int   bits()          const { return _cfg.bits; }
+      dim_t d_head()        const { return _d_head; }
+
+      // Detect whether a cache StorageView is in compressed format.
+      static bool is_packed(const StorageView& v) {
+        return !v.empty() && v.dtype() == DataType::INT8;
+      }
+
+      // encode: kv [*, d_head] float → packed [*, packed_stride] INT8
+      // Works for any leading dimensions (batch * heads * time flattened).
+      void encode(const StorageView& kv, StorageView& packed) const;
+
+      // decode: packed [*, packed_stride] INT8 → kv [*, d_head] same dtype/device as `kv_out`
+      void decode(const StorageView& packed,
+                  StorageView& kv_out,
+                  DataType out_dtype,
+                  Device   out_device) const;
+
+      // append: encode `new_kv` and concat to existing packed cache along the time dimension.
+      // packed_cache: [batch, heads, time_old, packed_stride] INT8
+      // new_kv:       [batch, heads, 1,        d_head]       float
+      // Result:       [batch, heads, time_old+1, packed_stride] INT8
+      void append(const StorageView& new_kv,
+                  StorageView& packed_cache) const;
+
+    private:
+      dim_t  _d_head;
+      Config _cfg;
+      dim_t  _packed_stride;
+
+      // Per-group rotors: shape [n_groups, 4]  (s, b12, b13, b23)
+      // Initialised to identity; future phases will populate from learned weights.
+      std::vector<std::array<float, 4>> _rotors;  // size = n_groups
+
+      template <typename T>
+      void encode_cpu(const T* kv_ptr,
+                      int8_t*  packed_ptr,
+                      dim_t    n_tokens) const;
+
+      template <typename T>
+      void decode_cpu(const int8_t* packed_ptr,
+                      T*            kv_ptr,
+                      dim_t         n_tokens) const;
+
+#ifdef CT2_WITH_CUDA
+      template <typename T>
+      void encode_cuda(const T* kv_ptr, int8_t* packed_ptr, dim_t n_tokens) const;
+
+      template <typename T>
+      void decode_cuda(const int8_t* packed_ptr, T* kv_ptr, dim_t n_tokens) const;
+#endif
+    };
+
+  } // ops
+} // ctranslate2
diff --git a/include/ctranslate2/replica_pool.h b/include/ctranslate2/replica_pool.h
index 8c8e15d8e..2c3cdc53f 100644
--- a/include/ctranslate2/replica_pool.h
+++ b/include/ctranslate2/replica_pool.h
@@ -346,14 +346,31 @@ namespace ctranslate2 {
       _allocator = &get_allocator(_device);
     }
 
+    // #2027: Set the shutting-down flag before the queue closes so that idle()
+    // skips synchronize_stream() and avoids a deadlock on Windows with CUDA.
+    void prepare_shutdown() override {
+      _shutting_down.store(true, std::memory_order_release);
+    }
+
     void idle() override {
       // When no new jobs are immediately available, we synchronize the CUDA stream
       // so that the CudaAsyncAllocator can release some memory.
-      synchronize_stream(_device);
+      // #2027: Skip during shutdown to prevent blocking while the mutex is held,
+      // which would deadlock against ThreadPool::~ThreadPool() calling queue.close().
+      if (!_shutting_down.load(std::memory_order_acquire))
+        synchronize_stream(_device);
     }
 
     void finalize() override {
+      // #2027: Ensure the shutting-down flag is set before releasing the replica.
+      _shutting_down.store(true, std::memory_order_release);
+
       _replica.reset();
+
+      // #1912: Explicitly free thread-local CUDA resources (cuRAND states) before
+      // the thread is destroyed. Releasing them after thread exit crashes on Windows
+      // because the CUDA context is already invalid (stack buffer overrun 0xC0000409).
+      destroy_context(_device);
     }
 
   private:
@@ -362,6 +379,7 @@ namespace ctranslate2 {
     const size_t _num_threads;
     Allocator* _allocator;
     std::unique_ptr<Replica> _replica;
+    std::atomic<bool> _shutting_down{false};
   };
 
 }
diff --git a/include/ctranslate2/thread_pool.h b/include/ctranslate2/thread_pool.h
index 826b7e57f..3af918214 100644
--- a/include/ctranslate2/thread_pool.h
+++ b/include/ctranslate2/thread_pool.h
@@ -61,6 +61,9 @@ namespace ctranslate2 {
     void start(JobQueue& job_queue, int thread_affinity = -1);
     void join();
 
+    // Called before the job queue is closed, to allow workers to prepare for shutdown.
+    virtual void prepare_shutdown() {}
+
   protected:
     // Called before the work loop.
     virtual void initialize() {}
diff --git a/python/build_wheel.bat b/python/build_wheel.bat
new file mode 100644
index 000000000..f3bfafb34
--- /dev/null
+++ b/python/build_wheel.bat
@@ -0,0 +1,29 @@
+@echo off
+setlocal
+
+set SCRIPT_DIR=%~dp0
+set LOGFILE=%SCRIPT_DIR%build_log.txt
+
+echo [1] Initializing MSVC... > %LOGFILE%
+
+for /f "usebackq tokens=*" %%i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath`) do (
+  set VS_INSTALL_DIR=%%i
+)
+
+if not defined VS_INSTALL_DIR (
+  echo ERROR: Visual Studio not found >> %LOGFILE%
+  exit /b 1
+)
+
+call "%VS_INSTALL_DIR%\VC\Auxiliary\Build\vcvarsall.bat" x64 >> %LOGFILE% 2>&1
+echo [2] MSVC done >> %LOGFILE%
+
+set CTRANSLATE2_ROOT=%SCRIPT_DIR%..\build\install
+echo [3] CTRANSLATE2_ROOT=%CTRANSLATE2_ROOT% >> %LOGFILE%
+
+cd /d %SCRIPT_DIR%
+echo [4] Running uv build... >> %LOGFILE%
+uv build --wheel --no-build-isolation >> %LOGFILE% 2>&1
+echo [5] Exit code: %ERRORLEVEL% >> %LOGFILE%
+
+endlocal
diff --git a/python/build_with_msvc.py b/python/build_with_msvc.py
new file mode 100644
index 000000000..6ca193f27
--- /dev/null
+++ b/python/build_with_msvc.py
@@ -0,0 +1,75 @@
+"""
+vcvarsall.bat の環境変数を Python プロセスに取り込み、uv build を実行するスクリプト。
+
+日本語 Windows (cp932) では setuptools の cmd /u /c 方式が文字化けするため、
+cmd /a /c (ANSI) で環境を取得して上書きする。
+"""
+import os
+import subprocess
+import sys
+
+
+def get_msvc_env(arch: str = "x64") -> dict:
+    vswhere = os.path.join(
+        os.environ.get("ProgramFiles(x86)", r"C:\Program Files (x86)"),
+        "Microsoft Visual Studio", "Installer", "vswhere.exe",
+    )
+    install_path = subprocess.check_output(
+        [vswhere, "-latest", "-requires",
+         "Microsoft.VisualStudio.Component.VC.Tools.x86.x64",
+         "-property", "installationPath", "-products", "*"],
+        stderr=subprocess.DEVNULL,
+    ).decode("mbcs").strip()
+
+    vcvarsall = os.path.join(install_path, "VC", "Auxiliary", "Build", "vcvarsall.bat")
+
+    # cmd /a = ANSI モード（cp932 として正しくデコードできる）
+    out = subprocess.check_output(
+        f'cmd /a /c ""{vcvarsall}" {arch} && set"',
+        stderr=subprocess.STDOUT,
+    ).decode("mbcs", errors="replace")
+
+    env = {}
+    for line in out.splitlines():
+        key, _, value = line.partition("=")
+        if key and value:
+            env[key] = value
+    return env
+
+
+def main():
+    python_version = sys.argv[1] if len(sys.argv) > 1 else None
+
+    print("[build_with_msvc] Loading MSVC environment via vcvarsall.bat (ANSI mode)...")
+    msvc_env = get_msvc_env("x64")
+    if "PATH" not in msvc_env:
+        print("[build_with_msvc] ERROR: Failed to load MSVC environment.")
+        sys.exit(1)
+
+    # 現在の環境に MSVC 環境をマージ（MSVC のものを優先）
+    merged = {**os.environ, **msvc_env}
+    # CTRANSLATE2_ROOT は明示的に保持
+    if "CTRANSLATE2_ROOT" in os.environ:
+        merged["CTRANSLATE2_ROOT"] = os.environ["CTRANSLATE2_ROOT"]
+    # setuptools に vcvarsall.bat の再実行をスキップさせ、この環境をそのまま使わせる
+    merged["DISTUTILS_USE_SDK"] = "1"
+
+    cmd = ["uv", "build", "--wheel"]
+    if python_version:
+        cmd += ["--python", python_version]
+    else:
+        cmd += ["--no-build-isolation"]
+
+    print(f"[build_with_msvc] cl.exe: {msvc_env.get('PATH', '').split(';')[0]}")
+    print(f"[build_with_msvc] Running: {' '.join(cmd)}")
+
+    result = subprocess.run(
+        cmd,
+        env=merged,
+        cwd=os.path.dirname(os.path.abspath(__file__)),
+    )
+    sys.exit(result.returncode)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/pyproject.toml b/python/pyproject.toml
index d62e25b9b..b9e3ebd38 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -1,3 +1,49 @@
 [build-system]
 requires = ["setuptools", "wheel", "pybind11==2.11.1"]
 build-backend = "setuptools.build_meta"
+
+[project]
+name = "ctranslate2"
+version = "4.7.1"
+description = "Fast inference engine for Transformer models"
+license = "MIT"
+authors = [{ name = "OpenNMT" }]
+requires-python = ">=3.9"
+dependencies = [
+    "numpy",
+    "pyyaml>=5.3,<7",
+]
+classifiers = [
+    "Development Status :: 5 - Production/Stable",
+    "Environment :: GPU :: NVIDIA CUDA :: 12 :: 12.4",
+    "Intended Audience :: Developers",
+    "Intended Audience :: Science/Research",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3 :: Only",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: 3.13",
+    "Programming Language :: Python :: 3.14",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
+]
+keywords = ["opennmt", "nmt", "neural machine translation", "cuda", "mkl", "inference", "quantization"]
+
+[project.urls]
+Documentation = "https://opennmt.net/CTranslate2"
+Forum = "https://forum.opennmt.net"
+Gitter = "https://gitter.im/OpenNMT/CTranslate2"
+Source = "https://github.com/OpenNMT/CTranslate2"
+
+[project.scripts]
+ct2-fairseq-converter = "ctranslate2.converters.fairseq:main"
+ct2-marian-converter = "ctranslate2.converters.marian:main"
+ct2-openai-gpt2-converter = "ctranslate2.converters.openai_gpt2:main"
+ct2-opennmt-py-converter = "ctranslate2.converters.opennmt_py:main"
+ct2-opennmt-tf-converter = "ctranslate2.converters.opennmt_tf:main"
+ct2-opus-mt-converter = "ctranslate2.converters.opus_mt:main"
+ct2-transformers-converter = "ctranslate2.converters.transformers:main"
+
+[tool.setuptools.packages.find]
+include = ["ctranslate2*"]
diff --git a/python/setup.py b/python/setup.py
index d161d59da..9aa86ea82 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -5,29 +5,12 @@
 import pybind11
 
 from pybind11.setup_helpers import ParallelCompile
-from setuptools import Extension, find_packages, setup
+from setuptools import Extension, setup
 
-base_dir = os.path.dirname(os.path.abspath(__file__))
 include_dirs = [pybind11.get_include()]
 library_dirs = []
 
 
-def _get_long_description():
-    readme_path = os.path.join(base_dir, "README.md")
-    if not os.path.exists(readme_path):
-        return ""
-    with open(readme_path, encoding="utf-8") as readme_file:
-        return readme_file.read()
-
-
-def _get_project_version():
-    version_path = os.path.join(base_dir, "ctranslate2", "version.py")
-    version = {}
-    with open(version_path, encoding="utf-8") as fp:
-        exec(fp.read(), version)
-    return version["__version__"]
-
-
 def _maybe_add_library_root(lib_name):
     if "%s_ROOT" % lib_name in os.environ:
         root = os.environ["%s_ROOT" % lib_name]
@@ -68,54 +51,6 @@ def _maybe_add_library_root(lib_name):
 ParallelCompile("CMAKE_BUILD_PARALLEL_LEVEL").install()
 
 setup(
-    name="ctranslate2",
-    version=_get_project_version(),
-    license="MIT",
-    description="Fast inference engine for Transformer models",
-    long_description=_get_long_description(),
-    long_description_content_type="text/markdown",
-    author="OpenNMT",
-    url="https://opennmt.net",
-    classifiers=[
-        "Development Status :: 5 - Production/Stable",
-        "Environment :: GPU :: NVIDIA CUDA :: 12 :: 12.4",
-        "Intended Audience :: Developers",
-        "Intended Audience :: Science/Research",
-        "Programming Language :: Python :: 3",
-        "Programming Language :: Python :: 3 :: Only",
-        "Programming Language :: Python :: 3.9",
-        "Programming Language :: Python :: 3.10",
-        "Programming Language :: Python :: 3.11",
-        "Programming Language :: Python :: 3.12",
-        "Programming Language :: Python :: 3.13",
-        "Programming Language :: Python :: 3.14",
-        "Topic :: Scientific/Engineering :: Artificial Intelligence",
-    ],
-    project_urls={
-        "Documentation": "https://opennmt.net/CTranslate2",
-        "Forum": "https://forum.opennmt.net",
-        "Gitter": "https://gitter.im/OpenNMT/CTranslate2",
-        "Source": "https://github.com/OpenNMT/CTranslate2",
-    },
-    keywords="opennmt nmt neural machine translation cuda mkl inference quantization",
-    packages=find_packages(exclude=["bin"]),
     package_data=package_data,
     ext_modules=[ctranslate2_module],
-    python_requires=">=3.9",
-    install_requires=[
-        "setuptools",
-        "numpy",
-        "pyyaml>=5.3,<7",
-    ],
-    entry_points={
-        "console_scripts": [
-            "ct2-fairseq-converter=ctranslate2.converters.fairseq:main",
-            "ct2-marian-converter=ctranslate2.converters.marian:main",
-            "ct2-openai-gpt2-converter=ctranslate2.converters.openai_gpt2:main",
-            "ct2-opennmt-py-converter=ctranslate2.converters.opennmt_py:main",
-            "ct2-opennmt-tf-converter=ctranslate2.converters.opennmt_tf:main",
-            "ct2-opus-mt-converter=ctranslate2.converters.opus_mt:main",
-            "ct2-transformers-converter=ctranslate2.converters.transformers:main",
-        ],
-    },
 )
diff --git a/src/cpu/clifford_kernels.cc b/src/cpu/clifford_kernels.cc
new file mode 100644
index 000000000..c133e680b
--- /dev/null
+++ b/src/cpu/clifford_kernels.cc
@@ -0,0 +1,56 @@
+// SIMD-optimised Clifford Cl(3,0) rotor operations.
+//
+// This file is compiled once per ISA (similar to kernels.cc) and provides
+// vectorised encode/decode for groups of 3 dimensions.
+//
+// Phase 2 uses these to replace the scalar loop in rotor_quant_kv_cpu.cc.
+//
+// Currently provides the fallback scalar + portable optimised paths.
+// AVX2 / NEON specialisations are added incrementally.
+
+#include "clifford_ops.h"
+#include "clifford_kernels.h"
+
+#include <cmath>
+#include <cstring>
+
+namespace ctranslate2 {
+  namespace cpu {
+    namespace clifford {
+
+      // -----------------------------------------------------------------------
+      // rotate_groups
+      //
+      // Apply the rotor sandwich to each group of 3 consecutive floats in `vec`,
+      // reading rotor parameters from `rotors` (n_groups × 4 floats: s,b12,b13,b23).
+      // -----------------------------------------------------------------------
+      void rotate_groups(float*       vec,
+                         const float* rotors,  // [n_groups][4]
+                         int64_t      d,
+                         bool         inverse) {
+        const int n_groups = static_cast<int>((d + 2) / 3);
+        for (int g = 0; g < n_groups; ++g) {
+          const float* R = rotors + g * 4;
+          const Rotor  rotor{R[0], R[1], R[2], R[3]};
+
+          const int base = g * 3;
+          // Handle last (possibly partial) group.
+          float a = vec[base];
+          float b = (base + 1 < static_cast<int>(d)) ? vec[base + 1] : 0.f;
+          float c = (base + 2 < static_cast<int>(d)) ? vec[base + 2] : 0.f;
+
+          float y0, y1, y2;
+          if (!inverse)
+            rotor_sandwich(rotor, a, b, c, y0, y1, y2);
+          else
+            rotor_sandwich_inv(rotor, a, b, c, y0, y1, y2);
+
+          vec[base] = y0;
+          if (base + 1 < static_cast<int>(d)) vec[base + 1] = y1;
+          if (base + 2 < static_cast<int>(d)) vec[base + 2] = y2;
+        }
+      }
+
+    } // clifford
+  } // cpu
+} // ctranslate2
diff --git a/src/cpu/clifford_kernels.h b/src/cpu/clifford_kernels.h
new file mode 100644
index 000000000..358593fbb
--- /dev/null
+++ b/src/cpu/clifford_kernels.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#include <cstdint>
+
+namespace ctranslate2 {
+  namespace cpu {
+    namespace clifford {
+
+      // Apply Clifford rotor sandwich to groups of 3 consecutive floats.
+      // vec:     in/out buffer of length `d`
+      // rotors:  [n_groups x 4] packed as (s, b12, b13, b23) per group
+      // inverse: if true, apply the inverse sandwich
+      void rotate_groups(float*       vec,
+                         const float* rotors,
+                         int64_t      d,
+                         bool         inverse);
+
+    } // clifford
+  } // cpu
+} // ctranslate2
diff --git a/src/cpu/clifford_ops.cc b/src/cpu/clifford_ops.cc
new file mode 100644
index 000000000..37456c8b5
--- /dev/null
+++ b/src/cpu/clifford_ops.cc
@@ -0,0 +1,136 @@
+#include "clifford_ops.h"
+
+#include <cmath>
+#include <cstdint>
+
+namespace ctranslate2 {
+  namespace cpu {
+    namespace clifford {
+
+      // -----------------------------------------------------------------------
+      // Rotor::normalised
+      // -----------------------------------------------------------------------
+      Rotor Rotor::normalised() const {
+        const float norm = std::sqrt(s*s + b12*b12 + b13*b13 + b23*b23);
+        if (norm < 1e-9f)
+          return {};  // fall back to identity
+        const float inv = 1.f / norm;
+        return {s*inv, b12*inv, b13*inv, b23*inv};
+      }
+
+      // -----------------------------------------------------------------------
+      // Forward sandwich: y = R · v · R̃
+      //
+      // Derivation (Cl(3,0) geometric product, explicit component expansion):
+      //
+      // Step 1 — T = R · v  where v = a·e1 + b·e2 + c·e3
+      //   T[e1]   =  s·a + p12·b + p13·c
+      //   T[e2]   =  s·b - p12·a + p23·c
+      //   T[e3]   =  s·c - p13·a - p23·b
+      //   T[e123] = p12·c - p13·b + p23·a
+      //
+      // Step 2 — y = T · R̃  where R̃ = s - p12·e12 - p13·e13 - p23·e23
+      //   y[e1] =  s·T1 + p12·T2 + p13·T3 + p23·T123
+      //   y[e2] = -p12·T1 + s·T2 + p23·T3 - p13·T123
+      //   y[e3] = -p13·T1 - p23·T2 + s·T3 + p12·T123
+      //
+      // (verified against quaternion rotation for 90° test cases)
+      // -----------------------------------------------------------------------
+      void rotor_sandwich(const Rotor& R,
+                          float a, float b, float c,
+                          float& y0, float& y1, float& y2) {
+        const float s   = R.s;
+        const float p12 = R.b12;
+        const float p13 = R.b13;
+        const float p23 = R.b23;
+
+        // Step 1: T = R · v
+        const float T1   =  s*a + p12*b + p13*c;
+        const float T2   =  s*b - p12*a + p23*c;
+        const float T3   =  s*c - p13*a - p23*b;
+        const float T123 = p12*c - p13*b + p23*a;
+
+        // Step 2: y = T · R̃
+        y0 =  s*T1 + p12*T2 + p13*T3 + p23*T123;
+        y1 = -p12*T1 + s*T2 + p23*T3 - p13*T123;
+        y2 = -p13*T1 - p23*T2 + s*T3 + p12*T123;
+      }
+
+      // -----------------------------------------------------------------------
+      // Inverse sandwich: y = R̃ · v · R
+      //
+      // Step 1 — T = R̃ · v  (sign-flip on bivector components)
+      //   T[e1]   =  s·a - p12·b - p13·c
+      //   T[e2]   =  s·b + p12·a - p23·c
+      //   T[e3]   =  s·c + p13·a + p23·b
+      //   T[e123] = -p12·c + p13·b - p23·a
+      //
+      // Step 2 — y = T · R
+      //   y[e1] =  s·T1 - p12·T2 - p13·T3 - p23·T123
+      //   y[e2] =  p12·T1 + s·T2 + p13·T123 - p23·T3
+      //   y[e3] = -p12·T123 + p13·T1 + p23·T2 + s·T3
+      // -----------------------------------------------------------------------
+      void rotor_sandwich_inv(const Rotor& R,
+                              float a, float b, float c,
+                              float& y0, float& y1, float& y2) {
+        const float s   = R.s;
+        const float p12 = R.b12;
+        const float p13 = R.b13;
+        const float p23 = R.b23;
+
+        // Step 1: T = R̃ · v
+        const float T1   =  s*a - p12*b - p13*c;
+        const float T2   =  s*b + p12*a - p23*c;
+        const float T3   =  s*c + p13*a + p23*b;
+        const float T123 = -p12*c + p13*b - p23*a;
+
+        // Step 2: y = T · R
+        y0 =  s*T1 - p12*T2 - p13*T3 - p23*T123;
+        y1 =  p12*T1 + s*T2 - p23*T3 + p13*T123;
+        y2 =  p13*T1 + p23*T2 + s*T3 - p12*T123;
+      }
+
+      // -----------------------------------------------------------------------
+      // random_rotor
+      //
+      // Deterministic "random" unit rotor from a simple hash of the indices.
+      // When all indices are 0, returns the identity rotor.
+      // -----------------------------------------------------------------------
+      Rotor random_rotor(int group_idx, int head_idx, int layer_idx) {
+        if (group_idx == 0 && head_idx == 0 && layer_idx == 0)
+          return {};
+
+        // Deterministic hash: mix indices into a 32-bit seed.
+        uint32_t seed = static_cast<uint32_t>(group_idx)
+                      ^ (static_cast<uint32_t>(head_idx) * 2654435761u)
+                      ^ (static_cast<uint32_t>(layer_idx) * 1234567891u);
+        // Xorshift32
+        auto xorshift = [](uint32_t x) -> uint32_t {
+          x ^= x << 13; x ^= x >> 17; x ^= x << 5;
+          return x;
+        };
+        seed = xorshift(seed ? seed : 1u);
+        const float u0 = static_cast<float>(seed) / static_cast<float>(0xFFFFFFFFu);
+        seed = xorshift(seed);
+        const float u1 = static_cast<float>(seed) / static_cast<float>(0xFFFFFFFFu);
+        seed = xorshift(seed);
+        const float u2 = static_cast<float>(seed) / static_cast<float>(0xFFFFFFFFu);
+        seed = xorshift(seed);
+        const float u3 = static_cast<float>(seed) / static_cast<float>(0xFFFFFFFFu);
+
+        // Uniform distribution on S³ via Marsaglia (1972):
+        const float sq1 = std::sqrt(1.f - u0);
+        const float sq2 = std::sqrt(u0);
+        const float a   = sq1 * std::sin(2.f * 3.14159265358979323846f * u1);
+        const float b   = sq1 * std::cos(2.f * 3.14159265358979323846f * u1);
+        const float c   = sq2 * std::sin(2.f * 3.14159265358979323846f * u2);
+        const float d   = sq2 * std::cos(2.f * 3.14159265358979323846f * u2);
+        (void)u3;
+
+        // Map (a,b,c,d) → (s, b12, b13, b23)
+        return Rotor{a, b, c, d};
+      }
+
+    } // clifford
+  } // cpu
+} // ctranslate2
diff --git a/src/cpu/clifford_ops.h b/src/cpu/clifford_ops.h
new file mode 100644
index 000000000..d2636d6f7
--- /dev/null
+++ b/src/cpu/clifford_ops.h
@@ -0,0 +1,56 @@
+#pragma once
+
+// Clifford algebra Cl(3,0) rotor operations for RotorQuant KV-cache compression.
+//
+// A rotor R in Cl(3,0) lives in the even sub-algebra:
+//   R = s·1 + b12·e12 + b13·e13 + b23·e23
+// with norm s²+b12²+b13²+b23² = 1 (unit rotor).
+//
+// The sandwich product  R·v·R̃  rotates a grade-1 vector v ∈ span{e1,e2,e3}
+// to another grade-1 vector.  R̃ is the reverse (conjugate) of R.
+//
+// Correspondence with quaternions (for reference):
+//   s  ↔ w,  b23 ↔ i,  -b13 ↔ j,  b12 ↔ k
+
+namespace ctranslate2 {
+  namespace cpu {
+    namespace clifford {
+
+      // Clifford Cl(3,0) rotor: R = s + b12·e12 + b13·e13 + b23·e23
+      // Identity rotor: s=1, b12=b13=b23=0
+      struct Rotor {
+        float s   = 1.f;
+        float b12 = 0.f;
+        float b13 = 0.f;
+        float b23 = 0.f;
+
+        Rotor() = default;
+        Rotor(float s_, float b12_, float b13_, float b23_)
+          : s(s_), b12(b12_), b13(b13_), b23(b23_) {}
+
+        // Return normalised version (unit rotor).
+        Rotor normalised() const;
+
+        // Return the reverse/conjugate: R̃ = s - b12·e12 - b13·e13 - b23·e23
+        Rotor reversed() const { return {s, -b12, -b13, -b23}; }
+      };
+
+      // Forward sandwich: y = R · [a,b,c]ᵀ · R̃
+      // (rotation of the grade-1 vector a·e1+b·e2+c·e3)
+      void rotor_sandwich(const Rotor& R,
+                          float a, float b, float c,
+                          float& y0, float& y1, float& y2);
+
+      // Inverse sandwich: y = R̃ · [a,b,c]ᵀ · R
+      // Undoes the forward rotation.
+      void rotor_sandwich_inv(const Rotor& R,
+                              float a, float b, float c,
+                              float& y0, float& y1, float& y2);
+
+      // Generate a random unit rotor seeded by (group_idx, head_idx, layer_idx).
+      // Returns identity when all indices are 0.
+      Rotor random_rotor(int group_idx, int head_idx, int layer_idx = 0);
+
+    } // clifford
+  } // cpu
+} // ctranslate2
diff --git a/src/cuda/random.cu b/src/cuda/random.cu
index f016bb447..a0ff684de 100644
--- a/src/cuda/random.cu
+++ b/src/cuda/random.cu
@@ -48,12 +48,17 @@ namespace ctranslate2 {
       curandState* _states;
     };
 
+    static thread_local std::unique_ptr<ScopedCurandStates<curandStatePhilox4_32_10_t>> states;
+
     curandStatePhilox4_32_10_t* get_curand_states(size_t num_states) {
-      static thread_local std::unique_ptr<ScopedCurandStates<curandStatePhilox4_32_10_t>> states;
       if (!states || num_states > states->num_states())
         states = std::make_unique<ScopedCurandStates<curandStatePhilox4_32_10_t>>(num_states);
       return states->states();
     }
 
+    void free_curand_states() {
+      states.reset();
+    }
+
   }
 }
diff --git a/src/cuda/random.h b/src/cuda/random.h
index 11ceba3be..a75401745 100644
--- a/src/cuda/random.h
+++ b/src/cuda/random.h
@@ -13,6 +13,7 @@ namespace ctranslate2 {
   namespace cuda {
 
     curandStatePhilox4_32_10_t* get_curand_states(size_t num_states);
+    void free_curand_states();
 
   }
 }
diff --git a/src/cuda/rotor_attention_kernel.cu b/src/cuda/rotor_attention_kernel.cu
new file mode 100644
index 000000000..36b104646
--- /dev/null
+++ b/src/cuda/rotor_attention_kernel.cu
@@ -0,0 +1,133 @@
+// Fused RotorQuant Attention CUDA kernel.
+//
+// Computes:  score(q, K_packed) = q · decode(K_packed)^T  in one pass,
+// without fully materialising the decoded K matrix.
+//
+// Converted from the Triton implementation in RotorQuant
+// turboquant/fused_attention.py (MIT licence).
+//
+// Two-term estimator (from TurboQuant / RotorQuant):
+//   score_i = term1_i + term2_i
+//   term1_i = <q_rotated, centroid(k_codes_i)>      (MSE reconstruction)
+//   term2_i = residual_norm_i * sqrt(pi/2) / m
+//              * sum_j(qjl_sign_ij * <q, s_j>)       (QJL correction)
+//
+// Phase 1 (this file): term1 only (no QJL), using min-max decoded keys.
+// Phase 2 (TODO): add QJL term2, use Lloyd-Max codebook.
+
+#include "ctranslate2/ops/rotor_quant_kv.h"
+
+#ifdef CT2_WITH_CUDA
+
+#include <cuda_fp16.h>
+#include "cuda/helpers.h"
+#include "cuda/utils.h"
+
+namespace ctranslate2 {
+  namespace cuda {
+
+    // -----------------------------------------------------------------------
+    // rotor_fused_attn_kernel
+    //
+    // Each block computes attention scores for one (batch_head) pair.
+    // Threads in the block iterate over key tokens.
+    //
+    // Grid:  (batch * heads,)
+    // Block: (BLOCK_T,)   where BLOCK_T tiles the key-sequence dimension.
+    //
+    // Output scores: [batch, heads, 1, seq_k]  (single query step)
+    // -----------------------------------------------------------------------
+    template <typename T, int BLOCK_T = 64>
+    __global__ void rotor_fused_attn_kernel(
+        const T*      __restrict__ q,           // [batch, heads, 1, d_head]
+        const int8_t* __restrict__ k_packed,    // [batch, heads, seq_k, packed_stride]
+        float*        __restrict__ scores,      // [batch, heads, 1, seq_k]
+        int   d_head,
+        int   seq_k,
+        int   packed_stride,
+        int   bits,
+        int   codes_bytes,
+        float scale_factor) {                   // 1 / sqrt(d_head)
+
+      const int bh = blockIdx.x;               // flattened batch-head index
+      const int t_base = threadIdx.x;
+
+      // Pointer to this batch-head's query vector.
+      const T*      q_bh = q        + (long long)bh * d_head;
+      const int8_t* k_bh = k_packed + (long long)bh * seq_k * packed_stride;
+      float*        s_bh = scores   + (long long)bh * seq_k;
+
+      // Load query to registers (up to d_head elements, tiled by BLOCK_T).
+      // For simplicity, load full d_head in each thread (works well when d_head ≤ 256).
+      // A production implementation would use shared memory.
+
+      for (int t = t_base; t < seq_k; t += BLOCK_T) {
+        const int8_t* k_tok = k_bh + (long long)t * packed_stride;
+
+        // Decode key token on-the-fly.
+        float mn, scale_kv;
+        memcpy(&mn,       k_tok + codes_bytes,     sizeof(float));
+        memcpy(&scale_kv, k_tok + codes_bytes + 4, sizeof(float));
+
+        const float  qmax = static_cast<float>((1 << bits) - 1);
+        const uint32_t mask = (1u << bits) - 1u;
+
+        // Dot product q · decode(k_tok).
+        float dot = 0.f;
+        uint32_t buf   = 0u;
+        int      avail = 0;
+        const int8_t* p = k_tok;
+
+        for (int d = 0; d < d_head; ++d) {
+          while (avail < bits) {
+            buf   |= static_cast<uint32_t>(static_cast<uint8_t>(*p++)) << avail;
+            avail += 8;
+          }
+          const uint32_t code = buf & mask;
+          buf   >>= bits;
+          avail  -= bits;
+          const float k_val = static_cast<float>(code) / qmax * scale_kv + mn;
+          dot += static_cast<float>(q_bh[d]) * k_val;
+        }
+
+        s_bh[t] = dot * scale_factor;
+      }
+    }
+
+    // -----------------------------------------------------------------------
+    // launch_rotor_fused_attn
+    // -----------------------------------------------------------------------
+    template <typename T>
+    void launch_rotor_fused_attn(
+        const T*      q,            // [batch, heads, 1, d_head]
+        const int8_t* k_packed,     // [batch, heads, seq_k, packed_stride]
+        float*        scores,       // [batch, heads, 1, seq_k]  (caller-allocated)
+        int   batch,
+        int   heads,
+        int   seq_k,
+        int   d_head,
+        int   packed_stride,
+        int   bits) {
+      if (seq_k == 0) return;
+
+      const int   codes_bytes  = (d_head * bits + 7) / 8;
+      const float scale_factor = 1.f / sqrtf(static_cast<float>(d_head));
+      const int   bh_count     = batch * heads;
+      const int   block_t      = 64;
+
+      rotor_fused_attn_kernel<cuda::device_type<T>, 64>
+          <<<bh_count, block_t, 0, get_cuda_stream()>>>(
+              cuda::device_cast(q), k_packed, scores,
+              d_head, seq_k, packed_stride, bits, codes_bytes, scale_factor);
+    }
+
+    // Explicit instantiations.
+    template void launch_rotor_fused_attn<float>(
+        const float*, const int8_t*, float*, int, int, int, int, int, int);
+    template void launch_rotor_fused_attn<float16_t>(
+        const float16_t*, const int8_t*, float*, int, int, int, int, int, int);
+
+  } // cuda
+} // ctranslate2
+
+#endif // CT2_WITH_CUDA
diff --git a/src/cuda/rotor_quant_kernel.cu b/src/cuda/rotor_quant_kernel.cu
new file mode 100644
index 000000000..5e98c148a
--- /dev/null
+++ b/src/cuda/rotor_quant_kernel.cu
@@ -0,0 +1,214 @@
+// RotorQuant encode/decode CUDA kernels.
+//
+// Ported from RotorQuant csrc/rotor_fused_kernel.cu (MIT licence).
+// PyTorch / ATen dependencies replaced with CTranslate2 CUDA helpers.
+//
+// Phase 1: per-token min-max quantisation, identity rotors.
+// Phase 2 (TODO): per-group Clifford rotor sandwich, Lloyd-Max codebook.
+//
+// Kernel signatures follow CTranslate2 conventions:
+//   - Raw device pointers (no torch::Tensor)
+//   - Float16 via cuda::device_type<float16_t> = __half
+//   - CUDA stream via cuda::get_cuda_stream()
+
+#include "ctranslate2/ops/rotor_quant_kv.h"
+
+#ifdef CT2_WITH_CUDA
+
+#include <cuda_fp16.h>
+#include <stdexcept>
+
+#include "cuda/helpers.h"
+#include "cuda/utils.h"
+
+namespace ctranslate2 {
+  namespace cuda {
+
+    // -----------------------------------------------------------------------
+    // Bit-pack helpers (device-side)
+    // -----------------------------------------------------------------------
+
+    // Read `bits`-bit code at position `idx` from packed byte array `src`.
+    __device__ inline uint32_t read_code(const int8_t* src, int idx, int bits) {
+      const int bit_pos  = idx * bits;
+      const int byte_idx = bit_pos >> 3;
+      const int shift    = bit_pos & 7;
+      // Read 2 bytes to handle cross-byte codes (bits ≤ 8 so 2 bytes always sufficient).
+      uint32_t val = static_cast<uint32_t>(static_cast<uint8_t>(src[byte_idx]));
+      if (shift + bits > 8)
+        val |= static_cast<uint32_t>(static_cast<uint8_t>(src[byte_idx + 1])) << 8;
+      return (val >> shift) & ((1u << bits) - 1u);
+    }
+
+    // Write `bits`-bit value `v` at position `idx` in packed byte array `dst`.
+    // Note: assumes the target bytes are zero-initialised before writing.
+    __device__ inline void write_code(int8_t* dst, int idx, int bits, uint32_t v) {
+      const int bit_pos  = idx * bits;
+      const int byte_idx = bit_pos >> 3;
+      const int shift    = bit_pos & 7;
+      atomicOr(reinterpret_cast<unsigned int*>(dst + byte_idx),
+               (v & ((1u << bits) - 1u)) << shift);
+    }
+
+    // -----------------------------------------------------------------------
+    // rotor_encode_kernel
+    //
+    // Each thread processes one token (one row of shape [d_head]).
+    // Grid: (n_tokens,), Block: (1,)
+    // -----------------------------------------------------------------------
+    template <typename T>
+    __global__ void rotor_encode_kernel(const T*    __restrict__ kv,      // [n_tokens, d_head]
+                                        int8_t*     __restrict__ packed,  // [n_tokens, packed_stride]
+                                        int         d_head,
+                                        int         packed_stride,
+                                        int         bits,
+                                        int         codes_bytes) {
+      const int t = blockIdx.x * blockDim.x + threadIdx.x;
+      if (t >= gridDim.x * blockDim.x) return;
+
+      const T*  src = kv     + (long long)t * d_head;
+      int8_t*   dst = packed + (long long)t * packed_stride;
+
+      // ---- Step 1: find min / max ----
+      float mn = __half2float(__float2half(static_cast<float>(src[0])));
+      float mx = mn;
+      for (int i = 1; i < d_head; ++i) {
+        const float v = static_cast<float>(src[i]);
+        if (v < mn) mn = v;
+        if (v > mx) mx = v;
+      }
+      float scale = mx - mn;
+      if (scale < 1e-9f) scale = 1e-9f;
+
+      // ---- Step 2: quantise + pack ----
+      const float qmax  = static_cast<float>((1 << bits) - 1);
+      const uint32_t mask = (1u << bits) - 1u;
+
+      // Zero the code bytes.
+      for (int b = 0; b < codes_bytes; ++b)
+        dst[b] = 0;
+
+      uint32_t buf  = 0u;
+      int      fill = 0;
+      int      byte_out = 0;
+      for (int i = 0; i < d_head; ++i) {
+        float q = (static_cast<float>(src[i]) - mn) / scale * qmax;
+        if (q < 0.f) q = 0.f;
+        if (q > qmax) q = qmax;
+        const uint32_t code = static_cast<uint32_t>(__float2int_rn(q));
+        buf  |= (code & mask) << fill;
+        fill += bits;
+        while (fill >= 8) {
+          dst[byte_out++] = static_cast<int8_t>(buf & 0xFFu);
+          buf >>= 8;
+          fill -= 8;
+        }
+      }
+      if (fill > 0)
+        dst[byte_out] = static_cast<int8_t>(buf & 0xFFu);
+
+      // ---- Step 3: write metadata ----
+      memcpy(dst + codes_bytes,     &mn,    sizeof(float));
+      memcpy(dst + codes_bytes + 4, &scale, sizeof(float));
+    }
+
+    // -----------------------------------------------------------------------
+    // rotor_decode_kernel
+    // -----------------------------------------------------------------------
+    template <typename T>
+    __global__ void rotor_decode_kernel(const int8_t* __restrict__ packed,  // [n_tokens, packed_stride]
+                                        T*            __restrict__ kv,      // [n_tokens, d_head]
+                                        int           d_head,
+                                        int           packed_stride,
+                                        int           bits,
+                                        int           codes_bytes) {
+      const int t = blockIdx.x * blockDim.x + threadIdx.x;
+      if (t >= gridDim.x * blockDim.x) return;
+
+      const int8_t* src = packed + (long long)t * packed_stride;
+      T*            dst = kv     + (long long)t * d_head;
+
+      // Read metadata.
+      float mn, scale;
+      memcpy(&mn,    src + codes_bytes,     sizeof(float));
+      memcpy(&scale, src + codes_bytes + 4, sizeof(float));
+
+      // Unpack codes and dequantise.
+      const float  qmax = static_cast<float>((1 << bits) - 1);
+      const uint32_t mask = (1u << bits) - 1u;
+
+      uint32_t      buf   = 0u;
+      int           avail = 0;
+      const int8_t* p     = src;
+      for (int i = 0; i < d_head; ++i) {
+        while (avail < bits) {
+          buf   |= static_cast<uint32_t>(static_cast<uint8_t>(*p++)) << avail;
+          avail += 8;
+        }
+        const uint32_t code = buf & mask;
+        buf   >>= bits;
+        avail  -= bits;
+        const float val = static_cast<float>(code) / qmax * scale + mn;
+        dst[i] = static_cast<T>(val);
+      }
+    }
+
+    // -----------------------------------------------------------------------
+    // Launch wrappers
+    // -----------------------------------------------------------------------
+    template <typename T>
+    void launch_rotor_encode(const T*    kv,
+                             int8_t*     packed,
+                             int         n_tokens,
+                             int         d_head,
+                             int         packed_stride,
+                             int         bits) {
+      if (n_tokens == 0) return;
+      const int  codes_bytes = (d_head * bits + 7) / 8;
+      const int  threads     = 128;
+      const int  blocks      = (n_tokens + threads - 1) / threads;
+      rotor_encode_kernel<cuda::device_type<T>>
+          <<<blocks, threads, 0, get_cuda_stream()>>>(
+              cuda::device_cast(kv), packed,
+              d_head, packed_stride, bits, codes_bytes);
+    }
+
+    template <typename T>
+    void launch_rotor_decode(const int8_t* packed,
+                             T*            kv,
+                             int           n_tokens,
+                             int           d_head,
+                             int           packed_stride,
+                             int           bits) {
+      if (n_tokens == 0) return;
+      const int  codes_bytes = (d_head * bits + 7) / 8;
+      const int  threads     = 128;
+      const int  blocks      = (n_tokens + threads - 1) / threads;
+      rotor_decode_kernel<cuda::device_type<T>>
+          <<<blocks, threads, 0, get_cuda_stream()>>>(
+              packed, cuda::device_cast(kv),
+              d_head, packed_stride, bits, codes_bytes);
+    }
+
+    // Explicit instantiations.
+    template void launch_rotor_encode<float>(
+        const float*, int8_t*, int, int, int, int);
+    template void launch_rotor_encode<float16_t>(
+        const float16_t*, int8_t*, int, int, int, int);
+
+    template void launch_rotor_decode<float>(
+        const int8_t*, float*, int, int, int, int);
+    template void launch_rotor_decode<float16_t>(
+        const int8_t*, float16_t*, int, int, int, int);
+
+  } // cuda
+} // ctranslate2
+
+// -----------------------------------------------------------------------
+// Hook rotor_quant_kv_gpu.cu encode_cuda / decode_cuda into the kernels.
+// The Phase-1 GPU stub in rotor_quant_kv_gpu.cu round-trips through CPU.
+// This file provides the real CUDA path but is wired in Phase 2 only
+// (see WITH_ROTOR_QUANT_CUDA define below).
+// -----------------------------------------------------------------------
+
+#endif // CT2_WITH_CUDA
diff --git a/src/devices.cc b/src/devices.cc
index a2936e0a6..d49a86f97 100644
--- a/src/devices.cc
+++ b/src/devices.cc
@@ -2,6 +2,7 @@
 
 #ifdef CT2_WITH_CUDA
 #  include "cuda/utils.h"
+#  include "cuda/random.h"
 #endif
 #ifdef CT2_WITH_TENSOR_PARALLEL
 #  include <unistd.h>
@@ -118,6 +119,16 @@ namespace ctranslate2 {
     (void)device;
 #endif
   }
+
+  void destroy_context(Device device) {
+#ifdef CT2_WITH_CUDA
+    if (device == Device::CUDA) {
+      cuda::free_curand_states();
+    }
+#else
+    (void)device;
+#endif
+  }
   // Initialize the static member variable
 #ifdef CT2_WITH_TENSOR_PARALLEL
     std::vector<ncclComm_t*> ScopedMPISetter::_nccl_comms;
diff --git a/src/layers/attention.cc b/src/layers/attention.cc
index dbbf1ed38..23f4abc21 100644
--- a/src/layers/attention.cc
+++ b/src/layers/attention.cc
@@ -9,6 +9,7 @@
 
 #include "dispatch.h"
 #include "cpu/parallel.h"
+#include "env.h"
 
 namespace ctranslate2 {
   namespace layers {
@@ -324,6 +325,14 @@ namespace ctranslate2 {
           scope + "/relative_attention_max_distance");
       else
         _maximum_relative_position = 0;
+
+      // RotorQuant KV-cache compression: enabled by CT2_ROTOR_QUANT_BITS=3|4.
+      const int rq_bits = read_int_from_env("CT2_ROTOR_QUANT_BITS", 0);
+      if (rq_bits == 3 || rq_bits == 4) {
+        ops::RotorQuantKV::Config rq_cfg;
+        rq_cfg.bits = rq_bits;
+        _rotor_quant = std::make_unique<ops::RotorQuantKV>(_d_head, rq_cfg);
+      }
     }
 
     DataType MultiHeadAttention::output_type() const {
@@ -503,32 +512,117 @@ namespace ctranslate2 {
         }
 
         if (cached_keys != nullptr) {
-          if (cached_keys->empty()) {
-            *cached_keys = std::move(keys_proj);
-            *cached_values = std::move(values_proj);
+          if (_rotor_quant && keys_proj.rank() == 4) {
+            // --- RotorQuant compressed path ---
+            // Only active for standard 4-D KV tensors [batch, heads, time, d_head].
+            // Falls back to standard path for merged-time-head-dims layouts.
+            // cached_keys / cached_values store INT8 packed buffers.
+            // Shape convention: [batch, heads, time, packed_stride].
+            if (cached_keys->empty()) {
+              // Prefill: encode the full keys_proj / values_proj.
+              // keys_proj shape: [batch, heads, time, d_head] (after split_heads)
+              _rotor_quant->encode(keys_proj,   *cached_keys);
+              _rotor_quant->encode(values_proj, *cached_values);
+              cached_keys->reshape(
+                {keys_proj.dim(0), keys_proj.dim(1),
+                 keys_proj.dim(2), _rotor_quant->packed_stride()});
+              cached_values->reshape(
+                {values_proj.dim(0), values_proj.dim(1),
+                 values_proj.dim(2), _rotor_quant->packed_stride()});
+            } else {
+              // Decode existing cache, concat with new tokens, re-encode.
+              // (Efficient incremental append will be added in Phase 2.)
+              StorageView& tmp = fused_proj;  // Reuse storage.
+
+              // Decode existing packed buffers to float.
+              StorageView old_keys(dtype, device), old_vals(dtype, device);
+              const dim_t old_time = cached_keys->dim(2);
+              const dim_t batch    = cached_keys->dim(0);
+              const dim_t heads    = cached_keys->dim(1);
+              StorageView ck_flat(DataType::INT8, device), cv_flat(DataType::INT8, device);
+              ck_flat.shallow_copy(*cached_keys);
+              cv_flat.shallow_copy(*cached_values);
+              ck_flat.reshape({batch * heads * old_time, _rotor_quant->packed_stride()});
+              cv_flat.reshape({batch * heads * old_time, _rotor_quant->packed_stride()});
+              _rotor_quant->decode(ck_flat, old_keys, dtype, device);
+              _rotor_quant->decode(cv_flat, old_vals, dtype, device);
+              old_keys.reshape({batch, heads, old_time, _d_head});
+              old_vals.reshape({batch, heads, old_time, _d_head});
+
+              // Concat along time.
+              const ops::Concat concat_op(_cache_time_dim);
+              tmp = std::move(old_keys);
+              concat_op({&tmp, &keys_proj}, old_keys);
+              tmp = std::move(old_vals);
+              concat_op({&tmp, &values_proj}, old_vals);
+
+              // Apply sliding window if needed.
+              if (!prefilling && _sliding_window > 0
+                  && old_keys.shape()[2] > _sliding_window) {
+                const ops::Slide slide_op(2, 1, old_keys.shape()[2] - 1);
+                slide_op(old_keys, tmp);  old_keys  = std::move(tmp);
+                slide_op(old_vals, tmp);  old_vals  = std::move(tmp);
+              }
+
+              // Re-encode the merged tensor.
+              const dim_t new_time = old_keys.dim(2);
+              _rotor_quant->encode(old_keys, *cached_keys);
+              _rotor_quant->encode(old_vals, *cached_values);
+              cached_keys->reshape({batch, heads, new_time, _rotor_quant->packed_stride()});
+              cached_values->reshape({batch, heads, new_time, _rotor_quant->packed_stride()});
+
+              // keys_proj / values_proj for attention = merged float tensors.
+              keys_proj   = std::move(old_keys);
+              values_proj = std::move(old_vals);
+            }
           } else {
-            const ops::Concat concat_op(_cache_time_dim);
-            StorageView& tmp = fused_proj;  // Reuse storage.
-            tmp = std::move(*cached_keys);
-            concat_op({&tmp, &keys_proj}, *cached_keys);
-            tmp = std::move(*cached_values);
-            concat_op({&tmp, &values_proj}, *cached_values);
-
-            if (!prefilling && _sliding_window > 0 && cached_keys->shape()[2] > _sliding_window) {
-              // only for generation
-              const ops::Slide slide_op(2, 1, cached_keys->shape()[2] - 1);
-              slide_op(*cached_keys, tmp);
-              *cached_keys = std::move(tmp);
-              slide_op(*cached_values, tmp);
-              *cached_values = std::move(tmp);
+            // --- Standard (uncompressed) path ---
+            if (cached_keys->empty()) {
+              *cached_keys = std::move(keys_proj);
+              *cached_values = std::move(values_proj);
+            } else {
+              const ops::Concat concat_op(_cache_time_dim);
+              StorageView& tmp = fused_proj;  // Reuse storage.
+              tmp = std::move(*cached_keys);
+              concat_op({&tmp, &keys_proj}, *cached_keys);
+              tmp = std::move(*cached_values);
+              concat_op({&tmp, &values_proj}, *cached_values);
+
+              if (!prefilling && _sliding_window > 0 && cached_keys->shape()[2] > _sliding_window) {
+                const ops::Slide slide_op(2, 1, cached_keys->shape()[2] - 1);
+                slide_op(*cached_keys, tmp);
+                *cached_keys = std::move(tmp);
+                slide_op(*cached_values, tmp);
+                *cached_values = std::move(tmp);
+              }
             }
           }
         }
       }
 
       if (cached_keys) {
-        keys_proj.shallow_copy(*cached_keys);
-        values_proj.shallow_copy(*cached_values);
+        if (_rotor_quant && cached_keys->rank() == 4
+            && ops::RotorQuantKV::is_packed(*cached_keys)) {
+          // Decode the packed cache into float for attention computation.
+          // (keys_proj / values_proj may already be set in the concat branch above.)
+          if (keys_proj.empty() || keys_proj.dtype() != dtype) {
+            const dim_t batch = cached_keys->dim(0);
+            const dim_t heads = cached_keys->dim(1);
+            const dim_t time  = cached_keys->dim(2);
+            StorageView ck_flat(DataType::INT8, device), cv_flat(DataType::INT8, device);
+            ck_flat.shallow_copy(*cached_keys);
+            cv_flat.shallow_copy(*cached_values);
+            ck_flat.reshape({batch * heads * time, _rotor_quant->packed_stride()});
+            cv_flat.reshape({batch * heads * time, _rotor_quant->packed_stride()});
+            _rotor_quant->decode(ck_flat, keys_proj,   dtype, device);
+            _rotor_quant->decode(cv_flat, values_proj, dtype, device);
+            keys_proj.reshape({batch, heads, time, _d_head});
+            values_proj.reshape({batch, heads, time, _d_head});
+          }
+        } else {
+          keys_proj.shallow_copy(*cached_keys);
+          values_proj.shallow_copy(*cached_values);
+        }
       }
 
       StorageView& context = fused_proj;  // Reuse storage.
diff --git a/src/ops/rotor_quant_kv.cc b/src/ops/rotor_quant_kv.cc
new file mode 100644
index 000000000..1f4d9cfc0
--- /dev/null
+++ b/src/ops/rotor_quant_kv.cc
@@ -0,0 +1,145 @@
+#include "ctranslate2/ops/rotor_quant_kv.h"
+
+#include <algorithm>
+#include <stdexcept>
+
+#include "ctranslate2/types.h"
+#include "ctranslate2/ops/concat.h"
+#include "dispatch.h"
+
+namespace ctranslate2 {
+  namespace ops {
+
+    // -------------------------------------------------------------------------
+    // compute_packed_stride
+    // -------------------------------------------------------------------------
+    dim_t RotorQuantKV::compute_packed_stride(dim_t d_head, int bits) {
+      const dim_t codes_bytes = (d_head * bits + 7) / 8;
+      return codes_bytes + 8;  // +8 for float32 min + float32 scale
+    }
+
+    // -------------------------------------------------------------------------
+    // Constructor
+    // -------------------------------------------------------------------------
+    RotorQuantKV::RotorQuantKV(dim_t d_head, const Config& cfg)
+      : _d_head(d_head)
+      , _cfg(cfg)
+      , _packed_stride(compute_packed_stride(d_head, cfg.bits))
+    {
+      if (cfg.bits != 3 && cfg.bits != 4)
+        throw std::invalid_argument("RotorQuantKV: bits must be 3 or 4");
+
+      // Initialise rotors to identity for all groups.
+      const dim_t n_groups = (d_head + 2) / 3;
+      _rotors.resize(n_groups, {1.f, 0.f, 0.f, 0.f});
+    }
+
+    // -------------------------------------------------------------------------
+    // encode (dispatch wrapper)
+    // -------------------------------------------------------------------------
+    void RotorQuantKV::encode(const StorageView& kv, StorageView& packed) const {
+      PROFILE("RotorQuantKV::encode");
+
+      const dim_t n_tokens = kv.size() / _d_head;
+      const Device dev = kv.device();
+      packed = StorageView({n_tokens, _packed_stride}, DataType::INT8, dev);
+      if (dev == Device::CPU) {
+        if (kv.dtype() == DataType::FLOAT32)
+          encode_cpu(kv.data<float>(), packed.data<int8_t>(), n_tokens);
+        else if (kv.dtype() == DataType::FLOAT16)
+          encode_cpu(kv.data<float16_t>(), packed.data<int8_t>(), n_tokens);
+        else
+          throw std::invalid_argument("RotorQuantKV::encode: unsupported dtype "
+                                      + dtype_name(kv.dtype()));
+      }
+#ifdef CT2_WITH_CUDA
+      else if (dev == Device::CUDA) {
+        if (kv.dtype() == DataType::FLOAT32)
+          encode_cuda(kv.data<float>(), packed.data<int8_t>(), n_tokens);
+        else if (kv.dtype() == DataType::FLOAT16)
+          encode_cuda(kv.data<float16_t>(), packed.data<int8_t>(), n_tokens);
+        else
+          throw std::invalid_argument("RotorQuantKV::encode (CUDA): unsupported dtype "
+                                      + dtype_name(kv.dtype()));
+      }
+#endif
+      else {
+        throw std::invalid_argument("RotorQuantKV::encode: unsupported device");
+      }
+    }
+
+    // -------------------------------------------------------------------------
+    // decode (dispatch wrapper)
+    // -------------------------------------------------------------------------
+    void RotorQuantKV::decode(const StorageView& packed,
+                              StorageView& kv_out,
+                              DataType out_dtype,
+                              Device   out_device) const {
+      PROFILE("RotorQuantKV::decode");
+
+      const dim_t n_tokens = packed.size() / _packed_stride;
+      kv_out = StorageView({n_tokens, _d_head}, out_dtype, out_device);
+
+      if (out_device == Device::CPU) {
+        if (out_dtype == DataType::FLOAT32)
+          decode_cpu(packed.data<int8_t>(), kv_out.data<float>(), n_tokens);
+        else if (out_dtype == DataType::FLOAT16)
+          decode_cpu(packed.data<int8_t>(), kv_out.data<float16_t>(), n_tokens);
+        else
+          throw std::invalid_argument("RotorQuantKV::decode: unsupported dtype "
+                                      + dtype_name(out_dtype));
+      }
+#ifdef CT2_WITH_CUDA
+      else if (out_device == Device::CUDA) {
+        if (out_dtype == DataType::FLOAT32)
+          decode_cuda(packed.data<int8_t>(), kv_out.data<float>(), n_tokens);
+        else if (out_dtype == DataType::FLOAT16)
+          decode_cuda(packed.data<int8_t>(), kv_out.data<float16_t>(), n_tokens);
+        else
+          throw std::invalid_argument("RotorQuantKV::decode (CUDA): unsupported dtype "
+                                      + dtype_name(out_dtype));
+      }
+#endif
+      else {
+        throw std::invalid_argument("RotorQuantKV::decode: unsupported device");
+      }
+    }
+
+    // -------------------------------------------------------------------------
+    // append
+    // Encodes new_kv and appends to packed_cache along the time dimension.
+    // packed_cache: [batch, heads, time_old, packed_stride] INT8 (or empty)
+    // new_kv:       [batch, heads, time_new, d_head]        float
+    // -------------------------------------------------------------------------
+    void RotorQuantKV::append(const StorageView& new_kv,
+                              StorageView& packed_cache) const {
+      PROFILE("RotorQuantKV::append");
+
+      if (packed_cache.empty()) {
+        // First call: just encode.
+        encode(new_kv, packed_cache);
+        // Restore 4-dim shape [batch, heads, time, packed_stride] if new_kv is 4D.
+        if (new_kv.rank() == 4) {
+          packed_cache.reshape({new_kv.dim(0), new_kv.dim(1),
+                                new_kv.dim(2), _packed_stride});
+        }
+        return;
+      }
+
+      // Encode new tokens.
+      StorageView new_packed(DataType::INT8, new_kv.device());
+      encode(new_kv, new_packed);
+      if (new_kv.rank() == 4)
+        new_packed.reshape({new_kv.dim(0), new_kv.dim(1),
+                            new_kv.dim(2), _packed_stride});
+
+      // Concat along time dimension (dim 2 for 4D, dim 0 for 2D).
+      const dim_t cat_dim = (packed_cache.rank() == 4) ? 2 : 0;
+      StorageView tmp(DataType::INT8, new_kv.device());
+      const ops::Concat concat_op(cat_dim);
+      tmp = std::move(packed_cache);
+      concat_op({&tmp, &new_packed}, packed_cache);
+    }
+
+  } // ops
+} // ctranslate2
diff --git a/src/ops/rotor_quant_kv_cpu.cc b/src/ops/rotor_quant_kv_cpu.cc
new file mode 100644
index 000000000..c3dc1cc5b
--- /dev/null
+++ b/src/ops/rotor_quant_kv_cpu.cc
@@ -0,0 +1,181 @@
+#include "ctranslate2/ops/rotor_quant_kv.h"
+
+// CPU implementation of RotorQuantKV encode/decode.
+//
+// Encoding algorithm (per token vector v of d_head floats):
+//  1. Apply Clifford rotor sandwich per 3-element group (identity in Phase 1).
+//  2. Compute min_val = min(v), scale = max(v) - min(v).
+//  3. Quantise each element to [0, 2^bits-1]:
+//       code_i = round((v_i - min_val) / scale * (2^bits - 1))
+//  4. Pack codes as 'bits'-bit integers into bytes (LSB first).
+//  5. Write [packed_codes | min_val_bytes | scale_bytes] to the output buffer.
+//
+// Decoding reverses the above.
+
+#include <algorithm>
+#include <cmath>
+#include <cstring>
+#include <limits>
+
+#include "ctranslate2/types.h"
+#include "cpu/clifford_ops.h"
+#include "cpu/parallel.h"
+
+namespace ctranslate2 {
+  namespace ops {
+
+    // -----------------------------------------------------------------------
+    // Bit-packing helpers
+    //
+    // We pack `n_vals` values, each occupying `bits` bits, LSB-first into
+    // consecutive bytes starting at `out`.
+    // -----------------------------------------------------------------------
+
+    static inline void pack_bits(const uint8_t* codes,
+                                 int8_t*        out,
+                                 dim_t          n_vals,
+                                 int            bits) {
+      uint32_t buf = 0;
+      int      fill = 0;
+      int8_t*  p = out;
+      for (dim_t i = 0; i < n_vals; ++i) {
+        buf  |= (static_cast<uint32_t>(codes[i]) & ((1u << bits) - 1u)) << fill;
+        fill += bits;
+        while (fill >= 8) {
+          *p++ = static_cast<int8_t>(buf & 0xFF);
+          buf >>= 8;
+          fill -= 8;
+        }
+      }
+      if (fill > 0)
+        *p = static_cast<int8_t>(buf & 0xFF);
+    }
+
+    static inline void unpack_bits(const int8_t* in,
+                                   uint8_t*      codes,
+                                   dim_t         n_vals,
+                                   int           bits) {
+      uint32_t           buf  = 0;
+      int                avail = 0;
+      const int8_t*      p    = in;
+      const uint32_t     mask = (1u << bits) - 1u;
+      for (dim_t i = 0; i < n_vals; ++i) {
+        while (avail < bits) {
+          buf  |= (static_cast<uint32_t>(static_cast<uint8_t>(*p++))) << avail;
+          avail += 8;
+        }
+        codes[i] = static_cast<uint8_t>(buf & mask);
+        buf   >>= bits;
+        avail  -= bits;
+      }
+    }
+
+    // -----------------------------------------------------------------------
+    // encode_cpu
+    // -----------------------------------------------------------------------
+    template <typename T>
+    void RotorQuantKV::encode_cpu(const T*    kv_ptr,
+                                  int8_t*     packed_ptr,
+                                  dim_t       n_tokens) const {
+      const dim_t  d       = _d_head;
+      const int    bits    = _cfg.bits;
+      const dim_t  cbytes  = (d * bits + 7) / 8;  // codes bytes per token
+      const float  qmax    = static_cast<float>((1 << bits) - 1);
+
+      cpu::parallel_for(0, n_tokens, 1, [&](dim_t begin, dim_t end) {
+        std::vector<float>   fvec(d);
+        std::vector<uint8_t> codes(d);
+
+        for (dim_t t = begin; t < end; ++t) {
+          const T*  src  = kv_ptr   + t * d;
+          int8_t*   dst  = packed_ptr + t * _packed_stride;
+
+          // Step 1: convert to float and optionally apply rotor.
+          // Phase 1: identity rotors → no rotation, just copy to float.
+          for (dim_t i = 0; i < d; ++i)
+            fvec[i] = static_cast<float>(src[i]);
+
+          // (Phase 2 will call clifford::rotor_sandwich per group here.)
+
+          // Step 2: find min / max.
+          float mn = fvec[0], mx = fvec[0];
+          for (dim_t i = 1; i < d; ++i) {
+            if (fvec[i] < mn) mn = fvec[i];
+            if (fvec[i] > mx) mx = fvec[i];
+          }
+          float scale = mx - mn;
+          if (scale < 1e-9f) scale = 1e-9f;  // avoid division by zero
+
+          // Step 3: quantise.
+          for (dim_t i = 0; i < d; ++i) {
+            float q = (fvec[i] - mn) / scale * qmax;
+            q = std::max(0.f, std::min(qmax, std::round(q)));
+            codes[i] = static_cast<uint8_t>(q);
+          }
+
+          // Step 4: pack bits.
+          pack_bits(codes.data(), dst, d, bits);
+
+          // Step 5: write metadata (min and scale as raw float bytes).
+          std::memcpy(dst + cbytes,     &mn,    sizeof(float));
+          std::memcpy(dst + cbytes + 4, &scale, sizeof(float));
+        }
+      });
+    }
+
+    // -----------------------------------------------------------------------
+    // decode_cpu
+    // -----------------------------------------------------------------------
+    template <typename T>
+    void RotorQuantKV::decode_cpu(const int8_t* packed_ptr,
+                                  T*            kv_ptr,
+                                  dim_t         n_tokens) const {
+      const dim_t  d       = _d_head;
+      const int    bits    = _cfg.bits;
+      const dim_t  cbytes  = (d * bits + 7) / 8;
+      const float  qmax    = static_cast<float>((1 << bits) - 1);
+
+      cpu::parallel_for(0, n_tokens, 1, [&](dim_t begin, dim_t end) {
+        std::vector<uint8_t> codes(d);
+        std::vector<float>   fvec(d);
+
+        for (dim_t t = begin; t < end; ++t) {
+          const int8_t* src = packed_ptr + t * _packed_stride;
+          T*            dst = kv_ptr     + t * d;
+
+          // Unpack codes.
+          unpack_bits(src, codes.data(), d, bits);
+
+          // Read metadata.
+          float mn, scale;
+          std::memcpy(&mn,    src + cbytes,     sizeof(float));
+          std::memcpy(&scale, src + cbytes + 4, sizeof(float));
+
+          // Dequantise.
+          for (dim_t i = 0; i < d; ++i)
+            fvec[i] = static_cast<float>(codes[i]) / qmax * scale + mn;
+
+          // (Phase 2: inverse rotor sandwich here.)
+
+          // Convert back to T.
+          for (dim_t i = 0; i < d; ++i)
+            dst[i] = static_cast<T>(fvec[i]);
+        }
+      });
+    }
+
+    // -----------------------------------------------------------------------
+    // Explicit instantiations for the types used by CTranslate2.
+    // -----------------------------------------------------------------------
+    template void RotorQuantKV::encode_cpu<float>(
+        const float*, int8_t*, dim_t) const;
+    template void RotorQuantKV::encode_cpu<float16_t>(
+        const float16_t*, int8_t*, dim_t) const;
+
+    template void RotorQuantKV::decode_cpu<float>(
+        const int8_t*, float*, dim_t) const;
+    template void RotorQuantKV::decode_cpu<float16_t>(
+        const int8_t*, float16_t*, dim_t) const;
+
+  } // ops
+} // ctranslate2
diff --git a/src/ops/rotor_quant_kv_gpu.cu b/src/ops/rotor_quant_kv_gpu.cu
new file mode 100644
index 000000000..d38712f6b
--- /dev/null
+++ b/src/ops/rotor_quant_kv_gpu.cu
@@ -0,0 +1,68 @@
+// GPU implementation of RotorQuantKV encode/decode.
+//
+// Dispatches to native CUDA kernels declared in cuda/rotor_quant_kernel.cu.
+
+#include "ctranslate2/ops/rotor_quant_kv.h"
+
+#ifdef CT2_WITH_CUDA
+
+#include <cuda_runtime.h>
+#include <stdexcept>
+#include <vector>
+
+#include "ctranslate2/types.h"
+
+// Forward-declare the kernel launchers from rotor_quant_kernel.cu.
+namespace ctranslate2 {
+  namespace cuda {
+    template <typename T>
+    void launch_rotor_encode(const T* kv, int8_t* packed,
+                             int n_tokens, int d_head,
+                             int packed_stride, int bits);
+    template <typename T>
+    void launch_rotor_decode(const int8_t* packed, T* kv,
+                             int n_tokens, int d_head,
+                             int packed_stride, int bits);
+  }
+}
+
+namespace ctranslate2 {
+  namespace ops {
+
+    template <typename T>
+    void RotorQuantKV::encode_cuda(const T*    kv_ptr,
+                                   int8_t*     packed_ptr,
+                                   dim_t       n_tokens) const {
+      cuda::launch_rotor_encode<T>(kv_ptr, packed_ptr,
+                                   static_cast<int>(n_tokens),
+                                   static_cast<int>(_d_head),
+                                   static_cast<int>(_packed_stride),
+                                   _cfg.bits);
+    }
+
+    template <typename T>
+    void RotorQuantKV::decode_cuda(const int8_t* packed_ptr,
+                                   T*            kv_ptr,
+                                   dim_t         n_tokens) const {
+      cuda::launch_rotor_decode<T>(packed_ptr, kv_ptr,
+                                   static_cast<int>(n_tokens),
+                                   static_cast<int>(_d_head),
+                                   static_cast<int>(_packed_stride),
+                                   _cfg.bits);
+    }
+
+    // Explicit instantiations.
+    template void RotorQuantKV::encode_cuda<float>(
+        const float*, int8_t*, dim_t) const;
+    template void RotorQuantKV::encode_cuda<float16_t>(
+        const float16_t*, int8_t*, dim_t) const;
+
+    template void RotorQuantKV::decode_cuda<float>(
+        const int8_t*, float*, dim_t) const;
+    template void RotorQuantKV::decode_cuda<float16_t>(
+        const int8_t*, float16_t*, dim_t) const;
+
+  } // ops
+} // ctranslate2
+
+#endif // CT2_WITH_CUDA
diff --git a/src/thread_pool.cc b/src/thread_pool.cc
index d0aad775b..d1cc7e20a 100644
--- a/src/thread_pool.cc
+++ b/src/thread_pool.cc
@@ -1,5 +1,10 @@
 #include "ctranslate2/thread_pool.h"
 
+#include <chrono>
+#include <future>
+
+#include <spdlog/spdlog.h>
+
 #include "ctranslate2/utils.h"
 
 namespace ctranslate2 {
@@ -47,7 +52,7 @@ namespace ctranslate2 {
     std::unique_lock<std::mutex> lock(_mutex);
 
     if (!can_get_job()) {
-      if (before_wait)
+      if (before_wait && !_request_end)
         before_wait();
       _can_get_job.wait(lock, [this]{ return can_get_job(); });
     }
@@ -103,7 +108,32 @@ namespace ctranslate2 {
   }
 
   void Worker::join() {
-    _thread.join();
+    if (!_thread.joinable())
+      return;
+
+    // Move the thread out of Worker so the helper thread fully owns its lifetime,
+    // preventing UAF on 'this' and the TOCTOU race between detach() and join().
+    auto shared_thread = std::make_shared<std::thread>(std::move(_thread));
+
+    // Use shared_ptr for the promise so it outlives this function if we detach,
+    // preventing the dangling-reference UB from the original PR #2027 implementation.
+    auto shared_promise = std::make_shared<std::promise<void>>();
+    auto join_future = shared_promise->get_future();
+
+    std::thread join_helper([shared_thread, shared_promise]() mutable {
+      if (shared_thread->joinable())
+        shared_thread->join();
+      shared_promise->set_value();
+    });
+
+    constexpr auto timeout = std::chrono::milliseconds(5000);
+    if (join_future.wait_for(timeout) == std::future_status::timeout) {
+      spdlog::warn("Worker thread did not finish within timeout; detaching.");
+      join_helper.detach();
+      // shared_thread and shared_promise are kept alive by join_helper's closure.
+    } else {
+      join_helper.join();
+    }
   }
 
   void Worker::run(JobQueue& job_queue) {
@@ -146,6 +176,10 @@ namespace ctranslate2 {
   }
 
   ThreadPool::~ThreadPool() {
+    // Notify workers before closing the queue so they can stop blocking operations
+    // (e.g. synchronize_stream) that would cause a deadlock on Windows with CUDA.
+    for (auto& worker : _workers)
+      worker->prepare_shutdown();
     _queue.close();
     for (auto& worker : _workers)
       worker->join();