diff --git a/.gitmodules b/.gitmodules
index a4db716..6889aed 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,3 @@
 [submodule "test/googletest"]
 	path = test/googletest
 	url = https://github.com/google/googletest.git
-[submodule "third_party/TinyFA"]
-	path = third_party/TinyFA
-	url = https://github.com/keith2018/TinyFA.git
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 76ea47c..aec31e2 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -94,11 +94,3 @@ if (TINYTORCH_USE_NCCL)
     target_include_directories(${PROJECT_NAME} PUBLIC ${NCCL_INCLUDE_DIRS})
     target_link_libraries(${PROJECT_NAME} PUBLIC ${NCCL_LIBRARY})
 endif ()
-
-# TinyFA
-if (TINYTORCH_USE_CUDA)
-    set(TFA_BUILD_TESTS OFF CACHE BOOL "" FORCE)
-    set(TFA_BUILD_EXAMPLES OFF CACHE BOOL "" FORCE)
-    add_subdirectory(${THIRD_PARTY_DIR}/TinyFA TinyFA)
-    target_link_libraries(${PROJECT_NAME} PRIVATE TinyFA::tinyfa)
-endif ()
diff --git a/src/Function/FuncFused.h b/src/Function/FuncFused.h
index 4ac04ad..b8d9f76 100644
--- a/src/Function/FuncFused.h
+++ b/src/Function/FuncFused.h
@@ -18,6 +18,19 @@ class FuncSiluMul : public Function<FuncSiluMul> {
   static void backward(AutogradContext* ctx, const Tensor& grad) { NOT_IMPLEMENTED(); }
 };
 
+class FuncFusedAddRmsNorm : public Function<FuncFusedAddRmsNorm> {
+ public:
+  static void forward(AutogradContext* ctx, Tensor& input, Tensor& residual, const Tensor& weight, float eps) {
+    op::fusedAddRmsNorm(input, residual, weight, eps);
+  }
+
+  static void backward(AutogradContext* ctx, const Tensor& grad) { NOT_IMPLEMENTED(); }
+};
+
 inline Tensor siluMul(const Tensor& x) { return FuncSiluMul::apply(x); }
 
+inline void fusedAddRmsNorm(Tensor& input, Tensor& residual, const Tensor& weight, float eps) {
+  FuncFusedAddRmsNorm::apply(input, residual, weight, eps);
+}
+
 }  // namespace tinytorch::function
diff --git a/src/Function/FuncNNLayer.h b/src/Function/FuncNNLayer.h
index f9ac33d..8791e14 100644
--- a/src/Function/FuncNNLayer.h
+++ b/src/Function/FuncNNLayer.h
@@ -267,24 +267,6 @@ class FuncSDPAttention : public Function<FuncSDPAttention> {
   static void backward(AutogradContext* ctx, const Tensor& grad) { NOT_IMPLEMENTED(); }
 };
 
-class FuncFlashAttention : public Function<FuncFlashAttention> {
- public:
-  static Tensor forward(AutogradContext* ctx, const Tensor& query, const Tensor& key, const Tensor& value,
-                        bool isCausal) {
-    return op::flashAttention(query, key, value, isCausal);
-  }
-  static void backward(AutogradContext* ctx, const Tensor& grad) { NOT_IMPLEMENTED(); }
-};
-
-class FuncRoPE : public Function<FuncRoPE> {
- public:
-  static Tensor forward(AutogradContext* ctx, const Tensor& input, const Tensor& rope, int64_t offset,
-                        QKVLayout layout) {
-    return op::ropeApply(input, rope, offset, layout);
-  }
-  static void backward(AutogradContext* ctx, const Tensor& grad) { NOT_IMPLEMENTED(); }
-};
-
 inline Tensor linear(const Tensor& input, const Tensor& weight, const Tensor& bias = {}) {
   return FuncLinear::apply(input, weight, bias);
 }
@@ -318,12 +300,5 @@ inline Tensor sdpAttention(const Tensor& query, const Tensor& key, const Tensor&
                            std::optional<float> scale = std::nullopt) {
   return FuncSDPAttention::apply(query, key, value, isCausal, attnMask, dropoutP, scale);
 }
-inline Tensor flashAttention(const Tensor& query, const Tensor& key, const Tensor& value, bool isCausal = false) {
-  return FuncFlashAttention::apply(query, key, value, isCausal);
-}
-inline Tensor ropeApply(const Tensor& input, const Tensor& rope, int64_t offset = 0,
-                        QKVLayout layout = QKVLayout::BHSD) {
-  return FuncRoPE::apply(input, rope, offset, layout);
-}
 
 }  // namespace tinytorch::function
diff --git a/src/Module/Basic.cpp b/src/Module/Basic.cpp
index dd73f1f..6449515 100644
--- a/src/Module/Basic.cpp
+++ b/src/Module/Basic.cpp
@@ -94,20 +94,4 @@ std::vector<std::pair<std::string, TensorPtr>> Embedding::namedParameters_() { r
 
 void Embedding::resetParameters() { Initializer::normal(weight_); }
 
-RoPE::RoPE(int64_t headDim, int64_t contextLength, float thetaBase, std::optional<RopeScalingConfig> scaling,
-           Options options)
-    : headDim_(headDim), contextLength_(contextLength), thetaBase_(thetaBase), scaling_(scaling), options_(options) {
-  RoPE::resetParameters();
-}
-
-Tensor RoPE::forward(const Tensor &input) { return function::ropeApply(input, rope_); }
-
-Tensor RoPE::forward(const Tensor &input, int64_t offset, QKVLayout layout) {
-  return function::ropeApply(input, rope_, offset, layout);
-}
-
-void RoPE::resetParameters() { rope_ = op::ropeInit(headDim_, contextLength_, thetaBase_, scaling_, options_); }
-
-std::vector<std::pair<std::string, TensorPtr>> RoPE::namedStates_() { return {{"rope", &rope_}}; }
-
 }  // namespace tinytorch::nn
\ No newline at end of file
diff --git a/src/Module/Basic.h b/src/Module/Basic.h
index c861520..4e50dc9 100644
--- a/src/Module/Basic.h
+++ b/src/Module/Basic.h
@@ -138,34 +138,4 @@ class Embedding : public Module {
   Tensor weight_;
 };
 
-class RoPE : public Module {
- public:
-  explicit RoPE(int64_t headDim, int64_t contextLength = 4096, float thetaBase = 10000.0f,
-                std::optional<RopeScalingConfig> scaling = std::nullopt, Options options = {});
-
-  using Module::forward;
-  Tensor forward(const Tensor &input) override;
-  Tensor forward(const Tensor &input, int64_t offset, QKVLayout layout = QKVLayout::BHSD);
-
-  using Module::operator();
-  Tensor operator()(const Tensor &input, int64_t offset, QKVLayout layout = QKVLayout::BHSD) {
-    return forward(input, offset, layout);
-  }
-
-  void resetParameters() override;
-
-  Tensor &cache() { return rope_; }
-
- protected:
-  std::vector<std::pair<std::string, TensorPtr>> namedStates_() override;
-
-  int64_t headDim_;
-  int64_t contextLength_;
-  float thetaBase_;
-  std::optional<RopeScalingConfig> scaling_;
-  Options options_;
-
-  Tensor rope_;
-};
-
 }  // namespace tinytorch::nn
diff --git a/src/Module/Norm.h b/src/Module/Norm.h
index ff7d131..d50a3b3 100644
--- a/src/Module/Norm.h
+++ b/src/Module/Norm.h
@@ -38,6 +38,7 @@ class RMSNorm : public Module {
   void resetParameters() override;
 
   Tensor &weight() { return weight_; }
+  float eps() const { return eps_; }
 
  protected:
   std::vector<std::pair<std::string, TensorPtr>> namedParameters_() override;
diff --git a/src/Operation/OpFused.h b/src/Operation/OpFused.h
index 6fed16b..6188b40 100644
--- a/src/Operation/OpFused.h
+++ b/src/Operation/OpFused.h
@@ -13,9 +13,14 @@ namespace tinytorch::op {
 
 using SiluMulOpFn = Tensor (*)(const Tensor& self);
 
+using FusedAddRmsNormOpFn = void (*)(Tensor& input, Tensor& residual, const Tensor& weight, float eps);
+
 // siluMul
 DEFINE_OP(siluMul, SiluMulOpFn)
 
+// fusedAddRmsNorm
+DEFINE_OP(fusedAddRmsNorm, FusedAddRmsNormOpFn)
+
 void registerFusedCpu();
 STATIC_CALL(registerFusedCpu);
 
diff --git a/src/Operation/OpFusedCpu.cpp b/src/Operation/OpFusedCpu.cpp
index 8ad2207..cc44f3e 100644
--- a/src/Operation/OpFusedCpu.cpp
+++ b/src/Operation/OpFusedCpu.cpp
@@ -16,6 +16,9 @@ namespace tinytorch::op {
 void registerFusedCpu() {
   // siluMul
   REG_FUSED_CPU_FLT(siluMul, siluMulOpCpuImpl);
+
+  // fusedAddRmsNorm
+  REG_FUSED_CPU_FLT(fusedAddRmsNorm, fusedAddRmsNormOpCpuImpl);
 }
 
 }  // namespace tinytorch::op
\ No newline at end of file
diff --git a/src/Operation/OpFusedCpu.h b/src/Operation/OpFusedCpu.h
index e8deef5..d9a7338 100644
--- a/src/Operation/OpFusedCpu.h
+++ b/src/Operation/OpFusedCpu.h
@@ -38,4 +38,35 @@ Tensor siluMulOpCpuImpl(const Tensor& self) {
   return ret;
 }
 
+template <typename T>
+void fusedAddRmsNormOpCpuImpl(Tensor& input, Tensor& residual, const Tensor& weight, float eps) {
+  ASSERT(input.shape() == residual.shape());
+  int64_t dim = input.size(-1);
+  int64_t numRows = input.numel() / dim;
+
+  T* inputPtr = input.dataPtr<T>();
+  T* residualPtr = residual.dataPtr<T>();
+  const T* weightPtr = weight.dataPtr<T>();
+
+  for (int64_t row = 0; row < numRows; row++) {
+    int64_t base = row * dim;
+
+    // add residual + accumulate sum‑of‑squares
+    float sumSq = 0.f;
+    for (int64_t i = 0; i < dim; i++) {
+      float r = static_cast<float>(inputPtr[base + i]) + static_cast<float>(residualPtr[base + i]);
+      residualPtr[base + i] = static_cast<T>(r);
+      sumSq += r * r;
+    }
+
+    float invRms = 1.f / std::sqrt(sumSq / static_cast<float>(dim) + eps);
+
+    // normalize + affine
+    for (int64_t i = 0; i < dim; i++) {
+      auto r = static_cast<float>(residualPtr[base + i]);
+      inputPtr[base + i] = static_cast<T>(r * invRms * static_cast<float>(weightPtr[i]));
+    }
+  }
+}
+
 }  // namespace tinytorch::op
diff --git a/src/Operation/OpFusedCuda.cu b/src/Operation/OpFusedCuda.cu
index 43541b5..9c884f1 100644
--- a/src/Operation/OpFusedCuda.cu
+++ b/src/Operation/OpFusedCuda.cu
@@ -16,6 +16,9 @@ namespace tinytorch::op {
 void registerFusedCuda() {
   // siluMul
   REG_FUSED_CUDA_FLT(siluMul, siluMulOpCudaImpl);
+
+  // fusedAddRmsNorm
+  REG_FUSED_CUDA_FLT(fusedAddRmsNorm, fusedAddRmsNormOpCudaImpl);
 }
 
 }  // namespace tinytorch::op
\ No newline at end of file
diff --git a/src/Operation/OpFusedCuda.cuh b/src/Operation/OpFusedCuda.cuh
index f871c1b..620f6f5 100644
--- a/src/Operation/OpFusedCuda.cuh
+++ b/src/Operation/OpFusedCuda.cuh
@@ -8,12 +8,46 @@
 
 #include "OpElemWiseCuda.cuh"
 #include "OpFused.h"
+#include "OpNNLayerCuda.cuh"
 #include "Utils/CUDAUtils.h"
 
 namespace tinytorch::op {
 
+template <typename T, int VEC_ELEMENTS>
+__global__ void kSiluMulVec(T* __restrict__ outPtr, const T* __restrict__ selfPtr, const int d) {
+  const unsigned int row = blockIdx.x;
+  const T* gatePtr = selfPtr + static_cast<int64_t>(row) * d * 2;
+  const T* upPtr = gatePtr + d;
+  T* rowOut = outPtr + static_cast<int64_t>(row) * d;
+
+  const int numVecs = d / VEC_ELEMENTS;
+
+  const int4* gateVec = reinterpret_cast<const int4*>(gatePtr);
+  const int4* upVec = reinterpret_cast<const int4*>(upPtr);
+  int4* outVec = reinterpret_cast<int4*>(rowOut);
+
+  for (auto i = threadIdx.x; i < numVecs; i += blockDim.x) {
+    // 128-bit load via read-only cache (__ldg)
+    int4 gv = __ldg(&gateVec[i]);
+    int4 uv = __ldg(&upVec[i]);
+
+    const T* g = reinterpret_cast<const T*>(&gv);
+    const T* u = reinterpret_cast<const T*>(&uv);
+    T r[VEC_ELEMENTS];
+
+#pragma unroll
+    for (int j = 0; j < VEC_ELEMENTS; ++j) {
+      auto gf = static_cast<float>(g[j]);
+      auto uf = static_cast<float>(u[j]);
+      r[j] = static_cast<T>(gf / (1.f + __expf(-gf)) * uf);
+    }
+
+    outVec[i] = *reinterpret_cast<const int4*>(r);
+  }
+}
+
 template <typename T>
-__global__ void kSiluMul(T* retPtr, const T* selfPtr, const int64_t halfLastDim, const int64_t n) {
+__global__ void kSiluMulScalar(T* retPtr, const T* selfPtr, const int64_t halfLastDim, const int64_t n) {
   const auto index = blockIdx.x * blockDim.x + threadIdx.x;
   if (index < n) {
     const int64_t sliceIdx = index / halfLastDim;
@@ -42,9 +76,69 @@ Tensor siluMulOpCudaImpl(const Tensor& self) {
   const int64_t lastDim = self.size(-1);
   const int64_t halfLastDim = lastDim / 2;
   const int64_t n = ret.numel();
-  auto params = cuda::getKernelLaunchParams(self.device().index, n);
-  CUDA_LAUNCH_KERNEL(kSiluMul<CudaT>, params, retPtr, selfPtr, halfLastDim, n);
+  const int64_t numRows = n / halfLastDim;
+
+  constexpr int kVecBytes = 16;  // int4
+  constexpr int kVecElements = kVecBytes / static_cast<int>(sizeof(CudaT));
+
+  const bool useVec = (halfLastDim % kVecElements == 0);
+
+  if (useVec) {
+    const int d = static_cast<int>(halfLastDim);
+    dim3 grid(static_cast<unsigned>(numRows));
+    dim3 block(std::min(d / kVecElements, 1024));
+    auto stream = cuda::getCurrentCUDAStream(self.device().index).stream();
+    kSiluMulVec<CudaT, kVecElements><<<grid, block, 0, stream>>>(retPtr, selfPtr, d);
+    CUDA_KERNEL_CHECK();
+  } else {
+    auto params = cuda::getKernelLaunchParams(self.device().index, n);
+    CUDA_LAUNCH_KERNEL(kSiluMulScalar<CudaT>, params, retPtr, selfPtr, halfLastDim, n);
+  }
+
   return ret;
 }
 
+template <typename T>
+__global__ void kFusedAddRMSNorm(T* __restrict__ input, T* __restrict__ residual, const T* __restrict__ weight,
+                                 int64_t dim, float eps) {
+  const auto row = blockIdx.x;
+  const auto tid = threadIdx.x;
+  const auto base = row * dim;
+
+  // add residual + accumulate sum‑of‑squares
+  float sumSq = 0.f;
+  for (auto i = tid; i < dim; i += blockDim.x) {
+    float r = static_cast<float>(input[base + i]) + static_cast<float>(residual[base + i]);
+    residual[base + i] = static_cast<T>(r);
+    sumSq += r * r;
+  }
+
+  sumSq = cudaBlockReduce<float, OpCudaReduceSum>(sumSq, 0.f);
+  float invRms = cuda::rsqrt(sumSq / static_cast<float>(dim) + eps);
+
+  // normalize + affine
+  for (auto i = tid; i < dim; i += blockDim.x) {
+    auto r = static_cast<float>(residual[base + i]);
+    input[base + i] = static_cast<T>(r * invRms * static_cast<float>(weight[i]));
+  }
+}
+
+template <typename T>
+void fusedAddRmsNormOpCudaImpl(Tensor& input, Tensor& residual, const Tensor& weight, float eps) {
+  ASSERT(input.shape() == residual.shape());
+  int64_t dim = input.size(-1);
+  int64_t numRows = input.numel() / dim;
+
+  using CudaT = typename cuda::CudaTypeCast<T>::type;
+  CudaT* inputPtr = input.dataPtr<CudaT>();
+  CudaT* residualPtr = residual.dataPtr<CudaT>();
+  const CudaT* weightPtr = weight.dataPtr<CudaT>();
+
+  auto stream = cuda::getCurrentCUDAStream(input.device().index).stream();
+  dim3 blockSize(std::clamp(nextPow2(dim), 32u, 1024u));
+  dim3 gridSize(numRows);
+  kFusedAddRMSNorm<CudaT><<<gridSize, blockSize, 0, stream>>>(inputPtr, residualPtr, weightPtr, dim, eps);
+  CUDA_KERNEL_CHECK();
+}
+
 }  // namespace tinytorch::op
diff --git a/src/Operation/OpNNLayer.h b/src/Operation/OpNNLayer.h
index 5f71fe4..5e5da04 100644
--- a/src/Operation/OpNNLayer.h
+++ b/src/Operation/OpNNLayer.h
@@ -8,25 +8,6 @@
 
 #include "Tensor/Dispatch.h"
 
-namespace tinytorch {
-
-struct RopeScalingConfig {
-  float factor;
-  float highFreqFactor;
-  float lowFreqFactor;
-  int64_t originalContextLength;
-
-  RopeScalingConfig(float f, float hf, float lf, int64_t len)
-      : factor(f), highFreqFactor(hf), lowFreqFactor(lf), originalContextLength(len) {}
-};
-
-enum class QKVLayout {
-  BHSD,  // [batch, numHead, seqLen, headDim]
-  BSHD   // [batch, seqLen, numHead, headDim]
-};
-
-}  // namespace tinytorch
-
 namespace tinytorch::op {
 
 enum class SoftmaxType : uint8_t {
@@ -59,12 +40,6 @@ using LayerNormOpFn = Tensor (*)(const Tensor& self, IntArrayView normalizedShap
 
 using RMSNormOpFn = Tensor (*)(const Tensor& self, IntArrayView normalizedShape, const Tensor& weight, float eps);
 
-using RopeInitOpFn = Tensor (*)(int64_t headDim, int64_t contextLength, float thetaBase,
-                                std::optional<RopeScalingConfig> scaling, Options options);
-using RopeApplyOpFn = Tensor (*)(const Tensor& input, const Tensor& rope, int64_t offset, QKVLayout layout);
-
-using FlashAttentionOpFn = Tensor (*)(const Tensor& query, const Tensor& key, const Tensor& value, bool isCausal);
-
 // softmax
 DEFINE_OP(softmax, SoftmaxOpFn);
 DEFINE_OP(softmaxOut, SoftmaxOpOutFn);
@@ -85,13 +60,6 @@ DEFINE_OP(layerNorm, LayerNormOpFn);
 // rmsNorm
 DEFINE_OP(rmsNorm, RMSNormOpFn);
 
-// rope
-DEFINE_OP(ropeInit, RopeInitOpFn);
-DEFINE_OP(ropeApply, RopeApplyOpFn);
-
-// flashAttention
-DEFINE_OP(flashAttention, FlashAttentionOpFn);
-
 void registerNNLayerCpu();
 STATIC_CALL(registerNNLayerCpu);
 
diff --git a/src/Operation/OpNNLayerCpu.cpp b/src/Operation/OpNNLayerCpu.cpp
index af3eaf1..afd69e8 100644
--- a/src/Operation/OpNNLayerCpu.cpp
+++ b/src/Operation/OpNNLayerCpu.cpp
@@ -33,10 +33,6 @@ void registerNNLayerCpu() {
 
   // rmsNorm
   REG_NN_LAYER_CPU_FLT(rmsNorm, rmsNormOpCpuImpl);
-
-  // rope
-  REG_NN_LAYER_CPU_FLT(ropeInit, ropeInitOpCpuImpl);
-  REG_NN_LAYER_CPU_FLT(ropeApply, ropeApplyOpCpuImpl);
 }
 
 }  // namespace tinytorch::op
diff --git a/src/Operation/OpNNLayerCpu.h b/src/Operation/OpNNLayerCpu.h
index e8c1b34..2835a0c 100644
--- a/src/Operation/OpNNLayerCpu.h
+++ b/src/Operation/OpNNLayerCpu.h
@@ -247,130 +247,4 @@ Tensor rmsNormOpCpuImpl(const Tensor& self, IntArrayView normalizedShape, const
   return normOpCpuImplDetail<T, NormType::RMSNorm>(self, normalizedShape, weight, {}, eps);
 }
 
-template <typename T>
-Tensor ropeInitOpCpuImpl(int64_t headDim, int64_t contextLength, float thetaBase,
-                         std::optional<RopeScalingConfig> scaling, Options options) {
-  ASSERT(!options.requiresGrad_);
-  ASSERT(options.device_.type == DeviceType::CPU);
-
-  ASSERT(headDim % 2 == 0);
-  int64_t halfDim = headDim >> 1;
-
-  // inverse frequency
-  Tensor invFreq({halfDim}, options);
-  auto* invFreqPtr = invFreq.dataPtr<T>();
-  for (int64_t i = 0; i < halfDim; i++) {
-    invFreqPtr[i] = 1.f / std::pow(thetaBase, static_cast<float>(i << 1) / static_cast<float>(headDim));
-  }
-
-  // apply scaling if needed
-  if (scaling.has_value()) {
-    auto originCtxLen = static_cast<float>(scaling->originalContextLength);
-    auto lowWaveLen = originCtxLen / scaling->lowFreqFactor;
-    auto highWaveLen = originCtxLen / scaling->highFreqFactor;
-    for (int64_t i = 0; i < halfDim; i++) {
-      auto waveLen = 2.f * static_cast<float>(M_PI) / invFreqPtr[i];
-      if (waveLen > lowWaveLen) {
-        invFreqPtr[i] /= scaling->factor;
-      } else if (waveLen < highWaveLen) {
-        // do nothing
-      } else {
-        auto smoothFactor =
-            (originCtxLen / waveLen - scaling->lowFreqFactor) / (scaling->highFreqFactor - scaling->lowFreqFactor);
-        auto scaled = invFreqPtr[i] / scaling->factor;
-        invFreqPtr[i] = (1.f - smoothFactor) * scaled + smoothFactor * invFreqPtr[i];
-      }
-    }
-  }
-
-  // Shape: [contextLength, headDim, 2] where last dim is [cos, sin]
-  // Memory layout: [cos0, sin0, cos1, sin1, cos2, sin2, ...]
-  Tensor rope({contextLength, headDim, 2}, options);
-  auto* ropePtr = rope.dataPtr<T>();
-
-  for (int64_t pos = 0; pos < contextLength; pos++) {
-    for (int64_t i = 0; i < halfDim; i++) {
-      float angle = static_cast<T>(pos) * invFreqPtr[i];
-      float cosVal = std::cos(angle);
-      float sinVal = std::sin(angle);
-
-      int64_t offset1 = (pos * headDim + i) * 2;
-      int64_t offset2 = (pos * headDim + halfDim + i) * 2;
-
-      ropePtr[offset1] = cosVal;
-      ropePtr[offset1 + 1] = sinVal;
-      ropePtr[offset2] = cosVal;
-      ropePtr[offset2 + 1] = sinVal;
-    }
-  }
-
-  return rope;
-}
-
-template <typename T>
-Tensor ropeApplyOpCpuImpl(const Tensor& input, const Tensor& rope, int64_t offset, QKVLayout layout) {
-  const auto& shape = input.shape();
-  ASSERT(shape.size() == 4);
-
-  int64_t batch, numHead, seqLen, headDim;
-  int64_t strideB, strideH, strideT;
-
-  if (layout == QKVLayout::BHSD) {
-    // [B, N, S, D]
-    batch = shape[0];
-    numHead = shape[1];
-    seqLen = shape[2];
-    headDim = shape[3];
-
-    strideT = headDim;
-    strideH = seqLen * headDim;
-    strideB = numHead * seqLen * headDim;
-  } else {
-    // [B, S, N, D]
-    batch = shape[0];
-    seqLen = shape[1];
-    numHead = shape[2];
-    headDim = shape[3];
-
-    strideH = headDim;
-    strideT = numHead * headDim;
-    strideB = seqLen * numHead * headDim;
-  }
-
-  ASSERT(headDim % 2 == 0);
-  const int64_t halfDim = headDim >> 1;
-
-  const auto* inputPtr = input.dataPtr<T>();
-  const auto* ropePtr = rope.dataPtr<T>();
-
-  Tensor out(shape, input.options().noGrad());
-  auto* outPtr = out.dataPtr<T>();
-
-  for (int64_t b = 0; b < batch; b++) {
-    for (int64_t h = 0; h < numHead; h++) {
-      for (int64_t t = 0; t < seqLen; t++) {
-        const int64_t base = b * strideB + h * strideH + t * strideT;
-        const T* xPtr = inputPtr + base;
-        T* yPtr = outPtr + base;
-
-        int64_t posIndex = offset + t;
-        const T* ropeRow = ropePtr + posIndex * headDim * 2;
-
-        for (int64_t i = 0; i < halfDim; i++) {
-          float x1 = xPtr[i];
-          float x2 = xPtr[halfDim + i];
-
-          int64_t idx = i * 2;
-          float c = ropeRow[idx];      // cos
-          float s = ropeRow[idx + 1];  // sin
-
-          yPtr[i] = x1 * c - x2 * s;
-          yPtr[halfDim + i] = x2 * c + x1 * s;
-        }
-      }
-    }
-  }
-  return out;
-}
-
 }  // namespace tinytorch::op
diff --git a/src/Operation/OpNNLayerCuda.cu b/src/Operation/OpNNLayerCuda.cu
index 523edbc..3ef5834 100644
--- a/src/Operation/OpNNLayerCuda.cu
+++ b/src/Operation/OpNNLayerCuda.cu
@@ -4,51 +4,10 @@
  *
  */
 
-#include "flash_attn/flash_api.cuh"
 #include "OpNNLayerCuda.cuh"
 
 namespace tinytorch::op {
 
-template <typename T>
-Tensor flashAttentionOpCudaImpl(const Tensor& query, const Tensor& key, const Tensor& value, bool isCausal) {
-  const auto& qShape = query.shape();  // [batch, seqLenQ, numHeadsQ, headDim]
-  const auto& kShape = key.shape();    // [batch, seqLenKV, numHeadsKV, headDim]
-  ASSERT(qShape.size() == 4);
-  ASSERT(kShape.size() == 4);
-
-  auto batch = qShape[0];
-  auto numHeadsQ = qShape[2];
-  auto numHeadsKV = kShape[2];
-  auto headDim = qShape[3];
-
-  ASSERT(numHeadsQ % numHeadsKV == 0);  // GQA
-
-  auto seqLenQ = query.size(1);
-  auto seqLenKV = key.size(1);
-
-  using CudaT = typename cuda::CudaTypeCast<T>::type;
-
-  Tensor out(qShape, query.options().noGrad());
-  auto* outPtr = out.dataPtr<CudaT>();
-
-  const auto* qPtr = query.dataPtr<CudaT>();
-  const auto* kPtr = key.dataPtr<CudaT>();
-  const auto* vPtr = value.dataPtr<CudaT>();
-  auto* oPtr = out.dataPtr<CudaT>();
-
-  auto stream = cuda::getCurrentCUDAStream(query.device().index).stream();
-  tfa::flashAttn<CudaT>(qPtr, kPtr, vPtr, oPtr, batch, seqLenQ, seqLenKV, numHeadsQ, numHeadsKV, headDim, isCausal,
-                        stream);
-  CUDA_KERNEL_CHECK();
-  return out;
-}
-
-#define INSTANTIATE_FLASHATTEN_OP(T)                                                                       \
-  template Tensor flashAttentionOpCudaImpl<T>(const Tensor& query, const Tensor& key, const Tensor& value, \
-                                              bool isCausal);
-FOR_FLT_TYPES(INSTANTIATE_FLASHATTEN_OP)
-#undef INSTANTIATE_FLASHATTEN_OP
-
 #define REG_NN_LAYER_CUDA_FLT(NAME, FUNC)                                       \
   REGISTER_OP_IMPL(NAME, CUDA, Float32, &(FUNC<DTypeToType_t<DType::Float32>>)) \
   REGISTER_OP_IMPL(NAME, CUDA, Float16, &(FUNC<DTypeToType_t<DType::Float16>>)) \
@@ -74,13 +33,6 @@ void registerNNLayerCuda() {
 
   // rmsNorm
   REG_NN_LAYER_CUDA_FLT(rmsNorm, rmsNormOpCudaImpl);
-
-  // rope
-  REG_NN_LAYER_CUDA_FLT(ropeInit, ropeInitOpCudaImpl);
-  REG_NN_LAYER_CUDA_FLT(ropeApply, ropeApplyOpCudaImpl);
-
-  // flashAttention
-  REG_NN_LAYER_CUDA_FLT(flashAttention, flashAttentionOpCudaImpl);
 }
 
 }  // namespace tinytorch::op
diff --git a/src/Operation/OpNNLayerCuda.cuh b/src/Operation/OpNNLayerCuda.cuh
index b1ef1d7..a817ec9 100644
--- a/src/Operation/OpNNLayerCuda.cuh
+++ b/src/Operation/OpNNLayerCuda.cuh
@@ -356,89 +356,6 @@ __global__ void kNormLarge(T* out, const T* input, const T* weight, const T* bia
   }
 }
 
-template <typename T>
-__global__ void kRopeComputeInvFreq(T* invFreqPtr, int64_t halfDim, float thetaBase) {
-  const auto idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx < halfDim) {
-    invFreqPtr[idx] =
-        static_cast<T>(1.f / cuda::pow(thetaBase, static_cast<float>(idx << 1) / static_cast<float>(halfDim << 1)));
-  }
-}
-
-template <typename T>
-__global__ void kRopeApplyScaling(T* invFreqPtr, int64_t halfDim, float originalContextLength, float lowFreqFactor,
-                                  float highFreqFactor, float scalingFactor) {
-  const auto idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx < halfDim) {
-    auto invFreq = static_cast<float>(invFreqPtr[idx]);
-    auto waveLen = 2.f * static_cast<float>(M_PI) / invFreq;
-    auto lowWaveLen = originalContextLength / lowFreqFactor;
-    auto highWaveLen = originalContextLength / highFreqFactor;
-
-    if (waveLen > lowWaveLen) {
-      invFreqPtr[idx] /= static_cast<T>(scalingFactor);
-    } else if (waveLen < highWaveLen) {
-      // do nothing
-    } else {
-      auto smoothFactor = (originalContextLength / waveLen - lowFreqFactor) / (highFreqFactor - lowFreqFactor);
-      auto scaled = invFreq / scalingFactor;
-      invFreqPtr[idx] = static_cast<T>((1.f - smoothFactor) * scaled + smoothFactor * invFreq);
-    }
-  }
-}
-
-template <typename T>
-__global__ void kRopePrecomputeCosSin(const T* invFreqPtr, T* ropePtr, int64_t contextLength, int64_t headDim) {
-  const int64_t pos = blockIdx.x;
-  if (pos < contextLength) {
-    auto halfDim = headDim >> 1;
-    for (auto i = threadIdx.x; i < halfDim; i += blockDim.x) {
-      float angle = static_cast<float>(pos) * static_cast<float>(invFreqPtr[i]);
-
-      T cosVal = static_cast<T>(::cosf(angle));
-      T sinVal = static_cast<T>(::sinf(angle));
-
-      int64_t offset1 = (pos * headDim + i) * 2;
-      int64_t offset2 = (pos * headDim + halfDim + i) * 2;
-
-      ropePtr[offset1] = cosVal;
-      ropePtr[offset1 + 1] = sinVal;
-      ropePtr[offset2] = cosVal;
-      ropePtr[offset2 + 1] = sinVal;
-    }
-  }
-}
-
-template <typename T>
-__global__ void kRopeApply(const T* input, const float* rope, T* output, int64_t batch, int64_t numHead, int64_t seqLen,
-                           int64_t headDim, int64_t strideB, int64_t strideH, int64_t strideT, int64_t offset) {
-  const int64_t b = blockIdx.x / numHead;
-  const int64_t h = blockIdx.x % numHead;
-  const int64_t t = blockIdx.y * blockDim.x + threadIdx.x;
-
-  if (b < batch && t < seqLen) {
-    const int64_t base = b * strideB + h * strideH + t * strideT;
-    const T* xPtr = input + base;
-    T* yPtr = output + base;
-
-    const int64_t posIndex = offset + t;
-    const float* ropeRow = rope + posIndex * headDim * 2;
-
-    const int64_t halfDim = headDim >> 1;
-    for (int64_t i = 0; i < halfDim; i++) {
-      auto x1 = static_cast<float>(xPtr[i]);
-      auto x2 = static_cast<float>(xPtr[halfDim + i]);
-
-      int64_t idx = i * 2;
-      float c = ropeRow[idx];      // cos
-      float s = ropeRow[idx + 1];  // sin
-
-      yPtr[i] = static_cast<T>(x1 * c - x2 * s);
-      yPtr[halfDim + i] = static_cast<T>(x2 * c + x1 * s);
-    }
-  }
-}
-
 template <typename T, SoftmaxType type>
 void softmaxForwardCudaImpl(Tensor& out, const Tensor& self, int64_t dim) {
   ASSERT(out.shape() == self.shape());
@@ -618,96 +535,4 @@ Tensor rmsNormOpCudaImpl(const Tensor& self, IntArrayView normalizedShape, const
   return normOpCudaImplDetail<T, NormType::RMSNorm>(self, normalizedShape, weight, {}, eps);
 }
 
-template <typename T>
-Tensor ropeInitOpCudaImpl(int64_t headDim, int64_t contextLength, float thetaBase,
-                          std::optional<RopeScalingConfig> scaling, Options options) {
-  ASSERT(!options.requiresGrad_);
-  ASSERT(options.device_.type == DeviceType::CUDA);
-  options.dtype_ = DType::Float32;
-
-  ASSERT(headDim % 2 == 0);
-  int64_t halfDim = headDim >> 1;
-
-  // inverse frequency
-  Tensor invFreq({halfDim}, options);
-  auto* invFreqPtr = invFreq.dataPtr<float>();
-
-  auto params = cuda::getKernelLaunchParams(options.device_.index, halfDim);
-  CUDA_LAUNCH_KERNEL(kRopeComputeInvFreq<float>, params, invFreqPtr, halfDim, thetaBase);
-
-  // apply scaling if needed
-  if (scaling.has_value()) {
-    CUDA_LAUNCH_KERNEL(kRopeApplyScaling<float>, params, invFreqPtr, halfDim,
-                       static_cast<float>(scaling->originalContextLength), scaling->lowFreqFactor,
-                       scaling->highFreqFactor, scaling->factor);
-  }
-
-  // Shape: [contextLength, headDim, 2] where last dim is [cos, sin]
-  // Memory layout: [cos0, sin0, cos1, sin1, cos2, sin2, ...]
-  Tensor rope({contextLength, headDim, 2}, options);
-  auto* ropePtr = rope.dataPtr<float>();
-
-  auto blockSize = cuda::getKernelBlockSize(options.device_.index);
-  auto stream = cuda::getCurrentCUDAStream(options.device_.index).stream();
-  kRopePrecomputeCosSin<float><<<contextLength, blockSize, 0, stream>>>(invFreqPtr, ropePtr, contextLength, headDim);
-  CUDA_KERNEL_CHECK();
-
-  return rope;
-}
-
-template <typename T>
-Tensor ropeApplyOpCudaImpl(const Tensor& input, const Tensor& rope, int64_t offset, QKVLayout layout) {
-  const auto& shape = input.shape();
-  ASSERT(shape.size() == 4);
-
-  int64_t batch, numHead, seqLen, headDim;
-  int64_t strideB, strideH, strideT;
-
-  if (layout == QKVLayout::BHSD) {
-    // [B, N, S, D]
-    batch = shape[0];
-    numHead = shape[1];
-    seqLen = shape[2];
-    headDim = shape[3];
-
-    strideT = headDim;
-    strideH = seqLen * headDim;
-    strideB = numHead * seqLen * headDim;
-  } else {
-    // [B, S, N, D]
-    batch = shape[0];
-    seqLen = shape[1];
-    numHead = shape[2];
-    headDim = shape[3];
-
-    strideH = headDim;
-    strideT = numHead * headDim;
-    strideB = seqLen * numHead * headDim;
-  }
-
-  ASSERT(headDim % 2 == 0);
-
-  using CudaT = typename cuda::CudaTypeCast<T>::type;
-  const auto* inputPtr = input.dataPtr<CudaT>();
-  const auto* ropePtr = rope.dataPtr<float>();
-
-  Tensor out(shape, input.options().noGrad());
-  auto* outPtr = out.dataPtr<CudaT>();
-
-  const auto blockSize = cuda::getKernelBlockSize(input.device().index);
-  const auto blocksPerSeq = cuda::getKernelGridSize(blockSize, seqLen);
-
-  dim3 gridDim(batch * numHead, blocksPerSeq);
-  dim3 blockDim(blockSize);
-
-  auto stream = cuda::getCurrentCUDAStream(input.device().index).stream();
-  kRopeApply<CudaT><<<gridDim, blockDim, 0, stream>>>(inputPtr, ropePtr, outPtr, batch, numHead, seqLen, headDim,
-                                                      strideB, strideH, strideT, offset);
-  CUDA_KERNEL_CHECK();
-  return out;
-}
-
-template <typename T>
-Tensor flashAttentionOpCudaImpl(const Tensor& query, const Tensor& key, const Tensor& value, bool isCausal);
-
 }  // namespace tinytorch::op
diff --git a/src/Operation/OpTransform.cpp b/src/Operation/OpTransform.cpp
index b97ea9c..95ac019 100644
--- a/src/Operation/OpTransform.cpp
+++ b/src/Operation/OpTransform.cpp
@@ -152,13 +152,28 @@ std::vector<Tensor> splitOpImplDetail(const Tensor& self, IntArrayView sectionSi
   const auto numSections = sectionSizes.size();
   std::vector<Tensor> retTensors(numSections);
 
-  if (dim == 0) {  // share storage
+  bool outerAllOne = (dim == 0);
+  if (!outerAllOne) {
+    outerAllOne = true;
+    for (int64_t i = 0; i < dim; i++) {
+      if (self.shape(i) != 1) {
+        outerAllOne = false;
+        break;
+      }
+    }
+  }
+
+  if (outerAllOne) {  // share storage
+    int64_t innerSize = 1;
+    for (int64_t i = dim + 1; i < self.dim(); i++) {
+      innerSize *= self.shape(i);
+    }
     int64_t startIdx = 0;
     for (size_t i = 0; i < numSections; i++) {
       SizeVector retShape(self.shape());
-      retShape[0] = sectionSizes[i];
+      retShape[dim] = sectionSizes[i];
 
-      int64_t storageOffset = self.storageOffset() + startIdx * self.stride(0);
+      int64_t storageOffset = self.storageOffset() + startIdx * innerSize;
       retTensors[i] = Tensor(retShape, self.options().noGrad(), self.storage(), storageOffset);
 
       startIdx += sectionSizes[i];
@@ -416,8 +431,20 @@ Tensor narrowOpImpl(const Tensor& self, int64_t dim, int64_t start, int64_t leng
   SizeVector retShape(self.shape());
   retShape[dim] = length;
 
-  if (dim == 0) {  // share storage
-    int64_t newOffset = self.storageOffset() + start * self.stride(0);
+  bool outerAllOne = true;
+  for (int64_t i = 0; i < dim; i++) {
+    if (self.shape(i) != 1) {
+      outerAllOne = false;
+      break;
+    }
+  }
+
+  if (dim == 0 || outerAllOne) {  // share storage
+    int64_t innerSize = 1;
+    for (int64_t i = dim + 1; i < self.dim(); i++) {
+      innerSize *= self.shape(i);
+    }
+    int64_t newOffset = self.storageOffset() + start * innerSize;
     return {retShape, self.options().noGrad(), self.storage(), newOffset};
   }
 
diff --git a/src/Tensor/Allocator.cpp b/src/Tensor/Allocator.cpp
index 41fc12a..2728a81 100644
--- a/src/Tensor/Allocator.cpp
+++ b/src/Tensor/Allocator.cpp
@@ -26,7 +26,7 @@ CPUPinnedAllocator::~CPUPinnedAllocator() {
 
 void* CPUPinnedAllocator::allocate(int64_t nbytes) {
   void* ptr = nullptr;
-  CUDA_CHECK(cudaHostAlloc(&ptr, nbytes, cudaHostAllocDefault));
+  CUDA_CHECK(cudaHostAlloc(&ptr, static_cast<size_t>(nbytes), cudaHostAllocDefault));
 #ifndef NDEBUG
   if (ptr) {
     allocatedPtrs_.insert(ptr);
@@ -60,7 +60,7 @@ CUDAAllocator::~CUDAAllocator() {
 void* CUDAAllocator::allocate(int64_t nbytes) {
   void* ptr = nullptr;
   cuda::CudaDeviceGuard guard(deviceIndex_);
-  CUDA_CHECK(cudaMalloc(&ptr, nbytes));
+  CUDA_CHECK(cudaMalloc(&ptr, static_cast<size_t>(nbytes)));
 #ifndef NDEBUG
   if (ptr) {
     allocatedPtrs_.insert(ptr);
@@ -91,33 +91,20 @@ Allocator* getAllocator(const Options& options) {
   if (options.device_.isCpu()) {
     if (options.pinnedMemory_) {
 #ifdef USE_CUDA
-      static CachedAllocator cachedPinnedAllocator(std::make_unique<CPUPinnedAllocator>());
-      return &cachedPinnedAllocator;
+      // leak-on-exit
+      static auto* cachedPinnedAllocator = new CachedAllocator(std::make_unique<CPUPinnedAllocator>());
+      return cachedPinnedAllocator;
 #else
       ASSERT(false && "cuda not support");
       return nullptr;
 #endif
     } else {
-      static CachedAllocator cachedCpuAllocator(std::make_unique<CPUAllocator<>>());
-      return &cachedCpuAllocator;
+      static auto* cachedCpuAllocator = new CachedAllocator(std::make_unique<CPUAllocator<>>());
+      return cachedCpuAllocator;
     }
   } else if (options.device_.isCuda()) {
 #ifdef USE_CUDA
-    auto deviceCount = cuda::getDeviceCount();
-    static std::vector<CachedAllocator> deviceAllocators;
-    static std::once_flag flag;
-    std::call_once(flag, [&]() {
-      deviceAllocators.reserve(deviceCount);
-      for (auto i = 0; i < deviceCount; i++) {
-        deviceAllocators.emplace_back(std::make_unique<CUDAAllocator>(i));
-      }
-    });
-    auto deviceIndex = options.device_.index;
-    if (deviceIndex < 0 || static_cast<size_t>(deviceIndex) >= deviceAllocators.size()) {
-      LOGE("getAllocator error: Invalid CUDA device index %d", deviceIndex);
-      return nullptr;
-    }
-    return &deviceAllocators[deviceIndex];
+    return getCUDACachedAllocator(options.device_.index);
 #else
     ASSERT(false && "cuda not support");
     return nullptr;
@@ -127,4 +114,28 @@ Allocator* getAllocator(const Options& options) {
   return nullptr;
 }
 
+CachedAllocator* getCUDACachedAllocator(int device) {
+#ifdef USE_CUDA
+  auto deviceCount = cuda::getDeviceCount();
+  // leak-on-exit
+  static std::vector<CachedAllocator*>* deviceAllocators = nullptr;
+  static std::once_flag flag;
+  std::call_once(flag, [&]() {
+    deviceAllocators = new std::vector<CachedAllocator*>();
+    deviceAllocators->reserve(deviceCount);
+    for (auto i = 0; i < deviceCount; i++) {
+      deviceAllocators->push_back(new CachedAllocator(std::make_unique<CUDAAllocator>(i)));
+    }
+  });
+  if (device < 0 || static_cast<size_t>(device) >= deviceAllocators->size()) {
+    LOGE("getCUDACachedAllocator error: Invalid CUDA device index %d", device);
+    return nullptr;
+  }
+  return (*deviceAllocators)[device];
+#else
+  (void)device;
+  return nullptr;
+#endif
+}
+
 }  // namespace tinytorch
diff --git a/src/Tensor/Allocator.h b/src/Tensor/Allocator.h
index 2c385f0..fc8a6cc 100644
--- a/src/Tensor/Allocator.h
+++ b/src/Tensor/Allocator.h
@@ -8,7 +8,6 @@
 
 #include <memory>
 
-#include "Device.h"
 #include "Utils/Macros.h"
 #include "ankerl/unordered_dense.h"
 
@@ -49,13 +48,14 @@ template <bool aligned>
 void* CPUAllocator<aligned>::allocate(int64_t nbytes) {
   void* ptr = nullptr;
   if (aligned) {
+    size_t alignedSize = (static_cast<size_t>(nbytes) + alignment_ - 1) & ~(alignment_ - 1);
 #if !defined(_MSC_VER)
-    ptr = std::aligned_alloc(alignment_, nbytes);
+    ptr = std::aligned_alloc(alignment_, alignedSize);
 #else
-    ptr = _aligned_malloc(nbytes, alignment_);
+    ptr = _aligned_malloc(alignedSize, alignment_);
 #endif
   } else {
-    ptr = std::malloc(nbytes);
+    ptr = std::malloc(static_cast<size_t>(nbytes));
   }
 
 #ifndef NDEBUG
@@ -105,13 +105,13 @@ class CPUPinnedAllocator : public Allocator {
 
 class CUDAAllocator : public Allocator {
  public:
-  explicit CUDAAllocator(DeviceIndex index = 0) : deviceIndex_(index) {}
+  explicit CUDAAllocator(int index = 0) : deviceIndex_(index) {}
   ~CUDAAllocator() override;
   void* allocate(int64_t nbytes) override;
   void deallocate(void* ptr) override;
 
  private:
-  DeviceIndex deviceIndex_;
+  int deviceIndex_;
 #ifndef NDEBUG
   ankerl::unordered_dense::set<void*> allocatedPtrs_;
 #endif
@@ -119,6 +119,9 @@ class CUDAAllocator : public Allocator {
 #endif
 
 struct Options;
+class CachedAllocator;
+
 Allocator* getAllocator(const Options& options);
+CachedAllocator* getCUDACachedAllocator(int device);
 
 }  // namespace tinytorch
diff --git a/src/Tensor/CachedAllocator.cpp b/src/Tensor/CachedAllocator.cpp
index 6baa2c3..8a4d76d 100644
--- a/src/Tensor/CachedAllocator.cpp
+++ b/src/Tensor/CachedAllocator.cpp
@@ -7,14 +7,15 @@
 #include "CachedAllocator.h"
 
 #include <array>
-#include <memory>
 #include <set>
+#include <unordered_map>
 
 #include "Utils/Logger.h"
 
 namespace tinytorch {
 
-bool CachedAllocator::cacheEnabled_ = true;
+std::atomic<bool> CachedAllocator::cacheEnabled_{true};
+std::atomic<int> CachedAllocator::nextPoolId_{0};
 
 // Ref:
 // https://github.com/pytorch/pytorch/blob/main/c10/cuda/CUDACachingAllocator.cpp
@@ -70,14 +71,25 @@ struct AllocParams {
 
   AllocParams(size_t size, BlockPool* pool, size_t allocSize)
       : searchKey(size), pool(pool), allocSize(allocSize), retBlock(nullptr) {}
+};
+
+struct PoolState {
+  BlockPool largeBlocks{BlockComparator, false};
+  BlockPool smallBlocks{BlockComparator, true};
+  int refCount{0};
+
+  PoolState() = default;
+
+  PoolState(PoolState&&) noexcept = default;
+  PoolState& operator=(PoolState&&) noexcept = delete;
 
-  size_t size() const { return searchKey.size; }
+  PoolState(const PoolState&) = delete;
+  PoolState& operator=(const PoolState&) = delete;
 };
 
 class CachedAllocatorImpl : public Allocator {
  public:
-  explicit CachedAllocatorImpl(Allocator* base)
-      : base_(base), totalAllocatedSize_(0), largeBlocks(BlockComparator, false), smallBlocks(BlockComparator, true) {}
+  explicit CachedAllocatorImpl(Allocator* base) : base_(base), totalAllocatedSize_(0), activePoolId_(-1) {}
 
   static size_t roundSize(size_t size) {
     if (size < kMinBlockSize) {
@@ -99,7 +111,7 @@ class CachedAllocatorImpl : public Allocator {
   static bool getFreeBlock(AllocParams& p) {
     BlockPool& pool = *p.pool;
 
-    // set-container search, return minium satisfied value.
+    // set-container search, return minimum satisfied value.
     const auto it = pool.blocks.lower_bound(&p.searchKey);
     if (it == pool.blocks.end()) {
       return false;
@@ -109,16 +121,16 @@ class CachedAllocatorImpl : public Allocator {
     return true;
   }
 
-  BlockPool& getPool(size_t size) {
+  static BlockPool& getPool(size_t size, PoolState& poolState) {
     if (size <= kSmallSize) {
-      return smallBlocks;
+      return poolState.smallBlocks;
     }
-    return largeBlocks;
+    return poolState.largeBlocks;
   }
 
   bool allocBlock(AllocParams& p) {
     size_t size = p.allocSize;
-    void* ptr = base_->allocate(size);
+    void* ptr = base_->allocate(static_cast<int64_t>(size));
     if (!ptr) {
       return false;
     }
@@ -136,9 +148,9 @@ class CachedAllocatorImpl : public Allocator {
     return remaining > kSmallSize;
   }
 
-  void releaseCachedBlocks() {
-    releaseBlocks(largeBlocks);
-    releaseBlocks(smallBlocks);
+  void releaseCachedBlocks(PoolState& poolState) {
+    releaseBlocks(poolState.largeBlocks);
+    releaseBlocks(poolState.smallBlocks);
   }
 
   void releaseBlock(Block* block) {
@@ -150,7 +162,7 @@ class CachedAllocatorImpl : public Allocator {
   }
 
   void releaseBlocks(BlockPool& pool) {
-    // Frees all non-split blocks
+    // frees all non-split blocks
     auto it = pool.blocks.begin();
     while (it != pool.blocks.end()) {
       Block* block = *it;
@@ -161,9 +173,23 @@ class CachedAllocatorImpl : public Allocator {
     }
   }
 
-  Block* mallocImpl(size_t origSize) {
+  void releaseAllBlocks(BlockPool& pool) {
+    auto it = pool.blocks.begin();
+    while (it != pool.blocks.end()) {
+      Block* block = *it;
+      ++it;
+      if (!block->prev) {
+        base_->deallocate(block->ptr);
+        totalAllocatedSize_ -= block->size;
+      }
+      pool.blocks.erase(block);
+      delete block;
+    }
+  }
+
+  Block* mallocImpl(size_t origSize, PoolState& poolState) {
     size_t size = roundSize(origSize);
-    auto& pool = getPool(size);
+    auto& pool = getPool(size, poolState);
     const size_t allocSize = getAllocationSize(size);
     AllocParams params(size, &pool, allocSize);
 
@@ -172,13 +198,21 @@ class CachedAllocatorImpl : public Allocator {
     if (!blockFound) {
       blockFound = allocBlock(params);
       if (!blockFound) {
-        // retry after release caches
-        releaseCachedBlocks();
+        // retry after release caches from the same pool
+        releaseCachedBlocks(poolState);
         blockFound = allocBlock(params);
       }
 
       if (!blockFound) {
-        LOGE("Out of memory. failed to allocate size: %lld", allocSize);
+        // last resort: try releasing default pool caches
+        if (activePoolId_ >= 0) {
+          releaseCachedBlocks(defaultPool_);
+          blockFound = allocBlock(params);
+        }
+      }
+
+      if (!blockFound) {
+        LOGE("Out of memory. failed to allocate size: %zu", allocSize);
         return nullptr;
       }
     }
@@ -206,18 +240,13 @@ class CachedAllocatorImpl : public Allocator {
     return block;
   }
 
-  static void freeImpl(Block* block) {
-    block->allocated = false;
-    freeBlock(block);
-  }
-
   static void freeBlock(Block* block) {
     ASSERT(!block->allocated);
     auto& pool = *block->pool;
 
     const std::array<Block*, 2> mergeCandidates = {block->prev, block->next};
     for (Block* candidate : mergeCandidates) {
-      tryMergeBlocks(block, candidate, pool);
+      (void)tryMergeBlocks(block, candidate, pool);
     }
     pool.blocks.insert(block);
   }
@@ -251,43 +280,165 @@ class CachedAllocatorImpl : public Allocator {
   }
 
   void* allocate(int64_t nbytes) override {
-    Block* block = mallocImpl(nbytes);
+    auto& poolState = getActivePoolState();
+    Block* block = mallocImpl(static_cast<size_t>(nbytes), poolState);
     if (block) {
-      activeBlocks[block->ptr] = block;
+      activeBlocks_[block->ptr] = block;
       return block->ptr;
     }
 
-    LOGE("allocate error, size: %lld", nbytes);
+    LOGE("allocate error, size: %lld", static_cast<long long>(nbytes));
     return nullptr;
   }
 
   void deallocate(void* ptr) override {
-    auto it = activeBlocks.find(ptr);
-    if (it != activeBlocks.end()) {
-      freeImpl(it->second);
-      activeBlocks.erase(it);
+    auto it = activeBlocks_.find(ptr);
+    if (it != activeBlocks_.end()) {
+      Block* block = it->second;
+      activeBlocks_.erase(it);
+
+      block->allocated = false;
+      freeBlock(block);
+
+      if (!CachedAllocator::isCacheEnabled()) {
+        if (!block->isSplit()) {
+          releaseBlock(block);
+        }
+      }
     } else {
       LOGE("deallocate error, ptr not valid: %p", ptr);
     }
   }
 
+  void beginAllocateToPool(int poolId) {
+    ASSERT(activePoolId_ < 0 && "Nested pool allocation not supported");
+    activePoolId_ = poolId;
+    // lazy-create pool and increment ref count
+    auto it = graphPools_.find(poolId);
+    if (it == graphPools_.end()) {
+      graphPools_.emplace(poolId, PoolState{});
+      it = graphPools_.find(poolId);
+    }
+    it->second.refCount++;
+  }
+
+  void endAllocateToPool() {
+    ASSERT(activePoolId_ >= 0 && "endAllocateToPool called without matching begin");
+    auto it = graphPools_.find(activePoolId_);
+    if (it != graphPools_.end()) {
+      it->second.refCount--;
+    }
+    activePoolId_ = -1;
+  }
+
+  void freePool(int poolId) {
+    auto it = graphPools_.find(poolId);
+    if (it == graphPools_.end()) {
+      return;
+    }
+
+    // do not free a pool that is still actively referenced
+    if (it->second.refCount > 0) {
+      LOGE("freePool warning: pool %d still has %d active references, forcing release", poolId, it->second.refCount);
+    }
+
+    // release all blocks in the pool back to the base allocator.
+    releaseAllBlocks(it->second.largeBlocks);
+    releaseAllBlocks(it->second.smallBlocks);
+    graphPools_.erase(it);
+  }
+
+  int activePoolId() const { return activePoolId_; }
+
   ~CachedAllocatorImpl() override {
-    releaseCachedBlocks();
-    ASSERT(activeBlocks.empty());
-    ASSERT(largeBlocks.blocks.empty());
-    ASSERT(smallBlocks.blocks.empty());
+    // release all graph pools
+    for (auto& [id, pool] : graphPools_) {
+      releaseAllBlocks(pool.largeBlocks);
+      releaseAllBlocks(pool.smallBlocks);
+    }
+    graphPools_.clear();
+
+    // release any remaining active blocks
+    for (auto& [ptr, block] : activeBlocks_) {
+      base_->deallocate(block->ptr);
+      delete block;
+    }
+    activeBlocks_.clear();
+
+    // release default pool cached blocks
+    releaseAllBlocks(defaultPool_.largeBlocks);
+    releaseAllBlocks(defaultPool_.smallBlocks);
   }
 
  private:
+  PoolState& getActivePoolState() {
+    if (activePoolId_ < 0) {
+      return defaultPool_;
+    }
+    return graphPools_[activePoolId_];
+  }
+
   Allocator* base_;
   uint64_t totalAllocatedSize_;
 
-  BlockPool largeBlocks;
-  BlockPool smallBlocks;
-  ankerl::unordered_dense::map<void*, Block*> activeBlocks;
+  PoolState defaultPool_;
+
+  std::unordered_map<int, PoolState> graphPools_;
+  int activePoolId_;  // -1 = default pool
+
+  ankerl::unordered_dense::map<void*, Block*> activeBlocks_;
 };
 
 CachedAllocator::CachedAllocator(std::unique_ptr<Allocator> base)
     : base_(std::move(base)), impl_(std::make_unique<CachedAllocatorImpl>(base_.get())) {}
 
+CachedAllocator::~CachedAllocator() = default;
+
+CachedAllocator::CachedAllocator(CachedAllocator&& other) noexcept
+    : base_(std::move(other.base_)), impl_(std::move(other.impl_)) {}
+
+CachedAllocator& CachedAllocator::operator=(CachedAllocator&& other) noexcept {
+  if (this != &other) {
+    impl_ = std::move(other.impl_);
+    base_ = std::move(other.base_);
+  }
+  return *this;
+}
+
+void* CachedAllocator::allocate(int64_t nbytes) {
+  std::lock_guard<std::recursive_mutex> lock(mutex_);
+  return impl_->allocate(nbytes);
+}
+
+void CachedAllocator::deallocate(void* ptr) {
+  std::lock_guard<std::recursive_mutex> lock(mutex_);
+  impl_->deallocate(ptr);
+}
+
+void CachedAllocator::beginAllocateToPool(int poolId) {
+  std::lock_guard<std::recursive_mutex> lock(mutex_);
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-static-cast-downcast)
+  static_cast<CachedAllocatorImpl*>(impl_.get())->beginAllocateToPool(poolId);
+}
+
+void CachedAllocator::endAllocateToPool() {
+  std::lock_guard<std::recursive_mutex> lock(mutex_);
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-static-cast-downcast)
+  static_cast<CachedAllocatorImpl*>(impl_.get())->endAllocateToPool();
+}
+
+void CachedAllocator::freePool(int poolId) {
+  std::lock_guard<std::recursive_mutex> lock(mutex_);
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-static-cast-downcast)
+  static_cast<CachedAllocatorImpl*>(impl_.get())->freePool(poolId);
+}
+
+int CachedAllocator::activePoolId() const {
+  std::lock_guard<std::recursive_mutex> lock(mutex_);
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-static-cast-downcast)
+  return static_cast<CachedAllocatorImpl*>(impl_.get())->activePoolId();
+}
+
+int CachedAllocator::newPoolId() { return nextPoolId_.fetch_add(1, std::memory_order_relaxed); }
+
 }  // namespace tinytorch
diff --git a/src/Tensor/CachedAllocator.h b/src/Tensor/CachedAllocator.h
index 63bf375..e5c5719 100644
--- a/src/Tensor/CachedAllocator.h
+++ b/src/Tensor/CachedAllocator.h
@@ -6,6 +6,9 @@
 
 #pragma once
 
+#include <atomic>
+#include <mutex>
+
 #include "Allocator.h"
 
 namespace tinytorch {
@@ -13,28 +16,35 @@ namespace tinytorch {
 class CachedAllocator : public Allocator {
  public:
   explicit CachedAllocator(std::unique_ptr<Allocator> base);
+  ~CachedAllocator() override;
+
+  CachedAllocator(CachedAllocator&&) noexcept;
+  CachedAllocator& operator=(CachedAllocator&&) noexcept;
+
+  CachedAllocator(const CachedAllocator&) = delete;
+  CachedAllocator& operator=(const CachedAllocator&) = delete;
+
+  static void setCacheEnabled(bool enabled) { cacheEnabled_.store(enabled, std::memory_order_release); }
+  static bool isCacheEnabled() { return cacheEnabled_.load(std::memory_order_acquire); }
+
+  void* allocate(int64_t nbytes) override;
+  void deallocate(void* ptr) override;
+
+  void beginAllocateToPool(int poolId);
+  void endAllocateToPool();
 
-  static void setCacheEnabled(bool enabled) { cacheEnabled_ = enabled; }
+  void freePool(int poolId);
 
-  void* allocate(int64_t nbytes) override {
-    if (!cacheEnabled_) {
-      return base_->allocate(nbytes);
-    }
-    return impl_->allocate(nbytes);
-  }
+  int activePoolId() const;
 
-  void deallocate(void* ptr) override {
-    if (!cacheEnabled_) {
-      base_->deallocate(ptr);
-      return;
-    }
-    impl_->deallocate(ptr);
-  }
+  static int newPoolId();
 
  private:
-  static bool cacheEnabled_;
+  static std::atomic<bool> cacheEnabled_;
+  static std::atomic<int> nextPoolId_;
   std::unique_ptr<Allocator> base_;
   std::unique_ptr<Allocator> impl_;
+  mutable std::recursive_mutex mutex_;
 };
 
 }  // namespace tinytorch
diff --git a/src/Tensor/Device.h b/src/Tensor/Device.h
index 664d7d4..227b06f 100644
--- a/src/Tensor/Device.h
+++ b/src/Tensor/Device.h
@@ -39,6 +39,7 @@ struct Device {
   Device(DeviceType t, DeviceIndex i = 0) : type(t), index(i) {}
 
   bool operator==(const Device& other) const { return type == other.type && index == other.index; }
+  bool operator!=(const Device& other) const { return type != other.type || index != other.index; }
 
   bool isCpu() const { return type == DeviceType::CPU; }
   bool isCuda() const { return type == DeviceType::CUDA; }
diff --git a/src/Utils/CUDAGraph.cpp b/src/Utils/CUDAGraph.cpp
new file mode 100644
index 0000000..e4c7678
--- /dev/null
+++ b/src/Utils/CUDAGraph.cpp
@@ -0,0 +1,36 @@
+/*
+ * TinyTorch
+ * @author 	: keith@robot9.me
+ *
+ */
+
+#include "CUDAGraph.h"
+
+#ifdef USE_CUDA
+
+#include "Tensor/Allocator.h"
+#include "Tensor/CachedAllocator.h"
+
+namespace tinytorch::cuda {
+
+CUDAGraphCaptureGuard::CUDAGraphCaptureGuard(CUDAGraph& graph, CUDAStream& stream, int poolId)
+    : graph_(graph), stream_(stream), poolId_(poolId) {
+  CachedAllocator* allocator = getCUDACachedAllocator(stream.deviceIdx());
+  ASSERT(allocator != nullptr && "Failed to get CUDA CachedAllocator");
+  allocator->beginAllocateToPool(poolId_);
+
+  graph_.beginCapture(stream_);
+}
+
+CUDAGraphCaptureGuard::~CUDAGraphCaptureGuard() {
+  CachedAllocator* allocator = getCUDACachedAllocator(stream_.deviceIdx());
+  if (allocator) {
+    allocator->endAllocateToPool();
+  }
+
+  graph_.endCapture(stream_);
+}
+
+}  // namespace tinytorch::cuda
+
+#endif  // USE_CUDA
diff --git a/src/Utils/CUDAGraph.h b/src/Utils/CUDAGraph.h
new file mode 100644
index 0000000..cb37a75
--- /dev/null
+++ b/src/Utils/CUDAGraph.h
@@ -0,0 +1,103 @@
+/*
+ * TinyTorch
+ * @author 	: keith@robot9.me
+ *
+ */
+
+#pragma once
+
+#ifdef USE_CUDA
+
+#include "CUDAUtils.h"
+
+namespace tinytorch::cuda {
+
+class CUDAGraph {
+ public:
+  CUDAGraph() = default;
+  ~CUDAGraph() { reset(); }
+
+  CUDAGraph(CUDAGraph&& other) noexcept : graph_(other.graph_), graphExec_(other.graphExec_) {
+    other.graph_ = nullptr;
+    other.graphExec_ = nullptr;
+  }
+
+  CUDAGraph& operator=(CUDAGraph&& other) noexcept {
+    if (this != &other) {
+      reset();
+      graph_ = other.graph_;
+      graphExec_ = other.graphExec_;
+      other.graph_ = nullptr;
+      other.graphExec_ = nullptr;
+    }
+    return *this;
+  }
+
+  CUDAGraph(const CUDAGraph&) = delete;
+  CUDAGraph& operator=(const CUDAGraph&) = delete;
+
+  void beginCapture(CUDAStream& stream, cudaStreamCaptureMode mode = cudaStreamCaptureModeGlobal) {
+    if (graph_ || graphExec_) {
+      // auto-reset to avoid resource leaks in release mode
+      reset();
+    }
+    ASSERT(stream.valid() && "Stream must be valid for capture");
+    CUDA_CHECK(cudaStreamBeginCapture(stream.stream(), mode));
+  }
+
+  void endCapture(CUDAStream& stream) {
+    ASSERT(!graph_ && !graphExec_ && "Graph already instantiated");
+    CUDA_CHECK(cudaStreamEndCapture(stream.stream(), &graph_));
+    ASSERT(graph_ != nullptr && "cudaStreamEndCapture returned null graph");
+
+    // instantiate: creates the executable graph from the captured graph
+#if CUDART_VERSION >= 12000
+    CUDA_CHECK(cudaGraphInstantiate(&graphExec_, graph_, 0ULL));
+#else
+    CUDA_CHECK(cudaGraphInstantiate(&graphExec_, graph_, nullptr, nullptr, 0));
+#endif
+    ASSERT(graphExec_ != nullptr);
+  }
+
+  void replay(CUDAStream& stream) {
+    ASSERT(graphExec_ != nullptr && "No graph captured; call beginCapture/endCapture first");
+    CUDA_CHECK(cudaGraphLaunch(graphExec_, stream.stream()));
+  }
+
+  void reset() {
+    if (graphExec_) {
+      CUDA_CHECK(cudaGraphExecDestroy(graphExec_));
+      graphExec_ = nullptr;
+    }
+    if (graph_) {
+      CUDA_CHECK(cudaGraphDestroy(graph_));
+      graph_ = nullptr;
+    }
+  }
+
+  bool valid() const { return graphExec_ != nullptr; }
+
+ private:
+  cudaGraph_t graph_ = nullptr;
+  cudaGraphExec_t graphExec_ = nullptr;
+};
+
+class CUDAGraphCaptureGuard {
+ public:
+  CUDAGraphCaptureGuard(CUDAGraph& graph, CUDAStream& stream, int poolId);
+  ~CUDAGraphCaptureGuard();
+
+  CUDAGraphCaptureGuard(const CUDAGraphCaptureGuard&) = delete;
+  CUDAGraphCaptureGuard& operator=(const CUDAGraphCaptureGuard&) = delete;
+  CUDAGraphCaptureGuard(CUDAGraphCaptureGuard&&) = delete;
+  CUDAGraphCaptureGuard& operator=(CUDAGraphCaptureGuard&&) = delete;
+
+ private:
+  CUDAGraph& graph_;
+  CUDAStream& stream_;
+  int poolId_;
+};
+
+}  // namespace tinytorch::cuda
+
+#endif  // USE_CUDA
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 94c4c14..92b2489 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -19,6 +19,7 @@ add_executable(${PROJECT_NAME}
         test_function.cpp
         test_module.cpp
         test_optimizer.cpp
+        test_allocator.cpp
 )
 target_include_directories(${PROJECT_NAME} PRIVATE
         ${CMAKE_CURRENT_SOURCE_DIR}/../src
@@ -28,6 +29,12 @@ target_include_directories(${PROJECT_NAME} PRIVATE
 )
 target_link_libraries(${PROJECT_NAME} TinyTorch_lib gtest_main)
 
+# cuda support
+find_package(CUDAToolkit QUIET)
+if (CUDAToolkit_FOUND)
+    target_compile_definitions(${PROJECT_NAME} PRIVATE USE_CUDA)
+endif ()
+
 include(GoogleTest)
 
 # add tests
diff --git a/test/test_allocator.cpp b/test/test_allocator.cpp
new file mode 100644
index 0000000..6faf82a
--- /dev/null
+++ b/test/test_allocator.cpp
@@ -0,0 +1,462 @@
+/*
+ * TinyTorch
+ * @author 	: keith@robot9.me
+ *
+ */
+
+#include <cstring>
+#include <set>
+#include <thread>
+#include <vector>
+
+#include "Tensor/Allocator.h"
+#include "Tensor/CachedAllocator.h"
+#include "TinyTorch.h"
+#include "Utils/CUDAUtils.h"
+#include "test.h"
+
+namespace tt = tinytorch;
+
+TEST(allocator, cpu_allocate_deallocate) {
+  auto* allocator = tt::getAllocator(tt::Options(tt::Device::cpu()));
+  ASSERT_NE(allocator, nullptr);
+
+  // Basic allocation
+  void* ptr = allocator->allocate(1024);
+  ASSERT_NE(ptr, nullptr);
+
+  // Write/read to verify the memory is usable
+  std::memset(ptr, 0xAB, 1024);
+  auto* bytes = static_cast<uint8_t*>(ptr);
+  EXPECT_EQ(bytes[0], 0xAB);
+  EXPECT_EQ(bytes[1023], 0xAB);
+
+  allocator->deallocate(ptr);
+}
+
+TEST(allocator, cpu_zero_size_allocation) {
+  auto* allocator = tt::getAllocator(tt::Options(tt::Device::cpu()));
+  ASSERT_NE(allocator, nullptr);
+
+  // Zero-size allocation should still return something valid from the caching layer
+  // (rounded up to kMinBlockSize=512)
+  void* ptr = allocator->allocate(0);
+  // Implementation rounds to at least 512, so this should succeed
+  if (ptr) {
+    allocator->deallocate(ptr);
+  }
+}
+
+TEST(allocator, cpu_alignment) {
+  auto* allocator = tt::getAllocator(tt::Options(tt::Device::cpu()));
+  ASSERT_NE(allocator, nullptr);
+
+  // Allocate multiple times and verify alignment (default 32-byte alignment)
+  for (int i = 0; i < 10; i++) {
+    void* ptr = allocator->allocate(64 + i * 7);  // various non-aligned sizes
+    ASSERT_NE(ptr, nullptr);
+    // The underlying memory should be 32-byte aligned (from CPUAllocator)
+    // but CachedAllocator may return sub-block offsets that are 512-aligned
+    // which is a multiple of 32.
+    EXPECT_EQ(reinterpret_cast<uintptr_t>(ptr) % 32, 0u) << "Allocation " << i << " not 32-byte aligned";
+    allocator->deallocate(ptr);
+  }
+}
+
+TEST(allocator, cpu_multiple_allocations) {
+  auto* allocator = tt::getAllocator(tt::Options(tt::Device::cpu()));
+  ASSERT_NE(allocator, nullptr);
+
+  std::vector<void*> ptrs;
+  constexpr int kNumAllocs = 100;
+
+  // Allocate many blocks
+  for (int i = 0; i < kNumAllocs; i++) {
+    void* ptr = allocator->allocate(512 * (i + 1));
+    ASSERT_NE(ptr, nullptr);
+    ptrs.push_back(ptr);
+  }
+
+  // Verify all pointers are unique
+  std::set<void*> uniquePtrs(ptrs.begin(), ptrs.end());
+  EXPECT_EQ(uniquePtrs.size(), ptrs.size());
+
+  // Free all
+  for (auto* ptr : ptrs) {
+    allocator->deallocate(ptr);
+  }
+}
+
+TEST(allocator, cache_reuse_same_size) {
+  auto* allocator = tt::getAllocator(tt::Options(tt::Device::cpu()));
+  ASSERT_NE(allocator, nullptr);
+
+  // Allocate and free
+  void* ptr1 = allocator->allocate(1024);
+  ASSERT_NE(ptr1, nullptr);
+  allocator->deallocate(ptr1);
+
+  // Allocate same size — should reuse the cached block
+  void* ptr2 = allocator->allocate(1024);
+  ASSERT_NE(ptr2, nullptr);
+  EXPECT_EQ(ptr1, ptr2) << "Expected cached block reuse for same size";
+  allocator->deallocate(ptr2);
+}
+
+TEST(allocator, cache_reuse_smaller_size) {
+  auto* allocator = tt::getAllocator(tt::Options(tt::Device::cpu()));
+  ASSERT_NE(allocator, nullptr);
+
+  // Allocate a larger block
+  void* ptr1 = allocator->allocate(4096);
+  ASSERT_NE(ptr1, nullptr);
+  allocator->deallocate(ptr1);
+
+  // Allocate a smaller size — might get the same block (or split from it)
+  void* ptr2 = allocator->allocate(512);
+  ASSERT_NE(ptr2, nullptr);
+  // ptr2 should be at the same base address (beginning of the split block)
+  EXPECT_EQ(ptr1, ptr2) << "Expected block split to reuse base address";
+  allocator->deallocate(ptr2);
+}
+
+TEST(allocator, cache_disabled_no_reuse) {
+  auto* allocator = tt::getAllocator(tt::Options(tt::Device::cpu()));
+  ASSERT_NE(allocator, nullptr);
+
+  // Allocate with cache enabled
+  void* ptr1 = allocator->allocate(2048);
+  ASSERT_NE(ptr1, nullptr);
+
+  // Disable cache, then free — block should be released to base allocator
+  tt::CachedAllocator::setCacheEnabled(false);
+  allocator->deallocate(ptr1);
+
+  // Allocate same size — should NOT get the same address (no caching)
+  void* ptr2 = allocator->allocate(2048);
+  ASSERT_NE(ptr2, nullptr);
+  // Note: it's possible (but unlikely) to get the same address from malloc,
+  // so we don't assert inequality. Just ensure it works without crash.
+  allocator->deallocate(ptr2);
+
+  // Re-enable cache
+  tt::CachedAllocator::setCacheEnabled(true);
+}
+
+TEST(allocator, cache_toggle_correctness) {
+  auto* allocator = tt::getAllocator(tt::Options(tt::Device::cpu()));
+  ASSERT_NE(allocator, nullptr);
+
+  // Allocate with cache enabled
+  void* ptr1 = allocator->allocate(1024);
+  ASSERT_NE(ptr1, nullptr);
+
+  // Disable cache
+  tt::CachedAllocator::setCacheEnabled(false);
+
+  // Allocate another (still tracked by impl)
+  void* ptr2 = allocator->allocate(1024);
+  ASSERT_NE(ptr2, nullptr);
+
+  // Re-enable cache
+  tt::CachedAllocator::setCacheEnabled(true);
+
+  // Deallocate both — should not crash regardless of cache state changes
+  allocator->deallocate(ptr1);
+  allocator->deallocate(ptr2);
+
+  // Verify reuse now works again
+  void* ptr3 = allocator->allocate(1024);
+  ASSERT_NE(ptr3, nullptr);
+  allocator->deallocate(ptr3);
+}
+
+TEST(allocator, pool_id_uniqueness) {
+  int id1 = tt::CachedAllocator::newPoolId();
+  int id2 = tt::CachedAllocator::newPoolId();
+  int id3 = tt::CachedAllocator::newPoolId();
+  EXPECT_NE(id1, id2);
+  EXPECT_NE(id2, id3);
+  EXPECT_NE(id1, id3);
+}
+
+TEST(allocator, concurrent_allocate_deallocate) {
+  auto* allocator = tt::getAllocator(tt::Options(tt::Device::cpu()));
+  ASSERT_NE(allocator, nullptr);
+
+  constexpr int kNumThreads = 8;
+  constexpr int kAllocsPerThread = 100;
+  std::vector<std::thread> threads;
+
+  std::atomic<int> errors{0};
+
+  threads.reserve(kNumThreads);
+  for (int t = 0; t < kNumThreads; t++) {
+    threads.emplace_back([&, t]() {
+      std::vector<void*> ptrs;
+      for (int i = 0; i < kAllocsPerThread; i++) {
+        size_t size = 512 * static_cast<size_t>((t * kAllocsPerThread + i) % 20 + 1);
+        void* ptr = allocator->allocate(static_cast<int64_t>(size));
+        if (!ptr) {
+          errors.fetch_add(1);
+          continue;
+        }
+        // Touch the memory to detect corruption
+        std::memset(ptr, static_cast<int>(t & 0xFF), std::min(size, size_t(64)));
+        ptrs.push_back(ptr);
+      }
+      // Deallocate in reverse order
+      for (auto it = ptrs.rbegin(); it != ptrs.rend(); ++it) {
+        allocator->deallocate(*it);
+      }
+    });
+  }
+
+  for (auto& th : threads) {
+    th.join();
+  }
+
+  EXPECT_EQ(errors.load(), 0) << "Some allocations failed under contention";
+}
+
+TEST(allocator, concurrent_allocate_interleaved) {
+  auto* allocator = tt::getAllocator(tt::Options(tt::Device::cpu()));
+  ASSERT_NE(allocator, nullptr);
+
+  constexpr int kNumThreads = 4;
+  constexpr int kIterations = 200;
+  std::vector<std::thread> threads;
+
+  std::atomic<int> errors{0};
+
+  threads.reserve(kNumThreads);
+  for (int t = 0; t < kNumThreads; t++) {
+    threads.emplace_back([&]() {
+      for (int i = 0; i < kIterations; i++) {
+        // Allocate and immediately free — exercises cache reuse under contention
+        void* ptr = allocator->allocate(1024);
+        if (!ptr) {
+          errors.fetch_add(1);
+          continue;
+        }
+        // Small delay to increase interleaving probability
+        std::memset(ptr, 0, 64);
+        allocator->deallocate(ptr);
+      }
+    });
+  }
+
+  for (auto& th : threads) {
+    th.join();
+  }
+
+  EXPECT_EQ(errors.load(), 0);
+}
+
+TEST(allocator, large_allocation) {
+  auto* allocator = tt::getAllocator(tt::Options(tt::Device::cpu()));
+  ASSERT_NE(allocator, nullptr);
+
+  // Allocate > kMinLargeAlloc (10 MiB) — uses kRoundLarge rounding
+  constexpr size_t kLargeSize = 16 * 1024 * 1024;  // 16 MiB
+  void* ptr = allocator->allocate(static_cast<int64_t>(kLargeSize));
+  ASSERT_NE(ptr, nullptr);
+
+  // Write to first and last byte
+  auto* bytes = static_cast<uint8_t*>(ptr);
+  bytes[0] = 0x11;
+  bytes[kLargeSize - 1] = 0x22;
+  EXPECT_EQ(bytes[0], 0x11);
+  EXPECT_EQ(bytes[kLargeSize - 1], 0x22);
+
+  allocator->deallocate(ptr);
+}
+
+TEST(allocator, mixed_sizes) {
+  auto* allocator = tt::getAllocator(tt::Options(tt::Device::cpu()));
+  ASSERT_NE(allocator, nullptr);
+
+  // Mix of small and large allocations
+  std::vector<std::pair<void*, size_t>> allocs;
+  std::vector<size_t> sizes = {64, 512, 1024, 4096, 1048576, 2097152, 10485760, 20971520};
+
+  for (auto size : sizes) {
+    void* ptr = allocator->allocate(static_cast<int64_t>(size));
+    ASSERT_NE(ptr, nullptr) << "Failed to allocate " << size << " bytes";
+    allocs.emplace_back(ptr, size);
+  }
+
+  // Verify all are writable
+  for (auto& [ptr, size] : allocs) {
+    std::memset(ptr, 0xCD, std::min(size, size_t(128)));
+  }
+
+  // Free in random-ish order
+  for (int i = static_cast<int>(allocs.size()) - 1; i >= 0; i -= 2) {
+    allocator->deallocate(allocs[i].first);
+  }
+  for (int i = 0; i < static_cast<int>(allocs.size()); i += 2) {
+    allocator->deallocate(allocs[i].first);
+  }
+}
+
+TEST(allocator, tensor_allocate_cpu) {
+  auto opts = tt::Options(tt::Device::cpu(), tt::DType::Float32).noGrad();
+
+  // Create tensor — exercises full allocator path
+  tt::Tensor t({100, 100}, opts);
+  ASSERT_NE(t.dataPtr<>(), nullptr);
+
+  // Fill and verify
+  auto data = std::vector<float>(10000, 3.14f);
+  tt::Tensor t2(data, {100, 100}, opts);
+  auto list = t2.toList<float>();
+  EXPECT_EQ(list.size(), 10000u);
+  EXPECT_NEAR(list[0], 3.14f, 1e-5f);
+  EXPECT_NEAR(list[9999], 3.14f, 1e-5f);
+}
+
+TEST(allocator, tensor_reuse_after_destroy) {
+  auto opts = tt::Options(tt::Device::cpu(), tt::DType::Float32).noGrad();
+
+  void* firstPtr = nullptr;
+  {
+    tt::Tensor t({256}, opts);
+    firstPtr = t.dataPtr<>();
+    ASSERT_NE(firstPtr, nullptr);
+  }
+  // Tensor destroyed, memory returned to cache
+
+  // New tensor of same size should reuse the cached block
+  tt::Tensor t2({256}, opts);
+  void* secondPtr = t2.dataPtr<>();
+  EXPECT_EQ(firstPtr, secondPtr) << "Expected cached block reuse for Tensor";
+}
+
+TEST(allocator, tensor_multiple_create_destroy_cycles) {
+  auto opts = tt::Options(tt::Device::cpu(), tt::DType::Float32).noGrad();
+
+  // Stress test: create and destroy many tensors
+  for (int cycle = 0; cycle < 50; cycle++) {
+    std::vector<tt::Tensor> tensors;
+    for (int i = 0; i < 20; i++) {
+      tensors.push_back(tt::Tensor({static_cast<int64_t>(128 * (i + 1))}, opts));
+      ASSERT_NE(tensors.back().dataPtr<>(), nullptr);
+    }
+    // All tensors freed when vector goes out of scope
+  }
+  // Should not crash, leak, or corrupt
+}
+
+#ifdef USE_CUDA
+
+#define SKIP_IF_NO_CUDA()                                         \
+  do {                                                            \
+    if (!tt::cuda::deviceAvailable()) {                           \
+      GTEST_SKIP() << "CUDA device not available; skipping test"; \
+    }                                                             \
+  } while (0)
+
+TEST(allocator, cuda_basic_allocate_deallocate) {
+  SKIP_IF_NO_CUDA();
+
+  auto* allocator = tt::getCUDACachedAllocator(0);
+  ASSERT_NE(allocator, nullptr);
+
+  void* ptr = allocator->allocate(4096);
+  ASSERT_NE(ptr, nullptr);
+  allocator->deallocate(ptr);
+}
+
+TEST(allocator, cuda_cache_reuse) {
+  SKIP_IF_NO_CUDA();
+
+  auto* allocator = tt::getCUDACachedAllocator(0);
+  ASSERT_NE(allocator, nullptr);
+
+  void* ptr1 = allocator->allocate(2048);
+  ASSERT_NE(ptr1, nullptr);
+  allocator->deallocate(ptr1);
+
+  void* ptr2 = allocator->allocate(2048);
+  ASSERT_NE(ptr2, nullptr);
+  EXPECT_EQ(ptr1, ptr2) << "CUDA cached block should be reused";
+  allocator->deallocate(ptr2);
+}
+
+TEST(allocator, cuda_pool_begin_end) {
+  SKIP_IF_NO_CUDA();
+
+  auto* allocator = tt::getCUDACachedAllocator(0);
+  ASSERT_NE(allocator, nullptr);
+
+  EXPECT_EQ(allocator->activePoolId(), -1);
+
+  int poolId = tt::CachedAllocator::newPoolId();
+  allocator->beginAllocateToPool(poolId);
+  EXPECT_EQ(allocator->activePoolId(), poolId);
+
+  void* ptr = allocator->allocate(1024);
+  ASSERT_NE(ptr, nullptr);
+
+  allocator->endAllocateToPool();
+  EXPECT_EQ(allocator->activePoolId(), -1);
+
+  // Free and cleanup
+  allocator->deallocate(ptr);
+  allocator->freePool(poolId);
+}
+
+TEST(allocator, cuda_tensor_integration) {
+  SKIP_IF_NO_CUDA();
+
+  auto opts = tt::Options(tt::Device(tt::DeviceType::CUDA, 0), tt::DType::Float32).noGrad();
+
+  // Create tensors on GPU
+  std::vector<float> data(1024, 1.5f);
+  tt::Tensor t(data, {32, 32}, opts);
+
+  // Copy back and verify
+  auto host = t.toList<float>();
+  EXPECT_EQ(host.size(), 1024u);
+  EXPECT_NEAR(host[0], 1.5f, 1e-5f);
+  EXPECT_NEAR(host[1023], 1.5f, 1e-5f);
+}
+
+TEST(allocator, cuda_concurrent_allocate) {
+  SKIP_IF_NO_CUDA();
+
+  auto* allocator = tt::getCUDACachedAllocator(0);
+  ASSERT_NE(allocator, nullptr);
+
+  constexpr int kNumThreads = 4;
+  constexpr int kAllocsPerThread = 50;
+  std::vector<std::thread> threads;
+  std::atomic<int> errors{0};
+
+  threads.reserve(kNumThreads);
+  for (int t = 0; t < kNumThreads; t++) {
+    threads.emplace_back([&]() {
+      std::vector<void*> ptrs;
+      for (int i = 0; i < kAllocsPerThread; i++) {
+        void* ptr = allocator->allocate(1024 * (i + 1));
+        if (!ptr) {
+          errors.fetch_add(1);
+          continue;
+        }
+        ptrs.push_back(ptr);
+      }
+      for (auto* ptr : ptrs) {
+        allocator->deallocate(ptr);
+      }
+    });
+  }
+
+  for (auto& th : threads) {
+    th.join();
+  }
+
+  EXPECT_EQ(errors.load(), 0);
+}
+
+#endif  // USE_CUDA
diff --git a/test/test_module.cpp b/test/test_module.cpp
index 80b2425..e2a5131 100644
--- a/test/test_module.cpp
+++ b/test/test_module.cpp
@@ -59,33 +59,3 @@ TEST(TEST_Module, dropout) {
   EXPECT_TRUE((output == 0).to(DType::Float32).sum().item<float>() > 0);
 }
 
-TEST(TEST_Module, rope) {
-  auto rope = nn::RoPE(4, 3, 1000.f);
-  EXPECT_THAT(rope.cache().shape(), ElementsAre(3, 4, 2));
-  EXPECT_TRUE(
-      VectorNear(rope.cache().toList<float>(),
-                 {1.0000, 0.0000, 1.0000, 0.0000, 1.0000,  0.0000, 1.0000, 0.0000, 0.5403,  0.8415, 0.9995, 0.0316,
-                  0.5403, 0.8415, 0.9995, 0.0316, -0.4161, 0.9093, 0.9980, 0.0632, -0.4161, 0.9093, 0.9980, 0.0632}));
-
-  RopeScalingConfig scaling{2.f, 4.f, 1.f, 2};
-  rope = nn::RoPE(4, 3, 10000.f, scaling);
-  EXPECT_TRUE(
-      VectorNear(rope.cache().toList<float>(),
-                 {1.0000, 0.0000, 1.0000, 0.0000, 1.0000, 0.0000, 1.0000, 0.0000, 0.8776, 0.4794, 1.0000, 0.0050,
-                  0.8776, 0.4794, 1.0000, 0.0050, 0.5403, 0.8415, 0.9999, 0.0100, 0.5403, 0.8415, 0.9999, 0.0100}));
-
-  auto x = Tensor(Array1d<float>{
-      0.0516,  -0.9695, -0.0861, 1.3223,  0.5351,  0.1768,  -0.0966, 0.2490,  -0.8414, -0.5110, -1.1106, -0.1058,
-      0.2775,  0.4491,  -0.5324, 0.1249,  1.0637,  -1.3959, 0.4438,  2.3408,  -0.0822, 1.0439,  -0.7985, 0.1040,
-      0.7335,  -2.2184, -0.6714, -0.3401, 1.5914,  -0.4572, -0.7603, -0.6322, 1.0201,  -0.5109, 0.8172,  -0.5532,
-      -0.1150, 0.2467,  -0.3851, 0.5726,  -0.1837, -1.5242, 0.4929,  -0.2529, 1.2423,  -0.4946, -0.4185, 1.1320});
-  x.reshape_({2, 2, 3, 4});
-  auto y = rope(x);
-  EXPECT_THAT(y.shape(), ElementsAre(2, 2, 3, 4));
-  EXPECT_TRUE(VectorNear(
-      y.toList<float>(),
-      {0.0516,  -0.9695, -0.0861, 1.3223,  0.5159,  0.1756,  0.1718, 0.2499,  0.4799,  -0.5099, -1.3081, -0.1109,
-       0.2775,  0.4491,  -0.5324, 0.1249,  0.7207,  -1.4076, 0.8994, 2.3338,  0.6274,  1.0428,  -0.5006, 0.1145,
-       0.7335,  -2.2184, -0.6714, -0.3401, 1.7611,  -0.4540, 0.0957, -0.6344, -0.1365, -0.5053, 1.3000,  -0.5583,
-       -0.1150, 0.2467,  -0.3851, 0.5726,  -0.3975, -1.5229, 0.3445, -0.2605, 1.0234,  -0.5059, 0.8192,  1.1270}));
-}
diff --git a/third_party/TinyFA b/third_party/TinyFA
deleted file mode 160000
index 4e18516..0000000
--- a/third_party/TinyFA
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 4e18516165acb029076b2ecc7d733c9ebf4d552a