diff --git a/.gitmodules b/.gitmodules index a4db716..6889aed 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,6 +1,3 @@ [submodule "test/googletest"] path = test/googletest url = https://github.com/google/googletest.git -[submodule "third_party/TinyFA"] - path = third_party/TinyFA - url = https://github.com/keith2018/TinyFA.git diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 76ea47c..aec31e2 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -94,11 +94,3 @@ if (TINYTORCH_USE_NCCL) target_include_directories(${PROJECT_NAME} PUBLIC ${NCCL_INCLUDE_DIRS}) target_link_libraries(${PROJECT_NAME} PUBLIC ${NCCL_LIBRARY}) endif () - -# TinyFA -if (TINYTORCH_USE_CUDA) - set(TFA_BUILD_TESTS OFF CACHE BOOL "" FORCE) - set(TFA_BUILD_EXAMPLES OFF CACHE BOOL "" FORCE) - add_subdirectory(${THIRD_PARTY_DIR}/TinyFA TinyFA) - target_link_libraries(${PROJECT_NAME} PRIVATE TinyFA::tinyfa) -endif () diff --git a/src/Function/FuncFused.h b/src/Function/FuncFused.h index 4ac04ad..b8d9f76 100644 --- a/src/Function/FuncFused.h +++ b/src/Function/FuncFused.h @@ -18,6 +18,19 @@ class FuncSiluMul : public Function { static void backward(AutogradContext* ctx, const Tensor& grad) { NOT_IMPLEMENTED(); } }; +class FuncFusedAddRmsNorm : public Function { + public: + static void forward(AutogradContext* ctx, Tensor& input, Tensor& residual, const Tensor& weight, float eps) { + op::fusedAddRmsNorm(input, residual, weight, eps); + } + + static void backward(AutogradContext* ctx, const Tensor& grad) { NOT_IMPLEMENTED(); } +}; + inline Tensor siluMul(const Tensor& x) { return FuncSiluMul::apply(x); } +inline void fusedAddRmsNorm(Tensor& input, Tensor& residual, const Tensor& weight, float eps) { + FuncFusedAddRmsNorm::apply(input, residual, weight, eps); +} + } // namespace tinytorch::function diff --git a/src/Function/FuncNNLayer.h b/src/Function/FuncNNLayer.h index f9ac33d..8791e14 100644 --- a/src/Function/FuncNNLayer.h +++ b/src/Function/FuncNNLayer.h @@ -267,24 +267,6 @@ class FuncSDPAttention : public Function { static void backward(AutogradContext* ctx, const Tensor& grad) { NOT_IMPLEMENTED(); } }; -class FuncFlashAttention : public Function { - public: - static Tensor forward(AutogradContext* ctx, const Tensor& query, const Tensor& key, const Tensor& value, - bool isCausal) { - return op::flashAttention(query, key, value, isCausal); - } - static void backward(AutogradContext* ctx, const Tensor& grad) { NOT_IMPLEMENTED(); } -}; - -class FuncRoPE : public Function { - public: - static Tensor forward(AutogradContext* ctx, const Tensor& input, const Tensor& rope, int64_t offset, - QKVLayout layout) { - return op::ropeApply(input, rope, offset, layout); - } - static void backward(AutogradContext* ctx, const Tensor& grad) { NOT_IMPLEMENTED(); } -}; - inline Tensor linear(const Tensor& input, const Tensor& weight, const Tensor& bias = {}) { return FuncLinear::apply(input, weight, bias); } @@ -318,12 +300,5 @@ inline Tensor sdpAttention(const Tensor& query, const Tensor& key, const Tensor& std::optional scale = std::nullopt) { return FuncSDPAttention::apply(query, key, value, isCausal, attnMask, dropoutP, scale); } -inline Tensor flashAttention(const Tensor& query, const Tensor& key, const Tensor& value, bool isCausal = false) { - return FuncFlashAttention::apply(query, key, value, isCausal); -} -inline Tensor ropeApply(const Tensor& input, const Tensor& rope, int64_t offset = 0, - QKVLayout layout = QKVLayout::BHSD) { - return FuncRoPE::apply(input, rope, offset, layout); -} } // namespace tinytorch::function diff --git a/src/Module/Basic.cpp b/src/Module/Basic.cpp index dd73f1f..6449515 100644 --- a/src/Module/Basic.cpp +++ b/src/Module/Basic.cpp @@ -94,20 +94,4 @@ std::vector> Embedding::namedParameters_() { r void Embedding::resetParameters() { Initializer::normal(weight_); } -RoPE::RoPE(int64_t headDim, int64_t contextLength, float thetaBase, std::optional scaling, - Options options) - : headDim_(headDim), contextLength_(contextLength), thetaBase_(thetaBase), scaling_(scaling), options_(options) { - RoPE::resetParameters(); -} - -Tensor RoPE::forward(const Tensor &input) { return function::ropeApply(input, rope_); } - -Tensor RoPE::forward(const Tensor &input, int64_t offset, QKVLayout layout) { - return function::ropeApply(input, rope_, offset, layout); -} - -void RoPE::resetParameters() { rope_ = op::ropeInit(headDim_, contextLength_, thetaBase_, scaling_, options_); } - -std::vector> RoPE::namedStates_() { return {{"rope", &rope_}}; } - } // namespace tinytorch::nn \ No newline at end of file diff --git a/src/Module/Basic.h b/src/Module/Basic.h index c861520..4e50dc9 100644 --- a/src/Module/Basic.h +++ b/src/Module/Basic.h @@ -138,34 +138,4 @@ class Embedding : public Module { Tensor weight_; }; -class RoPE : public Module { - public: - explicit RoPE(int64_t headDim, int64_t contextLength = 4096, float thetaBase = 10000.0f, - std::optional scaling = std::nullopt, Options options = {}); - - using Module::forward; - Tensor forward(const Tensor &input) override; - Tensor forward(const Tensor &input, int64_t offset, QKVLayout layout = QKVLayout::BHSD); - - using Module::operator(); - Tensor operator()(const Tensor &input, int64_t offset, QKVLayout layout = QKVLayout::BHSD) { - return forward(input, offset, layout); - } - - void resetParameters() override; - - Tensor &cache() { return rope_; } - - protected: - std::vector> namedStates_() override; - - int64_t headDim_; - int64_t contextLength_; - float thetaBase_; - std::optional scaling_; - Options options_; - - Tensor rope_; -}; - } // namespace tinytorch::nn diff --git a/src/Module/Norm.h b/src/Module/Norm.h index ff7d131..d50a3b3 100644 --- a/src/Module/Norm.h +++ b/src/Module/Norm.h @@ -38,6 +38,7 @@ class RMSNorm : public Module { void resetParameters() override; Tensor &weight() { return weight_; } + float eps() const { return eps_; } protected: std::vector> namedParameters_() override; diff --git a/src/Operation/OpFused.h b/src/Operation/OpFused.h index 6fed16b..6188b40 100644 --- a/src/Operation/OpFused.h +++ b/src/Operation/OpFused.h @@ -13,9 +13,14 @@ namespace tinytorch::op { using SiluMulOpFn = Tensor (*)(const Tensor& self); +using FusedAddRmsNormOpFn = void (*)(Tensor& input, Tensor& residual, const Tensor& weight, float eps); + // siluMul DEFINE_OP(siluMul, SiluMulOpFn) +// fusedAddRmsNorm +DEFINE_OP(fusedAddRmsNorm, FusedAddRmsNormOpFn) + void registerFusedCpu(); STATIC_CALL(registerFusedCpu); diff --git a/src/Operation/OpFusedCpu.cpp b/src/Operation/OpFusedCpu.cpp index 8ad2207..cc44f3e 100644 --- a/src/Operation/OpFusedCpu.cpp +++ b/src/Operation/OpFusedCpu.cpp @@ -16,6 +16,9 @@ namespace tinytorch::op { void registerFusedCpu() { // siluMul REG_FUSED_CPU_FLT(siluMul, siluMulOpCpuImpl); + + // fusedAddRmsNorm + REG_FUSED_CPU_FLT(fusedAddRmsNorm, fusedAddRmsNormOpCpuImpl); } } // namespace tinytorch::op \ No newline at end of file diff --git a/src/Operation/OpFusedCpu.h b/src/Operation/OpFusedCpu.h index e8deef5..d9a7338 100644 --- a/src/Operation/OpFusedCpu.h +++ b/src/Operation/OpFusedCpu.h @@ -38,4 +38,35 @@ Tensor siluMulOpCpuImpl(const Tensor& self) { return ret; } +template +void fusedAddRmsNormOpCpuImpl(Tensor& input, Tensor& residual, const Tensor& weight, float eps) { + ASSERT(input.shape() == residual.shape()); + int64_t dim = input.size(-1); + int64_t numRows = input.numel() / dim; + + T* inputPtr = input.dataPtr(); + T* residualPtr = residual.dataPtr(); + const T* weightPtr = weight.dataPtr(); + + for (int64_t row = 0; row < numRows; row++) { + int64_t base = row * dim; + + // add residual + accumulate sum‑of‑squares + float sumSq = 0.f; + for (int64_t i = 0; i < dim; i++) { + float r = static_cast(inputPtr[base + i]) + static_cast(residualPtr[base + i]); + residualPtr[base + i] = static_cast(r); + sumSq += r * r; + } + + float invRms = 1.f / std::sqrt(sumSq / static_cast(dim) + eps); + + // normalize + affine + for (int64_t i = 0; i < dim; i++) { + auto r = static_cast(residualPtr[base + i]); + inputPtr[base + i] = static_cast(r * invRms * static_cast(weightPtr[i])); + } + } +} + } // namespace tinytorch::op diff --git a/src/Operation/OpFusedCuda.cu b/src/Operation/OpFusedCuda.cu index 43541b5..9c884f1 100644 --- a/src/Operation/OpFusedCuda.cu +++ b/src/Operation/OpFusedCuda.cu @@ -16,6 +16,9 @@ namespace tinytorch::op { void registerFusedCuda() { // siluMul REG_FUSED_CUDA_FLT(siluMul, siluMulOpCudaImpl); + + // fusedAddRmsNorm + REG_FUSED_CUDA_FLT(fusedAddRmsNorm, fusedAddRmsNormOpCudaImpl); } } // namespace tinytorch::op \ No newline at end of file diff --git a/src/Operation/OpFusedCuda.cuh b/src/Operation/OpFusedCuda.cuh index f871c1b..620f6f5 100644 --- a/src/Operation/OpFusedCuda.cuh +++ b/src/Operation/OpFusedCuda.cuh @@ -8,12 +8,46 @@ #include "OpElemWiseCuda.cuh" #include "OpFused.h" +#include "OpNNLayerCuda.cuh" #include "Utils/CUDAUtils.h" namespace tinytorch::op { +template +__global__ void kSiluMulVec(T* __restrict__ outPtr, const T* __restrict__ selfPtr, const int d) { + const unsigned int row = blockIdx.x; + const T* gatePtr = selfPtr + static_cast(row) * d * 2; + const T* upPtr = gatePtr + d; + T* rowOut = outPtr + static_cast(row) * d; + + const int numVecs = d / VEC_ELEMENTS; + + const int4* gateVec = reinterpret_cast(gatePtr); + const int4* upVec = reinterpret_cast(upPtr); + int4* outVec = reinterpret_cast(rowOut); + + for (auto i = threadIdx.x; i < numVecs; i += blockDim.x) { + // 128-bit load via read-only cache (__ldg) + int4 gv = __ldg(&gateVec[i]); + int4 uv = __ldg(&upVec[i]); + + const T* g = reinterpret_cast(&gv); + const T* u = reinterpret_cast(&uv); + T r[VEC_ELEMENTS]; + +#pragma unroll + for (int j = 0; j < VEC_ELEMENTS; ++j) { + auto gf = static_cast(g[j]); + auto uf = static_cast(u[j]); + r[j] = static_cast(gf / (1.f + __expf(-gf)) * uf); + } + + outVec[i] = *reinterpret_cast(r); + } +} + template -__global__ void kSiluMul(T* retPtr, const T* selfPtr, const int64_t halfLastDim, const int64_t n) { +__global__ void kSiluMulScalar(T* retPtr, const T* selfPtr, const int64_t halfLastDim, const int64_t n) { const auto index = blockIdx.x * blockDim.x + threadIdx.x; if (index < n) { const int64_t sliceIdx = index / halfLastDim; @@ -42,9 +76,69 @@ Tensor siluMulOpCudaImpl(const Tensor& self) { const int64_t lastDim = self.size(-1); const int64_t halfLastDim = lastDim / 2; const int64_t n = ret.numel(); - auto params = cuda::getKernelLaunchParams(self.device().index, n); - CUDA_LAUNCH_KERNEL(kSiluMul, params, retPtr, selfPtr, halfLastDim, n); + const int64_t numRows = n / halfLastDim; + + constexpr int kVecBytes = 16; // int4 + constexpr int kVecElements = kVecBytes / static_cast(sizeof(CudaT)); + + const bool useVec = (halfLastDim % kVecElements == 0); + + if (useVec) { + const int d = static_cast(halfLastDim); + dim3 grid(static_cast(numRows)); + dim3 block(std::min(d / kVecElements, 1024)); + auto stream = cuda::getCurrentCUDAStream(self.device().index).stream(); + kSiluMulVec<<>>(retPtr, selfPtr, d); + CUDA_KERNEL_CHECK(); + } else { + auto params = cuda::getKernelLaunchParams(self.device().index, n); + CUDA_LAUNCH_KERNEL(kSiluMulScalar, params, retPtr, selfPtr, halfLastDim, n); + } + return ret; } +template +__global__ void kFusedAddRMSNorm(T* __restrict__ input, T* __restrict__ residual, const T* __restrict__ weight, + int64_t dim, float eps) { + const auto row = blockIdx.x; + const auto tid = threadIdx.x; + const auto base = row * dim; + + // add residual + accumulate sum‑of‑squares + float sumSq = 0.f; + for (auto i = tid; i < dim; i += blockDim.x) { + float r = static_cast(input[base + i]) + static_cast(residual[base + i]); + residual[base + i] = static_cast(r); + sumSq += r * r; + } + + sumSq = cudaBlockReduce(sumSq, 0.f); + float invRms = cuda::rsqrt(sumSq / static_cast(dim) + eps); + + // normalize + affine + for (auto i = tid; i < dim; i += blockDim.x) { + auto r = static_cast(residual[base + i]); + input[base + i] = static_cast(r * invRms * static_cast(weight[i])); + } +} + +template +void fusedAddRmsNormOpCudaImpl(Tensor& input, Tensor& residual, const Tensor& weight, float eps) { + ASSERT(input.shape() == residual.shape()); + int64_t dim = input.size(-1); + int64_t numRows = input.numel() / dim; + + using CudaT = typename cuda::CudaTypeCast::type; + CudaT* inputPtr = input.dataPtr(); + CudaT* residualPtr = residual.dataPtr(); + const CudaT* weightPtr = weight.dataPtr(); + + auto stream = cuda::getCurrentCUDAStream(input.device().index).stream(); + dim3 blockSize(std::clamp(nextPow2(dim), 32u, 1024u)); + dim3 gridSize(numRows); + kFusedAddRMSNorm<<>>(inputPtr, residualPtr, weightPtr, dim, eps); + CUDA_KERNEL_CHECK(); +} + } // namespace tinytorch::op diff --git a/src/Operation/OpNNLayer.h b/src/Operation/OpNNLayer.h index 5f71fe4..5e5da04 100644 --- a/src/Operation/OpNNLayer.h +++ b/src/Operation/OpNNLayer.h @@ -8,25 +8,6 @@ #include "Tensor/Dispatch.h" -namespace tinytorch { - -struct RopeScalingConfig { - float factor; - float highFreqFactor; - float lowFreqFactor; - int64_t originalContextLength; - - RopeScalingConfig(float f, float hf, float lf, int64_t len) - : factor(f), highFreqFactor(hf), lowFreqFactor(lf), originalContextLength(len) {} -}; - -enum class QKVLayout { - BHSD, // [batch, numHead, seqLen, headDim] - BSHD // [batch, seqLen, numHead, headDim] -}; - -} // namespace tinytorch - namespace tinytorch::op { enum class SoftmaxType : uint8_t { @@ -59,12 +40,6 @@ using LayerNormOpFn = Tensor (*)(const Tensor& self, IntArrayView normalizedShap using RMSNormOpFn = Tensor (*)(const Tensor& self, IntArrayView normalizedShape, const Tensor& weight, float eps); -using RopeInitOpFn = Tensor (*)(int64_t headDim, int64_t contextLength, float thetaBase, - std::optional scaling, Options options); -using RopeApplyOpFn = Tensor (*)(const Tensor& input, const Tensor& rope, int64_t offset, QKVLayout layout); - -using FlashAttentionOpFn = Tensor (*)(const Tensor& query, const Tensor& key, const Tensor& value, bool isCausal); - // softmax DEFINE_OP(softmax, SoftmaxOpFn); DEFINE_OP(softmaxOut, SoftmaxOpOutFn); @@ -85,13 +60,6 @@ DEFINE_OP(layerNorm, LayerNormOpFn); // rmsNorm DEFINE_OP(rmsNorm, RMSNormOpFn); -// rope -DEFINE_OP(ropeInit, RopeInitOpFn); -DEFINE_OP(ropeApply, RopeApplyOpFn); - -// flashAttention -DEFINE_OP(flashAttention, FlashAttentionOpFn); - void registerNNLayerCpu(); STATIC_CALL(registerNNLayerCpu); diff --git a/src/Operation/OpNNLayerCpu.cpp b/src/Operation/OpNNLayerCpu.cpp index af3eaf1..afd69e8 100644 --- a/src/Operation/OpNNLayerCpu.cpp +++ b/src/Operation/OpNNLayerCpu.cpp @@ -33,10 +33,6 @@ void registerNNLayerCpu() { // rmsNorm REG_NN_LAYER_CPU_FLT(rmsNorm, rmsNormOpCpuImpl); - - // rope - REG_NN_LAYER_CPU_FLT(ropeInit, ropeInitOpCpuImpl); - REG_NN_LAYER_CPU_FLT(ropeApply, ropeApplyOpCpuImpl); } } // namespace tinytorch::op diff --git a/src/Operation/OpNNLayerCpu.h b/src/Operation/OpNNLayerCpu.h index e8c1b34..2835a0c 100644 --- a/src/Operation/OpNNLayerCpu.h +++ b/src/Operation/OpNNLayerCpu.h @@ -247,130 +247,4 @@ Tensor rmsNormOpCpuImpl(const Tensor& self, IntArrayView normalizedShape, const return normOpCpuImplDetail(self, normalizedShape, weight, {}, eps); } -template -Tensor ropeInitOpCpuImpl(int64_t headDim, int64_t contextLength, float thetaBase, - std::optional scaling, Options options) { - ASSERT(!options.requiresGrad_); - ASSERT(options.device_.type == DeviceType::CPU); - - ASSERT(headDim % 2 == 0); - int64_t halfDim = headDim >> 1; - - // inverse frequency - Tensor invFreq({halfDim}, options); - auto* invFreqPtr = invFreq.dataPtr(); - for (int64_t i = 0; i < halfDim; i++) { - invFreqPtr[i] = 1.f / std::pow(thetaBase, static_cast(i << 1) / static_cast(headDim)); - } - - // apply scaling if needed - if (scaling.has_value()) { - auto originCtxLen = static_cast(scaling->originalContextLength); - auto lowWaveLen = originCtxLen / scaling->lowFreqFactor; - auto highWaveLen = originCtxLen / scaling->highFreqFactor; - for (int64_t i = 0; i < halfDim; i++) { - auto waveLen = 2.f * static_cast(M_PI) / invFreqPtr[i]; - if (waveLen > lowWaveLen) { - invFreqPtr[i] /= scaling->factor; - } else if (waveLen < highWaveLen) { - // do nothing - } else { - auto smoothFactor = - (originCtxLen / waveLen - scaling->lowFreqFactor) / (scaling->highFreqFactor - scaling->lowFreqFactor); - auto scaled = invFreqPtr[i] / scaling->factor; - invFreqPtr[i] = (1.f - smoothFactor) * scaled + smoothFactor * invFreqPtr[i]; - } - } - } - - // Shape: [contextLength, headDim, 2] where last dim is [cos, sin] - // Memory layout: [cos0, sin0, cos1, sin1, cos2, sin2, ...] - Tensor rope({contextLength, headDim, 2}, options); - auto* ropePtr = rope.dataPtr(); - - for (int64_t pos = 0; pos < contextLength; pos++) { - for (int64_t i = 0; i < halfDim; i++) { - float angle = static_cast(pos) * invFreqPtr[i]; - float cosVal = std::cos(angle); - float sinVal = std::sin(angle); - - int64_t offset1 = (pos * headDim + i) * 2; - int64_t offset2 = (pos * headDim + halfDim + i) * 2; - - ropePtr[offset1] = cosVal; - ropePtr[offset1 + 1] = sinVal; - ropePtr[offset2] = cosVal; - ropePtr[offset2 + 1] = sinVal; - } - } - - return rope; -} - -template -Tensor ropeApplyOpCpuImpl(const Tensor& input, const Tensor& rope, int64_t offset, QKVLayout layout) { - const auto& shape = input.shape(); - ASSERT(shape.size() == 4); - - int64_t batch, numHead, seqLen, headDim; - int64_t strideB, strideH, strideT; - - if (layout == QKVLayout::BHSD) { - // [B, N, S, D] - batch = shape[0]; - numHead = shape[1]; - seqLen = shape[2]; - headDim = shape[3]; - - strideT = headDim; - strideH = seqLen * headDim; - strideB = numHead * seqLen * headDim; - } else { - // [B, S, N, D] - batch = shape[0]; - seqLen = shape[1]; - numHead = shape[2]; - headDim = shape[3]; - - strideH = headDim; - strideT = numHead * headDim; - strideB = seqLen * numHead * headDim; - } - - ASSERT(headDim % 2 == 0); - const int64_t halfDim = headDim >> 1; - - const auto* inputPtr = input.dataPtr(); - const auto* ropePtr = rope.dataPtr(); - - Tensor out(shape, input.options().noGrad()); - auto* outPtr = out.dataPtr(); - - for (int64_t b = 0; b < batch; b++) { - for (int64_t h = 0; h < numHead; h++) { - for (int64_t t = 0; t < seqLen; t++) { - const int64_t base = b * strideB + h * strideH + t * strideT; - const T* xPtr = inputPtr + base; - T* yPtr = outPtr + base; - - int64_t posIndex = offset + t; - const T* ropeRow = ropePtr + posIndex * headDim * 2; - - for (int64_t i = 0; i < halfDim; i++) { - float x1 = xPtr[i]; - float x2 = xPtr[halfDim + i]; - - int64_t idx = i * 2; - float c = ropeRow[idx]; // cos - float s = ropeRow[idx + 1]; // sin - - yPtr[i] = x1 * c - x2 * s; - yPtr[halfDim + i] = x2 * c + x1 * s; - } - } - } - } - return out; -} - } // namespace tinytorch::op diff --git a/src/Operation/OpNNLayerCuda.cu b/src/Operation/OpNNLayerCuda.cu index 523edbc..3ef5834 100644 --- a/src/Operation/OpNNLayerCuda.cu +++ b/src/Operation/OpNNLayerCuda.cu @@ -4,51 +4,10 @@ * */ -#include "flash_attn/flash_api.cuh" #include "OpNNLayerCuda.cuh" namespace tinytorch::op { -template -Tensor flashAttentionOpCudaImpl(const Tensor& query, const Tensor& key, const Tensor& value, bool isCausal) { - const auto& qShape = query.shape(); // [batch, seqLenQ, numHeadsQ, headDim] - const auto& kShape = key.shape(); // [batch, seqLenKV, numHeadsKV, headDim] - ASSERT(qShape.size() == 4); - ASSERT(kShape.size() == 4); - - auto batch = qShape[0]; - auto numHeadsQ = qShape[2]; - auto numHeadsKV = kShape[2]; - auto headDim = qShape[3]; - - ASSERT(numHeadsQ % numHeadsKV == 0); // GQA - - auto seqLenQ = query.size(1); - auto seqLenKV = key.size(1); - - using CudaT = typename cuda::CudaTypeCast::type; - - Tensor out(qShape, query.options().noGrad()); - auto* outPtr = out.dataPtr(); - - const auto* qPtr = query.dataPtr(); - const auto* kPtr = key.dataPtr(); - const auto* vPtr = value.dataPtr(); - auto* oPtr = out.dataPtr(); - - auto stream = cuda::getCurrentCUDAStream(query.device().index).stream(); - tfa::flashAttn(qPtr, kPtr, vPtr, oPtr, batch, seqLenQ, seqLenKV, numHeadsQ, numHeadsKV, headDim, isCausal, - stream); - CUDA_KERNEL_CHECK(); - return out; -} - -#define INSTANTIATE_FLASHATTEN_OP(T) \ - template Tensor flashAttentionOpCudaImpl(const Tensor& query, const Tensor& key, const Tensor& value, \ - bool isCausal); -FOR_FLT_TYPES(INSTANTIATE_FLASHATTEN_OP) -#undef INSTANTIATE_FLASHATTEN_OP - #define REG_NN_LAYER_CUDA_FLT(NAME, FUNC) \ REGISTER_OP_IMPL(NAME, CUDA, Float32, &(FUNC>)) \ REGISTER_OP_IMPL(NAME, CUDA, Float16, &(FUNC>)) \ @@ -74,13 +33,6 @@ void registerNNLayerCuda() { // rmsNorm REG_NN_LAYER_CUDA_FLT(rmsNorm, rmsNormOpCudaImpl); - - // rope - REG_NN_LAYER_CUDA_FLT(ropeInit, ropeInitOpCudaImpl); - REG_NN_LAYER_CUDA_FLT(ropeApply, ropeApplyOpCudaImpl); - - // flashAttention - REG_NN_LAYER_CUDA_FLT(flashAttention, flashAttentionOpCudaImpl); } } // namespace tinytorch::op diff --git a/src/Operation/OpNNLayerCuda.cuh b/src/Operation/OpNNLayerCuda.cuh index b1ef1d7..a817ec9 100644 --- a/src/Operation/OpNNLayerCuda.cuh +++ b/src/Operation/OpNNLayerCuda.cuh @@ -356,89 +356,6 @@ __global__ void kNormLarge(T* out, const T* input, const T* weight, const T* bia } } -template -__global__ void kRopeComputeInvFreq(T* invFreqPtr, int64_t halfDim, float thetaBase) { - const auto idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < halfDim) { - invFreqPtr[idx] = - static_cast(1.f / cuda::pow(thetaBase, static_cast(idx << 1) / static_cast(halfDim << 1))); - } -} - -template -__global__ void kRopeApplyScaling(T* invFreqPtr, int64_t halfDim, float originalContextLength, float lowFreqFactor, - float highFreqFactor, float scalingFactor) { - const auto idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < halfDim) { - auto invFreq = static_cast(invFreqPtr[idx]); - auto waveLen = 2.f * static_cast(M_PI) / invFreq; - auto lowWaveLen = originalContextLength / lowFreqFactor; - auto highWaveLen = originalContextLength / highFreqFactor; - - if (waveLen > lowWaveLen) { - invFreqPtr[idx] /= static_cast(scalingFactor); - } else if (waveLen < highWaveLen) { - // do nothing - } else { - auto smoothFactor = (originalContextLength / waveLen - lowFreqFactor) / (highFreqFactor - lowFreqFactor); - auto scaled = invFreq / scalingFactor; - invFreqPtr[idx] = static_cast((1.f - smoothFactor) * scaled + smoothFactor * invFreq); - } - } -} - -template -__global__ void kRopePrecomputeCosSin(const T* invFreqPtr, T* ropePtr, int64_t contextLength, int64_t headDim) { - const int64_t pos = blockIdx.x; - if (pos < contextLength) { - auto halfDim = headDim >> 1; - for (auto i = threadIdx.x; i < halfDim; i += blockDim.x) { - float angle = static_cast(pos) * static_cast(invFreqPtr[i]); - - T cosVal = static_cast(::cosf(angle)); - T sinVal = static_cast(::sinf(angle)); - - int64_t offset1 = (pos * headDim + i) * 2; - int64_t offset2 = (pos * headDim + halfDim + i) * 2; - - ropePtr[offset1] = cosVal; - ropePtr[offset1 + 1] = sinVal; - ropePtr[offset2] = cosVal; - ropePtr[offset2 + 1] = sinVal; - } - } -} - -template -__global__ void kRopeApply(const T* input, const float* rope, T* output, int64_t batch, int64_t numHead, int64_t seqLen, - int64_t headDim, int64_t strideB, int64_t strideH, int64_t strideT, int64_t offset) { - const int64_t b = blockIdx.x / numHead; - const int64_t h = blockIdx.x % numHead; - const int64_t t = blockIdx.y * blockDim.x + threadIdx.x; - - if (b < batch && t < seqLen) { - const int64_t base = b * strideB + h * strideH + t * strideT; - const T* xPtr = input + base; - T* yPtr = output + base; - - const int64_t posIndex = offset + t; - const float* ropeRow = rope + posIndex * headDim * 2; - - const int64_t halfDim = headDim >> 1; - for (int64_t i = 0; i < halfDim; i++) { - auto x1 = static_cast(xPtr[i]); - auto x2 = static_cast(xPtr[halfDim + i]); - - int64_t idx = i * 2; - float c = ropeRow[idx]; // cos - float s = ropeRow[idx + 1]; // sin - - yPtr[i] = static_cast(x1 * c - x2 * s); - yPtr[halfDim + i] = static_cast(x2 * c + x1 * s); - } - } -} - template void softmaxForwardCudaImpl(Tensor& out, const Tensor& self, int64_t dim) { ASSERT(out.shape() == self.shape()); @@ -618,96 +535,4 @@ Tensor rmsNormOpCudaImpl(const Tensor& self, IntArrayView normalizedShape, const return normOpCudaImplDetail(self, normalizedShape, weight, {}, eps); } -template -Tensor ropeInitOpCudaImpl(int64_t headDim, int64_t contextLength, float thetaBase, - std::optional scaling, Options options) { - ASSERT(!options.requiresGrad_); - ASSERT(options.device_.type == DeviceType::CUDA); - options.dtype_ = DType::Float32; - - ASSERT(headDim % 2 == 0); - int64_t halfDim = headDim >> 1; - - // inverse frequency - Tensor invFreq({halfDim}, options); - auto* invFreqPtr = invFreq.dataPtr(); - - auto params = cuda::getKernelLaunchParams(options.device_.index, halfDim); - CUDA_LAUNCH_KERNEL(kRopeComputeInvFreq, params, invFreqPtr, halfDim, thetaBase); - - // apply scaling if needed - if (scaling.has_value()) { - CUDA_LAUNCH_KERNEL(kRopeApplyScaling, params, invFreqPtr, halfDim, - static_cast(scaling->originalContextLength), scaling->lowFreqFactor, - scaling->highFreqFactor, scaling->factor); - } - - // Shape: [contextLength, headDim, 2] where last dim is [cos, sin] - // Memory layout: [cos0, sin0, cos1, sin1, cos2, sin2, ...] - Tensor rope({contextLength, headDim, 2}, options); - auto* ropePtr = rope.dataPtr(); - - auto blockSize = cuda::getKernelBlockSize(options.device_.index); - auto stream = cuda::getCurrentCUDAStream(options.device_.index).stream(); - kRopePrecomputeCosSin<<>>(invFreqPtr, ropePtr, contextLength, headDim); - CUDA_KERNEL_CHECK(); - - return rope; -} - -template -Tensor ropeApplyOpCudaImpl(const Tensor& input, const Tensor& rope, int64_t offset, QKVLayout layout) { - const auto& shape = input.shape(); - ASSERT(shape.size() == 4); - - int64_t batch, numHead, seqLen, headDim; - int64_t strideB, strideH, strideT; - - if (layout == QKVLayout::BHSD) { - // [B, N, S, D] - batch = shape[0]; - numHead = shape[1]; - seqLen = shape[2]; - headDim = shape[3]; - - strideT = headDim; - strideH = seqLen * headDim; - strideB = numHead * seqLen * headDim; - } else { - // [B, S, N, D] - batch = shape[0]; - seqLen = shape[1]; - numHead = shape[2]; - headDim = shape[3]; - - strideH = headDim; - strideT = numHead * headDim; - strideB = seqLen * numHead * headDim; - } - - ASSERT(headDim % 2 == 0); - - using CudaT = typename cuda::CudaTypeCast::type; - const auto* inputPtr = input.dataPtr(); - const auto* ropePtr = rope.dataPtr(); - - Tensor out(shape, input.options().noGrad()); - auto* outPtr = out.dataPtr(); - - const auto blockSize = cuda::getKernelBlockSize(input.device().index); - const auto blocksPerSeq = cuda::getKernelGridSize(blockSize, seqLen); - - dim3 gridDim(batch * numHead, blocksPerSeq); - dim3 blockDim(blockSize); - - auto stream = cuda::getCurrentCUDAStream(input.device().index).stream(); - kRopeApply<<>>(inputPtr, ropePtr, outPtr, batch, numHead, seqLen, headDim, - strideB, strideH, strideT, offset); - CUDA_KERNEL_CHECK(); - return out; -} - -template -Tensor flashAttentionOpCudaImpl(const Tensor& query, const Tensor& key, const Tensor& value, bool isCausal); - } // namespace tinytorch::op diff --git a/src/Operation/OpTransform.cpp b/src/Operation/OpTransform.cpp index b97ea9c..95ac019 100644 --- a/src/Operation/OpTransform.cpp +++ b/src/Operation/OpTransform.cpp @@ -152,13 +152,28 @@ std::vector splitOpImplDetail(const Tensor& self, IntArrayView sectionSi const auto numSections = sectionSizes.size(); std::vector retTensors(numSections); - if (dim == 0) { // share storage + bool outerAllOne = (dim == 0); + if (!outerAllOne) { + outerAllOne = true; + for (int64_t i = 0; i < dim; i++) { + if (self.shape(i) != 1) { + outerAllOne = false; + break; + } + } + } + + if (outerAllOne) { // share storage + int64_t innerSize = 1; + for (int64_t i = dim + 1; i < self.dim(); i++) { + innerSize *= self.shape(i); + } int64_t startIdx = 0; for (size_t i = 0; i < numSections; i++) { SizeVector retShape(self.shape()); - retShape[0] = sectionSizes[i]; + retShape[dim] = sectionSizes[i]; - int64_t storageOffset = self.storageOffset() + startIdx * self.stride(0); + int64_t storageOffset = self.storageOffset() + startIdx * innerSize; retTensors[i] = Tensor(retShape, self.options().noGrad(), self.storage(), storageOffset); startIdx += sectionSizes[i]; @@ -416,8 +431,20 @@ Tensor narrowOpImpl(const Tensor& self, int64_t dim, int64_t start, int64_t leng SizeVector retShape(self.shape()); retShape[dim] = length; - if (dim == 0) { // share storage - int64_t newOffset = self.storageOffset() + start * self.stride(0); + bool outerAllOne = true; + for (int64_t i = 0; i < dim; i++) { + if (self.shape(i) != 1) { + outerAllOne = false; + break; + } + } + + if (dim == 0 || outerAllOne) { // share storage + int64_t innerSize = 1; + for (int64_t i = dim + 1; i < self.dim(); i++) { + innerSize *= self.shape(i); + } + int64_t newOffset = self.storageOffset() + start * innerSize; return {retShape, self.options().noGrad(), self.storage(), newOffset}; } diff --git a/src/Tensor/Allocator.cpp b/src/Tensor/Allocator.cpp index 41fc12a..2728a81 100644 --- a/src/Tensor/Allocator.cpp +++ b/src/Tensor/Allocator.cpp @@ -26,7 +26,7 @@ CPUPinnedAllocator::~CPUPinnedAllocator() { void* CPUPinnedAllocator::allocate(int64_t nbytes) { void* ptr = nullptr; - CUDA_CHECK(cudaHostAlloc(&ptr, nbytes, cudaHostAllocDefault)); + CUDA_CHECK(cudaHostAlloc(&ptr, static_cast(nbytes), cudaHostAllocDefault)); #ifndef NDEBUG if (ptr) { allocatedPtrs_.insert(ptr); @@ -60,7 +60,7 @@ CUDAAllocator::~CUDAAllocator() { void* CUDAAllocator::allocate(int64_t nbytes) { void* ptr = nullptr; cuda::CudaDeviceGuard guard(deviceIndex_); - CUDA_CHECK(cudaMalloc(&ptr, nbytes)); + CUDA_CHECK(cudaMalloc(&ptr, static_cast(nbytes))); #ifndef NDEBUG if (ptr) { allocatedPtrs_.insert(ptr); @@ -91,33 +91,20 @@ Allocator* getAllocator(const Options& options) { if (options.device_.isCpu()) { if (options.pinnedMemory_) { #ifdef USE_CUDA - static CachedAllocator cachedPinnedAllocator(std::make_unique()); - return &cachedPinnedAllocator; + // leak-on-exit + static auto* cachedPinnedAllocator = new CachedAllocator(std::make_unique()); + return cachedPinnedAllocator; #else ASSERT(false && "cuda not support"); return nullptr; #endif } else { - static CachedAllocator cachedCpuAllocator(std::make_unique>()); - return &cachedCpuAllocator; + static auto* cachedCpuAllocator = new CachedAllocator(std::make_unique>()); + return cachedCpuAllocator; } } else if (options.device_.isCuda()) { #ifdef USE_CUDA - auto deviceCount = cuda::getDeviceCount(); - static std::vector deviceAllocators; - static std::once_flag flag; - std::call_once(flag, [&]() { - deviceAllocators.reserve(deviceCount); - for (auto i = 0; i < deviceCount; i++) { - deviceAllocators.emplace_back(std::make_unique(i)); - } - }); - auto deviceIndex = options.device_.index; - if (deviceIndex < 0 || static_cast(deviceIndex) >= deviceAllocators.size()) { - LOGE("getAllocator error: Invalid CUDA device index %d", deviceIndex); - return nullptr; - } - return &deviceAllocators[deviceIndex]; + return getCUDACachedAllocator(options.device_.index); #else ASSERT(false && "cuda not support"); return nullptr; @@ -127,4 +114,28 @@ Allocator* getAllocator(const Options& options) { return nullptr; } +CachedAllocator* getCUDACachedAllocator(int device) { +#ifdef USE_CUDA + auto deviceCount = cuda::getDeviceCount(); + // leak-on-exit + static std::vector* deviceAllocators = nullptr; + static std::once_flag flag; + std::call_once(flag, [&]() { + deviceAllocators = new std::vector(); + deviceAllocators->reserve(deviceCount); + for (auto i = 0; i < deviceCount; i++) { + deviceAllocators->push_back(new CachedAllocator(std::make_unique(i))); + } + }); + if (device < 0 || static_cast(device) >= deviceAllocators->size()) { + LOGE("getCUDACachedAllocator error: Invalid CUDA device index %d", device); + return nullptr; + } + return (*deviceAllocators)[device]; +#else + (void)device; + return nullptr; +#endif +} + } // namespace tinytorch diff --git a/src/Tensor/Allocator.h b/src/Tensor/Allocator.h index 2c385f0..fc8a6cc 100644 --- a/src/Tensor/Allocator.h +++ b/src/Tensor/Allocator.h @@ -8,7 +8,6 @@ #include -#include "Device.h" #include "Utils/Macros.h" #include "ankerl/unordered_dense.h" @@ -49,13 +48,14 @@ template void* CPUAllocator::allocate(int64_t nbytes) { void* ptr = nullptr; if (aligned) { + size_t alignedSize = (static_cast(nbytes) + alignment_ - 1) & ~(alignment_ - 1); #if !defined(_MSC_VER) - ptr = std::aligned_alloc(alignment_, nbytes); + ptr = std::aligned_alloc(alignment_, alignedSize); #else - ptr = _aligned_malloc(nbytes, alignment_); + ptr = _aligned_malloc(alignedSize, alignment_); #endif } else { - ptr = std::malloc(nbytes); + ptr = std::malloc(static_cast(nbytes)); } #ifndef NDEBUG @@ -105,13 +105,13 @@ class CPUPinnedAllocator : public Allocator { class CUDAAllocator : public Allocator { public: - explicit CUDAAllocator(DeviceIndex index = 0) : deviceIndex_(index) {} + explicit CUDAAllocator(int index = 0) : deviceIndex_(index) {} ~CUDAAllocator() override; void* allocate(int64_t nbytes) override; void deallocate(void* ptr) override; private: - DeviceIndex deviceIndex_; + int deviceIndex_; #ifndef NDEBUG ankerl::unordered_dense::set allocatedPtrs_; #endif @@ -119,6 +119,9 @@ class CUDAAllocator : public Allocator { #endif struct Options; +class CachedAllocator; + Allocator* getAllocator(const Options& options); +CachedAllocator* getCUDACachedAllocator(int device); } // namespace tinytorch diff --git a/src/Tensor/CachedAllocator.cpp b/src/Tensor/CachedAllocator.cpp index 6baa2c3..8a4d76d 100644 --- a/src/Tensor/CachedAllocator.cpp +++ b/src/Tensor/CachedAllocator.cpp @@ -7,14 +7,15 @@ #include "CachedAllocator.h" #include -#include #include +#include #include "Utils/Logger.h" namespace tinytorch { -bool CachedAllocator::cacheEnabled_ = true; +std::atomic CachedAllocator::cacheEnabled_{true}; +std::atomic CachedAllocator::nextPoolId_{0}; // Ref: // https://github.com/pytorch/pytorch/blob/main/c10/cuda/CUDACachingAllocator.cpp @@ -70,14 +71,25 @@ struct AllocParams { AllocParams(size_t size, BlockPool* pool, size_t allocSize) : searchKey(size), pool(pool), allocSize(allocSize), retBlock(nullptr) {} +}; + +struct PoolState { + BlockPool largeBlocks{BlockComparator, false}; + BlockPool smallBlocks{BlockComparator, true}; + int refCount{0}; + + PoolState() = default; + + PoolState(PoolState&&) noexcept = default; + PoolState& operator=(PoolState&&) noexcept = delete; - size_t size() const { return searchKey.size; } + PoolState(const PoolState&) = delete; + PoolState& operator=(const PoolState&) = delete; }; class CachedAllocatorImpl : public Allocator { public: - explicit CachedAllocatorImpl(Allocator* base) - : base_(base), totalAllocatedSize_(0), largeBlocks(BlockComparator, false), smallBlocks(BlockComparator, true) {} + explicit CachedAllocatorImpl(Allocator* base) : base_(base), totalAllocatedSize_(0), activePoolId_(-1) {} static size_t roundSize(size_t size) { if (size < kMinBlockSize) { @@ -99,7 +111,7 @@ class CachedAllocatorImpl : public Allocator { static bool getFreeBlock(AllocParams& p) { BlockPool& pool = *p.pool; - // set-container search, return minium satisfied value. + // set-container search, return minimum satisfied value. const auto it = pool.blocks.lower_bound(&p.searchKey); if (it == pool.blocks.end()) { return false; @@ -109,16 +121,16 @@ class CachedAllocatorImpl : public Allocator { return true; } - BlockPool& getPool(size_t size) { + static BlockPool& getPool(size_t size, PoolState& poolState) { if (size <= kSmallSize) { - return smallBlocks; + return poolState.smallBlocks; } - return largeBlocks; + return poolState.largeBlocks; } bool allocBlock(AllocParams& p) { size_t size = p.allocSize; - void* ptr = base_->allocate(size); + void* ptr = base_->allocate(static_cast(size)); if (!ptr) { return false; } @@ -136,9 +148,9 @@ class CachedAllocatorImpl : public Allocator { return remaining > kSmallSize; } - void releaseCachedBlocks() { - releaseBlocks(largeBlocks); - releaseBlocks(smallBlocks); + void releaseCachedBlocks(PoolState& poolState) { + releaseBlocks(poolState.largeBlocks); + releaseBlocks(poolState.smallBlocks); } void releaseBlock(Block* block) { @@ -150,7 +162,7 @@ class CachedAllocatorImpl : public Allocator { } void releaseBlocks(BlockPool& pool) { - // Frees all non-split blocks + // frees all non-split blocks auto it = pool.blocks.begin(); while (it != pool.blocks.end()) { Block* block = *it; @@ -161,9 +173,23 @@ class CachedAllocatorImpl : public Allocator { } } - Block* mallocImpl(size_t origSize) { + void releaseAllBlocks(BlockPool& pool) { + auto it = pool.blocks.begin(); + while (it != pool.blocks.end()) { + Block* block = *it; + ++it; + if (!block->prev) { + base_->deallocate(block->ptr); + totalAllocatedSize_ -= block->size; + } + pool.blocks.erase(block); + delete block; + } + } + + Block* mallocImpl(size_t origSize, PoolState& poolState) { size_t size = roundSize(origSize); - auto& pool = getPool(size); + auto& pool = getPool(size, poolState); const size_t allocSize = getAllocationSize(size); AllocParams params(size, &pool, allocSize); @@ -172,13 +198,21 @@ class CachedAllocatorImpl : public Allocator { if (!blockFound) { blockFound = allocBlock(params); if (!blockFound) { - // retry after release caches - releaseCachedBlocks(); + // retry after release caches from the same pool + releaseCachedBlocks(poolState); blockFound = allocBlock(params); } if (!blockFound) { - LOGE("Out of memory. failed to allocate size: %lld", allocSize); + // last resort: try releasing default pool caches + if (activePoolId_ >= 0) { + releaseCachedBlocks(defaultPool_); + blockFound = allocBlock(params); + } + } + + if (!blockFound) { + LOGE("Out of memory. failed to allocate size: %zu", allocSize); return nullptr; } } @@ -206,18 +240,13 @@ class CachedAllocatorImpl : public Allocator { return block; } - static void freeImpl(Block* block) { - block->allocated = false; - freeBlock(block); - } - static void freeBlock(Block* block) { ASSERT(!block->allocated); auto& pool = *block->pool; const std::array mergeCandidates = {block->prev, block->next}; for (Block* candidate : mergeCandidates) { - tryMergeBlocks(block, candidate, pool); + (void)tryMergeBlocks(block, candidate, pool); } pool.blocks.insert(block); } @@ -251,43 +280,165 @@ class CachedAllocatorImpl : public Allocator { } void* allocate(int64_t nbytes) override { - Block* block = mallocImpl(nbytes); + auto& poolState = getActivePoolState(); + Block* block = mallocImpl(static_cast(nbytes), poolState); if (block) { - activeBlocks[block->ptr] = block; + activeBlocks_[block->ptr] = block; return block->ptr; } - LOGE("allocate error, size: %lld", nbytes); + LOGE("allocate error, size: %lld", static_cast(nbytes)); return nullptr; } void deallocate(void* ptr) override { - auto it = activeBlocks.find(ptr); - if (it != activeBlocks.end()) { - freeImpl(it->second); - activeBlocks.erase(it); + auto it = activeBlocks_.find(ptr); + if (it != activeBlocks_.end()) { + Block* block = it->second; + activeBlocks_.erase(it); + + block->allocated = false; + freeBlock(block); + + if (!CachedAllocator::isCacheEnabled()) { + if (!block->isSplit()) { + releaseBlock(block); + } + } } else { LOGE("deallocate error, ptr not valid: %p", ptr); } } + void beginAllocateToPool(int poolId) { + ASSERT(activePoolId_ < 0 && "Nested pool allocation not supported"); + activePoolId_ = poolId; + // lazy-create pool and increment ref count + auto it = graphPools_.find(poolId); + if (it == graphPools_.end()) { + graphPools_.emplace(poolId, PoolState{}); + it = graphPools_.find(poolId); + } + it->second.refCount++; + } + + void endAllocateToPool() { + ASSERT(activePoolId_ >= 0 && "endAllocateToPool called without matching begin"); + auto it = graphPools_.find(activePoolId_); + if (it != graphPools_.end()) { + it->second.refCount--; + } + activePoolId_ = -1; + } + + void freePool(int poolId) { + auto it = graphPools_.find(poolId); + if (it == graphPools_.end()) { + return; + } + + // do not free a pool that is still actively referenced + if (it->second.refCount > 0) { + LOGE("freePool warning: pool %d still has %d active references, forcing release", poolId, it->second.refCount); + } + + // release all blocks in the pool back to the base allocator. + releaseAllBlocks(it->second.largeBlocks); + releaseAllBlocks(it->second.smallBlocks); + graphPools_.erase(it); + } + + int activePoolId() const { return activePoolId_; } + ~CachedAllocatorImpl() override { - releaseCachedBlocks(); - ASSERT(activeBlocks.empty()); - ASSERT(largeBlocks.blocks.empty()); - ASSERT(smallBlocks.blocks.empty()); + // release all graph pools + for (auto& [id, pool] : graphPools_) { + releaseAllBlocks(pool.largeBlocks); + releaseAllBlocks(pool.smallBlocks); + } + graphPools_.clear(); + + // release any remaining active blocks + for (auto& [ptr, block] : activeBlocks_) { + base_->deallocate(block->ptr); + delete block; + } + activeBlocks_.clear(); + + // release default pool cached blocks + releaseAllBlocks(defaultPool_.largeBlocks); + releaseAllBlocks(defaultPool_.smallBlocks); } private: + PoolState& getActivePoolState() { + if (activePoolId_ < 0) { + return defaultPool_; + } + return graphPools_[activePoolId_]; + } + Allocator* base_; uint64_t totalAllocatedSize_; - BlockPool largeBlocks; - BlockPool smallBlocks; - ankerl::unordered_dense::map activeBlocks; + PoolState defaultPool_; + + std::unordered_map graphPools_; + int activePoolId_; // -1 = default pool + + ankerl::unordered_dense::map activeBlocks_; }; CachedAllocator::CachedAllocator(std::unique_ptr base) : base_(std::move(base)), impl_(std::make_unique(base_.get())) {} +CachedAllocator::~CachedAllocator() = default; + +CachedAllocator::CachedAllocator(CachedAllocator&& other) noexcept + : base_(std::move(other.base_)), impl_(std::move(other.impl_)) {} + +CachedAllocator& CachedAllocator::operator=(CachedAllocator&& other) noexcept { + if (this != &other) { + impl_ = std::move(other.impl_); + base_ = std::move(other.base_); + } + return *this; +} + +void* CachedAllocator::allocate(int64_t nbytes) { + std::lock_guard lock(mutex_); + return impl_->allocate(nbytes); +} + +void CachedAllocator::deallocate(void* ptr) { + std::lock_guard lock(mutex_); + impl_->deallocate(ptr); +} + +void CachedAllocator::beginAllocateToPool(int poolId) { + std::lock_guard lock(mutex_); + // NOLINTNEXTLINE(cppcoreguidelines-pro-type-static-cast-downcast) + static_cast(impl_.get())->beginAllocateToPool(poolId); +} + +void CachedAllocator::endAllocateToPool() { + std::lock_guard lock(mutex_); + // NOLINTNEXTLINE(cppcoreguidelines-pro-type-static-cast-downcast) + static_cast(impl_.get())->endAllocateToPool(); +} + +void CachedAllocator::freePool(int poolId) { + std::lock_guard lock(mutex_); + // NOLINTNEXTLINE(cppcoreguidelines-pro-type-static-cast-downcast) + static_cast(impl_.get())->freePool(poolId); +} + +int CachedAllocator::activePoolId() const { + std::lock_guard lock(mutex_); + // NOLINTNEXTLINE(cppcoreguidelines-pro-type-static-cast-downcast) + return static_cast(impl_.get())->activePoolId(); +} + +int CachedAllocator::newPoolId() { return nextPoolId_.fetch_add(1, std::memory_order_relaxed); } + } // namespace tinytorch diff --git a/src/Tensor/CachedAllocator.h b/src/Tensor/CachedAllocator.h index 63bf375..e5c5719 100644 --- a/src/Tensor/CachedAllocator.h +++ b/src/Tensor/CachedAllocator.h @@ -6,6 +6,9 @@ #pragma once +#include +#include + #include "Allocator.h" namespace tinytorch { @@ -13,28 +16,35 @@ namespace tinytorch { class CachedAllocator : public Allocator { public: explicit CachedAllocator(std::unique_ptr base); + ~CachedAllocator() override; + + CachedAllocator(CachedAllocator&&) noexcept; + CachedAllocator& operator=(CachedAllocator&&) noexcept; + + CachedAllocator(const CachedAllocator&) = delete; + CachedAllocator& operator=(const CachedAllocator&) = delete; + + static void setCacheEnabled(bool enabled) { cacheEnabled_.store(enabled, std::memory_order_release); } + static bool isCacheEnabled() { return cacheEnabled_.load(std::memory_order_acquire); } + + void* allocate(int64_t nbytes) override; + void deallocate(void* ptr) override; + + void beginAllocateToPool(int poolId); + void endAllocateToPool(); - static void setCacheEnabled(bool enabled) { cacheEnabled_ = enabled; } + void freePool(int poolId); - void* allocate(int64_t nbytes) override { - if (!cacheEnabled_) { - return base_->allocate(nbytes); - } - return impl_->allocate(nbytes); - } + int activePoolId() const; - void deallocate(void* ptr) override { - if (!cacheEnabled_) { - base_->deallocate(ptr); - return; - } - impl_->deallocate(ptr); - } + static int newPoolId(); private: - static bool cacheEnabled_; + static std::atomic cacheEnabled_; + static std::atomic nextPoolId_; std::unique_ptr base_; std::unique_ptr impl_; + mutable std::recursive_mutex mutex_; }; } // namespace tinytorch diff --git a/src/Tensor/Device.h b/src/Tensor/Device.h index 664d7d4..227b06f 100644 --- a/src/Tensor/Device.h +++ b/src/Tensor/Device.h @@ -39,6 +39,7 @@ struct Device { Device(DeviceType t, DeviceIndex i = 0) : type(t), index(i) {} bool operator==(const Device& other) const { return type == other.type && index == other.index; } + bool operator!=(const Device& other) const { return type != other.type || index != other.index; } bool isCpu() const { return type == DeviceType::CPU; } bool isCuda() const { return type == DeviceType::CUDA; } diff --git a/src/Utils/CUDAGraph.cpp b/src/Utils/CUDAGraph.cpp new file mode 100644 index 0000000..e4c7678 --- /dev/null +++ b/src/Utils/CUDAGraph.cpp @@ -0,0 +1,36 @@ +/* + * TinyTorch + * @author : keith@robot9.me + * + */ + +#include "CUDAGraph.h" + +#ifdef USE_CUDA + +#include "Tensor/Allocator.h" +#include "Tensor/CachedAllocator.h" + +namespace tinytorch::cuda { + +CUDAGraphCaptureGuard::CUDAGraphCaptureGuard(CUDAGraph& graph, CUDAStream& stream, int poolId) + : graph_(graph), stream_(stream), poolId_(poolId) { + CachedAllocator* allocator = getCUDACachedAllocator(stream.deviceIdx()); + ASSERT(allocator != nullptr && "Failed to get CUDA CachedAllocator"); + allocator->beginAllocateToPool(poolId_); + + graph_.beginCapture(stream_); +} + +CUDAGraphCaptureGuard::~CUDAGraphCaptureGuard() { + CachedAllocator* allocator = getCUDACachedAllocator(stream_.deviceIdx()); + if (allocator) { + allocator->endAllocateToPool(); + } + + graph_.endCapture(stream_); +} + +} // namespace tinytorch::cuda + +#endif // USE_CUDA diff --git a/src/Utils/CUDAGraph.h b/src/Utils/CUDAGraph.h new file mode 100644 index 0000000..cb37a75 --- /dev/null +++ b/src/Utils/CUDAGraph.h @@ -0,0 +1,103 @@ +/* + * TinyTorch + * @author : keith@robot9.me + * + */ + +#pragma once + +#ifdef USE_CUDA + +#include "CUDAUtils.h" + +namespace tinytorch::cuda { + +class CUDAGraph { + public: + CUDAGraph() = default; + ~CUDAGraph() { reset(); } + + CUDAGraph(CUDAGraph&& other) noexcept : graph_(other.graph_), graphExec_(other.graphExec_) { + other.graph_ = nullptr; + other.graphExec_ = nullptr; + } + + CUDAGraph& operator=(CUDAGraph&& other) noexcept { + if (this != &other) { + reset(); + graph_ = other.graph_; + graphExec_ = other.graphExec_; + other.graph_ = nullptr; + other.graphExec_ = nullptr; + } + return *this; + } + + CUDAGraph(const CUDAGraph&) = delete; + CUDAGraph& operator=(const CUDAGraph&) = delete; + + void beginCapture(CUDAStream& stream, cudaStreamCaptureMode mode = cudaStreamCaptureModeGlobal) { + if (graph_ || graphExec_) { + // auto-reset to avoid resource leaks in release mode + reset(); + } + ASSERT(stream.valid() && "Stream must be valid for capture"); + CUDA_CHECK(cudaStreamBeginCapture(stream.stream(), mode)); + } + + void endCapture(CUDAStream& stream) { + ASSERT(!graph_ && !graphExec_ && "Graph already instantiated"); + CUDA_CHECK(cudaStreamEndCapture(stream.stream(), &graph_)); + ASSERT(graph_ != nullptr && "cudaStreamEndCapture returned null graph"); + + // instantiate: creates the executable graph from the captured graph +#if CUDART_VERSION >= 12000 + CUDA_CHECK(cudaGraphInstantiate(&graphExec_, graph_, 0ULL)); +#else + CUDA_CHECK(cudaGraphInstantiate(&graphExec_, graph_, nullptr, nullptr, 0)); +#endif + ASSERT(graphExec_ != nullptr); + } + + void replay(CUDAStream& stream) { + ASSERT(graphExec_ != nullptr && "No graph captured; call beginCapture/endCapture first"); + CUDA_CHECK(cudaGraphLaunch(graphExec_, stream.stream())); + } + + void reset() { + if (graphExec_) { + CUDA_CHECK(cudaGraphExecDestroy(graphExec_)); + graphExec_ = nullptr; + } + if (graph_) { + CUDA_CHECK(cudaGraphDestroy(graph_)); + graph_ = nullptr; + } + } + + bool valid() const { return graphExec_ != nullptr; } + + private: + cudaGraph_t graph_ = nullptr; + cudaGraphExec_t graphExec_ = nullptr; +}; + +class CUDAGraphCaptureGuard { + public: + CUDAGraphCaptureGuard(CUDAGraph& graph, CUDAStream& stream, int poolId); + ~CUDAGraphCaptureGuard(); + + CUDAGraphCaptureGuard(const CUDAGraphCaptureGuard&) = delete; + CUDAGraphCaptureGuard& operator=(const CUDAGraphCaptureGuard&) = delete; + CUDAGraphCaptureGuard(CUDAGraphCaptureGuard&&) = delete; + CUDAGraphCaptureGuard& operator=(CUDAGraphCaptureGuard&&) = delete; + + private: + CUDAGraph& graph_; + CUDAStream& stream_; + int poolId_; +}; + +} // namespace tinytorch::cuda + +#endif // USE_CUDA diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 94c4c14..92b2489 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -19,6 +19,7 @@ add_executable(${PROJECT_NAME} test_function.cpp test_module.cpp test_optimizer.cpp + test_allocator.cpp ) target_include_directories(${PROJECT_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../src @@ -28,6 +29,12 @@ target_include_directories(${PROJECT_NAME} PRIVATE ) target_link_libraries(${PROJECT_NAME} TinyTorch_lib gtest_main) +# cuda support +find_package(CUDAToolkit QUIET) +if (CUDAToolkit_FOUND) + target_compile_definitions(${PROJECT_NAME} PRIVATE USE_CUDA) +endif () + include(GoogleTest) # add tests diff --git a/test/test_allocator.cpp b/test/test_allocator.cpp new file mode 100644 index 0000000..6faf82a --- /dev/null +++ b/test/test_allocator.cpp @@ -0,0 +1,462 @@ +/* + * TinyTorch + * @author : keith@robot9.me + * + */ + +#include +#include +#include +#include + +#include "Tensor/Allocator.h" +#include "Tensor/CachedAllocator.h" +#include "TinyTorch.h" +#include "Utils/CUDAUtils.h" +#include "test.h" + +namespace tt = tinytorch; + +TEST(allocator, cpu_allocate_deallocate) { + auto* allocator = tt::getAllocator(tt::Options(tt::Device::cpu())); + ASSERT_NE(allocator, nullptr); + + // Basic allocation + void* ptr = allocator->allocate(1024); + ASSERT_NE(ptr, nullptr); + + // Write/read to verify the memory is usable + std::memset(ptr, 0xAB, 1024); + auto* bytes = static_cast(ptr); + EXPECT_EQ(bytes[0], 0xAB); + EXPECT_EQ(bytes[1023], 0xAB); + + allocator->deallocate(ptr); +} + +TEST(allocator, cpu_zero_size_allocation) { + auto* allocator = tt::getAllocator(tt::Options(tt::Device::cpu())); + ASSERT_NE(allocator, nullptr); + + // Zero-size allocation should still return something valid from the caching layer + // (rounded up to kMinBlockSize=512) + void* ptr = allocator->allocate(0); + // Implementation rounds to at least 512, so this should succeed + if (ptr) { + allocator->deallocate(ptr); + } +} + +TEST(allocator, cpu_alignment) { + auto* allocator = tt::getAllocator(tt::Options(tt::Device::cpu())); + ASSERT_NE(allocator, nullptr); + + // Allocate multiple times and verify alignment (default 32-byte alignment) + for (int i = 0; i < 10; i++) { + void* ptr = allocator->allocate(64 + i * 7); // various non-aligned sizes + ASSERT_NE(ptr, nullptr); + // The underlying memory should be 32-byte aligned (from CPUAllocator) + // but CachedAllocator may return sub-block offsets that are 512-aligned + // which is a multiple of 32. + EXPECT_EQ(reinterpret_cast(ptr) % 32, 0u) << "Allocation " << i << " not 32-byte aligned"; + allocator->deallocate(ptr); + } +} + +TEST(allocator, cpu_multiple_allocations) { + auto* allocator = tt::getAllocator(tt::Options(tt::Device::cpu())); + ASSERT_NE(allocator, nullptr); + + std::vector ptrs; + constexpr int kNumAllocs = 100; + + // Allocate many blocks + for (int i = 0; i < kNumAllocs; i++) { + void* ptr = allocator->allocate(512 * (i + 1)); + ASSERT_NE(ptr, nullptr); + ptrs.push_back(ptr); + } + + // Verify all pointers are unique + std::set uniquePtrs(ptrs.begin(), ptrs.end()); + EXPECT_EQ(uniquePtrs.size(), ptrs.size()); + + // Free all + for (auto* ptr : ptrs) { + allocator->deallocate(ptr); + } +} + +TEST(allocator, cache_reuse_same_size) { + auto* allocator = tt::getAllocator(tt::Options(tt::Device::cpu())); + ASSERT_NE(allocator, nullptr); + + // Allocate and free + void* ptr1 = allocator->allocate(1024); + ASSERT_NE(ptr1, nullptr); + allocator->deallocate(ptr1); + + // Allocate same size — should reuse the cached block + void* ptr2 = allocator->allocate(1024); + ASSERT_NE(ptr2, nullptr); + EXPECT_EQ(ptr1, ptr2) << "Expected cached block reuse for same size"; + allocator->deallocate(ptr2); +} + +TEST(allocator, cache_reuse_smaller_size) { + auto* allocator = tt::getAllocator(tt::Options(tt::Device::cpu())); + ASSERT_NE(allocator, nullptr); + + // Allocate a larger block + void* ptr1 = allocator->allocate(4096); + ASSERT_NE(ptr1, nullptr); + allocator->deallocate(ptr1); + + // Allocate a smaller size — might get the same block (or split from it) + void* ptr2 = allocator->allocate(512); + ASSERT_NE(ptr2, nullptr); + // ptr2 should be at the same base address (beginning of the split block) + EXPECT_EQ(ptr1, ptr2) << "Expected block split to reuse base address"; + allocator->deallocate(ptr2); +} + +TEST(allocator, cache_disabled_no_reuse) { + auto* allocator = tt::getAllocator(tt::Options(tt::Device::cpu())); + ASSERT_NE(allocator, nullptr); + + // Allocate with cache enabled + void* ptr1 = allocator->allocate(2048); + ASSERT_NE(ptr1, nullptr); + + // Disable cache, then free — block should be released to base allocator + tt::CachedAllocator::setCacheEnabled(false); + allocator->deallocate(ptr1); + + // Allocate same size — should NOT get the same address (no caching) + void* ptr2 = allocator->allocate(2048); + ASSERT_NE(ptr2, nullptr); + // Note: it's possible (but unlikely) to get the same address from malloc, + // so we don't assert inequality. Just ensure it works without crash. + allocator->deallocate(ptr2); + + // Re-enable cache + tt::CachedAllocator::setCacheEnabled(true); +} + +TEST(allocator, cache_toggle_correctness) { + auto* allocator = tt::getAllocator(tt::Options(tt::Device::cpu())); + ASSERT_NE(allocator, nullptr); + + // Allocate with cache enabled + void* ptr1 = allocator->allocate(1024); + ASSERT_NE(ptr1, nullptr); + + // Disable cache + tt::CachedAllocator::setCacheEnabled(false); + + // Allocate another (still tracked by impl) + void* ptr2 = allocator->allocate(1024); + ASSERT_NE(ptr2, nullptr); + + // Re-enable cache + tt::CachedAllocator::setCacheEnabled(true); + + // Deallocate both — should not crash regardless of cache state changes + allocator->deallocate(ptr1); + allocator->deallocate(ptr2); + + // Verify reuse now works again + void* ptr3 = allocator->allocate(1024); + ASSERT_NE(ptr3, nullptr); + allocator->deallocate(ptr3); +} + +TEST(allocator, pool_id_uniqueness) { + int id1 = tt::CachedAllocator::newPoolId(); + int id2 = tt::CachedAllocator::newPoolId(); + int id3 = tt::CachedAllocator::newPoolId(); + EXPECT_NE(id1, id2); + EXPECT_NE(id2, id3); + EXPECT_NE(id1, id3); +} + +TEST(allocator, concurrent_allocate_deallocate) { + auto* allocator = tt::getAllocator(tt::Options(tt::Device::cpu())); + ASSERT_NE(allocator, nullptr); + + constexpr int kNumThreads = 8; + constexpr int kAllocsPerThread = 100; + std::vector threads; + + std::atomic errors{0}; + + threads.reserve(kNumThreads); + for (int t = 0; t < kNumThreads; t++) { + threads.emplace_back([&, t]() { + std::vector ptrs; + for (int i = 0; i < kAllocsPerThread; i++) { + size_t size = 512 * static_cast((t * kAllocsPerThread + i) % 20 + 1); + void* ptr = allocator->allocate(static_cast(size)); + if (!ptr) { + errors.fetch_add(1); + continue; + } + // Touch the memory to detect corruption + std::memset(ptr, static_cast(t & 0xFF), std::min(size, size_t(64))); + ptrs.push_back(ptr); + } + // Deallocate in reverse order + for (auto it = ptrs.rbegin(); it != ptrs.rend(); ++it) { + allocator->deallocate(*it); + } + }); + } + + for (auto& th : threads) { + th.join(); + } + + EXPECT_EQ(errors.load(), 0) << "Some allocations failed under contention"; +} + +TEST(allocator, concurrent_allocate_interleaved) { + auto* allocator = tt::getAllocator(tt::Options(tt::Device::cpu())); + ASSERT_NE(allocator, nullptr); + + constexpr int kNumThreads = 4; + constexpr int kIterations = 200; + std::vector threads; + + std::atomic errors{0}; + + threads.reserve(kNumThreads); + for (int t = 0; t < kNumThreads; t++) { + threads.emplace_back([&]() { + for (int i = 0; i < kIterations; i++) { + // Allocate and immediately free — exercises cache reuse under contention + void* ptr = allocator->allocate(1024); + if (!ptr) { + errors.fetch_add(1); + continue; + } + // Small delay to increase interleaving probability + std::memset(ptr, 0, 64); + allocator->deallocate(ptr); + } + }); + } + + for (auto& th : threads) { + th.join(); + } + + EXPECT_EQ(errors.load(), 0); +} + +TEST(allocator, large_allocation) { + auto* allocator = tt::getAllocator(tt::Options(tt::Device::cpu())); + ASSERT_NE(allocator, nullptr); + + // Allocate > kMinLargeAlloc (10 MiB) — uses kRoundLarge rounding + constexpr size_t kLargeSize = 16 * 1024 * 1024; // 16 MiB + void* ptr = allocator->allocate(static_cast(kLargeSize)); + ASSERT_NE(ptr, nullptr); + + // Write to first and last byte + auto* bytes = static_cast(ptr); + bytes[0] = 0x11; + bytes[kLargeSize - 1] = 0x22; + EXPECT_EQ(bytes[0], 0x11); + EXPECT_EQ(bytes[kLargeSize - 1], 0x22); + + allocator->deallocate(ptr); +} + +TEST(allocator, mixed_sizes) { + auto* allocator = tt::getAllocator(tt::Options(tt::Device::cpu())); + ASSERT_NE(allocator, nullptr); + + // Mix of small and large allocations + std::vector> allocs; + std::vector sizes = {64, 512, 1024, 4096, 1048576, 2097152, 10485760, 20971520}; + + for (auto size : sizes) { + void* ptr = allocator->allocate(static_cast(size)); + ASSERT_NE(ptr, nullptr) << "Failed to allocate " << size << " bytes"; + allocs.emplace_back(ptr, size); + } + + // Verify all are writable + for (auto& [ptr, size] : allocs) { + std::memset(ptr, 0xCD, std::min(size, size_t(128))); + } + + // Free in random-ish order + for (int i = static_cast(allocs.size()) - 1; i >= 0; i -= 2) { + allocator->deallocate(allocs[i].first); + } + for (int i = 0; i < static_cast(allocs.size()); i += 2) { + allocator->deallocate(allocs[i].first); + } +} + +TEST(allocator, tensor_allocate_cpu) { + auto opts = tt::Options(tt::Device::cpu(), tt::DType::Float32).noGrad(); + + // Create tensor — exercises full allocator path + tt::Tensor t({100, 100}, opts); + ASSERT_NE(t.dataPtr<>(), nullptr); + + // Fill and verify + auto data = std::vector(10000, 3.14f); + tt::Tensor t2(data, {100, 100}, opts); + auto list = t2.toList(); + EXPECT_EQ(list.size(), 10000u); + EXPECT_NEAR(list[0], 3.14f, 1e-5f); + EXPECT_NEAR(list[9999], 3.14f, 1e-5f); +} + +TEST(allocator, tensor_reuse_after_destroy) { + auto opts = tt::Options(tt::Device::cpu(), tt::DType::Float32).noGrad(); + + void* firstPtr = nullptr; + { + tt::Tensor t({256}, opts); + firstPtr = t.dataPtr<>(); + ASSERT_NE(firstPtr, nullptr); + } + // Tensor destroyed, memory returned to cache + + // New tensor of same size should reuse the cached block + tt::Tensor t2({256}, opts); + void* secondPtr = t2.dataPtr<>(); + EXPECT_EQ(firstPtr, secondPtr) << "Expected cached block reuse for Tensor"; +} + +TEST(allocator, tensor_multiple_create_destroy_cycles) { + auto opts = tt::Options(tt::Device::cpu(), tt::DType::Float32).noGrad(); + + // Stress test: create and destroy many tensors + for (int cycle = 0; cycle < 50; cycle++) { + std::vector tensors; + for (int i = 0; i < 20; i++) { + tensors.push_back(tt::Tensor({static_cast(128 * (i + 1))}, opts)); + ASSERT_NE(tensors.back().dataPtr<>(), nullptr); + } + // All tensors freed when vector goes out of scope + } + // Should not crash, leak, or corrupt +} + +#ifdef USE_CUDA + +#define SKIP_IF_NO_CUDA() \ + do { \ + if (!tt::cuda::deviceAvailable()) { \ + GTEST_SKIP() << "CUDA device not available; skipping test"; \ + } \ + } while (0) + +TEST(allocator, cuda_basic_allocate_deallocate) { + SKIP_IF_NO_CUDA(); + + auto* allocator = tt::getCUDACachedAllocator(0); + ASSERT_NE(allocator, nullptr); + + void* ptr = allocator->allocate(4096); + ASSERT_NE(ptr, nullptr); + allocator->deallocate(ptr); +} + +TEST(allocator, cuda_cache_reuse) { + SKIP_IF_NO_CUDA(); + + auto* allocator = tt::getCUDACachedAllocator(0); + ASSERT_NE(allocator, nullptr); + + void* ptr1 = allocator->allocate(2048); + ASSERT_NE(ptr1, nullptr); + allocator->deallocate(ptr1); + + void* ptr2 = allocator->allocate(2048); + ASSERT_NE(ptr2, nullptr); + EXPECT_EQ(ptr1, ptr2) << "CUDA cached block should be reused"; + allocator->deallocate(ptr2); +} + +TEST(allocator, cuda_pool_begin_end) { + SKIP_IF_NO_CUDA(); + + auto* allocator = tt::getCUDACachedAllocator(0); + ASSERT_NE(allocator, nullptr); + + EXPECT_EQ(allocator->activePoolId(), -1); + + int poolId = tt::CachedAllocator::newPoolId(); + allocator->beginAllocateToPool(poolId); + EXPECT_EQ(allocator->activePoolId(), poolId); + + void* ptr = allocator->allocate(1024); + ASSERT_NE(ptr, nullptr); + + allocator->endAllocateToPool(); + EXPECT_EQ(allocator->activePoolId(), -1); + + // Free and cleanup + allocator->deallocate(ptr); + allocator->freePool(poolId); +} + +TEST(allocator, cuda_tensor_integration) { + SKIP_IF_NO_CUDA(); + + auto opts = tt::Options(tt::Device(tt::DeviceType::CUDA, 0), tt::DType::Float32).noGrad(); + + // Create tensors on GPU + std::vector data(1024, 1.5f); + tt::Tensor t(data, {32, 32}, opts); + + // Copy back and verify + auto host = t.toList(); + EXPECT_EQ(host.size(), 1024u); + EXPECT_NEAR(host[0], 1.5f, 1e-5f); + EXPECT_NEAR(host[1023], 1.5f, 1e-5f); +} + +TEST(allocator, cuda_concurrent_allocate) { + SKIP_IF_NO_CUDA(); + + auto* allocator = tt::getCUDACachedAllocator(0); + ASSERT_NE(allocator, nullptr); + + constexpr int kNumThreads = 4; + constexpr int kAllocsPerThread = 50; + std::vector threads; + std::atomic errors{0}; + + threads.reserve(kNumThreads); + for (int t = 0; t < kNumThreads; t++) { + threads.emplace_back([&]() { + std::vector ptrs; + for (int i = 0; i < kAllocsPerThread; i++) { + void* ptr = allocator->allocate(1024 * (i + 1)); + if (!ptr) { + errors.fetch_add(1); + continue; + } + ptrs.push_back(ptr); + } + for (auto* ptr : ptrs) { + allocator->deallocate(ptr); + } + }); + } + + for (auto& th : threads) { + th.join(); + } + + EXPECT_EQ(errors.load(), 0); +} + +#endif // USE_CUDA diff --git a/test/test_module.cpp b/test/test_module.cpp index 80b2425..e2a5131 100644 --- a/test/test_module.cpp +++ b/test/test_module.cpp @@ -59,33 +59,3 @@ TEST(TEST_Module, dropout) { EXPECT_TRUE((output == 0).to(DType::Float32).sum().item() > 0); } -TEST(TEST_Module, rope) { - auto rope = nn::RoPE(4, 3, 1000.f); - EXPECT_THAT(rope.cache().shape(), ElementsAre(3, 4, 2)); - EXPECT_TRUE( - VectorNear(rope.cache().toList(), - {1.0000, 0.0000, 1.0000, 0.0000, 1.0000, 0.0000, 1.0000, 0.0000, 0.5403, 0.8415, 0.9995, 0.0316, - 0.5403, 0.8415, 0.9995, 0.0316, -0.4161, 0.9093, 0.9980, 0.0632, -0.4161, 0.9093, 0.9980, 0.0632})); - - RopeScalingConfig scaling{2.f, 4.f, 1.f, 2}; - rope = nn::RoPE(4, 3, 10000.f, scaling); - EXPECT_TRUE( - VectorNear(rope.cache().toList(), - {1.0000, 0.0000, 1.0000, 0.0000, 1.0000, 0.0000, 1.0000, 0.0000, 0.8776, 0.4794, 1.0000, 0.0050, - 0.8776, 0.4794, 1.0000, 0.0050, 0.5403, 0.8415, 0.9999, 0.0100, 0.5403, 0.8415, 0.9999, 0.0100})); - - auto x = Tensor(Array1d{ - 0.0516, -0.9695, -0.0861, 1.3223, 0.5351, 0.1768, -0.0966, 0.2490, -0.8414, -0.5110, -1.1106, -0.1058, - 0.2775, 0.4491, -0.5324, 0.1249, 1.0637, -1.3959, 0.4438, 2.3408, -0.0822, 1.0439, -0.7985, 0.1040, - 0.7335, -2.2184, -0.6714, -0.3401, 1.5914, -0.4572, -0.7603, -0.6322, 1.0201, -0.5109, 0.8172, -0.5532, - -0.1150, 0.2467, -0.3851, 0.5726, -0.1837, -1.5242, 0.4929, -0.2529, 1.2423, -0.4946, -0.4185, 1.1320}); - x.reshape_({2, 2, 3, 4}); - auto y = rope(x); - EXPECT_THAT(y.shape(), ElementsAre(2, 2, 3, 4)); - EXPECT_TRUE(VectorNear( - y.toList(), - {0.0516, -0.9695, -0.0861, 1.3223, 0.5159, 0.1756, 0.1718, 0.2499, 0.4799, -0.5099, -1.3081, -0.1109, - 0.2775, 0.4491, -0.5324, 0.1249, 0.7207, -1.4076, 0.8994, 2.3338, 0.6274, 1.0428, -0.5006, 0.1145, - 0.7335, -2.2184, -0.6714, -0.3401, 1.7611, -0.4540, 0.0957, -0.6344, -0.1365, -0.5053, 1.3000, -0.5583, - -0.1150, 0.2467, -0.3851, 0.5726, -0.3975, -1.5229, 0.3445, -0.2605, 1.0234, -0.5059, 0.8192, 1.1270})); -} diff --git a/third_party/TinyFA b/third_party/TinyFA deleted file mode 160000 index 4e18516..0000000 --- a/third_party/TinyFA +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 4e18516165acb029076b2ecc7d733c9ebf4d552a