diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 3aec1742ee1..6585008f169 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -1354,6 +1354,7 @@ struct ggml_backend_cuda_context { int device; std::string name; cudaEvent_t copy_event = nullptr; + bool disable_mmq_stream_k_default = false; cudaStream_t streams[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { { nullptr } }; cublasHandle_t cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr}; diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index de579d2ed50..423735be081 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -4719,9 +4719,14 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back } static ggml_backend_t ggml_backend_cuda_device_init_backend(ggml_backend_dev_t dev, const char * params) { - GGML_UNUSED(params); + const bool disable_mmq_stream_k_default = params != nullptr && strstr(params, "disable_mmq_stream_k_default=1") != nullptr; ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context; - return ggml_backend_cuda_init(ctx->device); + ggml_backend_t backend = ggml_backend_cuda_init(ctx->device); + if (backend != nullptr) { + ggml_backend_cuda_context * backend_ctx = (ggml_backend_cuda_context *) backend->context; + backend_ctx->disable_mmq_stream_k_default = disable_mmq_stream_k_default; + } + return backend; } static ggml_backend_buffer_type_t ggml_backend_cuda_device_get_buffer_type(ggml_backend_dev_t dev) { diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu index 3f01ff5bfb0..7a0b3657273 100644 --- a/ggml/src/ggml-cuda/mmq.cu +++ b/ggml/src/ggml-cuda/mmq.cu @@ -118,8 +118,9 @@ void ggml_cuda_mul_mat_q( const int64_t s03 = src0->nb[3] / ts_src0; const int64_t s3 = dst->nb[3] / ts_dst; - const bool use_stream_k = (GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) - || GGML_CUDA_CC_IS_CDNA(cc); + const bool use_stream_k_default = ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) + || GGML_CUDA_CC_IS_CDNA(cc)) + && !ctx.disable_mmq_stream_k_default; // TODO: tighter pool buffer size vs q8 path const bool use_native_mxfp4 = blackwell_mma_available(cc) && src0->type == GGML_TYPE_MXFP4; @@ -158,7 +159,7 @@ void ggml_cuda_mul_mat_q( ne00, ne01, ne1, s01, ne11, s1, ne02, ne12, s02, s12, s2, ne03, ne13, s03, s13, s3, - use_stream_k, ne1}; + use_stream_k_default, ne1}; ggml_cuda_mul_mat_q_switch_type(ctx, args, stream); return; } @@ -218,7 +219,7 @@ void ggml_cuda_mul_mat_q( ne00, ne01, ne_get_rows, s01, ne_get_rows, s1, ne02, ne02, s02, s12, s2, ne03, ne13, s03, s13, s3, - use_stream_k, ne12}; + use_stream_k_default, ne12}; ggml_cuda_mul_mat_q_switch_type(ctx, args, stream); } @@ -250,8 +251,9 @@ void ggml_cuda_op_mul_mat_q( // The stream-k decomposition is only faster for recent NVIDIA GPUs. // Also its fixup needs to allocate a temporary buffer in the memory pool. // There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer. - const bool use_stream_k = ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) - || GGML_CUDA_CC_IS_CDNA(cc)) + const bool use_stream_k = (((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) + || GGML_CUDA_CC_IS_CDNA(cc)) + && !ctx.disable_mmq_stream_k_default) && src1_ncols == ne11; const mmq_args args = { src0_dd_i, src0->type, (const int *) src1_ddq_i, nullptr, nullptr, dst_dd_i, diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh index b1a319de9be..b47362adcfe 100644 --- a/ggml/src/ggml-cuda/mmq.cuh +++ b/ggml/src/ggml-cuda/mmq.cuh @@ -3463,7 +3463,7 @@ static __device__ __forceinline__ void mul_mat_q_process_tile( // The mul_mat_q kernel implements "stream-k" work partitioning as described in https://arxiv.org/abs/2301.03598 -template +template #if defined(GGML_USE_HIP) #if defined(RDNA4) || defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN) __launch_bounds__(ggml_cuda_get_physical_warp_size()*mmq_get_nwarps_device(), 2) @@ -3514,9 +3514,7 @@ static __global__ void mul_mat_q( } __syncthreads(); - // On non-CDNA AMD or old CUDA the performance with stream-k was worse, use conventional tiling instead: -#if (defined(GGML_USE_HIP) && !defined(CDNA)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA - { + if constexpr (!use_stream_k) { const int wt = blockIdx.z / nchannels_y; const int zt = blockIdx.z - wt*nchannels_y; const int jt = blockIdx.y; @@ -3569,7 +3567,6 @@ static __global__ void mul_mat_q( tile_x_max_i, tile_y_max_j, 0, ncols_x/qk); return; } -#endif // (defined(GGML_USE_HIP) && !defined(CDNA4) && !defined(CDNA3)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA constexpr int ITER_K = get_iter_k(type); @@ -3909,8 +3906,10 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a const int nbytes_shared = mmq_get_nbytes_shared(mmq_x, mmq_y, cc, warp_size, nwarps); - CUDA_SET_SHARED_MEMORY_LIMIT((mul_mat_q), nbytes_shared); - CUDA_SET_SHARED_MEMORY_LIMIT((mul_mat_q), nbytes_shared); + CUDA_SET_SHARED_MEMORY_LIMIT((mul_mat_q), nbytes_shared); + CUDA_SET_SHARED_MEMORY_LIMIT((mul_mat_q), nbytes_shared); + CUDA_SET_SHARED_MEMORY_LIMIT((mul_mat_q), nbytes_shared); + CUDA_SET_SHARED_MEMORY_LIMIT((mul_mat_q), nbytes_shared); const int nty = (args.nrows_x + mmq_y - 1) / mmq_y; const int ntx = (args.ncols_max + mmq_x - 1) / mmq_x; @@ -3925,7 +3924,8 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a if (!args.use_stream_k) { if (args.nrows_x % mmq_y == 0) { constexpr bool need_check = false; - mul_mat_q<<>> + constexpr bool use_stream_k = false; + mul_mat_q<<>> (args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, nullptr, args.ncols_x, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst, channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst, @@ -3933,7 +3933,8 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a args.ncols_max); } else { constexpr bool need_check = true; - mul_mat_q<<>> + constexpr bool use_stream_k = false; + mul_mat_q<<>> (args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, nullptr, args.ncols_x, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst, channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst, @@ -3954,7 +3955,8 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a if (args.nrows_x % mmq_y == 0) { constexpr bool need_check = false; - mul_mat_q<<>> + constexpr bool use_stream_k = true; + mul_mat_q<<>> (args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, args.ncols_x, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst, channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst, @@ -3971,7 +3973,8 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a args.ncols_max); } else { constexpr bool need_check = true; - mul_mat_q<<>> + constexpr bool use_stream_k = true; + mul_mat_q<<>> (args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, args.ncols_x, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst, channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst, @@ -4110,4 +4113,3 @@ void ggml_cuda_op_mul_mat_q( const int64_t src1_padded_row_size, cudaStream_t stream); bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t n_experts); - diff --git a/src/llama-context.cpp b/src/llama-context.cpp index ee0c29235cd..935b3ec9976 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -219,8 +219,10 @@ llama_context::llama_context( if (!hparams.vocab_only) { // GPU backends + const bool disable_mmq_stream_k_default = hparams.n_expert > 0; + const char * backend_params = disable_mmq_stream_k_default ? "disable_mmq_stream_k_default=1" : nullptr; for (const auto & dev : model.devices) { - ggml_backend_t backend = ggml_backend_dev_init(dev.dev, nullptr); + ggml_backend_t backend = ggml_backend_dev_init(dev.dev, backend_params); if (backend == nullptr) { throw std::runtime_error(format("failed to initialize %s backend", ggml_backend_dev_name(dev.dev))); }