Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 16 additions & 4 deletions ggml/src/ggml-sycl/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,11 @@ endif()

if (GGML_SYCL_TARGET STREQUAL "INTEL")
add_compile_definitions(GGML_SYCL_WARP_SIZE=16)
target_link_options(ggml-sycl PRIVATE -Xs -ze-intel-greater-than-4GB-buffer-required)
if (NOT GGML_SYCL_DEVICE_ARCH)
target_link_options(ggml-sycl PRIVATE -Xs -ze-intel-greater-than-4GB-buffer-required)
else()
message(STATUS "Skipping -ze-intel-greater-than-4GB-buffer-required for spir64_gen AOT")
endif()

# Link against Intel oneMKL
if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
Expand All @@ -160,7 +164,15 @@ if (GGML_SYCL_HOST_MEM_FALLBACK)
endif()

if (GGML_SYCL_DEVICE_ARCH)
target_compile_options(ggml-sycl PRIVATE -Xsycl-target-backend --offload-arch=${GGML_SYCL_DEVICE_ARCH})
target_link_options(ggml-sycl PRIVATE -Xsycl-target-backend --offload-arch=${GGML_SYCL_DEVICE_ARCH})
message(STATUS "GGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} (AOT via spir64_gen)")
target_compile_options(
ggml-sycl PRIVATE
-fsycl-targets=spir64_gen
"SHELL:-Xsycl-target-backend=spir64_gen \"-device ${GGML_SYCL_DEVICE_ARCH}\""
)
target_link_options(
ggml-sycl PRIVATE
-fsycl-targets=spir64_gen
"SHELL:-Xsycl-target-backend=spir64_gen \"-device ${GGML_SYCL_DEVICE_ARCH}\""
)
endif()

11 changes: 8 additions & 3 deletions ggml/src/ggml-sycl/ggml-sycl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2178,6 +2178,8 @@ inline void ggml_sycl_op_mul_mat_sycl(
#endif
if ((src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && use_fp16 && ggml_is_contiguous(src0) &&
row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
// NOTE: Fused dequant+GEMM and MMQ/DPAS were both attempted (Steps 10-11
// in optimization-workbook.md) but are slower than dequant+oneDNN.
ggml_sycl_pool_alloc<sycl::half> src0_as_f16(ctx.pool());
if (src0->type != GGML_TYPE_F16) {
scope_op_debug_print scope_dbg_print(__func__, "/to_fp16_sycl", dst, /*num_src=*/2,
Expand Down Expand Up @@ -3261,9 +3263,12 @@ enum class mul_mat_algo {
};

inline bool ggml_sycl_supports_mmq(enum ggml_type type) {
// TODO: accuracy issues in MMQ
GGML_UNUSED(type);
return false;
// DPAS INT8 MMQ kernel exists in mmq.cpp but is slower than dequant+oneDNN.
// Disabled pending further optimization. See optimization-workbook.md Step 11.
switch (type) {
default:
return false;
}
}

inline bool ggml_sycl_supports_reorder_mul_mat_sycl(enum ggml_type type) {
Expand Down
45 changes: 25 additions & 20 deletions ggml/src/ggml-sycl/mmq.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,11 @@
#include "mmq.hpp"
#include "vecdotq.hpp"

// Note: MMQ tile layout assumes WARP_SIZE >= 32 (QI4_K = QI5_K = QI6_K = 32).
// Intel targets set WARP_SIZE=16 (native subgroup size), which makes MMQ dp4a
// kernels non-functional. MMQ dispatch is disabled in ggml_sycl_supports_mmq().
// See optimization-workbook.md Step 11 for DPAS upgrade attempt and results.

typedef void (*allocate_tiles_sycl_t)(
int** x_ql,
sycl::half2** x_dm,
Expand Down Expand Up @@ -1831,7 +1836,7 @@ static void ggml_mul_mat_q4_0_q8_1_sycl(const void *vx, const void *vy,

cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) {
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
mul_mat_q4_0<need_check>(
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
nrows_dst, item_ct1,
Expand Down Expand Up @@ -1866,7 +1871,7 @@ static void ggml_mul_mat_q4_0_q8_1_sycl(const void *vx, const void *vy,

cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) {
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
mul_mat_q4_0<need_check>(
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
nrows_dst, item_ct1,
Expand Down Expand Up @@ -1946,7 +1951,7 @@ static void ggml_mul_mat_q4_1_q8_1_sycl(const void *vx, const void *vy,

cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) {
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
mul_mat_q4_1<need_check>(
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
nrows_dst, item_ct1,
Expand Down Expand Up @@ -1981,7 +1986,7 @@ static void ggml_mul_mat_q4_1_q8_1_sycl(const void *vx, const void *vy,

cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) {
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
mul_mat_q4_1<need_check>(
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
nrows_dst, item_ct1,
Expand Down Expand Up @@ -2061,7 +2066,7 @@ static void ggml_mul_mat_q5_0_q8_1_sycl(const void *vx, const void *vy,

cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) {
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
mul_mat_q5_0<need_check>(
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
nrows_dst, item_ct1,
Expand Down Expand Up @@ -2096,7 +2101,7 @@ static void ggml_mul_mat_q5_0_q8_1_sycl(const void *vx, const void *vy,

cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) {
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
mul_mat_q5_0<need_check>(
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
nrows_dst, item_ct1,
Expand Down Expand Up @@ -2176,7 +2181,7 @@ static void ggml_mul_mat_q5_1_q8_1_sycl(const void *vx, const void *vy,

cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) {
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
mul_mat_q5_1<need_check>(
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
nrows_dst, item_ct1,
Expand Down Expand Up @@ -2211,7 +2216,7 @@ static void ggml_mul_mat_q5_1_q8_1_sycl(const void *vx, const void *vy,

cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) {
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
mul_mat_q5_1<need_check>(
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
nrows_dst, item_ct1,
Expand Down Expand Up @@ -2291,7 +2296,7 @@ static void ggml_mul_mat_q8_0_q8_1_sycl(const void *vx, const void *vy,

cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) {
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
mul_mat_q8_0<need_check>(
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
nrows_dst, item_ct1,
Expand Down Expand Up @@ -2326,7 +2331,7 @@ static void ggml_mul_mat_q8_0_q8_1_sycl(const void *vx, const void *vy,

cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) {
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
mul_mat_q8_0<need_check>(
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
nrows_dst, item_ct1,
Expand Down Expand Up @@ -2408,7 +2413,7 @@ static void ggml_mul_mat_q2_K_q8_1_sycl(const void *vx, const void *vy,

cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) {
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
mul_mat_q2_K<need_check>(
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
nrows_dst, item_ct1,
Expand Down Expand Up @@ -2446,7 +2451,7 @@ static void ggml_mul_mat_q2_K_q8_1_sycl(const void *vx, const void *vy,

cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) {
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
mul_mat_q2_K<need_check>(
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
nrows_dst, item_ct1,
Expand Down Expand Up @@ -2533,7 +2538,7 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,

cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) {
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
mul_mat_q3_K<need_check>(
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
nrows_dst, item_ct1,
Expand Down Expand Up @@ -2574,7 +2579,7 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,

cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) {
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
mul_mat_q3_K<need_check>(
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
nrows_dst, item_ct1,
Expand Down Expand Up @@ -2659,7 +2664,7 @@ static void ggml_mul_mat_q4_K_q8_1_sycl(const void *vx, const void *vy,

cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) {
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
mul_mat_q4_K<need_check>(
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
nrows_dst, item_ct1,
Expand Down Expand Up @@ -2697,7 +2702,7 @@ static void ggml_mul_mat_q4_K_q8_1_sycl(const void *vx, const void *vy,

cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) {
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
mul_mat_q4_K<need_check>(
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
nrows_dst, item_ct1,
Expand Down Expand Up @@ -2780,7 +2785,7 @@ static void ggml_mul_mat_q5_K_q8_1_sycl(const void *vx, const void *vy,

cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) {
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
mul_mat_q5_K<need_check>(
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
nrows_dst, item_ct1,
Expand Down Expand Up @@ -2818,7 +2823,7 @@ static void ggml_mul_mat_q5_K_q8_1_sycl(const void *vx, const void *vy,

cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) {
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
mul_mat_q5_K<need_check>(
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
nrows_dst, item_ct1,
Expand Down Expand Up @@ -2901,7 +2906,7 @@ static void ggml_mul_mat_q6_K_q8_1_sycl(const void *vx, const void *vy,

cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) {
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
mul_mat_q6_K<need_check>(
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
nrows_dst, item_ct1,
Expand Down Expand Up @@ -2939,7 +2944,7 @@ static void ggml_mul_mat_q6_K_q8_1_sycl(const void *vx, const void *vy,

cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) {
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
mul_mat_q6_K<need_check>(
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
nrows_dst, item_ct1,
Expand Down