Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -212,4 +212,6 @@ images/
**/generated/**
aphrodite/_version.py
shellcheck-stable/
ep_kernels_workspace/
ep_kernels_workspace/
third-party/
**/spv/**
123 changes: 123 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ cmake_minimum_required(VERSION 3.26)
# cmake --install . --component _C
project(aphrodite_extensions LANGUAGES CXX)

# Optional: flag to enable Aphrodite's Vulkan backend wrappers (no third-party deps)
option(APHRODITE_ENABLE_VULKAN "Enable Aphrodite Vulkan backend wrappers" OFF)

# CUDA by default, can be overridden by using -DAPHRODITE_TARGET_DEVICE=... (used by setup.py)
set(APHRODITE_TARGET_DEVICE "cuda" CACHE STRING "Target device backend for Aphrodite")

Expand Down Expand Up @@ -97,6 +100,10 @@ if (NOT APHRODITE_TARGET_DEVICE STREQUAL "cuda" AND
if (APHRODITE_TARGET_DEVICE STREQUAL "cpu")
include(${CMAKE_CURRENT_LIST_DIR}/cmake/cpu_extension.cmake)
else()
# Still allow optional Vulkan shader generation even if not building GPU extensions
if(APHRODITE_ENABLE_VULKAN)
message(STATUS "Vulkan shaders were generated (no GPU extension build)")
endif()
return()
endif()
return()
Expand Down Expand Up @@ -314,6 +321,68 @@ set(APHRODITE_EXT_SRC
SRCS "${APHRODITE_EXT_SRC}"
CUDA_ARCHS "${CUDA_ARCHS}")

# If Vulkan is enabled, replace CUDA variants of activation/layernorm/rope with Vulkan wrappers
if(APHRODITE_ENABLE_VULKAN)
list(REMOVE_ITEM APHRODITE_EXT_SRC
"kernels/pos_encoding_kernels.cu"
"kernels/activation_kernels.cu"
"kernels/layernorm_kernels.cu")
list(APPEND APHRODITE_EXT_SRC
"kernels/vulkan/activation_vk.cpp"
"kernels/vulkan/layernorm_vk.cpp"
"kernels/vulkan/pos_encoding_vk.cpp"
"kernels/vulkan/vk_dispatch.cpp"
"kernels/vulkan/vk_runtime.cpp")

# Compile and embed a minimal set of Vulkan shaders
find_program(GLSLC glslc REQUIRED)
set(VK_SHADERS_DIR ${CMAKE_CURRENT_SOURCE_DIR}/kernels/vulkan/shaders)
# Emit SPIR-V into source tree for a stable path during dev/editable installs
set(VK_SPV_DIR ${CMAKE_CURRENT_SOURCE_DIR}/kernels/vulkan/spv)
file(MAKE_DIRECTORY ${VK_SPV_DIR})

# Manually compile a minimal set of shaders (extend as needed)
add_custom_command(
OUTPUT ${VK_SPV_DIR}/swiglu_f32.spv
COMMAND ${GLSLC} -fshader-stage=compute --target-env=vulkan1.2 -O -I${VK_SHADERS_DIR}
-DA_TYPE=float -DD_TYPE=float
-o ${VK_SPV_DIR}/swiglu_f32.spv ${VK_SHADERS_DIR}/swiglu.comp
DEPENDS ${VK_SHADERS_DIR}/swiglu.comp
COMMENT "Compiling Vulkan shader swiglu_f32"
)
add_custom_command(
OUTPUT ${VK_SPV_DIR}/swiglu_f16.spv
COMMAND ${GLSLC} -fshader-stage=compute --target-env=vulkan1.2 -O -I${VK_SHADERS_DIR}
-DA_TYPE=float16_t -DD_TYPE=float16_t -DRTE16=1
-o ${VK_SPV_DIR}/swiglu_f16.spv ${VK_SHADERS_DIR}/swiglu.comp
DEPENDS ${VK_SHADERS_DIR}/swiglu.comp
COMMENT "Compiling Vulkan shader swiglu_f16"
)
add_custom_command(
OUTPUT ${VK_SPV_DIR}/geglu_f32.spv
COMMAND ${GLSLC} -fshader-stage=compute --target-env=vulkan1.2 -O -I${VK_SHADERS_DIR}
-DA_TYPE=float -DD_TYPE=float
-o ${VK_SPV_DIR}/geglu_f32.spv ${VK_SHADERS_DIR}/geglu.comp
DEPENDS ${VK_SHADERS_DIR}/geglu.comp ${VK_SHADERS_DIR}/glu_head.comp ${VK_SHADERS_DIR}/glu_main.comp
COMMENT "Compiling Vulkan shader geglu_f32"
)
add_custom_command(
OUTPUT ${VK_SPV_DIR}/geglu_f16.spv
COMMAND ${GLSLC} -fshader-stage=compute --target-env=vulkan1.2 -O -I${VK_SHADERS_DIR}
-DA_TYPE=float16_t -DD_TYPE=float16_t -DRTE16=1
-o ${VK_SPV_DIR}/geglu_f16.spv ${VK_SHADERS_DIR}/geglu.comp
DEPENDS ${VK_SHADERS_DIR}/geglu.comp ${VK_SHADERS_DIR}/glu_head.comp ${VK_SHADERS_DIR}/glu_main.comp
COMMENT "Compiling Vulkan shader geglu_f16"
)
set(VK_SPV_OUTPUTS
${VK_SPV_DIR}/swiglu_f32.spv
${VK_SPV_DIR}/swiglu_f16.spv
${VK_SPV_DIR}/geglu_f32.spv
${VK_SPV_DIR}/geglu_f16.spv)

add_custom_target(aphrodite_vk_shaders ALL DEPENDS ${VK_SPV_OUTPUTS})
endif()

# Only build Marlin kernels if we are building for at least some compatible archs.
# Keep building Marlin for 9.0 as there are some group sizes and shapes that
# are not supported by Machete yet.
Expand Down Expand Up @@ -765,6 +834,7 @@ if (APHRODITE_GPU_LANG STREQUAL "HIP")
endif()

message(STATUS "Enabling C extension.")

define_gpu_extension_target(
_C
DESTINATION aphrodite
Expand All @@ -777,6 +847,59 @@ define_gpu_extension_target(
USE_SABI 3
WITH_SOABI)

if(APHRODITE_ENABLE_VULKAN)
# Hint CMake where to look for Vulkan
if(DEFINED ENV{VULKAN_SDK})
list(APPEND CMAKE_PREFIX_PATH $ENV{VULKAN_SDK})
list(APPEND CMAKE_LIBRARY_PATH $ENV{VULKAN_SDK}/lib $ENV{VULKAN_SDK}/lib64)
list(APPEND CMAKE_INCLUDE_PATH $ENV{VULKAN_SDK}/include)
endif()
find_package(Vulkan QUIET)
if (Vulkan_FOUND)
target_link_libraries(_C PRIVATE Vulkan::Vulkan)
get_target_property(_vk_includes Vulkan::Vulkan INTERFACE_INCLUDE_DIRECTORIES)
if(_vk_includes)
target_include_directories(_C PRIVATE ${_vk_includes})
endif()
else()
# Fallback: manually locate libvulkan and create an imported target
set(_vk_candidates
/usr/lib/x86_64-linux-gnu/libvulkan.so
/usr/lib64/libvulkan.so
/usr/lib/libvulkan.so)
set(_vk_lib "")
foreach(p ${_vk_candidates})
if(EXISTS ${p})
set(_vk_lib ${p})
break()
endif()
endforeach()
if(_vk_lib)
set(_vk_inc /usr/include CACHE PATH "Vulkan headers path")
add_library(Vulkan::Vulkan UNKNOWN IMPORTED)
set_target_properties(Vulkan::Vulkan PROPERTIES
IMPORTED_LOCATION ${_vk_lib}
INTERFACE_INCLUDE_DIRECTORIES ${_vk_inc})
target_link_libraries(_C PRIVATE Vulkan::Vulkan)
target_include_directories(_C PRIVATE ${_vk_inc})
message(STATUS "Linked Vulkan via fallback: ${_vk_lib}")
else()
message(WARNING "Vulkan SDK not fully found; building without linking Vulkan::Vulkan. Make sure runtime has libvulkan.")
endif()
endif()
# As a last resort, force-add common system include roots so <vulkan/vulkan.h> resolves
target_include_directories(_C PRIVATE /usr/include)
if(DEFINED ENV{VULKAN_SDK})
target_include_directories(_C PRIVATE $ENV{VULKAN_SDK}/include)
endif()
# Do not force /usr/include globally; restrict to Vulkan translation units via source-level includes
target_compile_definitions(_C PRIVATE APHRODITE_ENABLE_VULKAN=1)
# Ensure SPIR-V gets built before linking the extension
add_dependencies(_C aphrodite_vk_shaders)
# Provide a compile-time default for the runtime to find SPIR-Vs
target_compile_definitions(_C PRIVATE APHRODITE_VK_SPV_DIR_DEFAULT="${VK_SPV_DIR}")
endif()

# If CUTLASS is compiled on NVCC >= 12.5, it by default uses
# cudaGetDriverEntryPointByVersion as a wrapper to avoid directly calling the
# driver API. This causes problems when linking with earlier versions of CUDA.
Expand Down
5 changes: 5 additions & 0 deletions aphrodite/common/envs.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,11 @@ def get_aphrodite_port() -> Optional[int]:
"VERBOSE":
lambda: bool(int(os.getenv('VERBOSE', '0'))),

# Enable Vulkan backend for selected kernels (activations, layernorm, rope)
# APHRODITE_ENABLE_VULKAN=1 to enable
"APHRODITE_ENABLE_VULKAN":
lambda: bool(int(os.getenv("APHRODITE_ENABLE_VULKAN", "0"))),

# Root directory for Aphrodite configuration files
# Defaults to `~/.config/aphrodite` unless `XDG_CONFIG_HOME` is set
# Note that this not only affects how aphrodite finds its configuration files
Expand Down
149 changes: 149 additions & 0 deletions docs/dev/vulkan_kernels.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
## Aphrodite Vulkan kernels reference (internal)


### Conventions
- Dtypes: use fp16 or fp32 variants as needed.
- Shapes use row-major indexing consistent with our CUDA kernels. All buffers are raw device buffers; strides are passed via push constants when required by shader.
- For fused operations not present as single shader, compose multiple dispatches in the same command buffer.

---

## LayerNorm / RMSNorm

CUDA entrypoints:
- `void rms_norm(Tensor& out, Tensor& input, Tensor& weight, double epsilon)`
- `void fused_add_rms_norm(Tensor& input, Tensor& residual, Tensor& weight, double epsilon)`

Vulkan shaders:
- RMSNorm: `rms_norm.comp` → SPIR-V name: `rms_norm_f32`/`rms_norm_f16`
- LayerNorm (mean-variance): `norm.comp` → SPIR-V name: `norm_f32`
- GroupNorm helper: `group_norm.comp` (not required here)

Bindings and parameters (rms_norm.comp):
- Includes `generic_binary_head.comp` and `types.comp`.
- local_size: 512 x 1 x 1
- Push constants via `generic_binary_head.comp` (`p`):
- `ne00` (ncols), `nb01/nb02/nb03` (strides), `param1` (epsilon)
- `get_aoffset()`, `get_boffset()`, `get_doffset()` are header helpers
- Buffers:
- binding 0: `A_TYPE data_a[]` (input)
- binding 1: `B_TYPE data_b[]` (weight) [only if `do_multiply` true]
- binding 3: `D_TYPE data_d[]` (output)
- Specialization constant `do_multiply` (const_id 1):
- false: write `scale * x`
- true: write `scale * x * weight[col]` (handles wrap when `ncols > p.ne10`)

Dispatch shape:
- Workgroups: `[nrows, nchannels, samples]` from vulkan head headers
- For simple 2D: set `gl_NumWorkGroups = [num_tokens, 1, 1]`, `BLOCK_SIZE=512` threads

Wrapper mapping:
- `rms_norm`: single dispatch of `rms_norm.comp` with `do_multiply=true`, `data_b=weight`, `p.param1=epsilon`.
- `fused_add_rms_norm`: compose:
1) `add.comp` into `residual` or a temp: `input = input + residual` (vectorized add)
2) `rms_norm.comp` with `do_multiply=true` using `weight`, `epsilon` over the result

Notes:
- CUDA `rms_norm` writes to `out` shaped `[num_tokens, hidden]`. vulkan flattens and uses strides; map `input_stride` to `p.nb01` and contiguous output to linear `d_offset`.

---

## Rotary embeddings (RoPE)

CUDA entrypoints:
- `void rotary_embedding(Tensor& positions, Tensor& query, optional<Tensor> key, int64_t head_size, Tensor& cos_sin_cache, bool is_neox)`
- `void batched_rotary_embedding(Tensor& positions, Tensor& query, optional<Tensor> key, int64_t head_size, Tensor& cos_sin_cache, bool is_neox, int64_t rot_dim, Tensor& cos_sin_cache_offsets)`

Vulkan shaders:
- Shared header: `rope_head.comp` (bindings, push constants, YARN helpers)
- GPT-NeoX layout (split halves): `rope_neox.comp` → SPV: `rope_neox_f32`/`_f16`
- GPT-J/interleaved pairs: `rope_norm.comp` → SPV: `rope_norm_f32`/`_f16`
- Batched/multi sections: `rope_multi.comp` → SPV: `rope_multi_f32`/`_f16`
- Vision variant: `rope_vision.comp` (not required for LLM)

Bindings (`rope_head.comp`):
- binding 0: `A_TYPE data_a[]` (input Q/K flattened; we dispatch separately per tensor or pack)
- binding 1: `int data_pos[]` (positions per channel/token)
- binding 2: `float data_ff[]` (optional frequency factors; set `has_ff=0` if unused)
- binding 3: `D_TYPE data_d[]` (output)

Push constants struct `p` (rope_head.comp):
- `uint ncols` // columns (full hidden or pair count depending shader)
- `uint n_dims` // rotational dims applied (<= hidden)
- `float freq_scale` // YARN scaling
- `uint p_delta_rows` // rows per channel (seq length per batch slice)
- `float freq_base` // unused in current rope_* kernels
- `float ext_factor` // YARN interpolation factor
- `float attn_factor` // magnitude scale factor
- `float corr_dims[2]` // YARN correction dims
- `float theta_scale` // base^(i/2) like scaler
- `uint has_ff` // 1 if `data_ff` used
- `uint ne02` // auxiliary dim for multi/vision
- `uint s1, s2` // strides to locate row/channel in `data_a`
- `int sections[4]` // per-section sizes for `rope_multi`/`rope_vision`
- `uint is_back` // 1 for backward (negate theta)

Dispatch shape:
- `rope_*` kernels use `layout(local_size_x = 1, local_size_y = 256, local_size_z = 1)` and index by `gl_GlobalInvocationID.x` as row and `gl_GlobalInvocationID.y` stepping col pairs.
- For `[num_tokens, heads, head_size]`, set:
- `p.p_delta_rows = seq_len` or rows per channel in the flattening scheme
- `p.s1` = stride between rows (e.g., head_size or head_size/2 depending kernel)
- `p.s2` = stride between channels (heads * stride per head)
- `p.n_dims` = `rot_dim` (<= hidden), `p.ncols` = total columns per row
- Choose shader:
- `is_neox=true` → `rope_neox.comp` (split halves)
- else → `rope_norm.comp` (interleaved pairs)
- batched LoRA offsets → `rope_multi.comp` and populate `sections[]`, `ne02`

Wrapper mapping:
- `rotary_embedding`: one dispatch over Q (and over K if passed, or pack Q/K into a larger buffer and dispatch once by setting shapes appropriately). Feed `data_pos` from `positions` view and set YARN params to no-op if not used: `freq_scale=1`, `ext_factor=0`, `attn_factor=1`, `corr_dims={0,0}`, `theta_scale=base`.
- `batched_rotary_embedding`: prefer `rope_multi.comp` to model per-section offsets; otherwise loop over tokens and dispatch `rope_norm/neox` with position offsets applied in `data_pos` and sections.

Note on cos/sin cache:
- CUDA uses a precomputed `cos_sin_cache`. vulkan shaders compute cos/sin from `positions` and YARN params on-the-fly. To match numerics with cache-based paths, set `freq_scale=1`, `ext_factor=0`, `attn_factor=1`, and precompute `theta_scale` consistent with cache generation. If we must consume a cache directly, a tiny adapter shader would be needed.

---

## Activations (elementwise and gated)

CUDA entrypoints:
- Gated GLU-style: `silu_and_mul`, `mul_and_silu`, `gelu_and_mul`, `gelu_tanh_and_mul`, `fatrelu_and_mul`
- Elementwise: `gelu_new`, `gelu_fast`, `gelu_quick`

Vulkan shaders (elementwise):
- `silu.comp` → SPV: `silu_f32`/`_f16` (x / (1 + exp(-x)))
- `gelu.comp` → SPV: `gelu_f32`/`_f16` (tanh-style GELU)
- `gelu_erf.comp` → SPV: `gelu_erf_f32`/`_f16` (erf-style GELU)
- `gelu_quick.comp` → SPV: `gelu_quick_f32`/`_f16` (x*sigmoid(1.702x))
- Also available: `relu.comp`, `leaky_relu.comp`, `tanh.comp`, `sigmoid.comp`

Vulkan shaders (GLU gating):
- Shared infra: `glu_head.comp`, `glu_main.comp`
- SWiGLU (SiLU gate): `swiglu.comp` → SPV: `swiglu_f32`/`_f16`
- GEGLU (tanh GELU gate): `geglu.comp` → SPV: `geglu_f32`/`_f16`
- GEGLU erf: `geglu_erf.comp` → SPV: `geglu_erf_f32`/`_f16`
- GEGLU quick: `geglu_quick.comp` → SPV: `geglu_quick_f32`/`_f16`

Bindings (`glu_head.comp`):
- binding 0: `A_TYPE data_a[]` // primary input [N, 2*d] for default/swap modes
- binding 1: `A_TYPE data_b[]` // optional second input for split mode
- binding 2: `D_TYPE data_d[]` // output [N, d]

Push constants (`glu_head.comp`):
- `uint N` // number of rows (tokens)
- `uint ne00` // columns in `data_a` (2*d when default/swap)
- `uint ne20` // output width `d`
- `uint mode` // 0 default (act on first half), 1 swapped (act on second half), 2 split (a and b separate)
- `float alpha` // unused here
- `float limit` // unused here

Dispatch shape:
- local_size: 512 x 1 x 1. Global size: `[ceil_div(N*ne20, 512), 1, 1]` set via xyz decomposition used in `glu_main.comp` (`i` linearizes row,col).

Wrapper mapping:
- `silu_and_mul`: dispatch `swiglu.comp` with `mode=0`, `N=num_tokens`, `ne00=2*d`, `ne20=d`. Input packs [x, y]; output is `silu(x) * y`.
- `mul_and_silu`: same as above with `mode=1` (act on second half): `x * silu(y)`.
- `gelu_and_mul`: dispatch `geglu.comp` with `mode=0` (tanh GELU gate).
- `gelu_tanh_and_mul`: same as above (matches CUDA tanh approximation).
- `gelu_new` (elementwise tanh-based), `gelu_fast` (elementwise tanh fast), `gelu_quick`: use `gelu.comp`, `gelu.comp`/`gelu_erf.comp` as needed, `gelu_quick.comp` respectively with `p.KX = d`, `N = num_tokens` in `generic_head.comp`-style launchers.
- `fatrelu_and_mul`: not present as GLU shader; implement via a custom variant modeled on `glu_main.comp` with op `fatrelu(a, threshold) * b`, or approximate with `relu.comp` plus custom gate. If needed later, clone `swiglu.comp` pattern.
Loading
Loading