dphnAI · AlpinDale · Aug 14, 2025 · Aug 14, 2025 · Aug 14, 2025
diff --git a/.gitignore b/.gitignore
@@ -212,4 +212,6 @@ images/
 **/generated/**
 aphrodite/_version.py
 shellcheck-stable/
-ep_kernels_workspace/
+ep_kernels_workspace/
+third-party/
+**/spv/**
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -13,6 +13,9 @@ cmake_minimum_required(VERSION 3.26)
 # cmake --install . --component _C
 project(aphrodite_extensions LANGUAGES CXX)
 
+# Optional: flag to enable Aphrodite's Vulkan backend wrappers (no third-party deps)
+option(APHRODITE_ENABLE_VULKAN "Enable Aphrodite Vulkan backend wrappers" OFF)
+
 # CUDA by default, can be overridden by using -DAPHRODITE_TARGET_DEVICE=... (used by setup.py)
 set(APHRODITE_TARGET_DEVICE "cuda" CACHE STRING "Target device backend for Aphrodite")
 
@@ -97,6 +100,10 @@ if (NOT APHRODITE_TARGET_DEVICE STREQUAL "cuda" AND
     if (APHRODITE_TARGET_DEVICE STREQUAL "cpu")
         include(${CMAKE_CURRENT_LIST_DIR}/cmake/cpu_extension.cmake)
     else()
+        # Still allow optional Vulkan shader generation even if not building GPU extensions
+        if(APHRODITE_ENABLE_VULKAN)
+          message(STATUS "Vulkan shaders were generated (no GPU extension build)")
+        endif()
         return()
     endif()
     return()
@@ -314,6 +321,68 @@ set(APHRODITE_EXT_SRC
     SRCS "${APHRODITE_EXT_SRC}"
     CUDA_ARCHS "${CUDA_ARCHS}")
 
+  # If Vulkan is enabled, replace CUDA variants of activation/layernorm/rope with Vulkan wrappers
+  if(APHRODITE_ENABLE_VULKAN)
+    list(REMOVE_ITEM APHRODITE_EXT_SRC
+      "kernels/pos_encoding_kernels.cu"
+      "kernels/activation_kernels.cu"
+      "kernels/layernorm_kernels.cu")
+    list(APPEND APHRODITE_EXT_SRC
+      "kernels/vulkan/activation_vk.cpp"
+      "kernels/vulkan/layernorm_vk.cpp"
+      "kernels/vulkan/pos_encoding_vk.cpp"
+      "kernels/vulkan/vk_dispatch.cpp"
+      "kernels/vulkan/vk_runtime.cpp")
+
+    # Compile and embed a minimal set of Vulkan shaders
+    find_program(GLSLC glslc REQUIRED)
+    set(VK_SHADERS_DIR ${CMAKE_CURRENT_SOURCE_DIR}/kernels/vulkan/shaders)
+    # Emit SPIR-V into source tree for a stable path during dev/editable installs
+    set(VK_SPV_DIR ${CMAKE_CURRENT_SOURCE_DIR}/kernels/vulkan/spv)
+    file(MAKE_DIRECTORY ${VK_SPV_DIR})
+
+    # Manually compile a minimal set of shaders (extend as needed)
+    add_custom_command(
+      OUTPUT ${VK_SPV_DIR}/swiglu_f32.spv
+      COMMAND ${GLSLC} -fshader-stage=compute --target-env=vulkan1.2 -O -I${VK_SHADERS_DIR}
+              -DA_TYPE=float -DD_TYPE=float
+              -o ${VK_SPV_DIR}/swiglu_f32.spv ${VK_SHADERS_DIR}/swiglu.comp
+      DEPENDS ${VK_SHADERS_DIR}/swiglu.comp
+      COMMENT "Compiling Vulkan shader swiglu_f32"
+    )
+    add_custom_command(
+      OUTPUT ${VK_SPV_DIR}/swiglu_f16.spv
+      COMMAND ${GLSLC} -fshader-stage=compute --target-env=vulkan1.2 -O -I${VK_SHADERS_DIR}
+              -DA_TYPE=float16_t -DD_TYPE=float16_t -DRTE16=1
+              -o ${VK_SPV_DIR}/swiglu_f16.spv ${VK_SHADERS_DIR}/swiglu.comp
+      DEPENDS ${VK_SHADERS_DIR}/swiglu.comp
+      COMMENT "Compiling Vulkan shader swiglu_f16"
+    )
+    add_custom_command(
+      OUTPUT ${VK_SPV_DIR}/geglu_f32.spv
+      COMMAND ${GLSLC} -fshader-stage=compute --target-env=vulkan1.2 -O -I${VK_SHADERS_DIR}
+              -DA_TYPE=float -DD_TYPE=float
+              -o ${VK_SPV_DIR}/geglu_f32.spv ${VK_SHADERS_DIR}/geglu.comp
+      DEPENDS ${VK_SHADERS_DIR}/geglu.comp ${VK_SHADERS_DIR}/glu_head.comp ${VK_SHADERS_DIR}/glu_main.comp
+      COMMENT "Compiling Vulkan shader geglu_f32"
+    )
+    add_custom_command(
+      OUTPUT ${VK_SPV_DIR}/geglu_f16.spv
+      COMMAND ${GLSLC} -fshader-stage=compute --target-env=vulkan1.2 -O -I${VK_SHADERS_DIR}
+              -DA_TYPE=float16_t -DD_TYPE=float16_t -DRTE16=1
+              -o ${VK_SPV_DIR}/geglu_f16.spv ${VK_SHADERS_DIR}/geglu.comp
+      DEPENDS ${VK_SHADERS_DIR}/geglu.comp ${VK_SHADERS_DIR}/glu_head.comp ${VK_SHADERS_DIR}/glu_main.comp
+      COMMENT "Compiling Vulkan shader geglu_f16"
+    )
+    set(VK_SPV_OUTPUTS
+      ${VK_SPV_DIR}/swiglu_f32.spv
+      ${VK_SPV_DIR}/swiglu_f16.spv
+      ${VK_SPV_DIR}/geglu_f32.spv
+      ${VK_SPV_DIR}/geglu_f16.spv)
+
+    add_custom_target(aphrodite_vk_shaders ALL DEPENDS ${VK_SPV_OUTPUTS})
+  endif()
+
   # Only build Marlin kernels if we are building for at least some compatible archs.
   # Keep building Marlin for 9.0 as there are some group sizes and shapes that
   # are not supported by Machete yet.
@@ -765,6 +834,7 @@ if (APHRODITE_GPU_LANG STREQUAL "HIP")
 endif()
 
 message(STATUS "Enabling C extension.")
+
 define_gpu_extension_target(
   _C
   DESTINATION aphrodite
@@ -777,6 +847,59 @@ define_gpu_extension_target(
   USE_SABI 3
   WITH_SOABI)
 
+if(APHRODITE_ENABLE_VULKAN)
+  # Hint CMake where to look for Vulkan
+  if(DEFINED ENV{VULKAN_SDK})
+    list(APPEND CMAKE_PREFIX_PATH $ENV{VULKAN_SDK})
+    list(APPEND CMAKE_LIBRARY_PATH $ENV{VULKAN_SDK}/lib $ENV{VULKAN_SDK}/lib64)
+    list(APPEND CMAKE_INCLUDE_PATH $ENV{VULKAN_SDK}/include)
+  endif()
+  find_package(Vulkan QUIET)
+  if (Vulkan_FOUND)
+    target_link_libraries(_C PRIVATE Vulkan::Vulkan)
+    get_target_property(_vk_includes Vulkan::Vulkan INTERFACE_INCLUDE_DIRECTORIES)
+    if(_vk_includes)
+      target_include_directories(_C PRIVATE ${_vk_includes})
+    endif()
+  else()
+    # Fallback: manually locate libvulkan and create an imported target
+    set(_vk_candidates
+      /usr/lib/x86_64-linux-gnu/libvulkan.so
+      /usr/lib64/libvulkan.so
+      /usr/lib/libvulkan.so)
+    set(_vk_lib "")
+    foreach(p ${_vk_candidates})
+      if(EXISTS ${p})
+        set(_vk_lib ${p})
+        break()
+      endif()
+    endforeach()
+    if(_vk_lib)
+      set(_vk_inc /usr/include CACHE PATH "Vulkan headers path")
+      add_library(Vulkan::Vulkan UNKNOWN IMPORTED)
+      set_target_properties(Vulkan::Vulkan PROPERTIES
+        IMPORTED_LOCATION ${_vk_lib}
+        INTERFACE_INCLUDE_DIRECTORIES ${_vk_inc})
+      target_link_libraries(_C PRIVATE Vulkan::Vulkan)
+      target_include_directories(_C PRIVATE ${_vk_inc})
+      message(STATUS "Linked Vulkan via fallback: ${_vk_lib}")
+    else()
+      message(WARNING "Vulkan SDK not fully found; building without linking Vulkan::Vulkan. Make sure runtime has libvulkan.")
+    endif()
+  endif()
+  # As a last resort, force-add common system include roots so <vulkan/vulkan.h> resolves
+  target_include_directories(_C PRIVATE /usr/include)
+  if(DEFINED ENV{VULKAN_SDK})
+    target_include_directories(_C PRIVATE $ENV{VULKAN_SDK}/include)
+  endif()
+  # Do not force /usr/include globally; restrict to Vulkan translation units via source-level includes
+  target_compile_definitions(_C PRIVATE APHRODITE_ENABLE_VULKAN=1)
+  # Ensure SPIR-V gets built before linking the extension
+  add_dependencies(_C aphrodite_vk_shaders)
+  # Provide a compile-time default for the runtime to find SPIR-Vs
+  target_compile_definitions(_C PRIVATE APHRODITE_VK_SPV_DIR_DEFAULT="${VK_SPV_DIR}")
+endif()
+
 # If CUTLASS is compiled on NVCC >= 12.5, it by default uses
 # cudaGetDriverEntryPointByVersion as a wrapper to avoid directly calling the
 # driver API. This causes problems when linking with earlier versions of CUDA.

diff --git a/aphrodite/common/envs.py b/aphrodite/common/envs.py
@@ -254,6 +254,11 @@ def get_aphrodite_port() -> Optional[int]:
     "VERBOSE":
     lambda: bool(int(os.getenv('VERBOSE', '0'))),
 
+    # Enable Vulkan backend for selected kernels (activations, layernorm, rope)
+    # APHRODITE_ENABLE_VULKAN=1 to enable
+    "APHRODITE_ENABLE_VULKAN":
+    lambda: bool(int(os.getenv("APHRODITE_ENABLE_VULKAN", "0"))),
+
     # Root directory for Aphrodite configuration files
     # Defaults to `~/.config/aphrodite` unless `XDG_CONFIG_HOME` is set
     # Note that this not only affects how aphrodite finds its configuration files

diff --git a/docs/dev/vulkan_kernels.md b/docs/dev/vulkan_kernels.md
@@ -0,0 +1,149 @@
+## Aphrodite Vulkan kernels reference (internal)
+
+
+### Conventions
+- Dtypes: use fp16 or fp32 variants as needed.
+- Shapes use row-major indexing consistent with our CUDA kernels. All buffers are raw device buffers; strides are passed via push constants when required by shader.
+- For fused operations not present as single shader, compose multiple dispatches in the same command buffer.
+
+---
+
+## LayerNorm / RMSNorm
+
+CUDA entrypoints:
+- `void rms_norm(Tensor& out, Tensor& input, Tensor& weight, double epsilon)`
+- `void fused_add_rms_norm(Tensor& input, Tensor& residual, Tensor& weight, double epsilon)`
+
+Vulkan shaders:
+- RMSNorm: `rms_norm.comp` → SPIR-V name: `rms_norm_f32`/`rms_norm_f16`
+- LayerNorm (mean-variance): `norm.comp` → SPIR-V name: `norm_f32`
+- GroupNorm helper: `group_norm.comp` (not required here)
+
+Bindings and parameters (rms_norm.comp):
+- Includes `generic_binary_head.comp` and `types.comp`.
+- local_size: 512 x 1 x 1
+- Push constants via `generic_binary_head.comp` (`p`):
+  - `ne00` (ncols), `nb01/nb02/nb03` (strides), `param1` (epsilon)
+  - `get_aoffset()`, `get_boffset()`, `get_doffset()` are header helpers
+- Buffers:
+  - binding 0: `A_TYPE data_a[]`  (input)
+  - binding 1: `B_TYPE data_b[]`  (weight) [only if `do_multiply` true]
+  - binding 3: `D_TYPE data_d[]`  (output)
+- Specialization constant `do_multiply` (const_id 1):
+  - false: write `scale * x`
+  - true:  write `scale * x * weight[col]` (handles wrap when `ncols > p.ne10`)
+
+Dispatch shape:
+- Workgroups: `[nrows, nchannels, samples]` from vulkan head headers
+- For simple 2D: set `gl_NumWorkGroups = [num_tokens, 1, 1]`, `BLOCK_SIZE=512` threads
+
+Wrapper mapping:
+- `rms_norm`: single dispatch of `rms_norm.comp` with `do_multiply=true`, `data_b=weight`, `p.param1=epsilon`.
+- `fused_add_rms_norm`: compose:
+  1) `add.comp` into `residual` or a temp: `input = input + residual` (vectorized add)
+  2) `rms_norm.comp` with `do_multiply=true` using `weight`, `epsilon` over the result
+
+Notes:
+- CUDA `rms_norm` writes to `out` shaped `[num_tokens, hidden]`. vulkan flattens and uses strides; map `input_stride` to `p.nb01` and contiguous output to linear `d_offset`.
+
+---
+
+## Rotary embeddings (RoPE)
+
+CUDA entrypoints:
+- `void rotary_embedding(Tensor& positions, Tensor& query, optional<Tensor> key, int64_t head_size, Tensor& cos_sin_cache, bool is_neox)`
+- `void batched_rotary_embedding(Tensor& positions, Tensor& query, optional<Tensor> key, int64_t head_size, Tensor& cos_sin_cache, bool is_neox, int64_t rot_dim, Tensor& cos_sin_cache_offsets)`
+
+Vulkan shaders:
+- Shared header: `rope_head.comp` (bindings, push constants, YARN helpers)
+- GPT-NeoX layout (split halves): `rope_neox.comp` → SPV: `rope_neox_f32`/`_f16`
+- GPT-J/interleaved pairs: `rope_norm.comp` → SPV: `rope_norm_f32`/`_f16`
+- Batched/multi sections: `rope_multi.comp` → SPV: `rope_multi_f32`/`_f16`
+- Vision variant: `rope_vision.comp` (not required for LLM)
+
+Bindings (`rope_head.comp`):
+- binding 0: `A_TYPE data_a[]` (input Q/K flattened; we dispatch separately per tensor or pack)
+- binding 1: `int data_pos[]` (positions per channel/token)
+- binding 2: `float data_ff[]` (optional frequency factors; set `has_ff=0` if unused)
+- binding 3: `D_TYPE data_d[]` (output)
+
+Push constants struct `p` (rope_head.comp):
+- `uint ncols`          // columns (full hidden or pair count depending shader)
+- `uint n_dims`         // rotational dims applied (<= hidden)
+- `float freq_scale`    // YARN scaling
+- `uint p_delta_rows`   // rows per channel (seq length per batch slice)
+- `float freq_base`     // unused in current rope_* kernels
+- `float ext_factor`    // YARN interpolation factor
+- `float attn_factor`   // magnitude scale factor
+- `float corr_dims[2]`  // YARN correction dims
+- `float theta_scale`   // base^(i/2) like scaler
+- `uint has_ff`         // 1 if `data_ff` used
+- `uint ne02`           // auxiliary dim for multi/vision
+- `uint s1, s2`         // strides to locate row/channel in `data_a`
+- `int sections[4]`     // per-section sizes for `rope_multi`/`rope_vision`
+- `uint is_back`        // 1 for backward (negate theta)
+
+Dispatch shape:
+- `rope_*` kernels use `layout(local_size_x = 1, local_size_y = 256, local_size_z = 1)` and index by `gl_GlobalInvocationID.x` as row and `gl_GlobalInvocationID.y` stepping col pairs.
+- For `[num_tokens, heads, head_size]`, set:
+  - `p.p_delta_rows = seq_len` or rows per channel in the flattening scheme
+  - `p.s1` = stride between rows (e.g., head_size or head_size/2 depending kernel)
+  - `p.s2` = stride between channels (heads * stride per head)
+  - `p.n_dims` = `rot_dim` (<= hidden), `p.ncols` = total columns per row
+- Choose shader:
+  - `is_neox=true` → `rope_neox.comp` (split halves)
+  - else → `rope_norm.comp` (interleaved pairs)
+  - batched LoRA offsets → `rope_multi.comp` and populate `sections[]`, `ne02`
+
+Wrapper mapping:
+- `rotary_embedding`: one dispatch over Q (and over K if passed, or pack Q/K into a larger buffer and dispatch once by setting shapes appropriately). Feed `data_pos` from `positions` view and set YARN params to no-op if not used: `freq_scale=1`, `ext_factor=0`, `attn_factor=1`, `corr_dims={0,0}`, `theta_scale=base`.
+- `batched_rotary_embedding`: prefer `rope_multi.comp` to model per-section offsets; otherwise loop over tokens and dispatch `rope_norm/neox` with position offsets applied in `data_pos` and sections.
+
+Note on cos/sin cache:
+- CUDA uses a precomputed `cos_sin_cache`. vulkan shaders compute cos/sin from `positions` and YARN params on-the-fly. To match numerics with cache-based paths, set `freq_scale=1`, `ext_factor=0`, `attn_factor=1`, and precompute `theta_scale` consistent with cache generation. If we must consume a cache directly, a tiny adapter shader would be needed.
+
+---
+
+## Activations (elementwise and gated)
+
+CUDA entrypoints:
+- Gated GLU-style: `silu_and_mul`, `mul_and_silu`, `gelu_and_mul`, `gelu_tanh_and_mul`, `fatrelu_and_mul`
+- Elementwise: `gelu_new`, `gelu_fast`, `gelu_quick`
+
+Vulkan shaders (elementwise):
+- `silu.comp` → SPV: `silu_f32`/`_f16` (x / (1 + exp(-x)))
+- `gelu.comp` → SPV: `gelu_f32`/`_f16` (tanh-style GELU)
+- `gelu_erf.comp` → SPV: `gelu_erf_f32`/`_f16` (erf-style GELU)
+- `gelu_quick.comp` → SPV: `gelu_quick_f32`/`_f16` (x*sigmoid(1.702x))
+- Also available: `relu.comp`, `leaky_relu.comp`, `tanh.comp`, `sigmoid.comp`
+
+Vulkan shaders (GLU gating):
+- Shared infra: `glu_head.comp`, `glu_main.comp`
+- SWiGLU (SiLU gate): `swiglu.comp` → SPV: `swiglu_f32`/`_f16`
+- GEGLU (tanh GELU gate): `geglu.comp` → SPV: `geglu_f32`/`_f16`
+- GEGLU erf: `geglu_erf.comp` → SPV: `geglu_erf_f32`/`_f16`
+- GEGLU quick: `geglu_quick.comp` → SPV: `geglu_quick_f32`/`_f16`
+
+Bindings (`glu_head.comp`):
+- binding 0: `A_TYPE data_a[]`  // primary input [N, 2*d] for default/swap modes
+- binding 1: `A_TYPE data_b[]`  // optional second input for split mode
+- binding 2: `D_TYPE data_d[]`  // output [N, d]
+
+Push constants (`glu_head.comp`):
+- `uint N`     // number of rows (tokens)
+- `uint ne00`  // columns in `data_a` (2*d when default/swap)
+- `uint ne20`  // output width `d`
+- `uint mode`  // 0 default (act on first half), 1 swapped (act on second half), 2 split (a and b separate)
+- `float alpha` // unused here
+- `float limit` // unused here
+
+Dispatch shape:
+- local_size: 512 x 1 x 1. Global size: `[ceil_div(N*ne20, 512), 1, 1]` set via xyz decomposition used in `glu_main.comp` (`i` linearizes row,col).
+
+Wrapper mapping:
+- `silu_and_mul`: dispatch `swiglu.comp` with `mode=0`, `N=num_tokens`, `ne00=2*d`, `ne20=d`. Input packs [x, y]; output is `silu(x) * y`.
+- `mul_and_silu`: same as above with `mode=1` (act on second half): `x * silu(y)`.
+- `gelu_and_mul`: dispatch `geglu.comp` with `mode=0` (tanh GELU gate).
+- `gelu_tanh_and_mul`: same as above (matches CUDA tanh approximation).
+- `gelu_new` (elementwise tanh-based), `gelu_fast` (elementwise tanh fast), `gelu_quick`: use `gelu.comp`, `gelu.comp`/`gelu_erf.comp` as needed, `gelu_quick.comp` respectively with `p.KX = d`, `N = num_tokens` in `generic_head.comp`-style launchers.
+- `fatrelu_and_mul`: not present as GLU shader; implement via a custom variant modeled on `glu_main.comp` with op `fatrelu(a, threshold) * b`, or approximate with `relu.comp` plus custom gate. If needed later, clone `swiglu.comp` pattern.