lightvector · ChinChangYang · Dec 31, 2025 · Dec 31, 2025 · Dec 31, 2025 · Dec 31, 2025
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -97,6 +97,56 @@ jobs:
           name: katago-macos-opencl
           path: cpp/katago
 
+  build-macos-metal:
+    runs-on: macos-latest
+    permissions:
+      contents: read
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Install dependencies
+        run: |
+          brew install ninja zlib libzip
+          brew tap chinchangyang/katagocoreml-cpp
+          brew install katagocoreml
+
+      - name: Cache CMake build
+        uses: actions/cache@v4
+        with:
+          path: |
+            cpp/CMakeCache.txt
+            cpp/CMakeFiles
+            cpp/build.ninja
+            cpp/.ninja_deps
+            cpp/.ninja_log
+          key: ${{ runner.os }}-cmake-metal-${{ hashFiles('**/CMakeLists.txt') }}
+          restore-keys: |
+            ${{ runner.os }}-cmake-metal-
+
+      - name: Configure CMake
+        working-directory: cpp
+        run: |
+          cmake . -G Ninja -DUSE_BACKEND=METAL -DCMAKE_BUILD_TYPE=Release
+
+      - name: Build
+        working-directory: cpp
+        run: |
+          ninja
+
+      - name: Run tests
+        working-directory: cpp
+        run: |
+          ./katago runtests
+
+      - name: Upload artifact
+        if: github.event_name == 'push' && github.ref == 'refs/heads/master'
+        uses: actions/upload-artifact@v4
+        with:
+          name: katago-macos-metal
+          path: cpp/katago
+
   build-windows:
     runs-on: windows-latest
     permissions:

diff --git a/Compiling.md b/Compiling.md
@@ -118,8 +118,12 @@ As also mentioned in the instructions below but repeated here for visibility, if
    * If using OpenCL, you will want to verify that KataGo is picking up the correct device (e.g. some systems may have both an Intel CPU OpenCL and GPU OpenCL, if KataGo appears to pick the wrong one, you can correct this by specifying `openclGpuToUse` in `configs/gtp_example.cfg`).
 
 ## MacOS
-   * TLDR:
+   * TLDR (Metal backend - recommended for most users, hybrid CPU+GPU+Neural Engine for maximum throughput):
      ```
+     # First, install the katagocoreml library via Homebrew
+     brew tap chinchangyang/katagocoreml-cpp
+     brew install katagocoreml
+
      git clone https://github.com/lightvector/KataGo.git
      cd KataGo/cpp
      # If you get missing library errors, install the appropriate packages using your system package manager and try again.
@@ -132,6 +136,7 @@ As also mentioned in the instructions below but repeated here for visibility, if
       * CMake with a minimum version of 3.18.2: `brew install cmake`.
       * AppleClang and Swift compilers: `xcode-select --install`.
       * If using the Metal backend, [Ninja](https://ninja-build.org): `brew install ninja`
+      * If using the Metal backend, katagocoreml library: `brew tap chinchangyang/katagocoreml-cpp && brew install katagocoreml`
       * libzip: `brew install libzip`.
       * If you want to do self-play training and research, probably Google perftools `brew install gperftools` for TCMalloc or some other better malloc implementation. For unknown reasons, the allocation pattern in self-play with large numbers of threads and parallel games causes a lot of memory fragmentation under glibc malloc that will eventually run your machine out of memory, but better mallocs handle it fine.
       * If compiling to contribute to public distributed training runs, OpenSSL is required (`brew install openssl`).

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -32,7 +32,7 @@ endif()
 set(BUILD_DISTRIBUTED 0 CACHE BOOL "Build with http support for contributing to distributed training")
 set(USE_BACKEND CACHE STRING "Neural net backend")
 string(TOUPPER "${USE_BACKEND}" USE_BACKEND)
-set_property(CACHE USE_BACKEND PROPERTY STRINGS "" CUDA TENSORRT OPENCL EIGEN)
+set_property(CACHE USE_BACKEND PROPERTY STRINGS "" CUDA TENSORRT OPENCL EIGEN METAL)
 
 set(USE_TCMALLOC 0 CACHE BOOL "Use TCMalloc")
 set(NO_GIT_REVISION 0 CACHE BOOL "Disable embedding the git revision into the compiled exe")
@@ -97,7 +97,7 @@ elseif(USE_BACKEND STREQUAL "TENSORRT")
     message(FATAL_ERROR "Combining USE_CACHE_TENSORRT_PLAN with BUILD_DISTRIBUTED is not supported - it would consume excessive disk space and might worsen performance every time models are updated. Use only one at a time in a given build of KataGo.")
   endif()
 elseif(USE_BACKEND STREQUAL "METAL")
-  message(STATUS "-DUSE_BACKEND=METAL, using Metal backend.")
+  message(STATUS "-DUSE_BACKEND=METAL, using Metal backend with hybrid MPSGraph + CoreML execution.")
   if(NOT "${CMAKE_GENERATOR}" STREQUAL "Ninja")
     message(FATAL_ERROR "Bidirectional C++ Interop requires Ninja generator. Have ${CMAKE_GENERATOR}")
   endif()
@@ -107,6 +107,8 @@ elseif(USE_BACKEND STREQUAL "METAL")
   if(NOT "${CMAKE_CXX_COMPILER_ID}" STREQUAL "AppleClang")
     message(FATAL_ERROR "Project requires building with AppleClang. Have ${CMAKE_CXX_COMPILER_ID}")
   endif()
+  find_package(PkgConfig REQUIRED)
+  pkg_check_modules(KATAGOCOREML REQUIRED katagocoreml)
   list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/external/macos/cmake/modules")
   include(InitializeSwift)
   include(AddSwift)
@@ -115,11 +117,11 @@ elseif(USE_BACKEND STREQUAL "METAL")
     neuralnet/metalbackend.cpp
     )
   add_library(KataGoSwift STATIC
-    neuralnet/metalbackend.swift)
+    neuralnet/metalbackend.swift
+    neuralnet/metallayers.swift)
   _swift_generate_cxx_header(
     KataGoSwift
-    "${CMAKE_CURRENT_BINARY_DIR}/include/KataGoSwift/KataGoSwift-swift.h"
-    SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/neuralnet/metalbackend.swift")
+    "${CMAKE_CURRENT_BINARY_DIR}/include/KataGoSwift/KataGoSwift-swift.h")
   target_include_directories(KataGoSwift PUBLIC "${CMAKE_CURRENT_BINARY_DIR}/include")
   set_target_properties(KataGoSwift PROPERTIES Swift_MODULE_NAME "KataGoSwift")
   target_compile_options(KataGoSwift PUBLIC
@@ -399,9 +401,14 @@ elseif(USE_BACKEND STREQUAL "TENSORRT")
   target_link_libraries(katago CUDA::cudart_static ${TENSORRT_LIBRARY})
 elseif(USE_BACKEND STREQUAL "METAL")
   target_compile_definitions(katago PRIVATE USE_METAL_BACKEND)
-  target_link_libraries(katago KataGoSwift)
+  target_include_directories(katago PRIVATE ${KATAGOCOREML_INCLUDE_DIRS})
+  find_library(KATAGOCOREML_LIB katagocoreml HINTS /usr/local/lib REQUIRED)
+  target_link_directories(katago PRIVATE ${KATAGOCOREML_LIBRARY_DIRS})
+  target_link_libraries(katago KataGoSwift ${KATAGOCOREML_LIB} ${KATAGOCOREML_LDFLAGS}
+    "-framework MetalPerformanceShaders"
+    "-framework MetalPerformanceShadersGraph")
   if("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "x86_64")
-    message(WARNING "You are currently running cmake on an Intel-based processor. It is known that running KataGo in this configuration may encounter performance issues. It is recommended to switch to a cmake version designed for ARM64 architecture for optimal performance.")
+    message(WARNING "Metal backend may not work optimally on Intel. ARM64 architecture is recommended.")
   endif()
 elseif(USE_BACKEND STREQUAL "OPENCL")
   target_compile_definitions(katago PRIVATE USE_OPENCL_BACKEND)

diff --git a/cpp/configs/analysis_example.cfg b/cpp/configs/analysis_example.cfg
@@ -224,15 +224,33 @@ nnRandomize = true
 # ------------------------------
 # These only apply when using the METAL version of KataGo.
 
-# For one Metal instance: KataGo will automatically use the default device.
-# metalDeviceToUse = 0
-
-# For two Metal instance: Uncomment these options, AND set numNNServerThreadsPerModel = 2 above.
-# This will create two Metal instances, best overlapping the GPU and CPU execution.
+# Metal backend dispatch is configured via numNNServerThreadsPerModel and metalDeviceToUseThread<N>.
+# Device index values:
+#   0   = GPU only (MPSGraph) - default
+#   100 = ANE only (CoreML, runs on CPU + Apple Neural Engine)
+#
+# Mux mode (recommended): 4 pipelined server threads (2x GPU + 2x ANE).
+# Set nnMaxBatchSize to half of numSearchThreads for optimal pipelining.
+#
+# Example: mux mode (best throughput)
+# numNNServerThreadsPerModel = 4
+# metalDeviceToUseThread0 = 0
+# metalDeviceToUseThread1 = 0
+# metalDeviceToUseThread2 = 100
+# metalDeviceToUseThread3 = 100
+#
+# Example: GPU-only mode (default)
+# numNNServerThreadsPerModel = 1
 # metalDeviceToUseThread0 = 0
-# metalDeviceToUseThread1 = 1
+#
+# Example: ANE-only mode
+# numNNServerThreadsPerModel = 1
+# metalDeviceToUseThread0 = 100
+#
+# Default (no config): 1 server thread, GPU-only mode (gpuIdx = 0).
 
-# The pattern continues for additional Metal instances.
+# FP16 precision (default true). Set to false for exact FP32 inference (slower).
+# metalUseFP16 = true
 
 
 # OpenCL-specific GPU settings--------------------------------------

diff --git a/cpp/configs/gtp_example.cfg b/cpp/configs/gtp_example.cfg
@@ -460,15 +460,33 @@ searchFactorWhenWinningThreshold = 0.95
 # ------------------------------
 # These only apply when using the METAL version of KataGo.
 
-# For one Metal instance: KataGo will automatically use the default device.
-# metalDeviceToUse = 0
-
-# For two Metal instance: Uncomment these options, AND set numNNServerThreadsPerModel = 2 above.
-# This will create two Metal instances, best overlapping the GPU and CPU execution.
+# Metal backend dispatch is configured via numNNServerThreadsPerModel and metalDeviceToUseThread<N>.
+# Device index values:
+#   0   = GPU only (MPSGraph) - default
+#   100 = ANE only (CoreML, runs on CPU + Apple Neural Engine)
+#
+# Mux mode (recommended): 4 pipelined server threads (2x GPU + 2x ANE).
+# Set nnMaxBatchSize to half of numSearchThreads for optimal pipelining.
+#
+# Example: mux mode (best throughput)
+# numNNServerThreadsPerModel = 4
 # metalDeviceToUseThread0 = 0
-# metalDeviceToUseThread1 = 1
+# metalDeviceToUseThread1 = 0
+# metalDeviceToUseThread2 = 100
+# metalDeviceToUseThread3 = 100
+#
+# Example: GPU-only mode (default)
+# numNNServerThreadsPerModel = 1
+# metalDeviceToUseThread0 = 0
+#
+# Example: ANE-only mode
+# numNNServerThreadsPerModel = 1
+# metalDeviceToUseThread0 = 100
+#
+# Default (no config): 1 server thread, GPU-only mode (gpuIdx = 0).
 
-# The pattern continues for additional Metal instances.
+# FP16 precision (default true). Set to false for exact FP32 inference (slower).
+# metalUseFP16 = true
 
 # ------------------------------
 # OpenCL GPU settings