From 928275061459a9a93336cc6b931d8e1be8374293 Mon Sep 17 00:00:00 2001 From: bung87 Date: Mon, 9 Feb 2026 00:32:23 +0800 Subject: [PATCH] apple metal support --- arraymancer.nimble | 5 + src/arraymancer/tensor.nim | 4 + .../tensor/backend/global_config.nim | 4 + src/arraymancer/tensor/backend/metal.nim | 29 + .../tensor/backend/metal/metal_backend.nim | 576 ++++++++++++++++++ .../tensor/backend/metal/metal_buffer.nim | 96 +++ .../tensor/backend/metal/metal_kernels.metal | 369 +++++++++++ src/arraymancer/tensor/data_structure.nim | 46 +- src/arraymancer/tensor/init_copy_metal.nim | 28 + src/arraymancer/tensor/init_metal.nim | 65 ++ .../tensor/operators_blas_l1_metal.nim | 103 ++++ .../tensor/operators_blas_l2l3_metal.nim | 86 +++ .../tensor/operators_broadcasted_metal.nim | 194 ++++++ .../tensor/private/p_checks_metal.nim | 45 ++ .../tensor/private/p_init_metal.nim | 59 ++ .../tensor/shapeshifting_metal.nim | 137 +++++ src/arraymancer/tensor/tensor_metal.nim | 32 + tests/tensor/test_accessors_slicer_metal.nim | 69 +++ tests/tensor/test_broadcasting_metal.nim | 54 ++ tests/tensor/test_init_metal.nim | 33 + tests/tensor/test_operators_blas_metal.nim | 93 +++ tests/tensor/test_shapeshifting_metal.nim | 67 ++ tests/tests_metal.nim | 23 + 23 files changed, 2216 insertions(+), 1 deletion(-) create mode 100644 src/arraymancer/tensor/backend/metal.nim create mode 100644 src/arraymancer/tensor/backend/metal/metal_backend.nim create mode 100644 src/arraymancer/tensor/backend/metal/metal_buffer.nim create mode 100644 src/arraymancer/tensor/backend/metal/metal_kernels.metal create mode 100644 src/arraymancer/tensor/init_copy_metal.nim create mode 100644 src/arraymancer/tensor/init_metal.nim create mode 100644 src/arraymancer/tensor/operators_blas_l1_metal.nim create mode 100644 src/arraymancer/tensor/operators_blas_l2l3_metal.nim create mode 100644 src/arraymancer/tensor/operators_broadcasted_metal.nim create mode 100644 src/arraymancer/tensor/private/p_checks_metal.nim create mode 100644 src/arraymancer/tensor/private/p_init_metal.nim create mode 100644 src/arraymancer/tensor/shapeshifting_metal.nim create mode 100644 src/arraymancer/tensor/tensor_metal.nim create mode 100644 tests/tensor/test_accessors_slicer_metal.nim create mode 100644 tests/tensor/test_broadcasting_metal.nim create mode 100644 tests/tensor/test_init_metal.nim create mode 100644 tests/tensor/test_operators_blas_metal.nim create mode 100644 tests/tensor/test_shapeshifting_metal.nim create mode 100644 tests/tests_metal.nim diff --git a/arraymancer.nimble b/arraymancer.nimble index 3622d298d..206fefac3 100644 --- a/arraymancer.nimble +++ b/arraymancer.nimble @@ -219,6 +219,11 @@ task test_opencl, "Run all OpenCL backend tests": switches.add " -d:blas=cblas" # Archlinux, comment out on Debian/Ubuntu test "tests_opencl", switches, split = false, "cpp" +task test_metal, "Run all Metal backend tests": + var switches = " -d:metal" + switches.add " -d:blas=cblas" # Archlinux, comment out on Debian/Ubuntu + test "tests_metal", switches, split = false + # task test_deprecated, "Run all tests on deprecated procs": # test "tests_cpu_deprecated" diff --git a/src/arraymancer/tensor.nim b/src/arraymancer/tensor.nim index 25aeb0233..2f9ed7a9f 100644 --- a/src/arraymancer/tensor.nim +++ b/src/arraymancer/tensor.nim @@ -81,3 +81,7 @@ when defined(cuda) or defined(nimdoc) or defined(nimsuggest): when defined(opencl) or defined(nimdoc) or defined(nimsuggest): import ./tensor/tensor_opencl export tensor_opencl + +when defined(metal) or defined(nimdoc) or defined(nimsuggest): + import ./tensor/tensor_metal + export tensor_metal diff --git a/src/arraymancer/tensor/backend/global_config.nim b/src/arraymancer/tensor/backend/global_config.nim index 35fa15bfa..df1303c9c 100644 --- a/src/arraymancer/tensor/backend/global_config.nim +++ b/src/arraymancer/tensor/backend/global_config.nim @@ -32,6 +32,10 @@ const CUDA_HOF_BPG*: cint = 256 # should be (grid-stride+threadsPerBlock-1) const OMP_FOR_THRESHOLD* = 1000 # Tensor number of elements threshold before using OpenMP multithreading const OMP_MAX_REDUCE_BLOCKS* = 8 # Max number of expected OpenMP threads (used in reduce) +# Metal-specific configuration +const METAL_THREADGROUP_SIZE* = 256 # Default threadgroup size for Metal kernels +const METAL_BUFFER_POOL_MAX_SIZE* = 1024 * 1024 * 1024 # 1GB max buffer pool size + # Full procesor optimization (AVX, AVX2, ARM neon, ... if applicable) when defined(native): {.passC: "-march=native".} diff --git a/src/arraymancer/tensor/backend/metal.nim b/src/arraymancer/tensor/backend/metal.nim new file mode 100644 index 000000000..ef8c24062 --- /dev/null +++ b/src/arraymancer/tensor/backend/metal.nim @@ -0,0 +1,29 @@ +# Copyright 2017 the Arraymancer contributors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +## Metal backend for Arraymancer +## Provides GPU acceleration on macOS using Apple's Metal framework + +when defined(metal): + import ../data_structure, + ./metal/metal_buffer, + ./metal/metal_backend + + export metal_buffer, + metal_backend + + # Re-export types for convenience + export MetalBuffer, MetalBufferPool + export MTLDevice, MTLCommandQueue, MTLCommandBuffer, MTLBuffer + export MetalContext, metalContext, initMetalContext, isMetalAvailable diff --git a/src/arraymancer/tensor/backend/metal/metal_backend.nim b/src/arraymancer/tensor/backend/metal/metal_backend.nim new file mode 100644 index 000000000..e24fdbba2 --- /dev/null +++ b/src/arraymancer/tensor/backend/metal/metal_backend.nim @@ -0,0 +1,576 @@ +# Copyright 2017 the Arraymancer contributors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import ./metal_buffer, std/tables + +{.passC: "-x objective-c".} +{.passL: "-framework Metal".} +{.passL: "-framework MetalPerformanceShaders".} +{.passL: "-framework Foundation".} + +{.emit: """ +#import +#import + +id arraymancer_getDefaultDevice() { + NSArray> *devices = MTLCopyAllDevices(); + if (devices && [devices count] > 0) { + id device = [devices objectAtIndex:0]; + [devices release]; + return device; + } + return nil; +} + +id arraymancer_createCommandQueue(id device) { + return [device newCommandQueue]; +} + +id arraymancer_createBuffer(id device, size_t length) { + return [device newBufferWithLength:length options:MTLResourceStorageModeShared]; +} + +void* arraymancer_getBufferContents(id buffer) { + return [buffer contents]; +} + +void arraymancer_copyToBuffer(id buffer, const void* data, size_t length) { + memcpy([buffer contents], data, length); +} + +void arraymancer_copyFromBuffer(id buffer, void* data, size_t length) { + memcpy(data, [buffer contents], length); +} + +id arraymancer_createLibraryFromSource(id device, const char* source) { + NSError* error = nil; + NSString* sourceString = [NSString stringWithUTF8String:source]; + id library = [device newLibraryWithSource:sourceString options:nil error:&error]; + if (error) { + NSLog(@"Error creating library: %@", error); + return nil; + } + return library; +} + +id arraymancer_getFunction(id library, const char* name) { + NSString* funcName = [NSString stringWithUTF8String:name]; + return [library newFunctionWithName:funcName]; +} + +id arraymancer_createComputePipeline(id device, id function) { + NSError* error = nil; + id pipeline = [device newComputePipelineStateWithFunction:function error:&error]; + if (error) { + NSLog(@"Error creating pipeline: %@", error); + return nil; + } + return pipeline; +} + +id arraymancer_createCommandBuffer(id queue) { + return [queue commandBuffer]; +} + +id arraymancer_createComputeEncoder(id buffer) { + return [buffer computeCommandEncoder]; +} + +void arraymancer_setComputePipeline(id encoder, id pipeline) { + [encoder setComputePipelineState:pipeline]; +} + +void arraymancer_setBuffer(id encoder, id buffer, uint index) { + [encoder setBuffer:buffer offset:0 atIndex:index]; +} + +void arraymancer_setBytes(id encoder, const void* bytes, size_t length, uint index) { + [encoder setBytes:bytes length:length atIndex:index]; +} + +void arraymancer_dispatchThreads1D(id encoder, size_t size, size_t threadgroupSize) { + MTLSize gridSize = MTLSizeMake(size, 1, 1); + MTLSize tgSize = MTLSizeMake(threadgroupSize, 1, 1); + [encoder dispatchThreads:gridSize threadsPerThreadgroup:tgSize]; +} + +void arraymancer_dispatchThreads2D(id encoder, size_t width, size_t height, size_t threadgroupWidth, size_t threadgroupHeight) { + MTLSize gridSize = MTLSizeMake(width, height, 1); + MTLSize tgSize = MTLSizeMake(threadgroupWidth, threadgroupHeight, 1); + [encoder dispatchThreads:gridSize threadsPerThreadgroup:tgSize]; +} + +void arraymancer_endEncoding(id encoder) { + [encoder endEncoding]; +} + +void arraymancer_commitCommandBuffer(id buffer) { + [buffer commit]; +} + +void arraymancer_waitForCompletion(id buffer) { + [buffer waitUntilCompleted]; +} + +id arraymancer_createBlitEncoder(id buffer) { + return [buffer blitCommandEncoder]; +} + +void arraymancer_copyBuffer(id encoder, id src, id dst, size_t length) { + [encoder copyFromBuffer:src sourceOffset:0 toBuffer:dst destinationOffset:0 size:length]; +} + +void arraymancer_endBlitEncoding(id encoder) { + [encoder endEncoding]; +} +""".} + +proc getDefaultDevice(): MTLDevice {.importc: "arraymancer_getDefaultDevice", nodecl.} +proc createCommandQueue(device: MTLDevice): MTLCommandQueue {.importc: "arraymancer_createCommandQueue", nodecl.} +proc createBuffer(device: MTLDevice, length: csize_t): MTLBuffer {.importc: "arraymancer_createBuffer", nodecl.} +proc getBufferContents(buffer: MTLBuffer): pointer {.importc: "arraymancer_getBufferContents", nodecl.} +proc copyToBuffer(buffer: MTLBuffer, data: pointer, length: csize_t) {.importc: "arraymancer_copyToBuffer", nodecl.} +proc copyFromBuffer(buffer: MTLBuffer, data: pointer, length: csize_t) {.importc: "arraymancer_copyFromBuffer", nodecl.} +proc createLibraryFromSource(device: MTLDevice, source: cstring): MTLLibrary {.importc: "arraymancer_createLibraryFromSource", nodecl.} +proc getFunction(library: MTLLibrary, name: cstring): MTLFunction {.importc: "arraymancer_getFunction", nodecl.} +proc createComputePipeline(device: MTLDevice, function: MTLFunction): MTLComputePipelineState {.importc: "arraymancer_createComputePipeline", nodecl.} +proc createCommandBuffer(queue: MTLCommandQueue): MTLCommandBuffer {.importc: "arraymancer_createCommandBuffer", nodecl.} +proc createComputeEncoder(buffer: MTLCommandBuffer): MTLComputeCommandEncoder {.importc: "arraymancer_createComputeEncoder", nodecl.} +proc setComputePipeline(encoder: MTLComputeCommandEncoder, pipeline: MTLComputePipelineState) {.importc: "arraymancer_setComputePipeline", nodecl.} +proc setBuffer(encoder: MTLComputeCommandEncoder, buffer: MTLBuffer, index: cuint) {.importc: "arraymancer_setBuffer", nodecl.} +proc setBytes(encoder: MTLComputeCommandEncoder, bytes: pointer, length: csize_t, index: cuint) {.importc: "arraymancer_setBytes", nodecl.} +proc dispatchThreads1D(encoder: MTLComputeCommandEncoder, size, threadgroupSize: csize_t) {.importc: "arraymancer_dispatchThreads1D", nodecl.} +proc dispatchThreads2D(encoder: MTLComputeCommandEncoder, width, height, threadgroupWidth, threadgroupHeight: csize_t) {.importc: "arraymancer_dispatchThreads2D", nodecl.} +proc endEncoding(encoder: MTLComputeCommandEncoder) {.importc: "arraymancer_endEncoding", nodecl.} +proc commitCommandBuffer(buffer: MTLCommandBuffer) {.importc: "arraymancer_commitCommandBuffer", nodecl.} +proc waitForCompletion(buffer: MTLCommandBuffer) {.importc: "arraymancer_waitForCompletion", nodecl.} +proc createBlitEncoder(buffer: MTLCommandBuffer): MTLBlitCommandEncoder {.importc: "arraymancer_createBlitEncoder", nodecl.} +proc copyBuffer(encoder: MTLBlitCommandEncoder, src, dst: MTLBuffer, length: csize_t) {.importc: "arraymancer_copyBuffer", nodecl.} +proc endBlitEncoding(encoder: MTLBlitCommandEncoder) {.importc: "arraymancer_endBlitEncoding", nodecl.} + +type + MetalContext* = object + device*: MTLDevice + commandQueue*: MTLCommandQueue + bufferPool*: MetalBufferPool + library*: MTLLibrary + kernelCache*: Table[string, MTLComputePipelineState] + initialized*: bool + +var metalContext*: MetalContext + +# Embedded Metal kernel source code +const metalKernelSource = slurp("./metal_kernels.metal") + +proc initMetalContext*() = + if metalContext.initialized: + return + metalContext.device = getDefaultDevice() + if metalContext.device == nil: + raise newException(IOError, "Failed to get default Metal device. " & + "This can happen if:\n" & + " - You're running in a sandboxed/headless environment\n" & + " - You don't have an Apple Silicon Mac (M1/M2/M3)\n" & + " - The Metal framework is not accessible") + metalContext.commandQueue = createCommandQueue(metalContext.device) + metalContext.bufferPool = initMetalBufferPool() + metalContext.library = createLibraryFromSource(metalContext.device, metalKernelSource.cstring) + if metalContext.library == nil: + raise newException(IOError, "Failed to create Metal library from kernel source") + metalContext.kernelCache = initTable[string, MTLComputePipelineState]() + metalContext.initialized = true + +proc tryInitMetalContext*(): bool = + ## Try to initialize Metal context, return true if successful + if metalContext.initialized: + return true + try: + initMetalContext() + return true + except: + return false + +proc isMetalAvailable*(): bool = + if not metalContext.initialized: + return tryInitMetalContext() + return metalContext.initialized and metalContext.device != nil + +proc getKernel*(kernelName: string): MTLComputePipelineState = + ## Get or create a compute pipeline for a kernel function + if not metalContext.initialized: + initMetalContext() + + if metalContext.kernelCache.hasKey(kernelName): + return metalContext.kernelCache[kernelName] + + let function = getFunction(metalContext.library, kernelName.cstring) + if function == nil: + raise newException(ValueError, "Kernel function not found: " & kernelName) + + let pipeline = createComputePipeline(metalContext.device, function) + if pipeline == nil: + raise newException(IOError, "Failed to create compute pipeline for: " & kernelName) + + metalContext.kernelCache[kernelName] = pipeline + return pipeline + +proc createMetalBuffer*(length: int): MetalBuffer = + if not metalContext.initialized: + initMetalContext() + result = MetalBuffer() + result.buffer = createBuffer(metalContext.device, csize_t(length)) + result.length = length + result.devicePtr = getBufferContents(result.buffer) + +proc uploadToBuffer*(mb: MetalBuffer, data: pointer, length: int) = + copyToBuffer(mb.buffer, data, csize_t(length)) + +proc downloadFromBuffer*(mb: MetalBuffer, data: pointer, length: int) = + copyFromBuffer(mb.buffer, data, csize_t(length)) + +proc copyBuffer*(src, dst: MetalBuffer, length: int) = + ## Copy data from one Metal buffer to another + if not metalContext.initialized: + initMetalContext() + + let commandBuffer = createCommandBuffer(metalContext.commandQueue) + let blitEncoder = createBlitEncoder(commandBuffer) + + copyBuffer(blitEncoder, src.buffer, dst.buffer, csize_t(length)) + endBlitEncoding(blitEncoder) + commitCommandBuffer(commandBuffer) + waitForCompletion(commandBuffer) + +proc executeElementwiseKernel*( + kernelName: string, + buffers: seq[MetalBuffer], + constants: seq[pointer], + constantSizes: seq[int], + gridSize: int +) = + ## Execute a 1D elementwise kernel + if not metalContext.initialized: + initMetalContext() + + let pipeline = getKernel(kernelName) + let commandBuffer = createCommandBuffer(metalContext.commandQueue) + let encoder = createComputeEncoder(commandBuffer) + + setComputePipeline(encoder, pipeline) + + # Set buffers + for i, buf in buffers: + setBuffer(encoder, buf.buffer, cuint(i)) + + # Set constants + for i, constPtr in constants: + setBytes(encoder, constPtr, csize_t(constantSizes[i]), cuint(buffers.len + i)) + + # Dispatch threads + let threadgroupSize = 256 + dispatchThreads1D(encoder, csize_t(gridSize), csize_t(threadgroupSize)) + + endEncoding(encoder) + commitCommandBuffer(commandBuffer) + waitForCompletion(commandBuffer) + +proc executeGemmKernel*( + kernelName: string, + A, B, C: MetalBuffer, + M, N, K: int, + alpha, beta: float32 +) = + ## Execute a GEMM kernel + if not metalContext.initialized: + initMetalContext() + + let pipeline = getKernel(kernelName) + let commandBuffer = createCommandBuffer(metalContext.commandQueue) + let encoder = createComputeEncoder(commandBuffer) + + setComputePipeline(encoder, pipeline) + + # Set buffers + setBuffer(encoder, A.buffer, 0) + setBuffer(encoder, B.buffer, 1) + setBuffer(encoder, C.buffer, 2) + + # Set constants + var m = M + var n = N + var k = K + var a = alpha + var b = beta + setBytes(encoder, addr m, csize_t(sizeof(int)), 3) + setBytes(encoder, addr n, csize_t(sizeof(int)), 4) + setBytes(encoder, addr k, csize_t(sizeof(int)), 5) + setBytes(encoder, addr a, csize_t(sizeof(float32)), 6) + setBytes(encoder, addr b, csize_t(sizeof(float32)), 7) + + # Dispatch threads + let threadgroupWidth = 8 + let threadgroupHeight = 8 + dispatchThreads2D(encoder, csize_t(N), csize_t(M), csize_t(threadgroupWidth), csize_t(threadgroupHeight)) + + endEncoding(encoder) + commitCommandBuffer(commandBuffer) + waitForCompletion(commandBuffer) + +proc metalGemm*[T: SomeFloat]( + transA, transB: bool, + M, N, K: int, + alpha: T, + A: MetalBuffer, lda: int, + B: MetalBuffer, ldb: int, + beta: T, + C: MetalBuffer, ldc: int +) = + if not metalContext.initialized: + initMetalContext() + + # Metal only supports float32, not float64 + when T is float32: + var kernelName: string + if transA and not transB: + kernelName = "gemm_a_transpose_f32" + elif not transA and transB: + kernelName = "gemm_b_transpose_f32" + else: + kernelName = "gemm_naive_f32" + + executeGemmKernel(kernelName, A, B, C, M, N, K, alpha, beta) + else: + raise newException(ValueError, "Metal only supports float32, not float64") + +proc metalGemv*[T: SomeFloat]( + trans: bool, + M, N: int, + alpha: T, + A: MetalBuffer, lda: int, + x: MetalBuffer, incx: int, + beta: T, + y: MetalBuffer, incy: int +) = + ## Matrix-Vector multiplication: y = alpha * A * x + beta * y + ## Column-major layout (Fortran style) + if not metalContext.initialized: + initMetalContext() + + # Metal only supports float32, not float64 + when T is float32: + var m = M + var n = N + var a = float32(alpha) + var b = float32(beta) + + let pipeline = getKernel("gemv_f32") + let commandBuffer = createCommandBuffer(metalContext.commandQueue) + let encoder = createComputeEncoder(commandBuffer) + + setComputePipeline(encoder, pipeline) + setBuffer(encoder, A.buffer, 0) + setBuffer(encoder, x.buffer, 1) + setBuffer(encoder, y.buffer, 2) + setBytes(encoder, addr m, csize_t(sizeof(int)), 3) + setBytes(encoder, addr n, csize_t(sizeof(int)), 4) + setBytes(encoder, addr a, csize_t(sizeof(float32)), 5) + setBytes(encoder, addr b, csize_t(sizeof(float32)), 6) + + # Dispatch threads - one thread per output element (M rows) + let threadgroupSize = 256 + dispatchThreads1D(encoder, csize_t(M), csize_t(threadgroupSize)) + + endEncoding(encoder) + commitCommandBuffer(commandBuffer) + waitForCompletion(commandBuffer) + else: + raise newException(ValueError, "Metal only supports float32, not float64") + +proc metalElementwise*[T: SomeFloat]( + op: string, + A, B, C: MetalBuffer, + size: int +) = + if not metalContext.initialized: + initMetalContext() + + # Metal only supports float32, not float64 + when T is float32: + var kernelName: string + case op + of "add": kernelName = "elementwise_add_f32" + of "sub": kernelName = "elementwise_sub_f32" + of "mul": kernelName = "elementwise_mul_f32" + of "div": kernelName = "elementwise_div_f32" + else: raise newException(ValueError, "Unknown elementwise operation: " & op) + + var sizeVar = size + executeElementwiseKernel(kernelName, @[A, B, C], @[cast[pointer](addr sizeVar)], @[sizeof(int)], size) + else: + raise newException(ValueError, "Metal only supports float32, not float64") + +proc metalScalarMul*[T: SomeFloat]( + A, C: MetalBuffer, + scalar: T, + size: int +) = + if not metalContext.initialized: + initMetalContext() + + var kernelName = "scalar_mul_f32" + var sizeVar = size + var scalarVar = float32(scalar) + + let pipeline = getKernel(kernelName) + let commandBuffer = createCommandBuffer(metalContext.commandQueue) + let encoder = createComputeEncoder(commandBuffer) + + setComputePipeline(encoder, pipeline) + setBuffer(encoder, A.buffer, 0) + setBuffer(encoder, C.buffer, 1) + setBytes(encoder, addr scalarVar, csize_t(sizeof(float32)), 2) + setBytes(encoder, addr sizeVar, csize_t(sizeof(int)), 3) + + let threadgroupSize = 256 + dispatchThreads1D(encoder, csize_t(size), csize_t(threadgroupSize)) + + endEncoding(encoder) + commitCommandBuffer(commandBuffer) + waitForCompletion(commandBuffer) + +proc metalScalarAdd*[T: SomeFloat]( + A, C: MetalBuffer, + scalar: T, + size: int +) = + ## Element-wise addition of a scalar to a tensor + if not metalContext.initialized: + initMetalContext() + + var kernelName = "scalar_add_f32" + var sizeVar = size + var scalarVar = float32(scalar) + + let pipeline = getKernel(kernelName) + let commandBuffer = createCommandBuffer(metalContext.commandQueue) + let encoder = createComputeEncoder(commandBuffer) + + setComputePipeline(encoder, pipeline) + setBuffer(encoder, A.buffer, 0) + setBuffer(encoder, C.buffer, 1) + setBytes(encoder, addr scalarVar, csize_t(sizeof(float32)), 2) + setBytes(encoder, addr sizeVar, csize_t(sizeof(int)), 3) + + let threadgroupSize = 256 + dispatchThreads1D(encoder, csize_t(size), csize_t(threadgroupSize)) + + endEncoding(encoder) + commitCommandBuffer(commandBuffer) + waitForCompletion(commandBuffer) + +proc metalContiguousCopy*[T: SomeFloat]( + src, dst: MetalBuffer, + shape, strides: seq[int], + totalSize: int +) = + ## Copy data from a strided tensor to a contiguous buffer + if not metalContext.initialized: + initMetalContext() + + when T is float32: + let rank = shape.len + + # Create buffers for shape and strides + let shapeBuffer = createMetalBuffer(rank * sizeof(int32)) + let stridesBuffer = createMetalBuffer(rank * sizeof(int32)) + + # Upload shape and strides + var shapeData = newSeq[int32](rank) + var stridesData = newSeq[int32](rank) + for i in 0 ..< rank: + shapeData[i] = int32(shape[i]) + stridesData[i] = int32(strides[i]) + + uploadToBuffer(shapeBuffer, addr shapeData[0], rank * sizeof(int32)) + uploadToBuffer(stridesBuffer, addr stridesData[0], rank * sizeof(int32)) + + var totalSizeVar = int32(totalSize) + var rankVar = int32(rank) + + let pipeline = getKernel("contiguous_copy_f32") + let commandBuffer = createCommandBuffer(metalContext.commandQueue) + let encoder = createComputeEncoder(commandBuffer) + + setComputePipeline(encoder, pipeline) + setBuffer(encoder, src.buffer, 0) + setBuffer(encoder, dst.buffer, 1) + setBuffer(encoder, shapeBuffer.buffer, 2) + setBuffer(encoder, stridesBuffer.buffer, 3) + setBytes(encoder, addr rankVar, csize_t(sizeof(int32)), 4) + setBytes(encoder, addr totalSizeVar, csize_t(sizeof(int32)), 5) + + let threadgroupSize = 256 + dispatchThreads1D(encoder, csize_t(totalSize), csize_t(threadgroupSize)) + + endEncoding(encoder) + commitCommandBuffer(commandBuffer) + waitForCompletion(commandBuffer) + else: + raise newException(ValueError, "Metal only supports float32, not float64") + +proc metalDot*[T: SomeFloat](A, B: MetalBuffer, size: int): T = + ## Compute dot product of two vectors using GPU reduction + if not metalContext.initialized: + initMetalContext() + + when T is float32: + let kernelName = "vector_dot_f32" + var sizeVar = size + + # Create a buffer for partial results (one per threadgroup) + let threadgroupSize = 256 + let numGroups = (size + threadgroupSize - 1) div threadgroupSize + var partialResult = newSeq[float32](numGroups) + let partialBuffer = createMetalBuffer(numGroups * sizeof(float32)) + + let pipeline = getKernel(kernelName) + let commandBuffer = createCommandBuffer(metalContext.commandQueue) + let encoder = createComputeEncoder(commandBuffer) + + setComputePipeline(encoder, pipeline) + setBuffer(encoder, A.buffer, 0) + setBuffer(encoder, B.buffer, 1) + setBuffer(encoder, partialBuffer.buffer, 2) + setBytes(encoder, addr sizeVar, csize_t(sizeof(int)), 3) + + dispatchThreads1D(encoder, csize_t(size), csize_t(threadgroupSize)) + + endEncoding(encoder) + commitCommandBuffer(commandBuffer) + waitForCompletion(commandBuffer) + + # Download partial results and sum them on CPU + downloadFromBuffer(partialBuffer, addr partialResult[0], numGroups * sizeof(float32)) + + var sum: float32 = 0.0 + for val in partialResult: + sum += val + + return T(sum) + else: + raise newException(ValueError, "Metal only supports float32, not float64") + +proc preferCpuGemm*(m, n, k: int): bool = + const THRESHOLD = 128 * 128 * 128 + return m * n * k < THRESHOLD diff --git a/src/arraymancer/tensor/backend/metal/metal_buffer.nim b/src/arraymancer/tensor/backend/metal/metal_buffer.nim new file mode 100644 index 000000000..8f60381bb --- /dev/null +++ b/src/arraymancer/tensor/backend/metal/metal_buffer.nim @@ -0,0 +1,96 @@ +# Copyright 2017 the Arraymancer contributors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +type + MTLDevice* = pointer + MTLCommandQueue* = pointer + MTLCommandBuffer* = pointer + MTLComputeCommandEncoder* = pointer + MTLBlitCommandEncoder* = pointer + MTLBuffer* = pointer + MTLComputePipelineState* = pointer + MTLFunction* = pointer + MTLLibrary* = pointer + + MetalBufferObj* = object + buffer*: MTLBuffer + length*: int + devicePtr*: pointer + + MetalBuffer* = ref MetalBufferObj + + BufferPoolEntry* = object + buffer*: MetalBuffer + size*: int + lastUsed*: int64 + + MetalBufferPool* = object + entries*: seq[BufferPoolEntry] + maxSize*: int + currentSize*: int + accessCounter*: int64 + +const + DEFAULT_BUFFER_POOL_MAX_SIZE* = 1024 * 1024 * 1024 + BUFFER_POOL_ENTRY_MAX_AGE* = 1000 + +proc initMetalBufferPool*(maxSize: int = DEFAULT_BUFFER_POOL_MAX_SIZE): MetalBufferPool = + result.maxSize = maxSize + result.currentSize = 0 + result.accessCounter = 0 + +proc acquireBuffer*(pool: var MetalBufferPool, size: int, device: MTLDevice): MetalBuffer = + pool.accessCounter.inc + + for i in 0 ..< pool.entries.len: + if pool.entries[i].size >= size and pool.entries[i].buffer != nil: + result = pool.entries[i].buffer + pool.entries[i].lastUsed = pool.accessCounter + return + + result = nil + +proc releaseBuffer*(pool: var MetalBufferPool, buffer: MetalBuffer) = + if buffer == nil: + return + + pool.accessCounter.inc + + if pool.currentSize + buffer.length > pool.maxSize: + var oldestIdx = -1 + var oldestAccess = int64.high + for i in 0 ..< pool.entries.len: + if pool.entries[i].lastUsed < oldestAccess: + oldestAccess = pool.entries[i].lastUsed + oldestIdx = i + + if oldestIdx >= 0: + pool.currentSize -= pool.entries[oldestIdx].size + pool.entries.del(oldestIdx) + + pool.entries.add(BufferPoolEntry( + buffer: buffer, + size: buffer.length, + lastUsed: pool.accessCounter + )) + pool.currentSize += buffer.length + +proc cleanupPool*(pool: var MetalBufferPool) = + var i = 0 + while i < pool.entries.len: + if pool.accessCounter - pool.entries[i].lastUsed > BUFFER_POOL_ENTRY_MAX_AGE: + pool.currentSize -= pool.entries[i].size + pool.entries.del(i) + else: + i.inc diff --git a/src/arraymancer/tensor/backend/metal/metal_kernels.metal b/src/arraymancer/tensor/backend/metal/metal_kernels.metal new file mode 100644 index 000000000..cb97c0e3f --- /dev/null +++ b/src/arraymancer/tensor/backend/metal/metal_kernels.metal @@ -0,0 +1,369 @@ +// Copyright 2017 the Arraymancer contributors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +using namespace metal; + +// ============================================================================ +// Element-wise Operations (float32 only - Metal doesn't support float64) +// ============================================================================ + +kernel void elementwise_add_f32( + device const float* A [[buffer(0)]], + device const float* B [[buffer(1)]], + device float* C [[buffer(2)]], + constant int& size [[buffer(3)]], + uint gid [[thread_position_in_grid]]) +{ + if (gid < (uint)size) { + C[gid] = A[gid] + B[gid]; + } +} + +kernel void elementwise_sub_f32( + device const float* A [[buffer(0)]], + device const float* B [[buffer(1)]], + device float* C [[buffer(2)]], + constant int& size [[buffer(3)]], + uint gid [[thread_position_in_grid]]) +{ + if (gid < (uint)size) { + C[gid] = A[gid] - B[gid]; + } +} + +kernel void elementwise_mul_f32( + device const float* A [[buffer(0)]], + device const float* B [[buffer(1)]], + device float* C [[buffer(2)]], + constant int& size [[buffer(3)]], + uint gid [[thread_position_in_grid]]) +{ + if (gid < (uint)size) { + C[gid] = A[gid] * B[gid]; + } +} + +kernel void elementwise_div_f32( + device const float* A [[buffer(0)]], + device const float* B [[buffer(1)]], + device float* C [[buffer(2)]], + constant int& size [[buffer(3)]], + uint gid [[thread_position_in_grid]]) +{ + if (gid < (uint)size) { + C[gid] = A[gid] / B[gid]; + } +} + +// ============================================================================ +// Scalar Operations +// ============================================================================ + +kernel void scalar_mul_f32( + device const float* A [[buffer(0)]], + device float* C [[buffer(1)]], + constant float& scalar [[buffer(2)]], + constant int& size [[buffer(3)]], + uint gid [[thread_position_in_grid]]) +{ + if (gid < (uint)size) { + C[gid] = A[gid] * scalar; + } +} + +kernel void scalar_add_f32( + device const float* A [[buffer(0)]], + device float* C [[buffer(1)]], + constant float& scalar [[buffer(2)]], + constant int& size [[buffer(3)]], + uint gid [[thread_position_in_grid]]) +{ + if (gid < (uint)size) { + C[gid] = A[gid] + scalar; + } +} + +// ============================================================================ +// Simple GEMM (naive implementation for reference) +// Column-major layout (Fortran style) +// ============================================================================ + +kernel void gemm_naive_f32( + device const float* A [[buffer(0)]], + device const float* B [[buffer(1)]], + device float* C [[buffer(2)]], + constant int& M [[buffer(3)]], + constant int& N [[buffer(4)]], + constant int& K [[buffer(5)]], + constant float& alpha [[buffer(6)]], + constant float& beta [[buffer(7)]], + uint2 gid [[thread_position_in_grid]]) +{ + uint row = gid.y; // 0 to M-1 + uint col = gid.x; // 0 to N-1 + + if (row < (uint)M && col < (uint)N) { + float sum = 0.0; + // Column-major indexing: + // A[i,k] is at A[k*M + i] + // B[k,j] is at B[j*K + k] + // C[i,j] is at C[j*M + i] + for (int k = 0; k < K; k++) { + sum += A[k * M + row] * B[col * K + k]; + } + C[col * M + row] = alpha * sum + beta * C[col * M + row]; + } +} + +// ============================================================================ +// Transpose GEMM variants +// Column-major layout (Fortran style) +// ============================================================================ + +kernel void gemm_a_transpose_f32( + device const float* A [[buffer(0)]], + device const float* B [[buffer(1)]], + device float* C [[buffer(2)]], + constant int& M [[buffer(3)]], + constant int& N [[buffer(4)]], + constant int& K [[buffer(5)]], + constant float& alpha [[buffer(6)]], + constant float& beta [[buffer(7)]], + uint2 gid [[thread_position_in_grid]]) +{ + uint row = gid.y; + uint col = gid.x; + + if (row < (uint)M && col < (uint)N) { + float sum = 0.0; + // A^T[i,k] = A[k,i], in column-major: A[k,i] is at A[i*K + k] + for (int k = 0; k < K; k++) { + sum += A[row * K + k] * B[col * K + k]; + } + C[col * M + row] = alpha * sum + beta * C[col * M + row]; + } +} + +kernel void gemm_b_transpose_f32( + device const float* A [[buffer(0)]], + device const float* B [[buffer(1)]], + device float* C [[buffer(2)]], + constant int& M [[buffer(3)]], + constant int& N [[buffer(4)]], + constant int& K [[buffer(5)]], + constant float& alpha [[buffer(6)]], + constant float& beta [[buffer(7)]], + uint2 gid [[thread_position_in_grid]]) +{ + uint row = gid.y; + uint col = gid.x; + + if (row < (uint)M && col < (uint)N) { + float sum = 0.0; + // B^T[k,j] = B[j,k], in column-major: B[j,k] is at B[k*N + j] + for (int k = 0; k < K; k++) { + sum += A[k * M + row] * B[k * N + col]; + } + C[col * M + row] = alpha * sum + beta * C[col * M + row]; + } +} + +// ============================================================================ +// Batched GEMM (for future use with 3D+ tensors) +// ============================================================================ + +kernel void batched_gemm_f32( + device const float* A [[buffer(0)]], + device const float* B [[buffer(1)]], + device float* C [[buffer(2)]], + constant int& batchSize [[buffer(3)]], + constant int& M [[buffer(4)]], + constant int& N [[buffer(5)]], + constant int& K [[buffer(6)]], + constant float& alpha [[buffer(7)]], + constant float& beta [[buffer(8)]], + uint3 gid [[thread_position_in_grid]]) +{ + uint batch = gid.z; + uint row = gid.y; + uint col = gid.x; + + if (batch < (uint)batchSize && row < (uint)M && col < (uint)N) { + int batchOffsetA = batch * M * K; + int batchOffsetB = batch * K * N; + int batchOffsetC = batch * M * N; + + float sum = 0.0; + for (int k = 0; k < K; k++) { + sum += A[batchOffsetA + row * K + k] * B[batchOffsetB + k * N + col]; + } + C[batchOffsetC + row * N + col] = alpha * sum + beta * C[batchOffsetC + row * N + col]; + } +} + +// ============================================================================ +// Vector Operations (BLAS Level 1) +// ============================================================================ + +kernel void vector_dot_f32( + device const float* A [[buffer(0)]], + device const float* B [[buffer(1)]], + device float* result [[buffer(2)]], + constant int& size [[buffer(3)]], + uint gid [[thread_position_in_grid]], + uint lid [[thread_position_in_threadgroup]], + uint groupSize [[threads_per_threadgroup]]) +{ + threadgroup float localSum[256]; + + float sum = 0.0; + for (uint i = gid; i < (uint)size; i += groupSize) { + sum += A[i] * B[i]; + } + + localSum[lid] = sum; + threadgroup_barrier(mem_flags::mem_threadgroup); + + for (uint s = groupSize / 2; s > 0; s >>= 1) { + if (lid < s) { + localSum[lid] += localSum[lid + s]; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + } + + if (lid == 0) { + result[gid / groupSize] = localSum[0]; + } +} + +kernel void vector_scale_f32( + device float* X [[buffer(0)]], + constant float& alpha [[buffer(1)]], + constant int& size [[buffer(2)]], + uint gid [[thread_position_in_grid]]) +{ + if (gid < (uint)size) { + X[gid] *= alpha; + } +} + +kernel void vector_axpy_f32( + device const float* X [[buffer(0)]], + device float* Y [[buffer(1)]], + constant float& alpha [[buffer(2)]], + constant int& size [[buffer(3)]], + uint gid [[thread_position_in_grid]]) +{ + if (gid < (uint)size) { + Y[gid] += alpha * X[gid]; + } +} + +// ============================================================================ +// Activation Functions (for ML workloads) +// ============================================================================ + +kernel void relu_f32( + device const float* X [[buffer(0)]], + device float* Y [[buffer(1)]], + constant int& size [[buffer(2)]], + uint gid [[thread_position_in_grid]]) +{ + if (gid < (uint)size) { + Y[gid] = max(0.0f, X[gid]); + } +} + +kernel void sigmoid_f32( + device const float* X [[buffer(0)]], + device float* Y [[buffer(1)]], + constant int& size [[buffer(2)]], + uint gid [[thread_position_in_grid]]) +{ + if (gid < (uint)size) { + Y[gid] = 1.0f / (1.0f + exp(-X[gid])); + } +} + +kernel void tanh_f32( + device const float* X [[buffer(0)]], + device float* Y [[buffer(1)]], + constant int& size [[buffer(2)]], + uint gid [[thread_position_in_grid]]) +{ + if (gid < (uint)size) { + Y[gid] = tanh(X[gid]); + } +} + +// ============================================================================ +// Contiguous Copy Kernel +// Copies data from a strided tensor to a contiguous buffer +// ============================================================================ + +kernel void contiguous_copy_f32( + device const float* src [[buffer(0)]], + device float* dst [[buffer(1)]], + constant int* src_shape [[buffer(2)]], + constant int* src_strides [[buffer(3)]], + constant int& rank [[buffer(4)]], + constant int& total_size [[buffer(5)]], + uint gid [[thread_position_in_grid]]) +{ + if (gid >= (uint)total_size) return; + + // Convert flat index to multi-dimensional indices + int idx = (int)gid; + int src_offset = 0; + + for (int dim = rank - 1; dim >= 0; dim--) { + int coord = idx % src_shape[dim]; + src_offset += coord * src_strides[dim]; + idx /= src_shape[dim]; + } + + dst[gid] = src[src_offset]; +} + +// ============================================================================ +// Matrix-Vector Multiplication +// Column-major layout (Fortran style) +// y = alpha * A * x + beta * y +// ============================================================================ + +kernel void gemv_f32( + device const float* A [[buffer(0)]], + device const float* x [[buffer(1)]], + device float* y [[buffer(2)]], + constant int& M [[buffer(3)]], + constant int& N [[buffer(4)]], + constant float& alpha [[buffer(5)]], + constant float& beta [[buffer(6)]], + uint gid [[thread_position_in_grid]]) +{ + // Each thread computes one element of the result vector y + // gid is the row index (0 to M-1) + if (gid < (uint)M) { + float sum = 0.0; + // Column-major indexing: + // A[i,j] is at A[j*M + i] + // x[j] is at x[j] + // y[i] is at y[i] + for (int j = 0; j < N; j++) { + sum += A[j * M + gid] * x[j]; + } + y[gid] = alpha * sum + beta * y[gid]; + } +} diff --git a/src/arraymancer/tensor/data_structure.nim b/src/arraymancer/tensor/data_structure.nim index 7a282a920..c590640c3 100644 --- a/src/arraymancer/tensor/data_structure.nim +++ b/src/arraymancer/tensor/data_structure.nim @@ -90,12 +90,55 @@ when defined(opencl): offset*: int storage*: ClStorage[T] -when defined(cuda) and defined(opencl): +when defined(metal): + import ./backend/metal/metal_buffer + + type + MetalTensorRefTrackerObj*[T: SomeFloat] = object + value*: MetalBuffer + + MetalTensorRefTracker*[T] = ref MetalTensorRefTrackerObj[T] + + MetalStorage*[T: SomeFloat] = object + ## Opaque seq-like structure for storage on the Metal backend. + Flen*: int + Fbuffer*: MetalBuffer + Fref_tracking*: MetalTensorRefTracker[T] + + MetalTensor*[T: SomeFloat] = object + ## Tensor data structure stored on Apple Metal GPU + ## - ``shape``: Dimensions of the MetalTensor + ## - ``strides``: Numbers of items to skip to get the next item along a dimension. + ## - ``offset``: Offset to get the first item of the MetalTensor. Note: offset can be negative, in particular for slices. + ## - ``storage``: An opaque data storage for the MetalTensor + ## - ``cpuData``: Optional CPU copy for lazy transfer + ## + ## Warning ⚠: + ## Assignment ``var a = b`` does not copy the data. Data modification on one MetalTensor will be reflected on the other. + ## However modification on metadata (shape, strides or offset) will not affect the other tensor. + ## Explicit copies can be made with ``clone``: ``var a = b.clone`` + shape*: Metadata + strides*: Metadata + offset*: int + storage*: MetalStorage[T] + cpuData*: seq[T] ## Optional CPU copy for lazy transfer + + + +when defined(cuda) and defined(opencl) and defined(metal): + type AnyTensor*[T] = Tensor[T] or CudaTensor[T] or ClTensor[T] or MetalTensor[T] +elif defined(cuda) and defined(opencl): type AnyTensor*[T] = Tensor[T] or CudaTensor[T] or ClTensor[T] +elif defined(cuda) and defined(metal): + type AnyTensor*[T] = Tensor[T] or CudaTensor[T] or MetalTensor[T] +elif defined(opencl) and defined(metal): + type AnyTensor*[T] = Tensor[T] or ClTensor[T] or MetalTensor[T] elif defined(cuda): type AnyTensor*[T] = Tensor[T] or CudaTensor[T] elif defined(opencl): type AnyTensor*[T] = Tensor[T] or ClTensor[T] +elif defined(metal): + type AnyTensor*[T] = Tensor[T] or MetalTensor[T] else: type AnyTensor*[T] = Tensor[T] @@ -224,3 +267,4 @@ proc dataArray*[T: KnownSupportsCopyMem](t: Tensor[T]): ptr UncheckedArray[T] {. proc dataArray*[T: not KnownSupportsCopyMem](t: Tensor[T]): ptr UncheckedArray[T] {.error: "`dataArray` " & " is deprecated for mem copyable types and not supported for GC'ed types!".} + diff --git a/src/arraymancer/tensor/init_copy_metal.nim b/src/arraymancer/tensor/init_copy_metal.nim new file mode 100644 index 000000000..a34783a6a --- /dev/null +++ b/src/arraymancer/tensor/init_copy_metal.nim @@ -0,0 +1,28 @@ +# Copyright 2017 the Arraymancer contributors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import ./private/p_init_metal, + ./data_structure, + ./backend/metal/metal_backend, + ./init_metal, + ./init_copy_cpu + +proc clone*[T: SomeFloat](t: MetalTensor[T]): MetalTensor[T] {.noinit.} = + ## Clone (deep copy) a MetalTensor. + ## Copy will not share its data with the original. + ## Data is copied via CPU to ensure proper handling of strides and layout. + + let cpuTensor = t.toCpu() + let clonedCpu = init_copy_cpu.clone(cpuTensor) + result = clonedCpu.metal() diff --git a/src/arraymancer/tensor/init_metal.nim b/src/arraymancer/tensor/init_metal.nim new file mode 100644 index 000000000..b8470d05f --- /dev/null +++ b/src/arraymancer/tensor/init_metal.nim @@ -0,0 +1,65 @@ +# Copyright 2017 the Arraymancer contributors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import ../private/sequninit, + ./private/p_init_metal, + ./backend/metal/metal_backend, + ./data_structure, + ./init_cpu + +proc metal*[T: SomeFloat](t: Tensor[T]): MetalTensor[T] {.noinit.} = + ## Convert a tensor on CPU to a tensor on a Metal device. + ## This performs a synchronous copy to the GPU. + + result = newMetalTensor[T](t.shape) + + let contig_t = t.asContiguous(colMajor, force = true) + let size = result.size * sizeof(T) + + uploadToBuffer(result.storage.Fbuffer, contig_t.get_data_ptr, size) + + # Store a CPU copy for lazy transfer (optional optimization) + # For now, we leave it empty to avoid unnecessary memory duplication + result.cpuData = @[] + +proc cpu*[T: SomeFloat](t: MetalTensor[T]): Tensor[T] {.noinit.} = + ## Convert a tensor on a Metal device to a tensor on CPU. + ## This performs a synchronous copy from the GPU. + + result.shape = t.shape + result.strides = t.strides + result.offset = t.offset + + allocCpuStorage result.storage, t.storage.Flen + + let size = t.storage.Flen * sizeof(T) + downloadFromBuffer(t.storage.Fbuffer, result.get_data_ptr, size) + +proc toMetal*[T: SomeFloat](t: Tensor[T]): MetalTensor[T] {.noinit.} = + ## Convert a tensor on CPU to a tensor on a Metal device (lazy transfer). + ## The data is copied immediately for now; future versions may defer transfer. + metal(t) + +proc toCpu*[T: SomeFloat](t: MetalTensor[T]): Tensor[T] {.noinit.} = + ## Convert a tensor on a Metal device to a tensor on CPU (lazy transfer). + ## The data is copied immediately for now; future versions may defer transfer. + cpu(t) + +proc zeros_like*[T: SomeFloat](t: MetalTensor[T]): MetalTensor[T] {.noinit, inline.} = + ## Creates a new MetalTensor filled with 0 with the same shape as the input + result = zeros[T](t.shape).metal + +proc ones_like*[T: SomeFloat](t: MetalTensor[T]): MetalTensor[T] {.noinit, inline.} = + ## Creates a new MetalTensor filled with 1 with the same shape as the input + result = ones[T](t.shape).metal diff --git a/src/arraymancer/tensor/operators_blas_l1_metal.nim b/src/arraymancer/tensor/operators_blas_l1_metal.nim new file mode 100644 index 000000000..83dd5fad8 --- /dev/null +++ b/src/arraymancer/tensor/operators_blas_l1_metal.nim @@ -0,0 +1,103 @@ +# Copyright 2017 the Arraymancer contributors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import ./backend/metal/metal_backend, + ./private/p_init_metal, + ./private/p_checks_metal, + ./data_structure + +# #################################################################### +# BLAS Level 1 (Vector dot product, Addition, Scalar to Vector/Matrix) + +proc dot*[T: SomeFloat](a, b: MetalTensor[T]): T {.inline.} = + ## Vector to Vector dot (scalar) product using GPU reduction kernel + when compileOption("boundChecks"): + check_dot_prod(a, b) + + result = metalDot[T](a.storage.Fbuffer, b.storage.Fbuffer, a.storage.Flen) + +proc `+=`*[T: SomeFloat](a: var MetalTensor[T], b: MetalTensor[T]) = + ## MetalTensor in-place addition + + when compileOption("boundChecks"): + check_elementwise(a, b) + + metalElementwise[T]("add", + a.storage.Fbuffer, + b.storage.Fbuffer, + a.storage.Fbuffer, + a.storage.Flen + ) + +proc `+`*[T: SomeFloat](a, b: MetalTensor[T]): MetalTensor[T] {.noinit.} = + ## MetalTensor addition + + when compileOption("boundChecks"): + check_elementwise(a, b) + + result = newMetalTensor[T](a.shape) + metalElementwise[T]("add", + a.storage.Fbuffer, + b.storage.Fbuffer, + result.storage.Fbuffer, + a.storage.Flen + ) + +proc `-=`*[T: SomeFloat](a: var MetalTensor[T], b: MetalTensor[T]) = + ## MetalTensor in-place substraction + + when compileOption("boundChecks"): + check_elementwise(a, b) + + metalElementwise[T]("sub", + a.storage.Fbuffer, + b.storage.Fbuffer, + a.storage.Fbuffer, + a.storage.Flen + ) + +proc `-`*[T: SomeFloat](a, b: MetalTensor[T]): MetalTensor[T] {.noinit.} = + ## MetalTensor substraction + + when compileOption("boundChecks"): + check_elementwise(a, b) + + result = newMetalTensor[T](a.shape) + metalElementwise[T]("sub", + a.storage.Fbuffer, + b.storage.Fbuffer, + result.storage.Fbuffer, + a.storage.Flen + ) + +proc `*=`*[T: SomeFloat](t: var MetalTensor[T]; a: T) {.inline.} = + ## MetalTensor inplace multiplication by a scalar + + metalScalarMul[T]( + t.storage.Fbuffer, + t.storage.Fbuffer, + a, + t.storage.Flen + ) + +proc `*`*[T: SomeFloat](a: T, t: MetalTensor[T]): MetalTensor[T] {.noinit, inline.} = + ## MetalTensor multiplication by a scalar + + result = newMetalTensor[T](t.shape) + metalScalarMul[T]( + t.storage.Fbuffer, + result.storage.Fbuffer, + a, + t.storage.Flen + ) diff --git a/src/arraymancer/tensor/operators_blas_l2l3_metal.nim b/src/arraymancer/tensor/operators_blas_l2l3_metal.nim new file mode 100644 index 000000000..92c837c39 --- /dev/null +++ b/src/arraymancer/tensor/operators_blas_l2l3_metal.nim @@ -0,0 +1,86 @@ +# Copyright 2017 the Arraymancer contributors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import ./backend/metal/metal_backend, + ./private/p_init_metal, + ./private/p_checks_metal, + ./data_structure, + ./shapeshifting_metal + +proc metalMV_y_eq_aAx_p_by[T: SomeFloat]( + alpha: T, a, x: MetalTensor[T], + beta: T, y: var MetalTensor[T]) = + # Matrix-Vector: y = alpha A matvecmul x + beta y + + # TODO: remove this contiguous layout constraint + if not a.isContiguous: + raise newException(ValueError, "NotImplemented: for now matrix should be contiguous") + + let M = a.shape[0] + let N = a.shape[1] + + # Use GPU kernel for matrix-vector multiplication + metalGemv[T]( + false, # trans + M, N, + alpha, + a.storage.Fbuffer, M, # lda = rows of A + x.storage.Fbuffer, 1, # incx = 1 + beta, + y.storage.Fbuffer, 1 # incy = 1 + ) + +proc metalMM_C_eq_aAB_p_bC[T: SomeFloat]( + alpha: T, a, b: MetalTensor[T], + beta: T, c: var MetalTensor[T]) = + # Matrix: C = alpha A matmul B + beta C + # All tensors are in column-major order (Fortran style) + + # TODO: remove this contiguous layout constraint + if not (a.isContiguous and b.isContiguous): + raise newException(ValueError, "NotImplemented: for now both tensors should be contiguous") + + let M = a.shape[0] # rows of A and C + let N = b.shape[1] # cols of B and C + let K = a.shape[1] # cols of A, rows of B + + # For column-major matrices: + # lda = number of rows in A (M) + # ldb = number of rows in B (K) + # ldc = number of rows in C (M) + metalGemm[T]( + false, false, # transA, transB + M, N, K, + alpha, + a.storage.Fbuffer, M, # lda = rows of A + b.storage.Fbuffer, K, # ldb = rows of B + beta, + c.storage.Fbuffer, M # ldc = rows of C + ) + +proc `*`*[T: SomeFloat](a, b: MetalTensor[T]): MetalTensor[T] = + ## Matrix multiplication (Matrix-Matrix and Matrix-Vector) on Metal + + if a.rank == 2 and b.rank == 2: + when compileOption("boundChecks"): + check_matmat(a, b) + result = newMetalTensor[T]([a.shape[0], b.shape[1]]) + metalMM_C_eq_aAB_p_bC(1.T, a, b, 0.T, result) + elif a.rank == 2 and b.rank == 1: + when compileOption("boundChecks"): + check_matvec(a, b) + result = newMetalTensor[T]([a.shape[0]]) + metalMV_y_eq_aAx_p_by(1.T, a, b, 0.T, result) + else: + raise newException(ValueError, "Matrix-Matrix or Matrix-Vector multiplication valid only if first Tensor is a Matrix and second is a Matrix or Vector") diff --git a/src/arraymancer/tensor/operators_broadcasted_metal.nim b/src/arraymancer/tensor/operators_broadcasted_metal.nim new file mode 100644 index 000000000..bae7781d8 --- /dev/null +++ b/src/arraymancer/tensor/operators_broadcasted_metal.nim @@ -0,0 +1,194 @@ +# Copyright 2017 the Arraymancer contributors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import ./backend/metal/metal_backend, + ./private/p_init_metal, + ./data_structure, + ./operators_blas_l1_metal, + ./shapeshifting_metal + +import ../private/deprecate + +# ######################################################### +# # Broadcasting Tensor-Tensor +# # And element-wise multiplication (Hadamard) and division + +proc `+.`*[T: SomeFloat](a, b: MetalTensor[T]): MetalTensor[T] {.noinit,inline.} = + ## Broadcasted addition for tensors of incompatible but broadcastable shape. + let (tmp_a, tmp_b) = broadcast2(a, b) + result = tmp_a + tmp_b + +proc `-.`*[T: SomeFloat](a, b: MetalTensor[T]): MetalTensor[T] {.noinit,inline.} = + ## Broadcasted subtraction for tensors of incompatible but broadcastable shape. + let (tmp_a, tmp_b) = broadcast2(a, b) + result = tmp_a - tmp_b + +proc `*.`*[T: SomeFloat](a,b: MetalTensor[T]): MetalTensor[T] {.noinit.} = + ## Element-wise multiplication (Hadamard product). + ## + ## And broadcasted element-wise multiplication. + let (tmp_a, tmp_b) = broadcast2(a, b) + + result = newMetalTensor[T](tmp_a.shape) + metalElementwise[T]("mul", + tmp_a.storage.Fbuffer, + tmp_b.storage.Fbuffer, + result.storage.Fbuffer, + result.storage.Flen + ) + +proc `/.`*[T: SomeFloat](a, b: MetalTensor[T]): MetalTensor[T] {.noinit.} = + ## Element-wise division. + ## + ## And broadcasted element-wise division. + let (tmp_a, tmp_b) = broadcast2(a, b) + + result = newMetalTensor[T](tmp_a.shape) + metalElementwise[T]("div", + tmp_a.storage.Fbuffer, + tmp_b.storage.Fbuffer, + result.storage.Fbuffer, + result.storage.Flen + ) + +# ############################################## +# # Broadcasting in-place Tensor-Tensor + +proc `+.=`*[T: SomeFloat](a: var MetalTensor[T], b: MetalTensor[T]) = + ## Tensor broadcasted in-place addition. + ## + ## Only the right hand side tensor can be broadcasted. + let tmp_b = b.broadcast(a.shape) + a += tmp_b + +proc `-.=`*[T: SomeFloat](a: var MetalTensor[T], b: MetalTensor[T]) = + ## Tensor broadcasted in-place subtraction. + ## + ## Only the right hand side tensor can be broadcasted. + let tmp_b = b.broadcast(a.shape) + a -= tmp_b + +proc `*.=`*[T: SomeFloat](a: var MetalTensor[T], b: MetalTensor[T]) = + ## Tensor broadcasted in-place multiplication (Hadamard product) + ## + ## Only the right hand side tensor can be broadcasted. + let tmp_b = b.broadcast(a.shape) + metalElementwise[T]("mul", + a.storage.Fbuffer, + tmp_b.storage.Fbuffer, + a.storage.Fbuffer, + a.storage.Flen + ) + +proc `/.=`*[T: SomeFloat](a: var MetalTensor[T], b: MetalTensor[T]) = + ## Tensor broadcasted in-place float division. + ## + ## Only the right hand side tensor can be broadcasted. + let tmp_b = b.broadcast(a.shape) + metalElementwise[T]("div", + a.storage.Fbuffer, + tmp_b.storage.Fbuffer, + a.storage.Fbuffer, + a.storage.Flen + ) + +# ############################################## +# # Broadcasting Tensor-Scalar and Scalar-Tensor + +proc `+.`*[T: SomeFloat](t: MetalTensor[T], val: T): MetalTensor[T] {.noinit.} = + ## Broadcasted addition for tensor + scalar. + ## Uses GPU scalar_add kernel + result = newMetalTensor[T](t.shape) + metalScalarAdd[T]( + t.storage.Fbuffer, + result.storage.Fbuffer, + val, + t.storage.Flen + ) + +proc `-.`*[T: SomeFloat](t: MetalTensor[T], val: T): MetalTensor[T] {.noinit.} = + ## Broadcasted subtraction for tensor - scalar. + ## Implemented as tensor + (-scalar) + result = newMetalTensor[T](t.shape) + metalScalarAdd[T]( + t.storage.Fbuffer, + result.storage.Fbuffer, + -val, + t.storage.Flen + ) + +proc `+.`*[T: SomeFloat](val: T, t: MetalTensor[T]): MetalTensor[T] {.noinit.} = + ## Broadcasted addition for scalar + tensor. + result = t +. val + +proc `-.`*[T: SomeFloat](val: T, t: MetalTensor[T]): MetalTensor[T] {.noinit.} = + ## Broadcasted subtraction for scalar - tensor. + ## Implemented as -(tensor - scalar) + result = newMetalTensor[T](t.shape) + metalScalarAdd[T]( + t.storage.Fbuffer, + result.storage.Fbuffer, + -val, + t.storage.Flen + ) + # Negate the result + metalScalarMul[T]( + result.storage.Fbuffer, + result.storage.Fbuffer, + T(-1), + t.storage.Flen + ) + +proc `/.`*[T: SomeFloat](val: T, t: MetalTensor[T]): MetalTensor[T] {.noinit.} = + ## Broadcasted division of a float by a tensor of floats. + ## For now, fall back to CPU + let cpuT = t.toCpu() + let cpuResult = val /. cpuT + result = cpuResult.metal() + +# ############################################## +# # Broadcasting in-place Tensor-Scalar + +proc `+.=`*[T: SomeFloat](t: var MetalTensor[T], val: T) = + ## Broadcasted in-place addition for tensor += scalar. + ## Uses GPU scalar_add kernel in-place + metalScalarAdd[T]( + t.storage.Fbuffer, + t.storage.Fbuffer, + val, + t.storage.Flen + ) + +proc `-.=`*[T: SomeFloat](t: var MetalTensor[T], val: T) = + ## Broadcasted in-place subtraction for tensor -= scalar. + ## Implemented as tensor += (-scalar) + metalScalarAdd[T]( + t.storage.Fbuffer, + t.storage.Fbuffer, + -val, + t.storage.Flen + ) + +# ############################################## +# Deprecated syntax + +implDeprecatedBy(`.+`, `+.`, exported = true) +implDeprecatedBy(`.-`, `-.`, exported = true) +implDeprecatedBy(`.*`, `*.`, exported = true) +implDeprecatedBy(`./`, `/.`, exported = true) + +implDeprecatedBy(`.=+`, `+.=`, exported = true) +implDeprecatedBy(`.=-`, `-.=`, exported = true) +implDeprecatedBy(`.=*`, `*.=`, exported = true) +implDeprecatedBy(`.=/`, `/.=`, exported = true) diff --git a/src/arraymancer/tensor/private/p_checks_metal.nim b/src/arraymancer/tensor/private/p_checks_metal.nim new file mode 100644 index 000000000..ad425586c --- /dev/null +++ b/src/arraymancer/tensor/private/p_checks_metal.nim @@ -0,0 +1,45 @@ +# Copyright 2017 the Arraymancer contributors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +when defined(metal): + import ../data_structure + + proc check_elementwise*[T,U](a: MetalTensor[T], b: MetalTensor[U]) {.inline.} = + ## Check if element-wise operations can be applied to 2 MetalTensors + if unlikely(a.shape != b.shape): + raise newException(ValueError, "Both Tensors should have the same shape.\n Left-hand side has shape " & + $a.shape & " while right-hand side has shape " & $b.shape) + + proc check_dot_prod*[T,U](a: MetalTensor[T], b: MetalTensor[U]) {.inline.} = + ## Check if dot product can be applied to 2 MetalTensors + if unlikely(a.rank != 1 or b.rank != 1): + raise newException(ValueError, "Dot product is only supported for vectors (1D tensors)") + if unlikely(a.shape[0] != b.shape[0]): + raise newException(ValueError, "Both vectors should have the same length") + + proc check_matmat*[T,U](a: MetalTensor[T], b: MetalTensor[U]) {.inline.} = + ## Check if matrix-matrix multiplication can be applied + if unlikely(a.rank != 2 or b.rank != 2): + raise newException(ValueError, "Matrix multiplication requires 2D tensors") + if unlikely(a.shape[1] != b.shape[0]): + raise newException(ValueError, "Incompatible shapes for matrix multiplication: " & + $a.shape & " and " & $b.shape) + + proc check_matvec*[T,U](a: MetalTensor[T], b: MetalTensor[U]) {.inline.} = + ## Check if matrix-vector multiplication can be applied + if unlikely(a.rank != 2 or b.rank != 1): + raise newException(ValueError, "Matrix-vector multiplication requires a 2D matrix and 1D vector") + if unlikely(a.shape[1] != b.shape[0]): + raise newException(ValueError, "Incompatible shapes for matrix-vector multiplication: " & + $a.shape & " and " & $b.shape) diff --git a/src/arraymancer/tensor/private/p_init_metal.nim b/src/arraymancer/tensor/private/p_init_metal.nim new file mode 100644 index 000000000..b90971929 --- /dev/null +++ b/src/arraymancer/tensor/private/p_init_metal.nim @@ -0,0 +1,59 @@ +# Copyright 2017 the Arraymancer contributors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import ../backend/metal/metal_backend, + ../data_structure + +proc newMetalStorage*[T: SomeFloat](length: int): MetalStorage[T] = + result.Flen = length + result.Fbuffer = createMetalBuffer(length * sizeof(T)) + new result.Fref_tracking + result.Fref_tracking.value = result.Fbuffer + +template tensorMetal*[T: SomeFloat]( + shape: typed, + layout: OrderType = colMajor, + result: var MetalTensor[T]) = + + result.shape.copyFrom(shape) + shape_to_strides(result.shape, layout, result.strides) + result.offset = 0 + result.storage = newMetalStorage[T](result.size) + result.cpuData = @[] + +proc newMetalTensor*[T: SomeFloat]( + shape: varargs[int], + layout: OrderType = colMajor): MetalTensor[T] {.noinit.} = + ## Internal proc + ## Allocate a MetalTensor + ## WARNING: The Metal memory is not initialized to 0 + + tensorMetal(shape, layout, result) + +proc newMetalTensor*[T: SomeFloat]( + shape: Metadata, + layout: OrderType = colMajor): MetalTensor[T] {.noinit.} = + + tensorMetal(shape, layout, result) + +proc get_data_ptr*[T: SomeFloat](t: MetalTensor[T]): ptr T {.inline.} = + ## Get access to the data pointer of a MetalTensor + ## This is unsafe and should only be used internally + cast[ptr T](t.storage.Fbuffer.devicePtr) + +proc get_data_ptr*[T: SomeFloat](t: var MetalTensor[T]): ptr T {.inline.} = + ## Get access to the data pointer of a MetalTensor + ## This is unsafe and should only be used internally + cast[ptr T](t.storage.Fbuffer.devicePtr) diff --git a/src/arraymancer/tensor/shapeshifting_metal.nim b/src/arraymancer/tensor/shapeshifting_metal.nim new file mode 100644 index 000000000..63a7fcdc5 --- /dev/null +++ b/src/arraymancer/tensor/shapeshifting_metal.nim @@ -0,0 +1,137 @@ +# Copyright 2017 the Arraymancer contributors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import ./data_structure, + ./private/p_init_metal, + ./private/p_shapeshifting, + ./init_metal, + ./backend/metal/metal_backend + +proc transpose*(t: MetalTensor): MetalTensor {.noSideEffect.}= + ## Transpose a MetalTensor. + ## + ## For N-d Tensor with shape (0, 1, 2 ... n-1) the resulting tensor will have shape (n-1, ... 2, 1, 0) + t.shape.reversed(result.shape) + t.strides.reversed(result.strides) + result.offset = t.offset + result.storage = t.storage + result.cpuData = t.cpuData + +proc isContiguous*(t: MetalTensor): bool {.inline.} = + ## Check if the MetalTensor is contiguous in memory + ## A tensor is contiguous if its strides match either row-major or col-major layout + if t.rank == 0: + return true + + # Check for row-major (C-style) contiguous + var expectedStride = 1 + var isRowMajor = true + for i in countdown(t.rank - 1, 0): + if t.strides[i] != expectedStride: + isRowMajor = false + break + expectedStride *= t.shape[i] + + if isRowMajor: + return true + + # Check for col-major (Fortran-style) contiguous + expectedStride = 1 + var isColMajor = true + for i in 0 ..< t.rank: + if t.strides[i] != expectedStride: + isColMajor = false + break + expectedStride *= t.shape[i] + + return isColMajor + +proc asContiguous*[T: SomeFloat](t: MetalTensor[T], layout: OrderType = rowMajor, force: bool = false): MetalTensor[T] {.noinit.} = + ## Return a contiguous copy of the MetalTensor using GPU kernel + if not force and t.isContiguous: + return t + + # Create new contiguous tensor + result = newMetalTensor[T](t.shape) + + # Convert shape and strides to seq for GPU kernel + var shapeSeq = newSeq[int](t.rank) + var stridesSeq = newSeq[int](t.rank) + for i in 0 ..< t.rank: + shapeSeq[i] = t.shape[i] + stridesSeq[i] = t.strides[i] + + # Use GPU kernel to copy data with arbitrary strides to contiguous buffer + metalContiguousCopy[T]( + t.storage.Fbuffer, + result.storage.Fbuffer, + shapeSeq, + stridesSeq, + t.storage.Flen + ) + +proc reshape*(t: MetalTensor, new_shape: varargs[int]): MetalTensor = + ## Reshape a MetalTensor without copy. + ## + ## ⚠ Reshaping without copy is only possible on contiguous rowMajor Tensors + + reshape_no_copy(t, new_shape, result, rowMajor) + result.storage = t.storage + result.cpuData = t.cpuData + +proc broadcast*(t: MetalTensor, shape: varargs[int]): MetalTensor {.noSideEffect.}= + ## Broadcast a MetalTensor to a new shape. + ## + ## Dimension(s) of size 1 can be expanded to arbitrary size by replicating + ## values along that dimension. + ## + ## Warning ⚠: + ## This is a no-copy operation, data is shared with the input. + ## This proc does not guarantee that a ``let`` value is immutable. + ## A broadcasted tensor should not be modified and only used for computation. + result = t + result.broadcastImpl(shape) + +proc broadcast*(t: MetalTensor, shape: Metadata): MetalTensor {.noSideEffect.}= + ## Broadcast a MetalTensor to a new shape. + ## + ## Dimension(s) of size 1 can be expanded to arbitrary size by replicating + ## values along that dimension. + ## + ## Warning ⚠: + ## This is a no-copy operation, data is shared with the input. + ## This proc does not guarantee that a ``let`` value is immutable. + ## A broadcasted tensor should not be modified and only used for computation. + result = t + result.broadcastImpl(shape) + +proc broadcast2*[T](a, b: MetalTensor[T]): tuple[a, b: MetalTensor[T]] {.noSideEffect.}= + ## Broadcast 2 tensors so they have compatible shapes for element-wise computations. + ## + ## Tensors in the tuple can be accessed with output.a and output.b + ## + ## The returned broadcasted Tensors share the underlying data with the input. + ## + ## Dimension(s) of size 1 can be expanded to arbitrary size by replicating + ## values along that dimension. + ## + ## Warning ⚠: + ## This is a no-copy operation, data is shared with the input. + ## This proc does not guarantee that a ``let`` value is immutable. + ## A broadcasted tensor should not be modified and only used for computation. + + broadcast2Impl(a,b, result) + + result.a.storage = a.storage + result.b.storage = b.storage diff --git a/src/arraymancer/tensor/tensor_metal.nim b/src/arraymancer/tensor/tensor_metal.nim new file mode 100644 index 000000000..ccaf15c89 --- /dev/null +++ b/src/arraymancer/tensor/tensor_metal.nim @@ -0,0 +1,32 @@ +# Copyright 2017 the Arraymancer contributors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import ./backend/metal/metal_backend, + ./backend/metal/metal_buffer, + ./init_metal, + ./init_copy_metal, + ./operators_blas_l1_metal, + ./operators_blas_l2l3_metal, + ./operators_broadcasted_metal, + ./shapeshifting_metal + +export metal_backend, + metal_buffer, + init_metal, + init_copy_metal, + operators_blas_l1_metal, + operators_blas_l2l3_metal, + operators_broadcasted_metal, + shapeshifting_metal diff --git a/tests/tensor/test_accessors_slicer_metal.nim b/tests/tensor/test_accessors_slicer_metal.nim new file mode 100644 index 000000000..045afe332 --- /dev/null +++ b/tests/tensor/test_accessors_slicer_metal.nim @@ -0,0 +1,69 @@ +# Copyright 2017 the Arraymancer contributors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Please compile with -d:metal switch +import ../../src/arraymancer +import std / unittest + +suite "Metal Accessors and Slicing": + test "Basic indexing": + let cpu_tensor = [[1.0, 2.0, 3.0], + [4.0, 5.0, 6.0], + [7.0, 8.0, 9.0]].toTensor + let metal_tensor = cpu_tensor.toMetal() + + # Shape should be preserved + check: metal_tensor.shape == [3, 3] + + test "Slice with range": + let cpu_tensor = [[1.0, 2.0, 3.0, 4.0], + [5.0, 6.0, 7.0, 8.0], + [9.0, 10.0, 11.0, 12.0]].toTensor + let metal_tensor = cpu_tensor.toMetal() + + let sliced = metal_tensor[0..1, 1..2] + let back_to_cpu = sliced.toCpu() + + check: back_to_cpu == cpu_tensor[0..1, 1..2] + + test "Slice with step": + let cpu_tensor = [[1.0, 2.0, 3.0, 4.0], + [5.0, 6.0, 7.0, 8.0], + [9.0, 10.0, 11.0, 12.0], + [13.0, 14.0, 15.0, 16.0]].toTensor + let metal_tensor = cpu_tensor.toMetal() + + let sliced = metal_tensor[0..3|2, 0..3|2] + let back_to_cpu = sliced.toCpu() + + check: back_to_cpu == cpu_tensor[0..3|2, 0..3|2] + + test "Slice with underscore": + let cpu_tensor = [[1.0, 2.0, 3.0], + [4.0, 5.0, 6.0], + [7.0, 8.0, 9.0]].toTensor + let metal_tensor = cpu_tensor.toMetal() + + let row = metal_tensor[1, _] + let back_to_cpu = row.toCpu() + + check: back_to_cpu == cpu_tensor[1, _] + + test "Metadata preservation": + let cpu_tensor = [[1.0, 2.0, 3.0], + [4.0, 5.0, 6.0]].toTensor + let metal_tensor = cpu_tensor.toMetal() + + check: metal_tensor.rank == cpu_tensor.rank + check: metal_tensor.size == cpu_tensor.size diff --git a/tests/tensor/test_broadcasting_metal.nim b/tests/tensor/test_broadcasting_metal.nim new file mode 100644 index 000000000..5603811e7 --- /dev/null +++ b/tests/tensor/test_broadcasting_metal.nim @@ -0,0 +1,54 @@ +# Copyright 2017 the Arraymancer contributors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Please compile with -d:metal switch +import ../../src/arraymancer +import std / [unittest, sugar, sequtils] + +suite "Metal: Shapeshifting - broadcasting and non linear algebra elementwise operations": + test "Tensor element-wise multiplication (Hadamard product) and division": + block: + let u = @[-4, 0, 9].toTensor().asType(float32).toMetal() + let v = @[2, 10, 3].toTensor().asType(float32).toMetal() + let expected_mul = @[-8, 0, 27].toTensor().asType(float32) + let expected_div = @[-2, 0, 3].toTensor().asType(float32) + + check: (u *. v).toCpu() == expected_mul + check: (u /. v).toCpu() == expected_div + + block: + let u = @[1.0, 8.0, -3.0].toTensor().asType(float32).toMetal() + let v = @[4.0, 2.0, 10.0].toTensor().asType(float32).toMetal() + let expected_mul = @[4.0, 16.0, -30.0].toTensor().asType(float32) + let expected_div = @[0.25, 4.0, -0.3].toTensor().asType(float32) + + check: (u *. v).toCpu() == expected_mul + check: (u /. v).toCpu() == expected_div + + test "Tensor element-wise in-place multiplication (Hadamard product) and division": + block: + var u = @[-4.0, 0.0, 9.0].toTensor().asType(float32).toMetal() + let v = @[2.0, 10.0, 3.0].toTensor().asType(float32).toMetal() + let expected_mul = @[-8.0, 0.0, 27.0].toTensor().asType(float32) + + u *.= v + check: u.toCpu() == expected_mul + + block: + var u = @[100.0, 10.0, 30.0].toTensor().asType(float32).toMetal() + let v = @[2.0, 5.0, 10.0].toTensor().asType(float32).toMetal() + let expected_div = @[50.0, 2.0, 3.0].toTensor().asType(float32) + + u /.= v + check: u.toCpu() == expected_div diff --git a/tests/tensor/test_init_metal.nim b/tests/tensor/test_init_metal.nim new file mode 100644 index 000000000..d246122b5 --- /dev/null +++ b/tests/tensor/test_init_metal.nim @@ -0,0 +1,33 @@ +# Copyright 2017 the Arraymancer contributors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import ../../src/arraymancer +import std / unittest + + +suite "Metal init": + test "Clone function": + let a = [ 7, 4, 3, 1, 8, 6, + 8, 1, 6, 2, 6, 6, + 2, 0, 4, 3, 2, 0].toTensor.reshape([3,6]).asType(float).toMetal + + # Tensor of shape 3x6 of type "int" on backend "Cpu" + # |7 4 3 1 8 6| + # |8 1 6 2 6 6| + # |2 0 4 3 2 0| + + let b = a[2..0|-2, 2..0|-2].clone + + check: b.toCpu == [ [4.0, 2.0], + [3.0, 7.0]].toTensor diff --git a/tests/tensor/test_operators_blas_metal.nim b/tests/tensor/test_operators_blas_metal.nim new file mode 100644 index 000000000..246245838 --- /dev/null +++ b/tests/tensor/test_operators_blas_metal.nim @@ -0,0 +1,93 @@ +# Copyright 2017 the Arraymancer contributors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Please compile with -d:metal switch +import ../../src/arraymancer +import std / [unittest, sugar] + +suite "Metal BLAS backend (Basic Linear Algebra Subprograms)": + test "GEMM - General Matrix to Matrix Multiplication": + let a = [[1.0, 2, 3], + [4.0, 5, 6]].toTensor().asType(float32).toMetal() + + let b = [[7.0, 8], + [9.0, 10], + [11.0, 12]].toTensor().asType(float32).toMetal() + + let expected = [[58.0, 64], + [139.0, 154]].toTensor().asType(float32) + + let result = (a * b).toCpu() + check: result == expected + + test "Matrix-Vector multiplication": + let a = [[1.0, 2.0, 3.0], + [4.0, 5.0, 6.0]].toTensor().asType(float32).toMetal() + let x = [1.0, 2.0, 3.0].toTensor().asType(float32).toMetal() + + let expected = [14.0, 32.0].toTensor().asType(float32) + + let result = (a * x).toCpu() + check: result == expected + + test "Element-wise addition": + let a = [[1.0, 2.0], [3.0, 4.0]].toTensor().asType(float32).toMetal() + let b = [[5.0, 6.0], [7.0, 8.0]].toTensor().asType(float32).toMetal() + + let expected = [[6.0, 8.0], [10.0, 12.0]].toTensor().asType(float32) + + let result = (a + b).toCpu() + check: result == expected + + test "Element-wise subtraction": + let a = [[5.0, 6.0], [7.0, 8.0]].toTensor().asType(float32).toMetal() + let b = [[1.0, 2.0], [3.0, 4.0]].toTensor().asType(float32).toMetal() + + let expected = [[4.0, 4.0], [4.0, 4.0]].toTensor().asType(float32) + + let result = (a - b).toCpu() + check: result == expected + + test "Scalar multiplication": + let a = [[1.0, 2.0], [3.0, 4.0]].toTensor().asType(float32).toMetal() + + let expected = [[2.0, 4.0], [6.0, 8.0]].toTensor().asType(float32) + + let result = (2.0'f32 * a).toCpu() + check: result == expected + + test "Dot product": + let a = [1.0, 2.0, 3.0, 4.0].toTensor().asType(float32).toMetal() + let b = [5.0, 6.0, 7.0, 8.0].toTensor().asType(float32).toMetal() + + let result = dot(a, b) + check: result == 70.0'f32 + + test "In-place addition": + var a = [[1.0, 2.0], [3.0, 4.0]].toTensor().asType(float32).toMetal() + let b = [[5.0, 6.0], [7.0, 8.0]].toTensor().asType(float32).toMetal() + + let expected = [[6.0, 8.0], [10.0, 12.0]].toTensor().asType(float32) + + a += b + check: a.toCpu() == expected + + test "Large matrix multiplication": + # Test with matrices above the GEMM threshold + let a = randomTensor([200, 200], 1.0'f32).toMetal() + let b = randomTensor([200, 200], 1.0'f32).toMetal() + + # Just verify it doesn't crash and returns correct shape + let result = a * b + check: result.shape == [200, 200] diff --git a/tests/tensor/test_shapeshifting_metal.nim b/tests/tensor/test_shapeshifting_metal.nim new file mode 100644 index 000000000..fe9fa39ba --- /dev/null +++ b/tests/tensor/test_shapeshifting_metal.nim @@ -0,0 +1,67 @@ +# Copyright 2017 the Arraymancer contributors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Please compile with -d:metal switch +import ../../src/arraymancer +import std / unittest + +suite "Metal Shapeshifting": + test "Transpose": + let cpu_tensor = [[1.0, 2.0, 3.0], + [4.0, 5.0, 6.0]].toTensor + let metal_tensor = cpu_tensor.toMetal() + + # Transpose should work by manipulating metadata + let transposed = metal_tensor.transpose() + let back_to_cpu = transposed.toCpu() + + check: back_to_cpu == cpu_tensor.transpose() + + test "Reshape": + let cpu_tensor = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0].toTensor + let metal_tensor = cpu_tensor.toMetal() + + let reshaped = metal_tensor.reshape([2, 3]) + let back_to_cpu = reshaped.toCpu() + + check: back_to_cpu == cpu_tensor.reshape([2, 3]) + + test "Slice": + let cpu_tensor = [[1.0, 2.0, 3.0], + [4.0, 5.0, 6.0], + [7.0, 8.0, 9.0]].toTensor + let metal_tensor = cpu_tensor.toMetal() + + let sliced = metal_tensor[1..2, 0..1] + let back_to_cpu = sliced.toCpu() + + check: back_to_cpu == cpu_tensor[1..2, 0..1] + + test "Contiguous check": + let cpu_tensor = [[1.0, 2.0, 3.0], + [4.0, 5.0, 6.0]].toTensor + let metal_tensor = cpu_tensor.toMetal() + + check: metal_tensor.isContiguous == cpu_tensor.isContiguous + + test "As contiguous": + let cpu_tensor = [[1.0, 2.0, 3.0], + [4.0, 5.0, 6.0]].toTensor + let metal_tensor = cpu_tensor.toMetal() + let transposed = metal_tensor.transpose() + + let contiguous = transposed.asContiguous(rowMajor) + let back_to_cpu = contiguous.toCpu() + + check: back_to_cpu == cpu_tensor.transpose().asContiguous(rowMajor) diff --git a/tests/tests_metal.nim b/tests/tests_metal.nim new file mode 100644 index 000000000..234b8695b --- /dev/null +++ b/tests/tests_metal.nim @@ -0,0 +1,23 @@ +# Copyright 2017 the Arraymancer contributors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Please compile with -d:metal switch +{.push warning[Spacing]: off.} +import ../src/arraymancer, + ./tensor/test_init_metal, + ./tensor/test_operators_blas_metal, + ./tensor/test_accessors_slicer_metal, + ./tensor/test_shapeshifting_metal, + ./tensor/test_broadcasting_metal +{.pop.}