From 928275061459a9a93336cc6b931d8e1be8374293 Mon Sep 17 00:00:00 2001
From: bung87 <crc32@qq.com>
Date: Mon, 9 Feb 2026 00:32:23 +0800
Subject: [PATCH] apple metal support

---
 arraymancer.nimble                            |   5 +
 src/arraymancer/tensor.nim                    |   4 +
 .../tensor/backend/global_config.nim          |   4 +
 src/arraymancer/tensor/backend/metal.nim      |  29 +
 .../tensor/backend/metal/metal_backend.nim    | 576 ++++++++++++++++++
 .../tensor/backend/metal/metal_buffer.nim     |  96 +++
 .../tensor/backend/metal/metal_kernels.metal  | 369 +++++++++++
 src/arraymancer/tensor/data_structure.nim     |  46 +-
 src/arraymancer/tensor/init_copy_metal.nim    |  28 +
 src/arraymancer/tensor/init_metal.nim         |  65 ++
 .../tensor/operators_blas_l1_metal.nim        | 103 ++++
 .../tensor/operators_blas_l2l3_metal.nim      |  86 +++
 .../tensor/operators_broadcasted_metal.nim    | 194 ++++++
 .../tensor/private/p_checks_metal.nim         |  45 ++
 .../tensor/private/p_init_metal.nim           |  59 ++
 .../tensor/shapeshifting_metal.nim            | 137 +++++
 src/arraymancer/tensor/tensor_metal.nim       |  32 +
 tests/tensor/test_accessors_slicer_metal.nim  |  69 +++
 tests/tensor/test_broadcasting_metal.nim      |  54 ++
 tests/tensor/test_init_metal.nim              |  33 +
 tests/tensor/test_operators_blas_metal.nim    |  93 +++
 tests/tensor/test_shapeshifting_metal.nim     |  67 ++
 tests/tests_metal.nim                         |  23 +
 23 files changed, 2216 insertions(+), 1 deletion(-)
 create mode 100644 src/arraymancer/tensor/backend/metal.nim
 create mode 100644 src/arraymancer/tensor/backend/metal/metal_backend.nim
 create mode 100644 src/arraymancer/tensor/backend/metal/metal_buffer.nim
 create mode 100644 src/arraymancer/tensor/backend/metal/metal_kernels.metal
 create mode 100644 src/arraymancer/tensor/init_copy_metal.nim
 create mode 100644 src/arraymancer/tensor/init_metal.nim
 create mode 100644 src/arraymancer/tensor/operators_blas_l1_metal.nim
 create mode 100644 src/arraymancer/tensor/operators_blas_l2l3_metal.nim
 create mode 100644 src/arraymancer/tensor/operators_broadcasted_metal.nim
 create mode 100644 src/arraymancer/tensor/private/p_checks_metal.nim
 create mode 100644 src/arraymancer/tensor/private/p_init_metal.nim
 create mode 100644 src/arraymancer/tensor/shapeshifting_metal.nim
 create mode 100644 src/arraymancer/tensor/tensor_metal.nim
 create mode 100644 tests/tensor/test_accessors_slicer_metal.nim
 create mode 100644 tests/tensor/test_broadcasting_metal.nim
 create mode 100644 tests/tensor/test_init_metal.nim
 create mode 100644 tests/tensor/test_operators_blas_metal.nim
 create mode 100644 tests/tensor/test_shapeshifting_metal.nim
 create mode 100644 tests/tests_metal.nim

diff --git a/arraymancer.nimble b/arraymancer.nimble
index 3622d298d..206fefac3 100644
--- a/arraymancer.nimble
+++ b/arraymancer.nimble
@@ -219,6 +219,11 @@ task test_opencl, "Run all OpenCL backend tests":
   switches.add " -d:blas=cblas" # Archlinux, comment out on Debian/Ubuntu
   test "tests_opencl", switches, split = false, "cpp"
 
+task test_metal, "Run all Metal backend tests":
+  var switches = " -d:metal"
+  switches.add " -d:blas=cblas" # Archlinux, comment out on Debian/Ubuntu
+  test "tests_metal", switches, split = false
+
 # task test_deprecated, "Run all tests on deprecated procs":
 #  test "tests_cpu_deprecated"
 
diff --git a/src/arraymancer/tensor.nim b/src/arraymancer/tensor.nim
index 25aeb0233..2f9ed7a9f 100644
--- a/src/arraymancer/tensor.nim
+++ b/src/arraymancer/tensor.nim
@@ -81,3 +81,7 @@ when defined(cuda) or defined(nimdoc) or defined(nimsuggest):
 when defined(opencl) or defined(nimdoc) or defined(nimsuggest):
   import ./tensor/tensor_opencl
   export tensor_opencl
+
+when defined(metal) or defined(nimdoc) or defined(nimsuggest):
+  import ./tensor/tensor_metal
+  export tensor_metal
diff --git a/src/arraymancer/tensor/backend/global_config.nim b/src/arraymancer/tensor/backend/global_config.nim
index 35fa15bfa..df1303c9c 100644
--- a/src/arraymancer/tensor/backend/global_config.nim
+++ b/src/arraymancer/tensor/backend/global_config.nim
@@ -32,6 +32,10 @@ const CUDA_HOF_BPG*: cint = 256     # should be (grid-stride+threadsPerBlock-1)
 const OMP_FOR_THRESHOLD* = 1000    # Tensor number of elements threshold before using OpenMP multithreading
 const OMP_MAX_REDUCE_BLOCKS* = 8   # Max number of expected OpenMP threads (used in reduce)
 
+# Metal-specific configuration
+const METAL_THREADGROUP_SIZE* = 256  # Default threadgroup size for Metal kernels
+const METAL_BUFFER_POOL_MAX_SIZE* = 1024 * 1024 * 1024  # 1GB max buffer pool size
+
 # Full procesor optimization (AVX, AVX2, ARM neon, ... if applicable)
 when defined(native):
   {.passC: "-march=native".}
diff --git a/src/arraymancer/tensor/backend/metal.nim b/src/arraymancer/tensor/backend/metal.nim
new file mode 100644
index 000000000..ef8c24062
--- /dev/null
+++ b/src/arraymancer/tensor/backend/metal.nim
@@ -0,0 +1,29 @@
+# Copyright 2017 the Arraymancer contributors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+## Metal backend for Arraymancer
+## Provides GPU acceleration on macOS using Apple's Metal framework
+
+when defined(metal):
+  import  ../data_structure,
+          ./metal/metal_buffer,
+          ./metal/metal_backend
+
+  export  metal_buffer,
+          metal_backend
+
+  # Re-export types for convenience
+  export MetalBuffer, MetalBufferPool
+  export MTLDevice, MTLCommandQueue, MTLCommandBuffer, MTLBuffer
+  export MetalContext, metalContext, initMetalContext, isMetalAvailable
diff --git a/src/arraymancer/tensor/backend/metal/metal_backend.nim b/src/arraymancer/tensor/backend/metal/metal_backend.nim
new file mode 100644
index 000000000..e24fdbba2
--- /dev/null
+++ b/src/arraymancer/tensor/backend/metal/metal_backend.nim
@@ -0,0 +1,576 @@
+# Copyright 2017 the Arraymancer contributors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import ./metal_buffer, std/tables
+
+{.passC: "-x objective-c".}
+{.passL: "-framework Metal".}
+{.passL: "-framework MetalPerformanceShaders".}
+{.passL: "-framework Foundation".}
+
+{.emit: """
+#import <Metal/Metal.h>
+#import <MetalPerformanceShaders/MetalPerformanceShaders.h>
+
+id<MTLDevice> arraymancer_getDefaultDevice() {
+  NSArray<id<MTLDevice>> *devices = MTLCopyAllDevices();
+  if (devices && [devices count] > 0) {
+    id<MTLDevice> device = [devices objectAtIndex:0];
+    [devices release];
+    return device;
+  }
+  return nil;
+}
+
+id<MTLCommandQueue> arraymancer_createCommandQueue(id<MTLDevice> device) {
+  return [device newCommandQueue];
+}
+
+id<MTLBuffer> arraymancer_createBuffer(id<MTLDevice> device, size_t length) {
+  return [device newBufferWithLength:length options:MTLResourceStorageModeShared];
+}
+
+void* arraymancer_getBufferContents(id<MTLBuffer> buffer) {
+  return [buffer contents];
+}
+
+void arraymancer_copyToBuffer(id<MTLBuffer> buffer, const void* data, size_t length) {
+  memcpy([buffer contents], data, length);
+}
+
+void arraymancer_copyFromBuffer(id<MTLBuffer> buffer, void* data, size_t length) {
+  memcpy(data, [buffer contents], length);
+}
+
+id<MTLLibrary> arraymancer_createLibraryFromSource(id<MTLDevice> device, const char* source) {
+  NSError* error = nil;
+  NSString* sourceString = [NSString stringWithUTF8String:source];
+  id<MTLLibrary> library = [device newLibraryWithSource:sourceString options:nil error:&error];
+  if (error) {
+    NSLog(@"Error creating library: %@", error);
+    return nil;
+  }
+  return library;
+}
+
+id<MTLFunction> arraymancer_getFunction(id<MTLLibrary> library, const char* name) {
+  NSString* funcName = [NSString stringWithUTF8String:name];
+  return [library newFunctionWithName:funcName];
+}
+
+id<MTLComputePipelineState> arraymancer_createComputePipeline(id<MTLDevice> device, id<MTLFunction> function) {
+  NSError* error = nil;
+  id<MTLComputePipelineState> pipeline = [device newComputePipelineStateWithFunction:function error:&error];
+  if (error) {
+    NSLog(@"Error creating pipeline: %@", error);
+    return nil;
+  }
+  return pipeline;
+}
+
+id<MTLCommandBuffer> arraymancer_createCommandBuffer(id<MTLCommandQueue> queue) {
+  return [queue commandBuffer];
+}
+
+id<MTLComputeCommandEncoder> arraymancer_createComputeEncoder(id<MTLCommandBuffer> buffer) {
+  return [buffer computeCommandEncoder];
+}
+
+void arraymancer_setComputePipeline(id<MTLComputeCommandEncoder> encoder, id<MTLComputePipelineState> pipeline) {
+  [encoder setComputePipelineState:pipeline];
+}
+
+void arraymancer_setBuffer(id<MTLComputeCommandEncoder> encoder, id<MTLBuffer> buffer, uint index) {
+  [encoder setBuffer:buffer offset:0 atIndex:index];
+}
+
+void arraymancer_setBytes(id<MTLComputeCommandEncoder> encoder, const void* bytes, size_t length, uint index) {
+  [encoder setBytes:bytes length:length atIndex:index];
+}
+
+void arraymancer_dispatchThreads1D(id<MTLComputeCommandEncoder> encoder, size_t size, size_t threadgroupSize) {
+  MTLSize gridSize = MTLSizeMake(size, 1, 1);
+  MTLSize tgSize = MTLSizeMake(threadgroupSize, 1, 1);
+  [encoder dispatchThreads:gridSize threadsPerThreadgroup:tgSize];
+}
+
+void arraymancer_dispatchThreads2D(id<MTLComputeCommandEncoder> encoder, size_t width, size_t height, size_t threadgroupWidth, size_t threadgroupHeight) {
+  MTLSize gridSize = MTLSizeMake(width, height, 1);
+  MTLSize tgSize = MTLSizeMake(threadgroupWidth, threadgroupHeight, 1);
+  [encoder dispatchThreads:gridSize threadsPerThreadgroup:tgSize];
+}
+
+void arraymancer_endEncoding(id<MTLComputeCommandEncoder> encoder) {
+  [encoder endEncoding];
+}
+
+void arraymancer_commitCommandBuffer(id<MTLCommandBuffer> buffer) {
+  [buffer commit];
+}
+
+void arraymancer_waitForCompletion(id<MTLCommandBuffer> buffer) {
+  [buffer waitUntilCompleted];
+}
+
+id<MTLBlitCommandEncoder> arraymancer_createBlitEncoder(id<MTLCommandBuffer> buffer) {
+  return [buffer blitCommandEncoder];
+}
+
+void arraymancer_copyBuffer(id<MTLBlitCommandEncoder> encoder, id<MTLBuffer> src, id<MTLBuffer> dst, size_t length) {
+  [encoder copyFromBuffer:src sourceOffset:0 toBuffer:dst destinationOffset:0 size:length];
+}
+
+void arraymancer_endBlitEncoding(id<MTLBlitCommandEncoder> encoder) {
+  [encoder endEncoding];
+}
+""".}
+
+proc getDefaultDevice(): MTLDevice {.importc: "arraymancer_getDefaultDevice", nodecl.}
+proc createCommandQueue(device: MTLDevice): MTLCommandQueue {.importc: "arraymancer_createCommandQueue", nodecl.}
+proc createBuffer(device: MTLDevice, length: csize_t): MTLBuffer {.importc: "arraymancer_createBuffer", nodecl.}
+proc getBufferContents(buffer: MTLBuffer): pointer {.importc: "arraymancer_getBufferContents", nodecl.}
+proc copyToBuffer(buffer: MTLBuffer, data: pointer, length: csize_t) {.importc: "arraymancer_copyToBuffer", nodecl.}
+proc copyFromBuffer(buffer: MTLBuffer, data: pointer, length: csize_t) {.importc: "arraymancer_copyFromBuffer", nodecl.}
+proc createLibraryFromSource(device: MTLDevice, source: cstring): MTLLibrary {.importc: "arraymancer_createLibraryFromSource", nodecl.}
+proc getFunction(library: MTLLibrary, name: cstring): MTLFunction {.importc: "arraymancer_getFunction", nodecl.}
+proc createComputePipeline(device: MTLDevice, function: MTLFunction): MTLComputePipelineState {.importc: "arraymancer_createComputePipeline", nodecl.}
+proc createCommandBuffer(queue: MTLCommandQueue): MTLCommandBuffer {.importc: "arraymancer_createCommandBuffer", nodecl.}
+proc createComputeEncoder(buffer: MTLCommandBuffer): MTLComputeCommandEncoder {.importc: "arraymancer_createComputeEncoder", nodecl.}
+proc setComputePipeline(encoder: MTLComputeCommandEncoder, pipeline: MTLComputePipelineState) {.importc: "arraymancer_setComputePipeline", nodecl.}
+proc setBuffer(encoder: MTLComputeCommandEncoder, buffer: MTLBuffer, index: cuint) {.importc: "arraymancer_setBuffer", nodecl.}
+proc setBytes(encoder: MTLComputeCommandEncoder, bytes: pointer, length: csize_t, index: cuint) {.importc: "arraymancer_setBytes", nodecl.}
+proc dispatchThreads1D(encoder: MTLComputeCommandEncoder, size, threadgroupSize: csize_t) {.importc: "arraymancer_dispatchThreads1D", nodecl.}
+proc dispatchThreads2D(encoder: MTLComputeCommandEncoder, width, height, threadgroupWidth, threadgroupHeight: csize_t) {.importc: "arraymancer_dispatchThreads2D", nodecl.}
+proc endEncoding(encoder: MTLComputeCommandEncoder) {.importc: "arraymancer_endEncoding", nodecl.}
+proc commitCommandBuffer(buffer: MTLCommandBuffer) {.importc: "arraymancer_commitCommandBuffer", nodecl.}
+proc waitForCompletion(buffer: MTLCommandBuffer) {.importc: "arraymancer_waitForCompletion", nodecl.}
+proc createBlitEncoder(buffer: MTLCommandBuffer): MTLBlitCommandEncoder {.importc: "arraymancer_createBlitEncoder", nodecl.}
+proc copyBuffer(encoder: MTLBlitCommandEncoder, src, dst: MTLBuffer, length: csize_t) {.importc: "arraymancer_copyBuffer", nodecl.}
+proc endBlitEncoding(encoder: MTLBlitCommandEncoder) {.importc: "arraymancer_endBlitEncoding", nodecl.}
+
+type
+  MetalContext* = object
+    device*: MTLDevice
+    commandQueue*: MTLCommandQueue
+    bufferPool*: MetalBufferPool
+    library*: MTLLibrary
+    kernelCache*: Table[string, MTLComputePipelineState]
+    initialized*: bool
+
+var metalContext*: MetalContext
+
+# Embedded Metal kernel source code
+const metalKernelSource = slurp("./metal_kernels.metal")
+
+proc initMetalContext*() =
+  if metalContext.initialized:
+    return
+  metalContext.device = getDefaultDevice()
+  if metalContext.device == nil:
+    raise newException(IOError, "Failed to get default Metal device. " &
+      "This can happen if:\n" &
+      "  - You're running in a sandboxed/headless environment\n" &
+      "  - You don't have an Apple Silicon Mac (M1/M2/M3)\n" &
+      "  - The Metal framework is not accessible")
+  metalContext.commandQueue = createCommandQueue(metalContext.device)
+  metalContext.bufferPool = initMetalBufferPool()
+  metalContext.library = createLibraryFromSource(metalContext.device, metalKernelSource.cstring)
+  if metalContext.library == nil:
+    raise newException(IOError, "Failed to create Metal library from kernel source")
+  metalContext.kernelCache = initTable[string, MTLComputePipelineState]()
+  metalContext.initialized = true
+
+proc tryInitMetalContext*(): bool =
+  ## Try to initialize Metal context, return true if successful
+  if metalContext.initialized:
+    return true
+  try:
+    initMetalContext()
+    return true
+  except:
+    return false
+
+proc isMetalAvailable*(): bool =
+  if not metalContext.initialized:
+    return tryInitMetalContext()
+  return metalContext.initialized and metalContext.device != nil
+
+proc getKernel*(kernelName: string): MTLComputePipelineState =
+  ## Get or create a compute pipeline for a kernel function
+  if not metalContext.initialized:
+    initMetalContext()
+  
+  if metalContext.kernelCache.hasKey(kernelName):
+    return metalContext.kernelCache[kernelName]
+  
+  let function = getFunction(metalContext.library, kernelName.cstring)
+  if function == nil:
+    raise newException(ValueError, "Kernel function not found: " & kernelName)
+  
+  let pipeline = createComputePipeline(metalContext.device, function)
+  if pipeline == nil:
+    raise newException(IOError, "Failed to create compute pipeline for: " & kernelName)
+  
+  metalContext.kernelCache[kernelName] = pipeline
+  return pipeline
+
+proc createMetalBuffer*(length: int): MetalBuffer =
+  if not metalContext.initialized:
+    initMetalContext()
+  result = MetalBuffer()
+  result.buffer = createBuffer(metalContext.device, csize_t(length))
+  result.length = length
+  result.devicePtr = getBufferContents(result.buffer)
+
+proc uploadToBuffer*(mb: MetalBuffer, data: pointer, length: int) =
+  copyToBuffer(mb.buffer, data, csize_t(length))
+
+proc downloadFromBuffer*(mb: MetalBuffer, data: pointer, length: int) =
+  copyFromBuffer(mb.buffer, data, csize_t(length))
+
+proc copyBuffer*(src, dst: MetalBuffer, length: int) =
+  ## Copy data from one Metal buffer to another
+  if not metalContext.initialized:
+    initMetalContext()
+  
+  let commandBuffer = createCommandBuffer(metalContext.commandQueue)
+  let blitEncoder = createBlitEncoder(commandBuffer)
+  
+  copyBuffer(blitEncoder, src.buffer, dst.buffer, csize_t(length))
+  endBlitEncoding(blitEncoder)
+  commitCommandBuffer(commandBuffer)
+  waitForCompletion(commandBuffer)
+
+proc executeElementwiseKernel*(
+  kernelName: string,
+  buffers: seq[MetalBuffer],
+  constants: seq[pointer],
+  constantSizes: seq[int],
+  gridSize: int
+) =
+  ## Execute a 1D elementwise kernel
+  if not metalContext.initialized:
+    initMetalContext()
+  
+  let pipeline = getKernel(kernelName)
+  let commandBuffer = createCommandBuffer(metalContext.commandQueue)
+  let encoder = createComputeEncoder(commandBuffer)
+  
+  setComputePipeline(encoder, pipeline)
+  
+  # Set buffers
+  for i, buf in buffers:
+    setBuffer(encoder, buf.buffer, cuint(i))
+  
+  # Set constants
+  for i, constPtr in constants:
+    setBytes(encoder, constPtr, csize_t(constantSizes[i]), cuint(buffers.len + i))
+  
+  # Dispatch threads
+  let threadgroupSize = 256
+  dispatchThreads1D(encoder, csize_t(gridSize), csize_t(threadgroupSize))
+  
+  endEncoding(encoder)
+  commitCommandBuffer(commandBuffer)
+  waitForCompletion(commandBuffer)
+
+proc executeGemmKernel*(
+  kernelName: string,
+  A, B, C: MetalBuffer,
+  M, N, K: int,
+  alpha, beta: float32
+) =
+  ## Execute a GEMM kernel
+  if not metalContext.initialized:
+    initMetalContext()
+  
+  let pipeline = getKernel(kernelName)
+  let commandBuffer = createCommandBuffer(metalContext.commandQueue)
+  let encoder = createComputeEncoder(commandBuffer)
+  
+  setComputePipeline(encoder, pipeline)
+  
+  # Set buffers
+  setBuffer(encoder, A.buffer, 0)
+  setBuffer(encoder, B.buffer, 1)
+  setBuffer(encoder, C.buffer, 2)
+  
+  # Set constants
+  var m = M
+  var n = N
+  var k = K
+  var a = alpha
+  var b = beta
+  setBytes(encoder, addr m, csize_t(sizeof(int)), 3)
+  setBytes(encoder, addr n, csize_t(sizeof(int)), 4)
+  setBytes(encoder, addr k, csize_t(sizeof(int)), 5)
+  setBytes(encoder, addr a, csize_t(sizeof(float32)), 6)
+  setBytes(encoder, addr b, csize_t(sizeof(float32)), 7)
+  
+  # Dispatch threads
+  let threadgroupWidth = 8
+  let threadgroupHeight = 8
+  dispatchThreads2D(encoder, csize_t(N), csize_t(M), csize_t(threadgroupWidth), csize_t(threadgroupHeight))
+  
+  endEncoding(encoder)
+  commitCommandBuffer(commandBuffer)
+  waitForCompletion(commandBuffer)
+
+proc metalGemm*[T: SomeFloat](
+  transA, transB: bool,
+  M, N, K: int,
+  alpha: T,
+  A: MetalBuffer, lda: int,
+  B: MetalBuffer, ldb: int,
+  beta: T,
+  C: MetalBuffer, ldc: int
+) =
+  if not metalContext.initialized:
+    initMetalContext()
+
+  # Metal only supports float32, not float64
+  when T is float32:
+    var kernelName: string
+    if transA and not transB:
+      kernelName = "gemm_a_transpose_f32"
+    elif not transA and transB:
+      kernelName = "gemm_b_transpose_f32"
+    else:
+      kernelName = "gemm_naive_f32"
+    
+    executeGemmKernel(kernelName, A, B, C, M, N, K, alpha, beta)
+  else:
+    raise newException(ValueError, "Metal only supports float32, not float64")
+
+proc metalGemv*[T: SomeFloat](
+  trans: bool,
+  M, N: int,
+  alpha: T,
+  A: MetalBuffer, lda: int,
+  x: MetalBuffer, incx: int,
+  beta: T,
+  y: MetalBuffer, incy: int
+) =
+  ## Matrix-Vector multiplication: y = alpha * A * x + beta * y
+  ## Column-major layout (Fortran style)
+  if not metalContext.initialized:
+    initMetalContext()
+
+  # Metal only supports float32, not float64
+  when T is float32:
+    var m = M
+    var n = N
+    var a = float32(alpha)
+    var b = float32(beta)
+    
+    let pipeline = getKernel("gemv_f32")
+    let commandBuffer = createCommandBuffer(metalContext.commandQueue)
+    let encoder = createComputeEncoder(commandBuffer)
+    
+    setComputePipeline(encoder, pipeline)
+    setBuffer(encoder, A.buffer, 0)
+    setBuffer(encoder, x.buffer, 1)
+    setBuffer(encoder, y.buffer, 2)
+    setBytes(encoder, addr m, csize_t(sizeof(int)), 3)
+    setBytes(encoder, addr n, csize_t(sizeof(int)), 4)
+    setBytes(encoder, addr a, csize_t(sizeof(float32)), 5)
+    setBytes(encoder, addr b, csize_t(sizeof(float32)), 6)
+    
+    # Dispatch threads - one thread per output element (M rows)
+    let threadgroupSize = 256
+    dispatchThreads1D(encoder, csize_t(M), csize_t(threadgroupSize))
+    
+    endEncoding(encoder)
+    commitCommandBuffer(commandBuffer)
+    waitForCompletion(commandBuffer)
+  else:
+    raise newException(ValueError, "Metal only supports float32, not float64")
+
+proc metalElementwise*[T: SomeFloat](
+  op: string,
+  A, B, C: MetalBuffer,
+  size: int
+) =
+  if not metalContext.initialized:
+    initMetalContext()
+
+  # Metal only supports float32, not float64
+  when T is float32:
+    var kernelName: string
+    case op
+    of "add": kernelName = "elementwise_add_f32"
+    of "sub": kernelName = "elementwise_sub_f32"
+    of "mul": kernelName = "elementwise_mul_f32"
+    of "div": kernelName = "elementwise_div_f32"
+    else: raise newException(ValueError, "Unknown elementwise operation: " & op)
+    
+    var sizeVar = size
+    executeElementwiseKernel(kernelName, @[A, B, C], @[cast[pointer](addr sizeVar)], @[sizeof(int)], size)
+  else:
+    raise newException(ValueError, "Metal only supports float32, not float64")
+
+proc metalScalarMul*[T: SomeFloat](
+  A, C: MetalBuffer,
+  scalar: T,
+  size: int
+) =
+  if not metalContext.initialized:
+    initMetalContext()
+  
+  var kernelName = "scalar_mul_f32"
+  var sizeVar = size
+  var scalarVar = float32(scalar)
+  
+  let pipeline = getKernel(kernelName)
+  let commandBuffer = createCommandBuffer(metalContext.commandQueue)
+  let encoder = createComputeEncoder(commandBuffer)
+  
+  setComputePipeline(encoder, pipeline)
+  setBuffer(encoder, A.buffer, 0)
+  setBuffer(encoder, C.buffer, 1)
+  setBytes(encoder, addr scalarVar, csize_t(sizeof(float32)), 2)
+  setBytes(encoder, addr sizeVar, csize_t(sizeof(int)), 3)
+  
+  let threadgroupSize = 256
+  dispatchThreads1D(encoder, csize_t(size), csize_t(threadgroupSize))
+  
+  endEncoding(encoder)
+  commitCommandBuffer(commandBuffer)
+  waitForCompletion(commandBuffer)
+
+proc metalScalarAdd*[T: SomeFloat](
+  A, C: MetalBuffer,
+  scalar: T,
+  size: int
+) =
+  ## Element-wise addition of a scalar to a tensor
+  if not metalContext.initialized:
+    initMetalContext()
+  
+  var kernelName = "scalar_add_f32"
+  var sizeVar = size
+  var scalarVar = float32(scalar)
+  
+  let pipeline = getKernel(kernelName)
+  let commandBuffer = createCommandBuffer(metalContext.commandQueue)
+  let encoder = createComputeEncoder(commandBuffer)
+  
+  setComputePipeline(encoder, pipeline)
+  setBuffer(encoder, A.buffer, 0)
+  setBuffer(encoder, C.buffer, 1)
+  setBytes(encoder, addr scalarVar, csize_t(sizeof(float32)), 2)
+  setBytes(encoder, addr sizeVar, csize_t(sizeof(int)), 3)
+  
+  let threadgroupSize = 256
+  dispatchThreads1D(encoder, csize_t(size), csize_t(threadgroupSize))
+  
+  endEncoding(encoder)
+  commitCommandBuffer(commandBuffer)
+  waitForCompletion(commandBuffer)
+
+proc metalContiguousCopy*[T: SomeFloat](
+  src, dst: MetalBuffer,
+  shape, strides: seq[int],
+  totalSize: int
+) =
+  ## Copy data from a strided tensor to a contiguous buffer
+  if not metalContext.initialized:
+    initMetalContext()
+  
+  when T is float32:
+    let rank = shape.len
+    
+    # Create buffers for shape and strides
+    let shapeBuffer = createMetalBuffer(rank * sizeof(int32))
+    let stridesBuffer = createMetalBuffer(rank * sizeof(int32))
+    
+    # Upload shape and strides
+    var shapeData = newSeq[int32](rank)
+    var stridesData = newSeq[int32](rank)
+    for i in 0 ..< rank:
+      shapeData[i] = int32(shape[i])
+      stridesData[i] = int32(strides[i])
+    
+    uploadToBuffer(shapeBuffer, addr shapeData[0], rank * sizeof(int32))
+    uploadToBuffer(stridesBuffer, addr stridesData[0], rank * sizeof(int32))
+    
+    var totalSizeVar = int32(totalSize)
+    var rankVar = int32(rank)
+    
+    let pipeline = getKernel("contiguous_copy_f32")
+    let commandBuffer = createCommandBuffer(metalContext.commandQueue)
+    let encoder = createComputeEncoder(commandBuffer)
+    
+    setComputePipeline(encoder, pipeline)
+    setBuffer(encoder, src.buffer, 0)
+    setBuffer(encoder, dst.buffer, 1)
+    setBuffer(encoder, shapeBuffer.buffer, 2)
+    setBuffer(encoder, stridesBuffer.buffer, 3)
+    setBytes(encoder, addr rankVar, csize_t(sizeof(int32)), 4)
+    setBytes(encoder, addr totalSizeVar, csize_t(sizeof(int32)), 5)
+    
+    let threadgroupSize = 256
+    dispatchThreads1D(encoder, csize_t(totalSize), csize_t(threadgroupSize))
+    
+    endEncoding(encoder)
+    commitCommandBuffer(commandBuffer)
+    waitForCompletion(commandBuffer)
+  else:
+    raise newException(ValueError, "Metal only supports float32, not float64")
+
+proc metalDot*[T: SomeFloat](A, B: MetalBuffer, size: int): T =
+  ## Compute dot product of two vectors using GPU reduction
+  if not metalContext.initialized:
+    initMetalContext()
+  
+  when T is float32:
+    let kernelName = "vector_dot_f32"
+    var sizeVar = size
+    
+    # Create a buffer for partial results (one per threadgroup)
+    let threadgroupSize = 256
+    let numGroups = (size + threadgroupSize - 1) div threadgroupSize
+    var partialResult = newSeq[float32](numGroups)
+    let partialBuffer = createMetalBuffer(numGroups * sizeof(float32))
+    
+    let pipeline = getKernel(kernelName)
+    let commandBuffer = createCommandBuffer(metalContext.commandQueue)
+    let encoder = createComputeEncoder(commandBuffer)
+    
+    setComputePipeline(encoder, pipeline)
+    setBuffer(encoder, A.buffer, 0)
+    setBuffer(encoder, B.buffer, 1)
+    setBuffer(encoder, partialBuffer.buffer, 2)
+    setBytes(encoder, addr sizeVar, csize_t(sizeof(int)), 3)
+    
+    dispatchThreads1D(encoder, csize_t(size), csize_t(threadgroupSize))
+    
+    endEncoding(encoder)
+    commitCommandBuffer(commandBuffer)
+    waitForCompletion(commandBuffer)
+    
+    # Download partial results and sum them on CPU
+    downloadFromBuffer(partialBuffer, addr partialResult[0], numGroups * sizeof(float32))
+    
+    var sum: float32 = 0.0
+    for val in partialResult:
+      sum += val
+    
+    return T(sum)
+  else:
+    raise newException(ValueError, "Metal only supports float32, not float64")
+
+proc preferCpuGemm*(m, n, k: int): bool =
+  const THRESHOLD = 128 * 128 * 128
+  return m * n * k < THRESHOLD
diff --git a/src/arraymancer/tensor/backend/metal/metal_buffer.nim b/src/arraymancer/tensor/backend/metal/metal_buffer.nim
new file mode 100644
index 000000000..8f60381bb
--- /dev/null
+++ b/src/arraymancer/tensor/backend/metal/metal_buffer.nim
@@ -0,0 +1,96 @@
+# Copyright 2017 the Arraymancer contributors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+type
+  MTLDevice* = pointer
+  MTLCommandQueue* = pointer
+  MTLCommandBuffer* = pointer
+  MTLComputeCommandEncoder* = pointer
+  MTLBlitCommandEncoder* = pointer
+  MTLBuffer* = pointer
+  MTLComputePipelineState* = pointer
+  MTLFunction* = pointer
+  MTLLibrary* = pointer
+
+  MetalBufferObj* = object
+    buffer*: MTLBuffer
+    length*: int
+    devicePtr*: pointer
+
+  MetalBuffer* = ref MetalBufferObj
+
+  BufferPoolEntry* = object
+    buffer*: MetalBuffer
+    size*: int
+    lastUsed*: int64
+
+  MetalBufferPool* = object
+    entries*: seq[BufferPoolEntry]
+    maxSize*: int
+    currentSize*: int
+    accessCounter*: int64
+
+const
+  DEFAULT_BUFFER_POOL_MAX_SIZE* = 1024 * 1024 * 1024
+  BUFFER_POOL_ENTRY_MAX_AGE* = 1000
+
+proc initMetalBufferPool*(maxSize: int = DEFAULT_BUFFER_POOL_MAX_SIZE): MetalBufferPool =
+  result.maxSize = maxSize
+  result.currentSize = 0
+  result.accessCounter = 0
+
+proc acquireBuffer*(pool: var MetalBufferPool, size: int, device: MTLDevice): MetalBuffer =
+  pool.accessCounter.inc
+
+  for i in 0 ..< pool.entries.len:
+    if pool.entries[i].size >= size and pool.entries[i].buffer != nil:
+      result = pool.entries[i].buffer
+      pool.entries[i].lastUsed = pool.accessCounter
+      return
+
+  result = nil
+
+proc releaseBuffer*(pool: var MetalBufferPool, buffer: MetalBuffer) =
+  if buffer == nil:
+    return
+
+  pool.accessCounter.inc
+
+  if pool.currentSize + buffer.length > pool.maxSize:
+    var oldestIdx = -1
+    var oldestAccess = int64.high
+    for i in 0 ..< pool.entries.len:
+      if pool.entries[i].lastUsed < oldestAccess:
+        oldestAccess = pool.entries[i].lastUsed
+        oldestIdx = i
+
+    if oldestIdx >= 0:
+      pool.currentSize -= pool.entries[oldestIdx].size
+      pool.entries.del(oldestIdx)
+
+  pool.entries.add(BufferPoolEntry(
+    buffer: buffer,
+    size: buffer.length,
+    lastUsed: pool.accessCounter
+  ))
+  pool.currentSize += buffer.length
+
+proc cleanupPool*(pool: var MetalBufferPool) =
+  var i = 0
+  while i < pool.entries.len:
+    if pool.accessCounter - pool.entries[i].lastUsed > BUFFER_POOL_ENTRY_MAX_AGE:
+      pool.currentSize -= pool.entries[i].size
+      pool.entries.del(i)
+    else:
+      i.inc
diff --git a/src/arraymancer/tensor/backend/metal/metal_kernels.metal b/src/arraymancer/tensor/backend/metal/metal_kernels.metal
new file mode 100644
index 000000000..cb97c0e3f
--- /dev/null
+++ b/src/arraymancer/tensor/backend/metal/metal_kernels.metal
@@ -0,0 +1,369 @@
+// Copyright 2017 the Arraymancer contributors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <metal_stdlib>
+using namespace metal;
+
+// ============================================================================
+// Element-wise Operations (float32 only - Metal doesn't support float64)
+// ============================================================================
+
+kernel void elementwise_add_f32(
+    device const float* A [[buffer(0)]],
+    device const float* B [[buffer(1)]],
+    device float* C [[buffer(2)]],
+    constant int& size [[buffer(3)]],
+    uint gid [[thread_position_in_grid]])
+{
+    if (gid < (uint)size) {
+        C[gid] = A[gid] + B[gid];
+    }
+}
+
+kernel void elementwise_sub_f32(
+    device const float* A [[buffer(0)]],
+    device const float* B [[buffer(1)]],
+    device float* C [[buffer(2)]],
+    constant int& size [[buffer(3)]],
+    uint gid [[thread_position_in_grid]])
+{
+    if (gid < (uint)size) {
+        C[gid] = A[gid] - B[gid];
+    }
+}
+
+kernel void elementwise_mul_f32(
+    device const float* A [[buffer(0)]],
+    device const float* B [[buffer(1)]],
+    device float* C [[buffer(2)]],
+    constant int& size [[buffer(3)]],
+    uint gid [[thread_position_in_grid]])
+{
+    if (gid < (uint)size) {
+        C[gid] = A[gid] * B[gid];
+    }
+}
+
+kernel void elementwise_div_f32(
+    device const float* A [[buffer(0)]],
+    device const float* B [[buffer(1)]],
+    device float* C [[buffer(2)]],
+    constant int& size [[buffer(3)]],
+    uint gid [[thread_position_in_grid]])
+{
+    if (gid < (uint)size) {
+        C[gid] = A[gid] / B[gid];
+    }
+}
+
+// ============================================================================
+// Scalar Operations
+// ============================================================================
+
+kernel void scalar_mul_f32(
+    device const float* A [[buffer(0)]],
+    device float* C [[buffer(1)]],
+    constant float& scalar [[buffer(2)]],
+    constant int& size [[buffer(3)]],
+    uint gid [[thread_position_in_grid]])
+{
+    if (gid < (uint)size) {
+        C[gid] = A[gid] * scalar;
+    }
+}
+
+kernel void scalar_add_f32(
+    device const float* A [[buffer(0)]],
+    device float* C [[buffer(1)]],
+    constant float& scalar [[buffer(2)]],
+    constant int& size [[buffer(3)]],
+    uint gid [[thread_position_in_grid]])
+{
+    if (gid < (uint)size) {
+        C[gid] = A[gid] + scalar;
+    }
+}
+
+// ============================================================================
+// Simple GEMM (naive implementation for reference)
+// Column-major layout (Fortran style)
+// ============================================================================
+
+kernel void gemm_naive_f32(
+    device const float* A [[buffer(0)]],
+    device const float* B [[buffer(1)]],
+    device float* C [[buffer(2)]],
+    constant int& M [[buffer(3)]],
+    constant int& N [[buffer(4)]],
+    constant int& K [[buffer(5)]],
+    constant float& alpha [[buffer(6)]],
+    constant float& beta [[buffer(7)]],
+    uint2 gid [[thread_position_in_grid]])
+{
+    uint row = gid.y;  // 0 to M-1
+    uint col = gid.x;  // 0 to N-1
+
+    if (row < (uint)M && col < (uint)N) {
+        float sum = 0.0;
+        // Column-major indexing:
+        // A[i,k] is at A[k*M + i]
+        // B[k,j] is at B[j*K + k]
+        // C[i,j] is at C[j*M + i]
+        for (int k = 0; k < K; k++) {
+            sum += A[k * M + row] * B[col * K + k];
+        }
+        C[col * M + row] = alpha * sum + beta * C[col * M + row];
+    }
+}
+
+// ============================================================================
+// Transpose GEMM variants
+// Column-major layout (Fortran style)
+// ============================================================================
+
+kernel void gemm_a_transpose_f32(
+    device const float* A [[buffer(0)]],
+    device const float* B [[buffer(1)]],
+    device float* C [[buffer(2)]],
+    constant int& M [[buffer(3)]],
+    constant int& N [[buffer(4)]],
+    constant int& K [[buffer(5)]],
+    constant float& alpha [[buffer(6)]],
+    constant float& beta [[buffer(7)]],
+    uint2 gid [[thread_position_in_grid]])
+{
+    uint row = gid.y;
+    uint col = gid.x;
+
+    if (row < (uint)M && col < (uint)N) {
+        float sum = 0.0;
+        // A^T[i,k] = A[k,i], in column-major: A[k,i] is at A[i*K + k]
+        for (int k = 0; k < K; k++) {
+            sum += A[row * K + k] * B[col * K + k];
+        }
+        C[col * M + row] = alpha * sum + beta * C[col * M + row];
+    }
+}
+
+kernel void gemm_b_transpose_f32(
+    device const float* A [[buffer(0)]],
+    device const float* B [[buffer(1)]],
+    device float* C [[buffer(2)]],
+    constant int& M [[buffer(3)]],
+    constant int& N [[buffer(4)]],
+    constant int& K [[buffer(5)]],
+    constant float& alpha [[buffer(6)]],
+    constant float& beta [[buffer(7)]],
+    uint2 gid [[thread_position_in_grid]])
+{
+    uint row = gid.y;
+    uint col = gid.x;
+
+    if (row < (uint)M && col < (uint)N) {
+        float sum = 0.0;
+        // B^T[k,j] = B[j,k], in column-major: B[j,k] is at B[k*N + j]
+        for (int k = 0; k < K; k++) {
+            sum += A[k * M + row] * B[k * N + col];
+        }
+        C[col * M + row] = alpha * sum + beta * C[col * M + row];
+    }
+}
+
+// ============================================================================
+// Batched GEMM (for future use with 3D+ tensors)
+// ============================================================================
+
+kernel void batched_gemm_f32(
+    device const float* A [[buffer(0)]],
+    device const float* B [[buffer(1)]],
+    device float* C [[buffer(2)]],
+    constant int& batchSize [[buffer(3)]],
+    constant int& M [[buffer(4)]],
+    constant int& N [[buffer(5)]],
+    constant int& K [[buffer(6)]],
+    constant float& alpha [[buffer(7)]],
+    constant float& beta [[buffer(8)]],
+    uint3 gid [[thread_position_in_grid]])
+{
+    uint batch = gid.z;
+    uint row = gid.y;
+    uint col = gid.x;
+
+    if (batch < (uint)batchSize && row < (uint)M && col < (uint)N) {
+        int batchOffsetA = batch * M * K;
+        int batchOffsetB = batch * K * N;
+        int batchOffsetC = batch * M * N;
+
+        float sum = 0.0;
+        for (int k = 0; k < K; k++) {
+            sum += A[batchOffsetA + row * K + k] * B[batchOffsetB + k * N + col];
+        }
+        C[batchOffsetC + row * N + col] = alpha * sum + beta * C[batchOffsetC + row * N + col];
+    }
+}
+
+// ============================================================================
+// Vector Operations (BLAS Level 1)
+// ============================================================================
+
+kernel void vector_dot_f32(
+    device const float* A [[buffer(0)]],
+    device const float* B [[buffer(1)]],
+    device float* result [[buffer(2)]],
+    constant int& size [[buffer(3)]],
+    uint gid [[thread_position_in_grid]],
+    uint lid [[thread_position_in_threadgroup]],
+    uint groupSize [[threads_per_threadgroup]])
+{
+    threadgroup float localSum[256];
+
+    float sum = 0.0;
+    for (uint i = gid; i < (uint)size; i += groupSize) {
+        sum += A[i] * B[i];
+    }
+
+    localSum[lid] = sum;
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    for (uint s = groupSize / 2; s > 0; s >>= 1) {
+        if (lid < s) {
+            localSum[lid] += localSum[lid + s];
+        }
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+    }
+
+    if (lid == 0) {
+        result[gid / groupSize] = localSum[0];
+    }
+}
+
+kernel void vector_scale_f32(
+    device float* X [[buffer(0)]],
+    constant float& alpha [[buffer(1)]],
+    constant int& size [[buffer(2)]],
+    uint gid [[thread_position_in_grid]])
+{
+    if (gid < (uint)size) {
+        X[gid] *= alpha;
+    }
+}
+
+kernel void vector_axpy_f32(
+    device const float* X [[buffer(0)]],
+    device float* Y [[buffer(1)]],
+    constant float& alpha [[buffer(2)]],
+    constant int& size [[buffer(3)]],
+    uint gid [[thread_position_in_grid]])
+{
+    if (gid < (uint)size) {
+        Y[gid] += alpha * X[gid];
+    }
+}
+
+// ============================================================================
+// Activation Functions (for ML workloads)
+// ============================================================================
+
+kernel void relu_f32(
+    device const float* X [[buffer(0)]],
+    device float* Y [[buffer(1)]],
+    constant int& size [[buffer(2)]],
+    uint gid [[thread_position_in_grid]])
+{
+    if (gid < (uint)size) {
+        Y[gid] = max(0.0f, X[gid]);
+    }
+}
+
+kernel void sigmoid_f32(
+    device const float* X [[buffer(0)]],
+    device float* Y [[buffer(1)]],
+    constant int& size [[buffer(2)]],
+    uint gid [[thread_position_in_grid]])
+{
+    if (gid < (uint)size) {
+        Y[gid] = 1.0f / (1.0f + exp(-X[gid]));
+    }
+}
+
+kernel void tanh_f32(
+    device const float* X [[buffer(0)]],
+    device float* Y [[buffer(1)]],
+    constant int& size [[buffer(2)]],
+    uint gid [[thread_position_in_grid]])
+{
+    if (gid < (uint)size) {
+        Y[gid] = tanh(X[gid]);
+    }
+}
+
+// ============================================================================
+// Contiguous Copy Kernel
+// Copies data from a strided tensor to a contiguous buffer
+// ============================================================================
+
+kernel void contiguous_copy_f32(
+    device const float* src [[buffer(0)]],
+    device float* dst [[buffer(1)]],
+    constant int* src_shape [[buffer(2)]],
+    constant int* src_strides [[buffer(3)]],
+    constant int& rank [[buffer(4)]],
+    constant int& total_size [[buffer(5)]],
+    uint gid [[thread_position_in_grid]])
+{
+    if (gid >= (uint)total_size) return;
+    
+    // Convert flat index to multi-dimensional indices
+    int idx = (int)gid;
+    int src_offset = 0;
+    
+    for (int dim = rank - 1; dim >= 0; dim--) {
+        int coord = idx % src_shape[dim];
+        src_offset += coord * src_strides[dim];
+        idx /= src_shape[dim];
+    }
+    
+    dst[gid] = src[src_offset];
+}
+
+// ============================================================================
+// Matrix-Vector Multiplication
+// Column-major layout (Fortran style)
+// y = alpha * A * x + beta * y
+// ============================================================================
+
+kernel void gemv_f32(
+    device const float* A [[buffer(0)]],
+    device const float* x [[buffer(1)]],
+    device float* y [[buffer(2)]],
+    constant int& M [[buffer(3)]],
+    constant int& N [[buffer(4)]],
+    constant float& alpha [[buffer(5)]],
+    constant float& beta [[buffer(6)]],
+    uint gid [[thread_position_in_grid]])
+{
+    // Each thread computes one element of the result vector y
+    // gid is the row index (0 to M-1)
+    if (gid < (uint)M) {
+        float sum = 0.0;
+        // Column-major indexing:
+        // A[i,j] is at A[j*M + i]
+        // x[j] is at x[j]
+        // y[i] is at y[i]
+        for (int j = 0; j < N; j++) {
+            sum += A[j * M + gid] * x[j];
+        }
+        y[gid] = alpha * sum + beta * y[gid];
+    }
+}
diff --git a/src/arraymancer/tensor/data_structure.nim b/src/arraymancer/tensor/data_structure.nim
index 7a282a920..c590640c3 100644
--- a/src/arraymancer/tensor/data_structure.nim
+++ b/src/arraymancer/tensor/data_structure.nim
@@ -90,12 +90,55 @@ when defined(opencl):
       offset*: int
       storage*: ClStorage[T]
 
-when defined(cuda) and defined(opencl):
+when defined(metal):
+  import ./backend/metal/metal_buffer
+
+  type
+    MetalTensorRefTrackerObj*[T: SomeFloat] = object
+      value*: MetalBuffer
+
+    MetalTensorRefTracker*[T] = ref MetalTensorRefTrackerObj[T]
+
+    MetalStorage*[T: SomeFloat] = object
+      ## Opaque seq-like structure for storage on the Metal backend.
+      Flen*: int
+      Fbuffer*: MetalBuffer
+      Fref_tracking*: MetalTensorRefTracker[T]
+
+    MetalTensor*[T: SomeFloat] = object
+      ## Tensor data structure stored on Apple Metal GPU
+      ##   - ``shape``: Dimensions of the MetalTensor
+      ##   - ``strides``: Numbers of items to skip to get the next item along a dimension.
+      ##   - ``offset``: Offset to get the first item of the MetalTensor. Note: offset can be negative, in particular for slices.
+      ##   - ``storage``: An opaque data storage for the MetalTensor
+      ##   - ``cpuData``: Optional CPU copy for lazy transfer
+      ##
+      ## Warning ⚠:
+      ##   Assignment ``var a = b`` does not copy the data. Data modification on one MetalTensor will be reflected on the other.
+      ##   However modification on metadata (shape, strides or offset) will not affect the other tensor.
+      ##   Explicit copies can be made with ``clone``: ``var a = b.clone``
+      shape*: Metadata
+      strides*: Metadata
+      offset*: int
+      storage*: MetalStorage[T]
+      cpuData*: seq[T]  ## Optional CPU copy for lazy transfer
+
+
+
+when defined(cuda) and defined(opencl) and defined(metal):
+  type AnyTensor*[T] = Tensor[T] or CudaTensor[T] or ClTensor[T] or MetalTensor[T]
+elif defined(cuda) and defined(opencl):
   type AnyTensor*[T] = Tensor[T] or CudaTensor[T] or ClTensor[T]
+elif defined(cuda) and defined(metal):
+  type AnyTensor*[T] = Tensor[T] or CudaTensor[T] or MetalTensor[T]
+elif defined(opencl) and defined(metal):
+  type AnyTensor*[T] = Tensor[T] or ClTensor[T] or MetalTensor[T]
 elif defined(cuda):
   type AnyTensor*[T] = Tensor[T] or CudaTensor[T]
 elif defined(opencl):
   type AnyTensor*[T] = Tensor[T] or ClTensor[T]
+elif defined(metal):
+  type AnyTensor*[T] = Tensor[T] or MetalTensor[T]
 else:
   type AnyTensor*[T] = Tensor[T]
 
@@ -224,3 +267,4 @@ proc dataArray*[T: KnownSupportsCopyMem](t: Tensor[T]): ptr UncheckedArray[T] {.
 
 proc dataArray*[T: not KnownSupportsCopyMem](t: Tensor[T]): ptr UncheckedArray[T] {.error: "`dataArray` " &
   " is deprecated for mem copyable types and not supported for GC'ed types!".}
+
diff --git a/src/arraymancer/tensor/init_copy_metal.nim b/src/arraymancer/tensor/init_copy_metal.nim
new file mode 100644
index 000000000..a34783a6a
--- /dev/null
+++ b/src/arraymancer/tensor/init_copy_metal.nim
@@ -0,0 +1,28 @@
+# Copyright 2017 the Arraymancer contributors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import  ./private/p_init_metal,
+        ./data_structure,
+        ./backend/metal/metal_backend,
+        ./init_metal,
+        ./init_copy_cpu
+
+proc clone*[T: SomeFloat](t: MetalTensor[T]): MetalTensor[T] {.noinit.} =
+  ## Clone (deep copy) a MetalTensor.
+  ## Copy will not share its data with the original.
+  ## Data is copied via CPU to ensure proper handling of strides and layout.
+
+  let cpuTensor = t.toCpu()
+  let clonedCpu = init_copy_cpu.clone(cpuTensor)
+  result = clonedCpu.metal()
diff --git a/src/arraymancer/tensor/init_metal.nim b/src/arraymancer/tensor/init_metal.nim
new file mode 100644
index 000000000..b8470d05f
--- /dev/null
+++ b/src/arraymancer/tensor/init_metal.nim
@@ -0,0 +1,65 @@
+# Copyright 2017 the Arraymancer contributors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import  ../private/sequninit,
+      ./private/p_init_metal,
+      ./backend/metal/metal_backend,
+      ./data_structure,
+      ./init_cpu
+
+proc metal*[T: SomeFloat](t: Tensor[T]): MetalTensor[T] {.noinit.} =
+  ## Convert a tensor on CPU to a tensor on a Metal device.
+  ## This performs a synchronous copy to the GPU.
+
+  result = newMetalTensor[T](t.shape)
+
+  let contig_t = t.asContiguous(colMajor, force = true)
+  let size = result.size * sizeof(T)
+
+  uploadToBuffer(result.storage.Fbuffer, contig_t.get_data_ptr, size)
+
+  # Store a CPU copy for lazy transfer (optional optimization)
+  # For now, we leave it empty to avoid unnecessary memory duplication
+  result.cpuData = @[]
+
+proc cpu*[T: SomeFloat](t: MetalTensor[T]): Tensor[T] {.noinit.} =
+  ## Convert a tensor on a Metal device to a tensor on CPU.
+  ## This performs a synchronous copy from the GPU.
+
+  result.shape = t.shape
+  result.strides = t.strides
+  result.offset = t.offset
+
+  allocCpuStorage result.storage, t.storage.Flen
+
+  let size = t.storage.Flen * sizeof(T)
+  downloadFromBuffer(t.storage.Fbuffer, result.get_data_ptr, size)
+
+proc toMetal*[T: SomeFloat](t: Tensor[T]): MetalTensor[T] {.noinit.} =
+  ## Convert a tensor on CPU to a tensor on a Metal device (lazy transfer).
+  ## The data is copied immediately for now; future versions may defer transfer.
+  metal(t)
+
+proc toCpu*[T: SomeFloat](t: MetalTensor[T]): Tensor[T] {.noinit.} =
+  ## Convert a tensor on a Metal device to a tensor on CPU (lazy transfer).
+  ## The data is copied immediately for now; future versions may defer transfer.
+  cpu(t)
+
+proc zeros_like*[T: SomeFloat](t: MetalTensor[T]): MetalTensor[T] {.noinit, inline.} =
+  ## Creates a new MetalTensor filled with 0 with the same shape as the input
+  result = zeros[T](t.shape).metal
+
+proc ones_like*[T: SomeFloat](t: MetalTensor[T]): MetalTensor[T] {.noinit, inline.} =
+  ## Creates a new MetalTensor filled with 1 with the same shape as the input
+  result = ones[T](t.shape).metal
diff --git a/src/arraymancer/tensor/operators_blas_l1_metal.nim b/src/arraymancer/tensor/operators_blas_l1_metal.nim
new file mode 100644
index 000000000..83dd5fad8
--- /dev/null
+++ b/src/arraymancer/tensor/operators_blas_l1_metal.nim
@@ -0,0 +1,103 @@
+# Copyright 2017 the Arraymancer contributors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import  ./backend/metal/metal_backend,
+        ./private/p_init_metal,
+        ./private/p_checks_metal,
+        ./data_structure
+
+# ####################################################################
+# BLAS Level 1 (Vector dot product, Addition, Scalar to Vector/Matrix)
+
+proc dot*[T: SomeFloat](a, b: MetalTensor[T]): T {.inline.} =
+  ## Vector to Vector dot (scalar) product using GPU reduction kernel
+  when compileOption("boundChecks"):
+    check_dot_prod(a, b)
+
+  result = metalDot[T](a.storage.Fbuffer, b.storage.Fbuffer, a.storage.Flen)
+
+proc `+=`*[T: SomeFloat](a: var MetalTensor[T], b: MetalTensor[T]) =
+  ## MetalTensor in-place addition
+
+  when compileOption("boundChecks"):
+    check_elementwise(a, b)
+
+  metalElementwise[T]("add",
+    a.storage.Fbuffer,
+    b.storage.Fbuffer,
+    a.storage.Fbuffer,
+    a.storage.Flen
+  )
+
+proc `+`*[T: SomeFloat](a, b: MetalTensor[T]): MetalTensor[T] {.noinit.} =
+  ## MetalTensor addition
+
+  when compileOption("boundChecks"):
+    check_elementwise(a, b)
+
+  result = newMetalTensor[T](a.shape)
+  metalElementwise[T]("add",
+    a.storage.Fbuffer,
+    b.storage.Fbuffer,
+    result.storage.Fbuffer,
+    a.storage.Flen
+  )
+
+proc `-=`*[T: SomeFloat](a: var MetalTensor[T], b: MetalTensor[T]) =
+  ## MetalTensor in-place substraction
+
+  when compileOption("boundChecks"):
+    check_elementwise(a, b)
+
+  metalElementwise[T]("sub",
+    a.storage.Fbuffer,
+    b.storage.Fbuffer,
+    a.storage.Fbuffer,
+    a.storage.Flen
+  )
+
+proc `-`*[T: SomeFloat](a, b: MetalTensor[T]): MetalTensor[T] {.noinit.} =
+  ## MetalTensor substraction
+
+  when compileOption("boundChecks"):
+    check_elementwise(a, b)
+
+  result = newMetalTensor[T](a.shape)
+  metalElementwise[T]("sub",
+    a.storage.Fbuffer,
+    b.storage.Fbuffer,
+    result.storage.Fbuffer,
+    a.storage.Flen
+  )
+
+proc `*=`*[T: SomeFloat](t: var MetalTensor[T]; a: T) {.inline.} =
+  ## MetalTensor inplace multiplication by a scalar
+
+  metalScalarMul[T](
+    t.storage.Fbuffer,
+    t.storage.Fbuffer,
+    a,
+    t.storage.Flen
+  )
+
+proc `*`*[T: SomeFloat](a: T, t: MetalTensor[T]): MetalTensor[T] {.noinit, inline.} =
+  ## MetalTensor multiplication by a scalar
+
+  result = newMetalTensor[T](t.shape)
+  metalScalarMul[T](
+    t.storage.Fbuffer,
+    result.storage.Fbuffer,
+    a,
+    t.storage.Flen
+  )
diff --git a/src/arraymancer/tensor/operators_blas_l2l3_metal.nim b/src/arraymancer/tensor/operators_blas_l2l3_metal.nim
new file mode 100644
index 000000000..92c837c39
--- /dev/null
+++ b/src/arraymancer/tensor/operators_blas_l2l3_metal.nim
@@ -0,0 +1,86 @@
+# Copyright 2017 the Arraymancer contributors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import  ./backend/metal/metal_backend,
+        ./private/p_init_metal,
+        ./private/p_checks_metal,
+        ./data_structure,
+        ./shapeshifting_metal
+
+proc metalMV_y_eq_aAx_p_by[T: SomeFloat](
+  alpha: T, a, x: MetalTensor[T],
+  beta: T, y: var MetalTensor[T]) =
+  # Matrix-Vector: y = alpha A matvecmul x + beta y
+
+  # TODO: remove this contiguous layout constraint
+  if not a.isContiguous:
+    raise newException(ValueError, "NotImplemented: for now matrix should be contiguous")
+
+  let M = a.shape[0]
+  let N = a.shape[1]
+  
+  # Use GPU kernel for matrix-vector multiplication
+  metalGemv[T](
+    false,  # trans
+    M, N,
+    alpha,
+    a.storage.Fbuffer, M,  # lda = rows of A
+    x.storage.Fbuffer, 1,  # incx = 1
+    beta,
+    y.storage.Fbuffer, 1   # incy = 1
+  )
+
+proc metalMM_C_eq_aAB_p_bC[T: SomeFloat](
+  alpha: T, a, b: MetalTensor[T],
+  beta: T, c: var MetalTensor[T]) =
+  # Matrix: C = alpha A matmul B + beta C
+  # All tensors are in column-major order (Fortran style)
+
+  # TODO: remove this contiguous layout constraint
+  if not (a.isContiguous and b.isContiguous):
+    raise newException(ValueError, "NotImplemented: for now both tensors should be contiguous")
+
+  let M = a.shape[0]  # rows of A and C
+  let N = b.shape[1]  # cols of B and C
+  let K = a.shape[1]  # cols of A, rows of B
+  
+  # For column-major matrices:
+  # lda = number of rows in A (M)
+  # ldb = number of rows in B (K)
+  # ldc = number of rows in C (M)
+  metalGemm[T](
+    false, false,  # transA, transB
+    M, N, K,
+    alpha,
+    a.storage.Fbuffer, M,  # lda = rows of A
+    b.storage.Fbuffer, K,  # ldb = rows of B
+    beta,
+    c.storage.Fbuffer, M   # ldc = rows of C
+  )
+
+proc `*`*[T: SomeFloat](a, b: MetalTensor[T]): MetalTensor[T] =
+  ## Matrix multiplication (Matrix-Matrix and Matrix-Vector) on Metal
+
+  if a.rank == 2 and b.rank == 2:
+    when compileOption("boundChecks"):
+      check_matmat(a, b)
+    result = newMetalTensor[T]([a.shape[0], b.shape[1]])
+    metalMM_C_eq_aAB_p_bC(1.T, a, b, 0.T, result)
+  elif a.rank == 2 and b.rank == 1:
+    when compileOption("boundChecks"):
+      check_matvec(a, b)
+    result = newMetalTensor[T]([a.shape[0]])
+    metalMV_y_eq_aAx_p_by(1.T, a, b, 0.T, result)
+  else:
+    raise newException(ValueError, "Matrix-Matrix or Matrix-Vector multiplication valid only if first Tensor is a Matrix and second is a Matrix or Vector")
diff --git a/src/arraymancer/tensor/operators_broadcasted_metal.nim b/src/arraymancer/tensor/operators_broadcasted_metal.nim
new file mode 100644
index 000000000..bae7781d8
--- /dev/null
+++ b/src/arraymancer/tensor/operators_broadcasted_metal.nim
@@ -0,0 +1,194 @@
+# Copyright 2017 the Arraymancer contributors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import  ./backend/metal/metal_backend,
+        ./private/p_init_metal,
+        ./data_structure,
+        ./operators_blas_l1_metal,
+        ./shapeshifting_metal
+
+import ../private/deprecate
+
+# #########################################################
+# # Broadcasting Tensor-Tensor
+# # And element-wise multiplication (Hadamard) and division
+
+proc `+.`*[T: SomeFloat](a, b: MetalTensor[T]): MetalTensor[T] {.noinit,inline.} =
+  ## Broadcasted addition for tensors of incompatible but broadcastable shape.
+  let (tmp_a, tmp_b) = broadcast2(a, b)
+  result = tmp_a + tmp_b
+
+proc `-.`*[T: SomeFloat](a, b: MetalTensor[T]): MetalTensor[T] {.noinit,inline.} =
+  ## Broadcasted subtraction for tensors of incompatible but broadcastable shape.
+  let (tmp_a, tmp_b) = broadcast2(a, b)
+  result = tmp_a - tmp_b
+
+proc `*.`*[T: SomeFloat](a,b: MetalTensor[T]): MetalTensor[T] {.noinit.} =
+  ## Element-wise multiplication (Hadamard product).
+  ##
+  ## And broadcasted element-wise multiplication.
+  let (tmp_a, tmp_b) = broadcast2(a, b)
+  
+  result = newMetalTensor[T](tmp_a.shape)
+  metalElementwise[T]("mul",
+    tmp_a.storage.Fbuffer,
+    tmp_b.storage.Fbuffer,
+    result.storage.Fbuffer,
+    result.storage.Flen
+  )
+
+proc `/.`*[T: SomeFloat](a, b: MetalTensor[T]): MetalTensor[T] {.noinit.} =
+  ## Element-wise division.
+  ##
+  ## And broadcasted element-wise division.
+  let (tmp_a, tmp_b) = broadcast2(a, b)
+  
+  result = newMetalTensor[T](tmp_a.shape)
+  metalElementwise[T]("div",
+    tmp_a.storage.Fbuffer,
+    tmp_b.storage.Fbuffer,
+    result.storage.Fbuffer,
+    result.storage.Flen
+  )
+
+# ##############################################
+# # Broadcasting in-place Tensor-Tensor
+
+proc `+.=`*[T: SomeFloat](a: var MetalTensor[T], b: MetalTensor[T]) =
+  ## Tensor broadcasted in-place addition.
+  ##
+  ## Only the right hand side tensor can be broadcasted.
+  let tmp_b = b.broadcast(a.shape)
+  a += tmp_b
+
+proc `-.=`*[T: SomeFloat](a: var MetalTensor[T], b: MetalTensor[T]) =
+  ## Tensor broadcasted in-place subtraction.
+  ##
+  ## Only the right hand side tensor can be broadcasted.
+  let tmp_b = b.broadcast(a.shape)
+  a -= tmp_b
+
+proc `*.=`*[T: SomeFloat](a: var MetalTensor[T], b: MetalTensor[T]) =
+  ## Tensor broadcasted in-place multiplication (Hadamard product)
+  ##
+  ## Only the right hand side tensor can be broadcasted.
+  let tmp_b = b.broadcast(a.shape)
+  metalElementwise[T]("mul",
+    a.storage.Fbuffer,
+    tmp_b.storage.Fbuffer,
+    a.storage.Fbuffer,
+    a.storage.Flen
+  )
+
+proc `/.=`*[T: SomeFloat](a: var MetalTensor[T], b: MetalTensor[T]) =
+  ## Tensor broadcasted in-place float division.
+  ##
+  ## Only the right hand side tensor can be broadcasted.
+  let tmp_b = b.broadcast(a.shape)
+  metalElementwise[T]("div",
+    a.storage.Fbuffer,
+    tmp_b.storage.Fbuffer,
+    a.storage.Fbuffer,
+    a.storage.Flen
+  )
+
+# ##############################################
+# # Broadcasting Tensor-Scalar and Scalar-Tensor
+
+proc `+.`*[T: SomeFloat](t: MetalTensor[T], val: T): MetalTensor[T] {.noinit.} =
+  ## Broadcasted addition for tensor + scalar.
+  ## Uses GPU scalar_add kernel
+  result = newMetalTensor[T](t.shape)
+  metalScalarAdd[T](
+    t.storage.Fbuffer,
+    result.storage.Fbuffer,
+    val,
+    t.storage.Flen
+  )
+
+proc `-.`*[T: SomeFloat](t: MetalTensor[T], val: T): MetalTensor[T] {.noinit.} =
+  ## Broadcasted subtraction for tensor - scalar.
+  ## Implemented as tensor + (-scalar)
+  result = newMetalTensor[T](t.shape)
+  metalScalarAdd[T](
+    t.storage.Fbuffer,
+    result.storage.Fbuffer,
+    -val,
+    t.storage.Flen
+  )
+
+proc `+.`*[T: SomeFloat](val: T, t: MetalTensor[T]): MetalTensor[T] {.noinit.} =
+  ## Broadcasted addition for scalar + tensor.
+  result = t +. val
+
+proc `-.`*[T: SomeFloat](val: T, t: MetalTensor[T]): MetalTensor[T] {.noinit.} =
+  ## Broadcasted subtraction for scalar - tensor.
+  ## Implemented as -(tensor - scalar)
+  result = newMetalTensor[T](t.shape)
+  metalScalarAdd[T](
+    t.storage.Fbuffer,
+    result.storage.Fbuffer,
+    -val,
+    t.storage.Flen
+  )
+  # Negate the result
+  metalScalarMul[T](
+    result.storage.Fbuffer,
+    result.storage.Fbuffer,
+    T(-1),
+    t.storage.Flen
+  )
+
+proc `/.`*[T: SomeFloat](val: T, t: MetalTensor[T]): MetalTensor[T] {.noinit.} =
+  ## Broadcasted division of a float by a tensor of floats.
+  ## For now, fall back to CPU
+  let cpuT = t.toCpu()
+  let cpuResult = val /. cpuT
+  result = cpuResult.metal()
+
+# ##############################################
+# # Broadcasting in-place Tensor-Scalar
+
+proc `+.=`*[T: SomeFloat](t: var MetalTensor[T], val: T) =
+  ## Broadcasted in-place addition for tensor += scalar.
+  ## Uses GPU scalar_add kernel in-place
+  metalScalarAdd[T](
+    t.storage.Fbuffer,
+    t.storage.Fbuffer,
+    val,
+    t.storage.Flen
+  )
+
+proc `-.=`*[T: SomeFloat](t: var MetalTensor[T], val: T) =
+  ## Broadcasted in-place subtraction for tensor -= scalar.
+  ## Implemented as tensor += (-scalar)
+  metalScalarAdd[T](
+    t.storage.Fbuffer,
+    t.storage.Fbuffer,
+    -val,
+    t.storage.Flen
+  )
+
+# ##############################################
+# Deprecated syntax
+
+implDeprecatedBy(`.+`, `+.`, exported = true)
+implDeprecatedBy(`.-`, `-.`, exported = true)
+implDeprecatedBy(`.*`, `*.`, exported = true)
+implDeprecatedBy(`./`, `/.`, exported = true)
+
+implDeprecatedBy(`.=+`, `+.=`, exported = true)
+implDeprecatedBy(`.=-`, `-.=`, exported = true)
+implDeprecatedBy(`.=*`, `*.=`, exported = true)
+implDeprecatedBy(`.=/`, `/.=`, exported = true)
diff --git a/src/arraymancer/tensor/private/p_checks_metal.nim b/src/arraymancer/tensor/private/p_checks_metal.nim
new file mode 100644
index 000000000..ad425586c
--- /dev/null
+++ b/src/arraymancer/tensor/private/p_checks_metal.nim
@@ -0,0 +1,45 @@
+# Copyright 2017 the Arraymancer contributors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+when defined(metal):
+  import ../data_structure
+
+  proc check_elementwise*[T,U](a: MetalTensor[T], b: MetalTensor[U]) {.inline.} =
+    ## Check if element-wise operations can be applied to 2 MetalTensors
+    if unlikely(a.shape != b.shape):
+      raise newException(ValueError, "Both Tensors should have the same shape.\n Left-hand side has shape " &
+                                     $a.shape & " while right-hand side has shape " & $b.shape)
+
+  proc check_dot_prod*[T,U](a: MetalTensor[T], b: MetalTensor[U]) {.inline.} =
+    ## Check if dot product can be applied to 2 MetalTensors
+    if unlikely(a.rank != 1 or b.rank != 1):
+      raise newException(ValueError, "Dot product is only supported for vectors (1D tensors)")
+    if unlikely(a.shape[0] != b.shape[0]):
+      raise newException(ValueError, "Both vectors should have the same length")
+
+  proc check_matmat*[T,U](a: MetalTensor[T], b: MetalTensor[U]) {.inline.} =
+    ## Check if matrix-matrix multiplication can be applied
+    if unlikely(a.rank != 2 or b.rank != 2):
+      raise newException(ValueError, "Matrix multiplication requires 2D tensors")
+    if unlikely(a.shape[1] != b.shape[0]):
+      raise newException(ValueError, "Incompatible shapes for matrix multiplication: " &
+                                     $a.shape & " and " & $b.shape)
+
+  proc check_matvec*[T,U](a: MetalTensor[T], b: MetalTensor[U]) {.inline.} =
+    ## Check if matrix-vector multiplication can be applied
+    if unlikely(a.rank != 2 or b.rank != 1):
+      raise newException(ValueError, "Matrix-vector multiplication requires a 2D matrix and 1D vector")
+    if unlikely(a.shape[1] != b.shape[0]):
+      raise newException(ValueError, "Incompatible shapes for matrix-vector multiplication: " &
+                                     $a.shape & " and " & $b.shape)
diff --git a/src/arraymancer/tensor/private/p_init_metal.nim b/src/arraymancer/tensor/private/p_init_metal.nim
new file mode 100644
index 000000000..b90971929
--- /dev/null
+++ b/src/arraymancer/tensor/private/p_init_metal.nim
@@ -0,0 +1,59 @@
+# Copyright 2017 the Arraymancer contributors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import  ../backend/metal/metal_backend,
+        ../data_structure
+
+proc newMetalStorage*[T: SomeFloat](length: int): MetalStorage[T] =
+  result.Flen = length
+  result.Fbuffer = createMetalBuffer(length * sizeof(T))
+  new result.Fref_tracking
+  result.Fref_tracking.value = result.Fbuffer
+
+template tensorMetal*[T: SomeFloat](
+  shape: typed,
+  layout: OrderType = colMajor,
+  result: var MetalTensor[T]) =
+
+  result.shape.copyFrom(shape)
+  shape_to_strides(result.shape, layout, result.strides)
+  result.offset = 0
+  result.storage = newMetalStorage[T](result.size)
+  result.cpuData = @[]
+
+proc newMetalTensor*[T: SomeFloat](
+  shape: varargs[int],
+  layout: OrderType = colMajor): MetalTensor[T] {.noinit.} =
+  ## Internal proc
+  ## Allocate a MetalTensor
+  ## WARNING: The Metal memory is not initialized to 0
+
+  tensorMetal(shape, layout, result)
+
+proc newMetalTensor*[T: SomeFloat](
+  shape: Metadata,
+  layout: OrderType = colMajor): MetalTensor[T] {.noinit.} =
+
+  tensorMetal(shape, layout, result)
+
+proc get_data_ptr*[T: SomeFloat](t: MetalTensor[T]): ptr T {.inline.} =
+  ## Get access to the data pointer of a MetalTensor
+  ## This is unsafe and should only be used internally
+  cast[ptr T](t.storage.Fbuffer.devicePtr)
+
+proc get_data_ptr*[T: SomeFloat](t: var MetalTensor[T]): ptr T {.inline.} =
+  ## Get access to the data pointer of a MetalTensor
+  ## This is unsafe and should only be used internally
+  cast[ptr T](t.storage.Fbuffer.devicePtr)
diff --git a/src/arraymancer/tensor/shapeshifting_metal.nim b/src/arraymancer/tensor/shapeshifting_metal.nim
new file mode 100644
index 000000000..63a7fcdc5
--- /dev/null
+++ b/src/arraymancer/tensor/shapeshifting_metal.nim
@@ -0,0 +1,137 @@
+# Copyright 2017 the Arraymancer contributors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import  ./data_structure,
+        ./private/p_init_metal,
+        ./private/p_shapeshifting,
+        ./init_metal,
+        ./backend/metal/metal_backend
+
+proc transpose*(t: MetalTensor): MetalTensor {.noSideEffect.}=
+  ## Transpose a MetalTensor.
+  ##
+  ## For N-d Tensor with shape (0, 1, 2 ... n-1) the resulting tensor will have shape (n-1, ... 2, 1, 0)
+  t.shape.reversed(result.shape)
+  t.strides.reversed(result.strides)
+  result.offset = t.offset
+  result.storage = t.storage
+  result.cpuData = t.cpuData
+
+proc isContiguous*(t: MetalTensor): bool {.inline.} =
+  ## Check if the MetalTensor is contiguous in memory
+  ## A tensor is contiguous if its strides match either row-major or col-major layout
+  if t.rank == 0:
+    return true
+  
+  # Check for row-major (C-style) contiguous
+  var expectedStride = 1
+  var isRowMajor = true
+  for i in countdown(t.rank - 1, 0):
+    if t.strides[i] != expectedStride:
+      isRowMajor = false
+      break
+    expectedStride *= t.shape[i]
+  
+  if isRowMajor:
+    return true
+  
+  # Check for col-major (Fortran-style) contiguous
+  expectedStride = 1
+  var isColMajor = true
+  for i in 0 ..< t.rank:
+    if t.strides[i] != expectedStride:
+      isColMajor = false
+      break
+    expectedStride *= t.shape[i]
+  
+  return isColMajor
+
+proc asContiguous*[T: SomeFloat](t: MetalTensor[T], layout: OrderType = rowMajor, force: bool = false): MetalTensor[T] {.noinit.} =
+  ## Return a contiguous copy of the MetalTensor using GPU kernel
+  if not force and t.isContiguous:
+    return t
+  
+  # Create new contiguous tensor
+  result = newMetalTensor[T](t.shape)
+  
+  # Convert shape and strides to seq for GPU kernel
+  var shapeSeq = newSeq[int](t.rank)
+  var stridesSeq = newSeq[int](t.rank)
+  for i in 0 ..< t.rank:
+    shapeSeq[i] = t.shape[i]
+    stridesSeq[i] = t.strides[i]
+  
+  # Use GPU kernel to copy data with arbitrary strides to contiguous buffer
+  metalContiguousCopy[T](
+    t.storage.Fbuffer,
+    result.storage.Fbuffer,
+    shapeSeq,
+    stridesSeq,
+    t.storage.Flen
+  )
+
+proc reshape*(t: MetalTensor, new_shape: varargs[int]): MetalTensor =
+  ## Reshape a MetalTensor without copy.
+  ##
+  ## ⚠ Reshaping without copy is only possible on contiguous rowMajor Tensors
+  
+  reshape_no_copy(t, new_shape, result, rowMajor)
+  result.storage = t.storage
+  result.cpuData = t.cpuData
+
+proc broadcast*(t: MetalTensor, shape: varargs[int]): MetalTensor {.noSideEffect.}=
+  ## Broadcast a MetalTensor to a new shape.
+  ##
+  ## Dimension(s) of size 1 can be expanded to arbitrary size by replicating
+  ## values along that dimension.
+  ##
+  ## Warning ⚠:
+  ##   This is a no-copy operation, data is shared with the input.
+  ##   This proc does not guarantee that a ``let`` value is immutable.
+  ##   A broadcasted tensor should not be modified and only used for computation.
+  result = t
+  result.broadcastImpl(shape)
+
+proc broadcast*(t: MetalTensor, shape: Metadata): MetalTensor {.noSideEffect.}=
+  ## Broadcast a MetalTensor to a new shape.
+  ##
+  ## Dimension(s) of size 1 can be expanded to arbitrary size by replicating
+  ## values along that dimension.
+  ##
+  ## Warning ⚠:
+  ##   This is a no-copy operation, data is shared with the input.
+  ##   This proc does not guarantee that a ``let`` value is immutable.
+  ##   A broadcasted tensor should not be modified and only used for computation.
+  result = t
+  result.broadcastImpl(shape)
+
+proc broadcast2*[T](a, b: MetalTensor[T]): tuple[a, b: MetalTensor[T]] {.noSideEffect.}=
+  ## Broadcast 2 tensors so they have compatible shapes for element-wise computations.
+  ##
+  ## Tensors in the tuple can be accessed with output.a and output.b
+  ##
+  ## The returned broadcasted Tensors share the underlying data with the input.
+  ##
+  ## Dimension(s) of size 1 can be expanded to arbitrary size by replicating
+  ## values along that dimension.
+  ##
+  ## Warning ⚠:
+  ##   This is a no-copy operation, data is shared with the input.
+  ##   This proc does not guarantee that a ``let`` value is immutable.
+  ##   A broadcasted tensor should not be modified and only used for computation.
+
+  broadcast2Impl(a,b, result)
+
+  result.a.storage = a.storage
+  result.b.storage = b.storage
diff --git a/src/arraymancer/tensor/tensor_metal.nim b/src/arraymancer/tensor/tensor_metal.nim
new file mode 100644
index 000000000..ccaf15c89
--- /dev/null
+++ b/src/arraymancer/tensor/tensor_metal.nim
@@ -0,0 +1,32 @@
+# Copyright 2017 the Arraymancer contributors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import ./backend/metal/metal_backend,
+        ./backend/metal/metal_buffer,
+        ./init_metal,
+        ./init_copy_metal,
+        ./operators_blas_l1_metal,
+        ./operators_blas_l2l3_metal,
+        ./operators_broadcasted_metal,
+        ./shapeshifting_metal
+
+export  metal_backend,
+        metal_buffer,
+        init_metal,
+        init_copy_metal,
+        operators_blas_l1_metal,
+        operators_blas_l2l3_metal,
+        operators_broadcasted_metal,
+        shapeshifting_metal
diff --git a/tests/tensor/test_accessors_slicer_metal.nim b/tests/tensor/test_accessors_slicer_metal.nim
new file mode 100644
index 000000000..045afe332
--- /dev/null
+++ b/tests/tensor/test_accessors_slicer_metal.nim
@@ -0,0 +1,69 @@
+# Copyright 2017 the Arraymancer contributors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Please compile with -d:metal switch
+import ../../src/arraymancer
+import std / unittest
+
+suite "Metal Accessors and Slicing":
+  test "Basic indexing":
+    let cpu_tensor = [[1.0, 2.0, 3.0],
+                      [4.0, 5.0, 6.0],
+                      [7.0, 8.0, 9.0]].toTensor
+    let metal_tensor = cpu_tensor.toMetal()
+
+    # Shape should be preserved
+    check: metal_tensor.shape == [3, 3]
+
+  test "Slice with range":
+    let cpu_tensor = [[1.0, 2.0, 3.0, 4.0],
+                      [5.0, 6.0, 7.0, 8.0],
+                      [9.0, 10.0, 11.0, 12.0]].toTensor
+    let metal_tensor = cpu_tensor.toMetal()
+
+    let sliced = metal_tensor[0..1, 1..2]
+    let back_to_cpu = sliced.toCpu()
+
+    check: back_to_cpu == cpu_tensor[0..1, 1..2]
+
+  test "Slice with step":
+    let cpu_tensor = [[1.0, 2.0, 3.0, 4.0],
+                      [5.0, 6.0, 7.0, 8.0],
+                      [9.0, 10.0, 11.0, 12.0],
+                      [13.0, 14.0, 15.0, 16.0]].toTensor
+    let metal_tensor = cpu_tensor.toMetal()
+
+    let sliced = metal_tensor[0..3|2, 0..3|2]
+    let back_to_cpu = sliced.toCpu()
+
+    check: back_to_cpu == cpu_tensor[0..3|2, 0..3|2]
+
+  test "Slice with underscore":
+    let cpu_tensor = [[1.0, 2.0, 3.0],
+                      [4.0, 5.0, 6.0],
+                      [7.0, 8.0, 9.0]].toTensor
+    let metal_tensor = cpu_tensor.toMetal()
+
+    let row = metal_tensor[1, _]
+    let back_to_cpu = row.toCpu()
+
+    check: back_to_cpu == cpu_tensor[1, _]
+
+  test "Metadata preservation":
+    let cpu_tensor = [[1.0, 2.0, 3.0],
+                      [4.0, 5.0, 6.0]].toTensor
+    let metal_tensor = cpu_tensor.toMetal()
+
+    check: metal_tensor.rank == cpu_tensor.rank
+    check: metal_tensor.size == cpu_tensor.size
diff --git a/tests/tensor/test_broadcasting_metal.nim b/tests/tensor/test_broadcasting_metal.nim
new file mode 100644
index 000000000..5603811e7
--- /dev/null
+++ b/tests/tensor/test_broadcasting_metal.nim
@@ -0,0 +1,54 @@
+# Copyright 2017 the Arraymancer contributors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Please compile with -d:metal switch
+import ../../src/arraymancer
+import std / [unittest, sugar, sequtils]
+
+suite "Metal: Shapeshifting - broadcasting and non linear algebra elementwise operations":
+  test "Tensor element-wise multiplication (Hadamard product) and division":
+    block:
+      let u = @[-4, 0, 9].toTensor().asType(float32).toMetal()
+      let v = @[2, 10, 3].toTensor().asType(float32).toMetal()
+      let expected_mul = @[-8, 0, 27].toTensor().asType(float32)
+      let expected_div = @[-2, 0, 3].toTensor().asType(float32)
+
+      check: (u *. v).toCpu() == expected_mul
+      check: (u /. v).toCpu() == expected_div
+
+    block:
+      let u = @[1.0, 8.0, -3.0].toTensor().asType(float32).toMetal()
+      let v = @[4.0, 2.0, 10.0].toTensor().asType(float32).toMetal()
+      let expected_mul = @[4.0, 16.0, -30.0].toTensor().asType(float32)
+      let expected_div = @[0.25, 4.0, -0.3].toTensor().asType(float32)
+
+      check: (u *. v).toCpu() == expected_mul
+      check: (u /. v).toCpu() == expected_div
+
+  test "Tensor element-wise in-place multiplication (Hadamard product) and division":
+    block:
+      var u = @[-4.0, 0.0, 9.0].toTensor().asType(float32).toMetal()
+      let v = @[2.0, 10.0, 3.0].toTensor().asType(float32).toMetal()
+      let expected_mul = @[-8.0, 0.0, 27.0].toTensor().asType(float32)
+
+      u *.= v
+      check: u.toCpu() == expected_mul
+
+    block:
+      var u = @[100.0, 10.0, 30.0].toTensor().asType(float32).toMetal()
+      let v = @[2.0, 5.0, 10.0].toTensor().asType(float32).toMetal()
+      let expected_div = @[50.0, 2.0, 3.0].toTensor().asType(float32)
+
+      u /.= v
+      check: u.toCpu() == expected_div
diff --git a/tests/tensor/test_init_metal.nim b/tests/tensor/test_init_metal.nim
new file mode 100644
index 000000000..d246122b5
--- /dev/null
+++ b/tests/tensor/test_init_metal.nim
@@ -0,0 +1,33 @@
+# Copyright 2017 the Arraymancer contributors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import ../../src/arraymancer
+import std / unittest
+
+
+suite "Metal init":
+  test "Clone function":
+    let a = [ 7, 4, 3, 1, 8, 6,
+              8, 1, 6, 2, 6, 6,
+              2, 0, 4, 3, 2, 0].toTensor.reshape([3,6]).asType(float).toMetal
+
+    # Tensor of shape 3x6 of type "int" on backend "Cpu"
+    # |7      4       3       1       8       6|
+    # |8      1       6       2       6       6|
+    # |2      0       4       3       2       0|
+
+    let b = a[2..0|-2, 2..0|-2].clone
+
+    check: b.toCpu == [ [4.0, 2.0],
+                      [3.0, 7.0]].toTensor
diff --git a/tests/tensor/test_operators_blas_metal.nim b/tests/tensor/test_operators_blas_metal.nim
new file mode 100644
index 000000000..246245838
--- /dev/null
+++ b/tests/tensor/test_operators_blas_metal.nim
@@ -0,0 +1,93 @@
+# Copyright 2017 the Arraymancer contributors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Please compile with -d:metal switch
+import ../../src/arraymancer
+import std / [unittest, sugar]
+
+suite "Metal BLAS backend (Basic Linear Algebra Subprograms)":
+  test "GEMM - General Matrix to Matrix Multiplication":
+    let a = [[1.0, 2, 3],
+             [4.0, 5, 6]].toTensor().asType(float32).toMetal()
+
+    let b = [[7.0, 8],
+             [9.0, 10],
+             [11.0, 12]].toTensor().asType(float32).toMetal()
+
+    let expected = [[58.0, 64],
+                    [139.0, 154]].toTensor().asType(float32)
+
+    let result = (a * b).toCpu()
+    check: result == expected
+
+  test "Matrix-Vector multiplication":
+    let a = [[1.0, 2.0, 3.0],
+             [4.0, 5.0, 6.0]].toTensor().asType(float32).toMetal()
+    let x = [1.0, 2.0, 3.0].toTensor().asType(float32).toMetal()
+
+    let expected = [14.0, 32.0].toTensor().asType(float32)
+
+    let result = (a * x).toCpu()
+    check: result == expected
+
+  test "Element-wise addition":
+    let a = [[1.0, 2.0], [3.0, 4.0]].toTensor().asType(float32).toMetal()
+    let b = [[5.0, 6.0], [7.0, 8.0]].toTensor().asType(float32).toMetal()
+
+    let expected = [[6.0, 8.0], [10.0, 12.0]].toTensor().asType(float32)
+
+    let result = (a + b).toCpu()
+    check: result == expected
+
+  test "Element-wise subtraction":
+    let a = [[5.0, 6.0], [7.0, 8.0]].toTensor().asType(float32).toMetal()
+    let b = [[1.0, 2.0], [3.0, 4.0]].toTensor().asType(float32).toMetal()
+
+    let expected = [[4.0, 4.0], [4.0, 4.0]].toTensor().asType(float32)
+
+    let result = (a - b).toCpu()
+    check: result == expected
+
+  test "Scalar multiplication":
+    let a = [[1.0, 2.0], [3.0, 4.0]].toTensor().asType(float32).toMetal()
+
+    let expected = [[2.0, 4.0], [6.0, 8.0]].toTensor().asType(float32)
+
+    let result = (2.0'f32 * a).toCpu()
+    check: result == expected
+
+  test "Dot product":
+    let a = [1.0, 2.0, 3.0, 4.0].toTensor().asType(float32).toMetal()
+    let b = [5.0, 6.0, 7.0, 8.0].toTensor().asType(float32).toMetal()
+
+    let result = dot(a, b)
+    check: result == 70.0'f32
+
+  test "In-place addition":
+    var a = [[1.0, 2.0], [3.0, 4.0]].toTensor().asType(float32).toMetal()
+    let b = [[5.0, 6.0], [7.0, 8.0]].toTensor().asType(float32).toMetal()
+
+    let expected = [[6.0, 8.0], [10.0, 12.0]].toTensor().asType(float32)
+
+    a += b
+    check: a.toCpu() == expected
+
+  test "Large matrix multiplication":
+    # Test with matrices above the GEMM threshold
+    let a = randomTensor([200, 200], 1.0'f32).toMetal()
+    let b = randomTensor([200, 200], 1.0'f32).toMetal()
+
+    # Just verify it doesn't crash and returns correct shape
+    let result = a * b
+    check: result.shape == [200, 200]
diff --git a/tests/tensor/test_shapeshifting_metal.nim b/tests/tensor/test_shapeshifting_metal.nim
new file mode 100644
index 000000000..fe9fa39ba
--- /dev/null
+++ b/tests/tensor/test_shapeshifting_metal.nim
@@ -0,0 +1,67 @@
+# Copyright 2017 the Arraymancer contributors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Please compile with -d:metal switch
+import ../../src/arraymancer
+import std / unittest
+
+suite "Metal Shapeshifting":
+  test "Transpose":
+    let cpu_tensor = [[1.0, 2.0, 3.0],
+                      [4.0, 5.0, 6.0]].toTensor
+    let metal_tensor = cpu_tensor.toMetal()
+
+    # Transpose should work by manipulating metadata
+    let transposed = metal_tensor.transpose()
+    let back_to_cpu = transposed.toCpu()
+
+    check: back_to_cpu == cpu_tensor.transpose()
+
+  test "Reshape":
+    let cpu_tensor = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0].toTensor
+    let metal_tensor = cpu_tensor.toMetal()
+
+    let reshaped = metal_tensor.reshape([2, 3])
+    let back_to_cpu = reshaped.toCpu()
+
+    check: back_to_cpu == cpu_tensor.reshape([2, 3])
+
+  test "Slice":
+    let cpu_tensor = [[1.0, 2.0, 3.0],
+                      [4.0, 5.0, 6.0],
+                      [7.0, 8.0, 9.0]].toTensor
+    let metal_tensor = cpu_tensor.toMetal()
+
+    let sliced = metal_tensor[1..2, 0..1]
+    let back_to_cpu = sliced.toCpu()
+
+    check: back_to_cpu == cpu_tensor[1..2, 0..1]
+
+  test "Contiguous check":
+    let cpu_tensor = [[1.0, 2.0, 3.0],
+                      [4.0, 5.0, 6.0]].toTensor
+    let metal_tensor = cpu_tensor.toMetal()
+
+    check: metal_tensor.isContiguous == cpu_tensor.isContiguous
+
+  test "As contiguous":
+    let cpu_tensor = [[1.0, 2.0, 3.0],
+                      [4.0, 5.0, 6.0]].toTensor
+    let metal_tensor = cpu_tensor.toMetal()
+    let transposed = metal_tensor.transpose()
+
+    let contiguous = transposed.asContiguous(rowMajor)
+    let back_to_cpu = contiguous.toCpu()
+
+    check: back_to_cpu == cpu_tensor.transpose().asContiguous(rowMajor)
diff --git a/tests/tests_metal.nim b/tests/tests_metal.nim
new file mode 100644
index 000000000..234b8695b
--- /dev/null
+++ b/tests/tests_metal.nim
@@ -0,0 +1,23 @@
+# Copyright 2017 the Arraymancer contributors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Please compile with -d:metal switch
+{.push warning[Spacing]: off.}
+import ../src/arraymancer,
+       ./tensor/test_init_metal,
+       ./tensor/test_operators_blas_metal,
+       ./tensor/test_accessors_slicer_metal,
+       ./tensor/test_shapeshifting_metal,
+       ./tensor/test_broadcasting_metal
+{.pop.}