apple · kevchengcodes · Jun 11, 2026 · Jun 12, 2026 · Jun 12, 2026
diff --git a/models/yolo/README.md b/models/yolo/README.md
@@ -44,22 +44,37 @@ uv run export.py --help
 ### In your iOS and macOS applications
 
 ```swift
-import ObjectDetector
-
-// Detection parameters
-let params = DetectionParameters()
+import CoreAIObjectDetector
 
 // Load directly from an exported .aimodel directory.
 let detector = try await ObjectDetector(resourcesAt: "coreai-models/exports/yolos-base_float32_static.aimodel")
 
-// Run inference
-let detections = try await detector.detect(image: cgImage, parameters: params)
+// Single image, default parameters.
+let detections = try await detector.detect(image: cgImage)
+
+// Batched detection. For dynamic-shape exports, optionally override the spatial dims
+// on DetectionParameters; for static exports the values are ignored.
+var params = DetectionParameters()
+params.inputHeight = 800
+params.inputWidth = 1024
+let batchDetections = try await detector.detect(images: [imageA, imageB], parameters: params)
+
+// Optional: warm up the kernel for the exact (B, H, W) you'll run with.
+try await detector.warmup(imageCount: 2, parameters: params)
 ```
 
 ### On your Mac using built-in Command Line Tool
 
 ```bash
+# Single image, static-shape model.
 swift run -c release object-detector --model path/to/exported_model.aimodel --image path/to/image.jpg
+
+# Batched detection on a dynamic-shape export, with optional, explicit input dims and warmup.
+swift run -c release object-detector \
+  --model path/to/dynamic.aimodel \
+  --image a.jpg --image b.jpg \
+  --input-height 800 --input-width 1024 \
+  --warmup
 ```
 
 [^1]: [Paper](https://arxiv.org/abs/2106.00666) · [HuggingFace](https://huggingface.co/hustvl/yolos-tiny)
diff --git a/swift/Sources/CoreAIObjectDetector/DetectionOutputs.swift b/swift/Sources/CoreAIObjectDetector/DetectionOutputs.swift
@@ -73,18 +73,32 @@ public struct DetectionParameters: Sendable {
     /// When empty, labels default to "class_N".
     public var classLabels: [Int: String]
 
+    /// Model input height. Only consulted when the model declares a dynamic
+    /// spatial dimension; ignored for static-shape models. Defaults to 800
+    /// (matches the YOLOS export's reference input and the training-time
+    /// canvas geometry).
+    public var inputHeight: Int
+
+    /// Model input width. Only consulted when the model declares a dynamic
+    /// spatial dimension; ignored for static-shape models. Defaults to 800.
+    public var inputWidth: Int
+
     public init(
         threshold: Float = 0.3,
         maxDetections: Int = 100,
         normalizationMeans: (CGFloat, CGFloat, CGFloat) = (0.485, 0.456, 0.406),
         normalizationStds: (CGFloat, CGFloat, CGFloat) = (0.229, 0.224, 0.225),
-        classLabels: [Int: String] = ObjectDetectionLabels.coco
+        classLabels: [Int: String] = ObjectDetectionLabels.coco,
+        inputHeight: Int = 800,
+        inputWidth: Int = 800
     ) {
         self.threshold = threshold
         self.maxDetections = maxDetections
         self.normalizationMeans = normalizationMeans
         self.normalizationStds = normalizationStds
         self.classLabels = classLabels
+        self.inputHeight = inputHeight
+        self.inputWidth = inputWidth
     }
 
     public static let `default` = DetectionParameters()

diff --git a/swift/Sources/CoreAIObjectDetector/ObjectDetector.swift b/swift/Sources/CoreAIObjectDetector/ObjectDetector.swift
@@ -80,75 +80,218 @@ public struct ObjectDetector {
 
     // MARK: - Inference
 
-    /// Warm up the backend (e.g. trigger Metal kernel compilation) with a dummy pass.
-    public func warmup() async throws {
+    /// Warm up the backend (e.g. trigger Metal kernel compilation) with a dummy
+    /// pass at the same `(B, H, W)` that subsequent `detect()` calls will use.
+    /// For static-shape models the arguments are ignored — `planBatch` falls
+    /// back to the descriptor's fixed dims.
+    public func warmup(imageCount: Int = 1, parameters: DetectionParameters = .default) async throws {
         guard case .ndArray(let imageDescriptor) = functionDescriptor.inputDescriptor(of: imageInputName) else {
             throw DetectionRuntimeError.invalidConfiguration(
                 "No array descriptor for image input '\(imageInputName)'"
             )
         }
-        let imageArray = NDArray(descriptor: imageDescriptor)
-        _ = try await function.run(inputs: [imageInputName: imageArray])
+        let expectedShape = imageDescriptor.shape
+        guard expectedShape.count == 4 else {
+            throw DetectionRuntimeError.invalidConfiguration(
+                "Expected 4-dimensional input shape, got \(expectedShape.count)"
+            )
+        }
+        let plan = try Self.planBatch(
+            expectedShape: expectedShape,
+            imageCount: imageCount,
+            parameters: parameters
+        )
+        let resolved = imageDescriptor.resolvingDynamicDimensions(
+            [plan.batch, 3, plan.height, plan.width])
+        _ = try await function.run(inputs: [imageInputName: NDArray(descriptor: resolved)])
     }
 
     /// Detect objects in `image` using `.default` parameters.
     public func detect(image: CGImage) async throws -> [DetectedObject] {
         try await detect(image: image, parameters: .default)
     }
 
-    /// Detect objects in `image`.
+    /// Detect objects in `image` — convenience wrapper over the batched API.
     public func detect(image: CGImage, parameters: DetectionParameters) async throws -> [DetectedObject] {
-        // Build image NDArray
+        let results = try await detect(images: [image], parameters: parameters)
+        return results.first ?? []
+    }
+
+    /// Detect objects in each of `images` using `.default` parameters.
+    public func detect(images: [CGImage]) async throws -> [[DetectedObject]] {
+        try await detect(images: images, parameters: .default)
+    }
+
+    /// Detect objects across `images` in a single batched forward pass.
+    ///
+    /// Pipeline:
+    /// 1. Resolve a batch plan `(B, H, W)` from the model descriptor and
+    ///    parameters. Batch is always `images.count`. Dynamic spatial dims
+    ///    are filled from `parameters.inputHeight` / `inputWidth` (which
+    ///    have struct-level defaults).
+    /// 2. Preprocess each image sequentially into a `[3, H, W]` Float buffer.
+    /// 3. Concatenate the per-image buffers into the `[B, 3, H, W]` input
+    ///    NDArray and run a single forward pass.
+    /// 4. Slice each batch slot from the outputs and decode independently,
+    ///    returning `images.count` detection lists in input order.
+    public func detect(images: [CGImage], parameters: DetectionParameters) async throws
+        -> [[DetectedObject]]
+    {
+        guard !images.isEmpty else {
+            throw DetectionRuntimeError.invalidConfiguration("detect requires at least one image")
+        }
         guard case .ndArray(let imageDescriptor) = functionDescriptor.inputDescriptor(of: imageInputName) else {
             throw DetectionRuntimeError.invalidConfiguration(
                 "No array descriptor for image input '\(imageInputName)'"
             )
         }
-
         let expectedShape = imageDescriptor.shape
         guard expectedShape.count == 4 else {
             throw DetectionRuntimeError.invalidConfiguration(
                 "Expected 4-dimensional input shape, got \(expectedShape.count)"
             )
         }
-        let height = expectedShape[2]
-        let width = expectedShape[3]
-        let floatPixels = try ImagePreprocessor(
-            targetSize: CGSize(width: width, height: height),
+
+        let plan = try Self.planBatch(
+            expectedShape: expectedShape,
+            imageCount: images.count,
+            parameters: parameters
+        )
+
+        // 1. Preprocess each input image (sequential).
+        let perImagePixels = try preprocessImages(images, plan: plan, parameters: parameters)
+
+        // 2. Build batched NDArray and run inference once.
+        let resolvedDescriptor = imageDescriptor.resolvingDynamicDimensions(
+            [plan.batch, 3, plan.height, plan.width])
+        let imageArray = try buildInputNDArray(descriptor: resolvedDescriptor, perImagePixels: perImagePixels)
+
+        var outputs = try await function.run(inputs: [imageInputName: imageArray])
+        guard let logitsArray = outputs.remove(logitsOutputName)?.ndArray,
+            let boxesArray = outputs.remove(boxesOutputName)?.ndArray
+        else {
+            throw DetectionRuntimeError.invalidConfiguration(
+                "Missing one or more outputs after run."
+            )
+        }
+
+        // 3. Decode each input image's batch slot.
+        return Self.decodePerImage(
+            logitsArray: logitsArray,
+            boxesArray: boxesArray,
+            images: images,
+            parameters: parameters
+        )
+    }
+
+    // MARK: - Preprocessing
+
+    /// Sequentially preprocess each image to a `[3 * H * W]` Float buffer at
+    /// the plan's target spatial dimensions.
+    private func preprocessImages(
+        _ images: [CGImage], plan: BatchPlan, parameters: DetectionParameters
+    ) throws -> [[Float]] {
+        let preprocessor = ImagePreprocessor(
+            targetSize: CGSize(width: plan.width, height: plan.height),
             mean: parameters.normalizationMeans,
             std: parameters.normalizationStds,
             rescaleFactor: 1.0
-        ).preprocessCHW(cgImage: image)
-
-        var imageArray = NDArray(descriptor: imageDescriptor)
+        )
+        return try images.map { try preprocessor.preprocessCHW(cgImage: $0) }
+    }
 
-        if imageDescriptor.scalarType == .float16 {
+    /// Build the input NDArray for a `[B, 3, H, W]` resolved descriptor by
+    /// concatenating per-image CHW buffers in batch order. Each per-image
+    /// entry is `3*H*W` floats; the buffers are written contiguously to match
+    /// row-major batch-leading layout.
+    private func buildInputNDArray(
+        descriptor: NDArrayDescriptor, perImagePixels: [[Float]]
+    ) throws -> NDArray {
+        var imageArray = NDArray(descriptor: descriptor)
+        let flat = Array(perImagePixels.joined())
+        if descriptor.scalarType == .float16 {
             #if !((os(macOS) || targetEnvironment(macCatalyst)) && arch(x86_64))
-            fillNDArray(&imageArray, as: Float16.self, with: floatPixels.map(Float16.init))
+            fillNDArray(&imageArray, as: Float16.self, with: flat.map(Float16.init))
             #else
             fatalError("Float16 is not supported on this platform")
             #endif
         } else {
-            fillNDArray(&imageArray, as: Float.self, with: floatPixels)
+            fillNDArray(&imageArray, as: Float.self, with: flat)
         }
+        return imageArray
+    }
 
-        // Run inference and extract outputs
-        var outputs = try await function.run(inputs: [imageInputName: imageArray])
-        guard let logitsArray = outputs.remove(logitsOutputName)?.ndArray,
-            let boxesArray = outputs.remove(boxesOutputName)?.ndArray
-        else {
+    // MARK: - Output decoding
+
+    private static func decodePerImage(
+        logitsArray: NDArray,
+        boxesArray: NDArray,
+        images: [CGImage],
+        parameters: DetectionParameters
+    ) -> [[DetectedObject]] {
+        let logitsShape = logitsArray.shape  // [B, Q, C]
+        let boxesShape = boxesArray.shape  // [B, Q, 4]
+        let logitsAll = flattenAsFloat(logitsArray)
+        let boxesAll = flattenAsFloat(boxesArray)
+        let perBatchLog = logitsShape.dropFirst().reduce(1, *)
+        let perBatchBox = boxesShape.dropFirst().reduce(1, *)
+        let singleBatchLogitsShape = [1] + logitsShape.dropFirst()
+
+        return images.enumerated().map { i, image in
+            let raw = DetectionOutput(
+                logits: Array(logitsAll[i * perBatchLog..<(i + 1) * perBatchLog]),
+                logitsShape: singleBatchLogitsShape,
+                predictedBoxes: Array(boxesAll[i * perBatchBox..<(i + 1) * perBatchBox])
+            )
+            return DetectionPostprocessor.decode(
+                output: raw,
+                inputSize: CGSize(width: image.width, height: image.height),
+                parameters: parameters
+            )
+        }
+    }
+
+    // MARK: - Batch planning
+
+    struct BatchPlan: Equatable {
+        let batch: Int
+        let height: Int
+        let width: Int
+    }
+
+    /// Resolve the concrete `(B, H, W)` to bind the model with, given the
+    /// model's expected shape (which may contain `-1` for dynamic dims), the
+    /// number of input images, and the user's parameter overrides.
+    ///
+    /// Resolution rules:
+    /// - **Batch**: always `imageCount`. A static-batch model must match.
+    /// - **Spatial dims**: a dynamic `-1` dim is filled from
+    ///   `parameters.inputHeight` / `inputWidth`. A static dim is taken
+    ///   from the model descriptor (the parameters' values are ignored for
+    ///   that axis).
+    static func planBatch(
+        expectedShape: [Int],
+        imageCount: Int,
+        parameters: DetectionParameters
+    ) throws -> BatchPlan {
+        guard imageCount >= 1 else {
+            throw DetectionRuntimeError.invalidConfiguration("planBatch requires imageCount >= 1")
+        }
+
+        // Verify image count matches a static batch dim.
+        let batchExpected = expectedShape[0]
+        if batchExpected >= 0 && batchExpected != imageCount {
             throw DetectionRuntimeError.invalidConfiguration(
-                "Missing one or more outputs after run."
+                "Model expects fixed batch=\(batchExpected) but caller supplied \(imageCount) image(s)"
             )
         }
 
-        let rawOutput = DetectionOutput(
-            logits: flattenAsFloat(logitsArray),
-            logitsShape: logitsArray.shape,
-            predictedBoxes: flattenAsFloat(boxesArray)
-        )
-        let inputSize = CGSize(width: image.width, height: image.height)
-        return DetectionPostprocessor.decode(output: rawOutput, inputSize: inputSize, parameters: parameters)
+        let heightExpected = expectedShape[2]
+        let widthExpected = expectedShape[3]
+        let height = heightExpected < 0 ? parameters.inputHeight : heightExpected
+        let width = widthExpected < 0 ? parameters.inputWidth : widthExpected
+
+        return BatchPlan(batch: imageCount, height: height, width: width)
     }
 
     // MARK: - Name Discovery