From 587537a00e111035c4bca53ffbeaeeb7f9f74c38 Mon Sep 17 00:00:00 2001
From: Kevin Cheng <59463423+kevchengcodes@users.noreply.github.com>
Date: Thu, 11 Jun 2026 17:09:10 -0400
Subject: [PATCH 1/3] dynamic batch size, image H and W for object detector.
 tested with YoloS

---
 .../DetectionOutputs.swift                    |  16 +-
 .../CoreAIObjectDetector/ObjectDetector.swift | 188 +++++++++++++++---
 .../object-detector/ObjectDetectionMain.swift | 178 +++++++++++++----
 .../ObjectDetectorTests.swift                 |  73 +++++++
 4 files changed, 388 insertions(+), 67 deletions(-)

diff --git a/swift/Sources/CoreAIObjectDetector/DetectionOutputs.swift b/swift/Sources/CoreAIObjectDetector/DetectionOutputs.swift
index 25f3a31..7cdf8d7 100644
--- a/swift/Sources/CoreAIObjectDetector/DetectionOutputs.swift
+++ b/swift/Sources/CoreAIObjectDetector/DetectionOutputs.swift
@@ -73,18 +73,32 @@ public struct DetectionParameters: Sendable {
     /// When empty, labels default to "class_N".
     public var classLabels: [Int: String]
 
+    /// Model input height. Only consulted when the model declares a dynamic
+    /// spatial dimension; ignored for static-shape models. Defaults to 800
+    /// (matches the YOLOS export's reference input and the training-time
+    /// canvas geometry).
+    public var inputHeight: Int
+
+    /// Model input width. Only consulted when the model declares a dynamic
+    /// spatial dimension; ignored for static-shape models. Defaults to 800.
+    public var inputWidth: Int
+
     public init(
         threshold: Float = 0.3,
         maxDetections: Int = 100,
         normalizationMeans: (CGFloat, CGFloat, CGFloat) = (0.485, 0.456, 0.406),
         normalizationStds: (CGFloat, CGFloat, CGFloat) = (0.229, 0.224, 0.225),
-        classLabels: [Int: String] = ObjectDetectionLabels.coco
+        classLabels: [Int: String] = ObjectDetectionLabels.coco,
+        inputHeight: Int = 800,
+        inputWidth: Int = 800
     ) {
         self.threshold = threshold
         self.maxDetections = maxDetections
         self.normalizationMeans = normalizationMeans
         self.normalizationStds = normalizationStds
         self.classLabels = classLabels
+        self.inputHeight = inputHeight
+        self.inputWidth = inputWidth
     }
 
     public static let `default` = DetectionParameters()
diff --git a/swift/Sources/CoreAIObjectDetector/ObjectDetector.swift b/swift/Sources/CoreAIObjectDetector/ObjectDetector.swift
index b218336..045531b 100644
--- a/swift/Sources/CoreAIObjectDetector/ObjectDetector.swift
+++ b/swift/Sources/CoreAIObjectDetector/ObjectDetector.swift
@@ -87,8 +87,11 @@ public struct ObjectDetector {
                 "No array descriptor for image input '\(imageInputName)'"
             )
         }
-        let imageArray = NDArray(descriptor: imageDescriptor)
-        _ = try await function.run(inputs: [imageInputName: imageArray])
+        let defaults = DetectionParameters()
+        let warmupShape = zip(imageDescriptor.shape, [1, 3, defaults.inputHeight, defaults.inputWidth])
+            .map { actual, fallback in actual >= 0 ? actual : fallback }
+        let resolved = imageDescriptor.resolvingDynamicDimensions(warmupShape)
+        _ = try await function.run(inputs: [imageInputName: NDArray(descriptor: resolved)])
     }
 
     /// Detect objects in `image` using `.default` parameters.
@@ -96,59 +99,188 @@ public struct ObjectDetector {
         try await detect(image: image, parameters: .default)
     }
 
-    /// Detect objects in `image`.
+    /// Detect objects in `image` — convenience wrapper over the batched API.
     public func detect(image: CGImage, parameters: DetectionParameters) async throws -> [DetectedObject] {
-        // Build image NDArray
+        let results = try await detect(images: [image], parameters: parameters)
+        return results.first ?? []
+    }
+
+    /// Detect objects in each of `images` using `.default` parameters.
+    public func detect(images: [CGImage]) async throws -> [[DetectedObject]] {
+        try await detect(images: images, parameters: .default)
+    }
+
+    /// Detect objects across `images` in a single batched forward pass.
+    ///
+    /// Pipeline:
+    /// 1. Resolve a batch plan `(B, H, W)` from the model descriptor and
+    ///    parameters. Batch is always `images.count`. Dynamic spatial dims
+    ///    are filled from `parameters.inputHeight` / `inputWidth` (which
+    ///    have struct-level defaults).
+    /// 2. Preprocess each image sequentially into a `[3, H, W]` Float buffer.
+    /// 3. Concatenate the per-image buffers into the `[B, 3, H, W]` input
+    ///    NDArray and run a single forward pass.
+    /// 4. Slice each batch slot from the outputs and decode independently,
+    ///    returning `images.count` detection lists in input order.
+    public func detect(images: [CGImage], parameters: DetectionParameters) async throws
+        -> [[DetectedObject]]
+    {
+        guard !images.isEmpty else {
+            throw DetectionRuntimeError.invalidConfiguration("detect requires at least one image")
+        }
         guard case .ndArray(let imageDescriptor) = functionDescriptor.inputDescriptor(of: imageInputName) else {
             throw DetectionRuntimeError.invalidConfiguration(
                 "No array descriptor for image input '\(imageInputName)'"
             )
         }
-
         let expectedShape = imageDescriptor.shape
         guard expectedShape.count == 4 else {
             throw DetectionRuntimeError.invalidConfiguration(
                 "Expected 4-dimensional input shape, got \(expectedShape.count)"
             )
         }
-        let height = expectedShape[2]
-        let width = expectedShape[3]
-        let floatPixels = try ImagePreprocessor(
-            targetSize: CGSize(width: width, height: height),
+
+        let plan = try Self.planBatch(
+            expectedShape: expectedShape,
+            imageSizes: images.map { CGSize(width: $0.width, height: $0.height) },
+            parameters: parameters
+        )
+
+        // 1. Preprocess each input image (sequential).
+        let perImagePixels = try preprocessImages(images, plan: plan, parameters: parameters)
+
+        // 2. Build batched NDArray and run inference once.
+        let resolvedDescriptor = imageDescriptor.resolvingDynamicDimensions(
+            [plan.batch, 3, plan.height, plan.width])
+        let imageArray = try buildInputNDArray(descriptor: resolvedDescriptor, perImagePixels: perImagePixels)
+
+        var outputs = try await function.run(inputs: [imageInputName: imageArray])
+        guard let logitsArray = outputs.remove(logitsOutputName)?.ndArray,
+            let boxesArray = outputs.remove(boxesOutputName)?.ndArray
+        else {
+            throw DetectionRuntimeError.invalidConfiguration(
+                "Missing one or more outputs after run."
+            )
+        }
+
+        // 3. Decode each input image's batch slot.
+        return Self.decodePerImage(
+            logitsArray: logitsArray,
+            boxesArray: boxesArray,
+            images: images,
+            parameters: parameters
+        )
+    }
+
+    // MARK: - Preprocessing
+
+    /// Sequentially preprocess each image to a `[3 * H * W]` Float buffer at
+    /// the plan's target spatial dimensions.
+    private func preprocessImages(
+        _ images: [CGImage], plan: BatchPlan, parameters: DetectionParameters
+    ) throws -> [[Float]] {
+        let preprocessor = ImagePreprocessor(
+            targetSize: CGSize(width: plan.width, height: plan.height),
             mean: parameters.normalizationMeans,
             std: parameters.normalizationStds,
             rescaleFactor: 1.0
-        ).preprocessCHW(cgImage: image)
-
-        var imageArray = NDArray(descriptor: imageDescriptor)
+        )
+        return try images.map { try preprocessor.preprocessCHW(cgImage: $0) }
+    }
 
-        if imageDescriptor.scalarType == .float16 {
+    /// Build the input NDArray for a `[B, 3, H, W]` resolved descriptor by
+    /// concatenating per-image CHW buffers in batch order. Each per-image
+    /// entry is `3*H*W` floats; the buffers are written contiguously to match
+    /// row-major batch-leading layout.
+    private func buildInputNDArray(
+        descriptor: NDArrayDescriptor, perImagePixels: [[Float]]
+    ) throws -> NDArray {
+        var imageArray = NDArray(descriptor: descriptor)
+        let flat = Array(perImagePixels.joined())
+        if descriptor.scalarType == .float16 {
             #if !((os(macOS) || targetEnvironment(macCatalyst)) && arch(x86_64))
-            fillNDArray(&imageArray, as: Float16.self, with: floatPixels.map(Float16.init))
+            fillNDArray(&imageArray, as: Float16.self, with: flat.map(Float16.init))
             #else
             fatalError("Float16 is not supported on this platform")
             #endif
         } else {
-            fillNDArray(&imageArray, as: Float.self, with: floatPixels)
+            fillNDArray(&imageArray, as: Float.self, with: flat)
         }
+        return imageArray
+    }
 
-        // Run inference and extract outputs
-        var outputs = try await function.run(inputs: [imageInputName: imageArray])
-        guard let logitsArray = outputs.remove(logitsOutputName)?.ndArray,
-            let boxesArray = outputs.remove(boxesOutputName)?.ndArray
-        else {
+    // MARK: - Output decoding
+
+    private static func decodePerImage(
+        logitsArray: NDArray,
+        boxesArray: NDArray,
+        images: [CGImage],
+        parameters: DetectionParameters
+    ) -> [[DetectedObject]] {
+        let logitsShape = logitsArray.shape  // [B, Q, C]
+        let boxesShape = boxesArray.shape  // [B, Q, 4]
+        let logitsAll = flattenAsFloat(logitsArray)
+        let boxesAll = flattenAsFloat(boxesArray)
+        let perBatchLog = logitsShape.dropFirst().reduce(1, *)
+        let perBatchBox = boxesShape.dropFirst().reduce(1, *)
+        let singleBatchLogitsShape = [1] + logitsShape.dropFirst()
+
+        return images.enumerated().map { i, image in
+            let raw = DetectionOutput(
+                logits: Array(logitsAll[i * perBatchLog..<(i + 1) * perBatchLog]),
+                logitsShape: singleBatchLogitsShape,
+                predictedBoxes: Array(boxesAll[i * perBatchBox..<(i + 1) * perBatchBox])
+            )
+            return DetectionPostprocessor.decode(
+                output: raw,
+                inputSize: CGSize(width: image.width, height: image.height),
+                parameters: parameters
+            )
+        }
+    }
+
+    // MARK: - Batch planning
+
+    struct BatchPlan: Equatable {
+        let batch: Int
+        let height: Int
+        let width: Int
+    }
+
+    /// Resolve the concrete `(B, H, W)` to bind the model with, given the
+    /// model's expected shape (which may contain `-1` for dynamic dims), the
+    /// list of input image sizes, and the user's parameter overrides.
+    ///
+    /// Resolution rules:
+    /// - **Batch**: always `images.count`. A static-batch model must match.
+    /// - **Spatial dims**: a dynamic `-1` dim is filled from
+    ///   `parameters.inputHeight` / `inputWidth`. A static dim is taken
+    ///   from the model descriptor (the parameters' values are ignored for
+    ///   that axis).
+    static func planBatch(
+        expectedShape: [Int],
+        imageSizes: [CGSize],
+        parameters: DetectionParameters
+    ) throws -> BatchPlan {
+        guard !imageSizes.isEmpty else {
+            throw DetectionRuntimeError.invalidConfiguration("planBatch requires at least one image")
+        }
+
+        // Resolve batch from image count; verify it matches a static batch dim.
+        let targetBatch = imageSizes.count
+        let batchExpected = expectedShape[0]
+        if batchExpected >= 0 && batchExpected != targetBatch {
             throw DetectionRuntimeError.invalidConfiguration(
-                "Missing one or more outputs after run."
+                "Model expects fixed batch=\(batchExpected) but caller supplied \(targetBatch) image(s)"
             )
         }
 
-        let rawOutput = DetectionOutput(
-            logits: flattenAsFloat(logitsArray),
-            logitsShape: logitsArray.shape,
-            predictedBoxes: flattenAsFloat(boxesArray)
-        )
-        let inputSize = CGSize(width: image.width, height: image.height)
-        return DetectionPostprocessor.decode(output: rawOutput, inputSize: inputSize, parameters: parameters)
+        let heightExpected = expectedShape[2]
+        let widthExpected = expectedShape[3]
+        let height = heightExpected < 0 ? parameters.inputHeight : heightExpected
+        let width = widthExpected < 0 ? parameters.inputWidth : widthExpected
+
+        return BatchPlan(batch: targetBatch, height: height, width: width)
     }
 
     // MARK: - Name Discovery
diff --git a/swift/Sources/Tools/object-detector/ObjectDetectionMain.swift b/swift/Sources/Tools/object-detector/ObjectDetectionMain.swift
index 747f69e..07ce518 100644
--- a/swift/Sources/Tools/object-detector/ObjectDetectionMain.swift
+++ b/swift/Sources/Tools/object-detector/ObjectDetectionMain.swift
@@ -22,8 +22,12 @@ struct ObjectDetectorCLI: AsyncParsableCommand {
     @Option(name: .long, help: "Path to the .aimodel directory.")
     var model: String
 
-    @Option(name: .long, help: "Path to the input image.")
-    var image: String
+    @Option(
+        name: .long,
+        help:
+            "Path to an input image. Pass --image multiple times to run detection on a batch of images (one --image per source file)."
+    )
+    var image: [String] = []
 
     @Option(name: .long, help: "Confidence threshold (0–1).")
     var threshold: Float = 0.3
@@ -31,78 +35,171 @@ struct ObjectDetectorCLI: AsyncParsableCommand {
     @Option(name: .long, help: "Maximum number of detections to return.")
     var maxDetections: Int = 100
 
+    @Option(
+        name: .long,
+        help:
+            "Override model input height (only used for dynamic models). Defaults to DetectionParameters.inputHeight if not set."
+    )
+    var inputHeight: Int?
+
+    @Option(
+        name: .long,
+        help:
+            "Override model input width (only used for dynamic models). Defaults to DetectionParameters.inputWidth if not set."
+    )
+    var inputWidth: Int?
+
     @Flag(name: .long, help: "Run a warmup pass before timed inference.")
     var warmup: Bool = false
 
-    @Option(name: .long, help: "Save output image with rendered boxes to this path.")
+    @Option(
+        name: .long,
+        help:
+            "Render detections onto the input image(s). For a single --image, this is a file path. For multiple --image inputs, this is a directory; the CLI writes <source-stem>_detections.png into it."
+    )
     var outputImage: String?
 
-    @Option(name: .long, help: "Write JSON results to this path instead of stdout.")
+    @Option(
+        name: .long,
+        help:
+            "Write JSON results to this path instead of stdout. With one --image, the JSON is an array of detections; with multiple, it's an array of {image, detections} objects."
+    )
     var outputJson: String?
 
     @Flag(name: .long, help: "Print verbose progress information.")
     var verbose: Bool = false
 
+    // MARK: - Validation
+
+    func validate() throws {
+        if image.isEmpty {
+            throw ValidationError("At least one --image must be provided.")
+        }
+    }
+
     // MARK: - Run
 
     func run() async throws {
         if verbose { print("Loading model from \(model)...") }
-        let params = DetectionParameters(threshold: threshold, maxDetections: maxDetections)
+        var params = DetectionParameters(
+            threshold: threshold,
+            maxDetections: maxDetections
+        )
+        if let h = inputHeight { params.inputHeight = h }
+        if let w = inputWidth { params.inputWidth = w }
         let detector = try await ObjectDetector(resourcesAt: model)
 
-        let cgImage = try loadCGImage(from: image)
-        if verbose { print("Loaded image: \(cgImage.width)×\(cgImage.height)") }
+        let loaded: [(path: String, image: CGImage)] = try image.map { path in
+            let cgImage = try loadCGImage(from: path)
+            if verbose { print("Loaded image \(path): \(cgImage.width)×\(cgImage.height)") }
+            return (path, cgImage)
+        }
 
         if warmup {
             if verbose { print("Running warmup...") }
             try await detector.warmup()
         }
 
-        if verbose { print("Running detection...") }
+        if verbose { print("Running detection on \(loaded.count) image(s)...") }
         let start = SuspendingClock().now
-        let detections = try await detector.detect(image: cgImage, parameters: params)
+        let allDetections = try await detector.detect(
+            images: loaded.map { $0.image },
+            parameters: params
+        )
         let elapsed = SuspendingClock().now - start
 
         if verbose {
-            print("Inference time: \(elapsed)")
+            print("Inference time (total): \(elapsed)")
         }
 
-        // Format results
-        let results = detections.map { d -> JSONDetection in
-            JSONDetection(
-                label: d.label,
-                labelIndex: d.labelIndex,
-                score: d.confidence,
-                box: JSONDetection.Box(
-                    x: d.boundingBox.origin.x,
-                    y: d.boundingBox.origin.y,
-                    width: d.boundingBox.size.width,
-                    height: d.boundingBox.size.height
-                )
+        // Build per-image JSON entries
+        let entries: [JSONImageResult] = zip(loaded, allDetections).map { (entry, detections) in
+            JSONImageResult(
+                image: entry.path,
+                detections: detections.map { d in
+                    JSONDetection(
+                        label: d.label,
+                        labelIndex: d.labelIndex,
+                        score: d.confidence,
+                        box: JSONDetection.Box(
+                            x: d.boundingBox.origin.x,
+                            y: d.boundingBox.origin.y,
+                            width: d.boundingBox.size.width,
+                            height: d.boundingBox.size.height
+                        )
+                    )
+                }
             )
         }
 
-        if let jsonPath = outputJson {
-            let encoder = JSONEncoder()
-            encoder.outputFormatting = [.prettyPrinted, .sortedKeys]
-            let data = try encoder.encode(results)
-            try data.write(to: URL(fileURLWithPath: NSString(string: jsonPath).expandingTildeInPath))
-            print("Results written to \(jsonPath)")
-        } else {
-            print("\nDetections (\(detections.count)):")
-            for (i, d) in detections.enumerated() {
-                print(
-                    "  [\(i)] \(d.label) score=\(String(format: "%.3f", d.confidence))"
-                        + "  box=(\(Int(d.boundingBox.origin.x)),\(Int(d.boundingBox.origin.y)),\(Int(d.boundingBox.width))×\(Int(d.boundingBox.height)))"
-                )
+        try writeJsonOutput(entries: entries, multiImage: loaded.count > 1)
+
+        // Stdout summary (suppressed when --output-json is set, matching prior behavior)
+        if outputJson == nil {
+            for (i, entry) in entries.enumerated() {
+                if loaded.count > 1 {
+                    print("\nImage \(i + 1)/\(loaded.count): \(entry.image)")
+                }
+                print("Detections (\(entry.detections.count)):")
+                for (idx, d) in entry.detections.enumerated() {
+                    print(
+                        "  [\(idx)] \(d.label) score=\(String(format: "%.3f", d.score))"
+                            + "  box=(\(Int(d.box.x)),\(Int(d.box.y)),\(Int(d.box.width))×\(Int(d.box.height)))"
+                    )
+                }
             }
         }
 
-        // Render output image with bounding boxes
+        // Render output image(s)
         if let imagePath = outputImage {
-            let outputURL = URL(fileURLWithPath: NSString(string: imagePath).expandingTildeInPath)
-            try renderDetections(onto: cgImage, detections: detections, saveTo: outputURL)
+            try renderOutputImages(
+                imagePath: imagePath,
+                loaded: loaded,
+                allDetections: allDetections
+            )
+        }
+    }
+
+    // MARK: - Output helpers
+
+    private func writeJsonOutput(entries: [JSONImageResult], multiImage: Bool) throws {
+        guard let jsonPath = outputJson else { return }
+        let encoder = JSONEncoder()
+        encoder.outputFormatting = [.prettyPrinted, .sortedKeys]
+        let url = URL(fileURLWithPath: NSString(string: jsonPath).expandingTildeInPath)
+        let data: Data
+        if multiImage {
+            data = try encoder.encode(entries)
+        } else {
+            // Single-image: keep the prior schema (top-level array of detections).
+            data = try encoder.encode(entries.first?.detections ?? [])
+        }
+        try data.write(to: url)
+        print("Results written to \(jsonPath)")
+    }
+
+    private func renderOutputImages(
+        imagePath: String,
+        loaded: [(path: String, image: CGImage)],
+        allDetections: [[DetectedObject]]
+    ) throws {
+        let expanded = NSString(string: imagePath).expandingTildeInPath
+        if loaded.count == 1 {
+            let outputURL = URL(fileURLWithPath: expanded)
+            try renderDetections(onto: loaded[0].image, detections: allDetections[0], saveTo: outputURL)
             print("Output image written to \(imagePath)")
+            return
+        }
+
+        // Multi-image: treat --output-image as a directory.
+        let dirURL = URL(fileURLWithPath: expanded)
+        try FileManager.default.createDirectory(at: dirURL, withIntermediateDirectories: true)
+        for (entry, detections) in zip(loaded, allDetections) {
+            let stem = (entry.path as NSString).lastPathComponent
+            let stemNoExt = (stem as NSString).deletingPathExtension
+            let outURL = dirURL.appendingPathComponent("\(stemNoExt)_detections.png")
+            try renderDetections(onto: entry.image, detections: detections, saveTo: outURL)
+            print("Output image written to \(outURL.path)")
         }
     }
 }
@@ -241,3 +338,8 @@ private struct JSONDetection: Codable {
         let x, y, width, height: Double
     }
 }
+
+private struct JSONImageResult: Codable {
+    let image: String
+    let detections: [JSONDetection]
+}
diff --git a/swift/Tests/ObjectDetectorTests/ObjectDetectorTests.swift b/swift/Tests/ObjectDetectorTests/ObjectDetectorTests.swift
index 374b70f..10d41a5 100644
--- a/swift/Tests/ObjectDetectorTests/ObjectDetectorTests.swift
+++ b/swift/Tests/ObjectDetectorTests/ObjectDetectorTests.swift
@@ -212,4 +212,77 @@ struct ObjectDetectorTests {
         #expect(result.allSatisfy { $0.isFinite })
         #expect(abs(result[0] - 0.5) < 1e-5)
     }
+
+    // MARK: - Batch planning
+
+    @Test("planBatch: single image, dynamic dims, no overrides → parameter defaults")
+    func planBatchSingleDefault() throws {
+        let p = DetectionParameters()
+        let plan = try ObjectDetector.planBatch(
+            expectedShape: [-1, 3, -1, -1],
+            imageSizes: [CGSize(width: 728, height: 408)],
+            parameters: .default
+        )
+        #expect(plan == ObjectDetector.BatchPlan(batch: 1, height: p.inputHeight, width: p.inputWidth))
+    }
+
+    @Test("planBatch: multi-image, dynamic dims, no overrides → parameter defaults")
+    func planBatchMultiDefault() throws {
+        let p = DetectionParameters()
+        let plan = try ObjectDetector.planBatch(
+            expectedShape: [-1, 3, -1, -1],
+            imageSizes: [
+                CGSize(width: 1280, height: 720),
+                CGSize(width: 600, height: 800),
+                CGSize(width: 1024, height: 768),
+            ],
+            parameters: .default
+        )
+        #expect(plan == ObjectDetector.BatchPlan(batch: 3, height: p.inputHeight, width: p.inputWidth))
+    }
+
+    @Test("planBatch: multi-image, dynamic dims, explicit overrides win")
+    func planBatchMultiOverride() throws {
+        var params = DetectionParameters.default
+        params.inputHeight = 512
+        params.inputWidth = 512
+        let plan = try ObjectDetector.planBatch(
+            expectedShape: [-1, 3, -1, -1],
+            imageSizes: [
+                CGSize(width: 1280, height: 720),
+                CGSize(width: 600, height: 800),
+            ],
+            parameters: params
+        )
+        #expect(plan == ObjectDetector.BatchPlan(batch: 2, height: 512, width: 512))
+    }
+
+    @Test("planBatch: static spatial dims override parameter values silently")
+    func planBatchStaticSpatialIgnoresParams() throws {
+        // Static [1, 3, 800, 800] with mismatching params → planBatch uses
+        // the static dims; parameter values are silently ignored for fixed axes.
+        var params = DetectionParameters.default
+        params.inputHeight = 512
+        params.inputWidth = 512
+        let plan = try ObjectDetector.planBatch(
+            expectedShape: [1, 3, 800, 800],
+            imageSizes: [CGSize(width: 640, height: 480)],
+            parameters: params
+        )
+        #expect(plan == ObjectDetector.BatchPlan(batch: 1, height: 800, width: 800))
+    }
+
+    @Test("planBatch: static batch mismatch throws (multi-image into batch=1 model)")
+    func planBatchStaticBatchMismatchThrows() {
+        #expect(throws: DetectionRuntimeError.self) {
+            try ObjectDetector.planBatch(
+                expectedShape: [1, 3, -1, -1],
+                imageSizes: [
+                    CGSize(width: 1000, height: 1000),
+                    CGSize(width: 1000, height: 1000),
+                ],
+                parameters: .default
+            )
+        }
+    }
 }

From c87c09977ab3efb7b370532bd603b8acf69315b0 Mon Sep 17 00:00:00 2001
From: Kevin Cheng <59463423+kevchengcodes@users.noreply.github.com>
Date: Fri, 12 Jun 2026 14:36:29 -0400
Subject: [PATCH 2/3] make warmup use same shape and update planBatch signature

---
 .../CoreAIObjectDetector/ObjectDetector.swift | 45 ++++++++++++-------
 .../object-detector/ObjectDetectionMain.swift |  2 +-
 .../ObjectDetectorTests.swift                 | 20 +++------
 3 files changed, 34 insertions(+), 33 deletions(-)

diff --git a/swift/Sources/CoreAIObjectDetector/ObjectDetector.swift b/swift/Sources/CoreAIObjectDetector/ObjectDetector.swift
index 045531b..acaacc0 100644
--- a/swift/Sources/CoreAIObjectDetector/ObjectDetector.swift
+++ b/swift/Sources/CoreAIObjectDetector/ObjectDetector.swift
@@ -80,17 +80,29 @@ public struct ObjectDetector {
 
     // MARK: - Inference
 
-    /// Warm up the backend (e.g. trigger Metal kernel compilation) with a dummy pass.
-    public func warmup() async throws {
+    /// Warm up the backend (e.g. trigger Metal kernel compilation) with a dummy
+    /// pass at the same `(B, H, W)` that subsequent `detect()` calls will use.
+    /// For static-shape models the arguments are ignored — `planBatch` falls
+    /// back to the descriptor's fixed dims.
+    public func warmup(imageCount: Int = 1, parameters: DetectionParameters = .default) async throws {
         guard case .ndArray(let imageDescriptor) = functionDescriptor.inputDescriptor(of: imageInputName) else {
             throw DetectionRuntimeError.invalidConfiguration(
                 "No array descriptor for image input '\(imageInputName)'"
             )
         }
-        let defaults = DetectionParameters()
-        let warmupShape = zip(imageDescriptor.shape, [1, 3, defaults.inputHeight, defaults.inputWidth])
-            .map { actual, fallback in actual >= 0 ? actual : fallback }
-        let resolved = imageDescriptor.resolvingDynamicDimensions(warmupShape)
+        let expectedShape = imageDescriptor.shape
+        guard expectedShape.count == 4 else {
+            throw DetectionRuntimeError.invalidConfiguration(
+                "Expected 4-dimensional input shape, got \(expectedShape.count)"
+            )
+        }
+        let plan = try Self.planBatch(
+            expectedShape: expectedShape,
+            imageCount: imageCount,
+            parameters: parameters
+        )
+        let resolved = imageDescriptor.resolvingDynamicDimensions(
+            [plan.batch, 3, plan.height, plan.width])
         _ = try await function.run(inputs: [imageInputName: NDArray(descriptor: resolved)])
     }
 
@@ -142,7 +154,7 @@ public struct ObjectDetector {
 
         let plan = try Self.planBatch(
             expectedShape: expectedShape,
-            imageSizes: images.map { CGSize(width: $0.width, height: $0.height) },
+            imageCount: images.count,
             parameters: parameters
         )
 
@@ -249,29 +261,28 @@ public struct ObjectDetector {
 
     /// Resolve the concrete `(B, H, W)` to bind the model with, given the
     /// model's expected shape (which may contain `-1` for dynamic dims), the
-    /// list of input image sizes, and the user's parameter overrides.
+    /// number of input images, and the user's parameter overrides.
     ///
     /// Resolution rules:
-    /// - **Batch**: always `images.count`. A static-batch model must match.
+    /// - **Batch**: always `imageCount`. A static-batch model must match.
     /// - **Spatial dims**: a dynamic `-1` dim is filled from
     ///   `parameters.inputHeight` / `inputWidth`. A static dim is taken
     ///   from the model descriptor (the parameters' values are ignored for
     ///   that axis).
     static func planBatch(
         expectedShape: [Int],
-        imageSizes: [CGSize],
+        imageCount: Int,
         parameters: DetectionParameters
     ) throws -> BatchPlan {
-        guard !imageSizes.isEmpty else {
-            throw DetectionRuntimeError.invalidConfiguration("planBatch requires at least one image")
+        guard imageCount >= 1 else {
+            throw DetectionRuntimeError.invalidConfiguration("planBatch requires imageCount >= 1")
         }
 
-        // Resolve batch from image count; verify it matches a static batch dim.
-        let targetBatch = imageSizes.count
+        // Verify image count matches a static batch dim.
         let batchExpected = expectedShape[0]
-        if batchExpected >= 0 && batchExpected != targetBatch {
+        if batchExpected >= 0 && batchExpected != imageCount {
             throw DetectionRuntimeError.invalidConfiguration(
-                "Model expects fixed batch=\(batchExpected) but caller supplied \(targetBatch) image(s)"
+                "Model expects fixed batch=\(batchExpected) but caller supplied \(imageCount) image(s)"
             )
         }
 
@@ -280,7 +291,7 @@ public struct ObjectDetector {
         let height = heightExpected < 0 ? parameters.inputHeight : heightExpected
         let width = widthExpected < 0 ? parameters.inputWidth : widthExpected
 
-        return BatchPlan(batch: targetBatch, height: height, width: width)
+        return BatchPlan(batch: imageCount, height: height, width: width)
     }
 
     // MARK: - Name Discovery
diff --git a/swift/Sources/Tools/object-detector/ObjectDetectionMain.swift b/swift/Sources/Tools/object-detector/ObjectDetectionMain.swift
index 07ce518..2f897d9 100644
--- a/swift/Sources/Tools/object-detector/ObjectDetectionMain.swift
+++ b/swift/Sources/Tools/object-detector/ObjectDetectionMain.swift
@@ -97,7 +97,7 @@ struct ObjectDetectorCLI: AsyncParsableCommand {
 
         if warmup {
             if verbose { print("Running warmup...") }
-            try await detector.warmup()
+            try await detector.warmup(imageCount: loaded.count, parameters: params)
         }
 
         if verbose { print("Running detection on \(loaded.count) image(s)...") }
diff --git a/swift/Tests/ObjectDetectorTests/ObjectDetectorTests.swift b/swift/Tests/ObjectDetectorTests/ObjectDetectorTests.swift
index 10d41a5..2c65a8d 100644
--- a/swift/Tests/ObjectDetectorTests/ObjectDetectorTests.swift
+++ b/swift/Tests/ObjectDetectorTests/ObjectDetectorTests.swift
@@ -220,7 +220,7 @@ struct ObjectDetectorTests {
         let p = DetectionParameters()
         let plan = try ObjectDetector.planBatch(
             expectedShape: [-1, 3, -1, -1],
-            imageSizes: [CGSize(width: 728, height: 408)],
+            imageCount: 1,
             parameters: .default
         )
         #expect(plan == ObjectDetector.BatchPlan(batch: 1, height: p.inputHeight, width: p.inputWidth))
@@ -231,11 +231,7 @@ struct ObjectDetectorTests {
         let p = DetectionParameters()
         let plan = try ObjectDetector.planBatch(
             expectedShape: [-1, 3, -1, -1],
-            imageSizes: [
-                CGSize(width: 1280, height: 720),
-                CGSize(width: 600, height: 800),
-                CGSize(width: 1024, height: 768),
-            ],
+            imageCount: 3,
             parameters: .default
         )
         #expect(plan == ObjectDetector.BatchPlan(batch: 3, height: p.inputHeight, width: p.inputWidth))
@@ -248,10 +244,7 @@ struct ObjectDetectorTests {
         params.inputWidth = 512
         let plan = try ObjectDetector.planBatch(
             expectedShape: [-1, 3, -1, -1],
-            imageSizes: [
-                CGSize(width: 1280, height: 720),
-                CGSize(width: 600, height: 800),
-            ],
+            imageCount: 2,
             parameters: params
         )
         #expect(plan == ObjectDetector.BatchPlan(batch: 2, height: 512, width: 512))
@@ -266,7 +259,7 @@ struct ObjectDetectorTests {
         params.inputWidth = 512
         let plan = try ObjectDetector.planBatch(
             expectedShape: [1, 3, 800, 800],
-            imageSizes: [CGSize(width: 640, height: 480)],
+            imageCount: 1,
             parameters: params
         )
         #expect(plan == ObjectDetector.BatchPlan(batch: 1, height: 800, width: 800))
@@ -277,10 +270,7 @@ struct ObjectDetectorTests {
         #expect(throws: DetectionRuntimeError.self) {
             try ObjectDetector.planBatch(
                 expectedShape: [1, 3, -1, -1],
-                imageSizes: [
-                    CGSize(width: 1000, height: 1000),
-                    CGSize(width: 1000, height: 1000),
-                ],
+                imageCount: 2,
                 parameters: .default
             )
         }

From c06360f66adb23ee0cdc20e103e84e59691f0d44 Mon Sep 17 00:00:00 2001
From: Kevin Cheng <59463423+kevchengcodes@users.noreply.github.com>
Date: Fri, 12 Jun 2026 16:01:56 -0400
Subject: [PATCH 3/3] update README

---
 models/yolo/README.md | 27 +++++++++++++++++++++------
 1 file changed, 21 insertions(+), 6 deletions(-)

diff --git a/models/yolo/README.md b/models/yolo/README.md
index 5bc2108..97a5791 100644
--- a/models/yolo/README.md
+++ b/models/yolo/README.md
@@ -44,22 +44,37 @@ uv run export.py --help
 ### In your iOS and macOS applications
 
 ```swift
-import ObjectDetector
-
-// Detection parameters
-let params = DetectionParameters()
+import CoreAIObjectDetector
 
 // Load directly from an exported .aimodel directory.
 let detector = try await ObjectDetector(resourcesAt: "coreai-models/exports/yolos-base_float32_static.aimodel")
 
-// Run inference
-let detections = try await detector.detect(image: cgImage, parameters: params)
+// Single image, default parameters.
+let detections = try await detector.detect(image: cgImage)
+
+// Batched detection. For dynamic-shape exports, optionally override the spatial dims
+// on DetectionParameters; for static exports the values are ignored.
+var params = DetectionParameters()
+params.inputHeight = 800
+params.inputWidth = 1024
+let batchDetections = try await detector.detect(images: [imageA, imageB], parameters: params)
+
+// Optional: warm up the kernel for the exact (B, H, W) you'll run with.
+try await detector.warmup(imageCount: 2, parameters: params)
 ```
 
 ### On your Mac using built-in Command Line Tool
 
 ```bash
+# Single image, static-shape model.
 swift run -c release object-detector --model path/to/exported_model.aimodel --image path/to/image.jpg
+
+# Batched detection on a dynamic-shape export, with optional, explicit input dims and warmup.
+swift run -c release object-detector \
+  --model path/to/dynamic.aimodel \
+  --image a.jpg --image b.jpg \
+  --input-height 800 --input-width 1024 \
+  --warmup
 ```
 
 [^1]: [Paper](https://arxiv.org/abs/2106.00666) · [HuggingFace](https://huggingface.co/hustvl/yolos-tiny)