From 587537a00e111035c4bca53ffbeaeeb7f9f74c38 Mon Sep 17 00:00:00 2001 From: Kevin Cheng <59463423+kevchengcodes@users.noreply.github.com> Date: Thu, 11 Jun 2026 17:09:10 -0400 Subject: [PATCH 1/3] dynamic batch size, image H and W for object detector. tested with YoloS --- .../DetectionOutputs.swift | 16 +- .../CoreAIObjectDetector/ObjectDetector.swift | 188 +++++++++++++++--- .../object-detector/ObjectDetectionMain.swift | 178 +++++++++++++---- .../ObjectDetectorTests.swift | 73 +++++++ 4 files changed, 388 insertions(+), 67 deletions(-) diff --git a/swift/Sources/CoreAIObjectDetector/DetectionOutputs.swift b/swift/Sources/CoreAIObjectDetector/DetectionOutputs.swift index 25f3a31..7cdf8d7 100644 --- a/swift/Sources/CoreAIObjectDetector/DetectionOutputs.swift +++ b/swift/Sources/CoreAIObjectDetector/DetectionOutputs.swift @@ -73,18 +73,32 @@ public struct DetectionParameters: Sendable { /// When empty, labels default to "class_N". public var classLabels: [Int: String] + /// Model input height. Only consulted when the model declares a dynamic + /// spatial dimension; ignored for static-shape models. Defaults to 800 + /// (matches the YOLOS export's reference input and the training-time + /// canvas geometry). + public var inputHeight: Int + + /// Model input width. Only consulted when the model declares a dynamic + /// spatial dimension; ignored for static-shape models. Defaults to 800. + public var inputWidth: Int + public init( threshold: Float = 0.3, maxDetections: Int = 100, normalizationMeans: (CGFloat, CGFloat, CGFloat) = (0.485, 0.456, 0.406), normalizationStds: (CGFloat, CGFloat, CGFloat) = (0.229, 0.224, 0.225), - classLabels: [Int: String] = ObjectDetectionLabels.coco + classLabels: [Int: String] = ObjectDetectionLabels.coco, + inputHeight: Int = 800, + inputWidth: Int = 800 ) { self.threshold = threshold self.maxDetections = maxDetections self.normalizationMeans = normalizationMeans self.normalizationStds = normalizationStds self.classLabels = classLabels + self.inputHeight = inputHeight + self.inputWidth = inputWidth } public static let `default` = DetectionParameters() diff --git a/swift/Sources/CoreAIObjectDetector/ObjectDetector.swift b/swift/Sources/CoreAIObjectDetector/ObjectDetector.swift index b218336..045531b 100644 --- a/swift/Sources/CoreAIObjectDetector/ObjectDetector.swift +++ b/swift/Sources/CoreAIObjectDetector/ObjectDetector.swift @@ -87,8 +87,11 @@ public struct ObjectDetector { "No array descriptor for image input '\(imageInputName)'" ) } - let imageArray = NDArray(descriptor: imageDescriptor) - _ = try await function.run(inputs: [imageInputName: imageArray]) + let defaults = DetectionParameters() + let warmupShape = zip(imageDescriptor.shape, [1, 3, defaults.inputHeight, defaults.inputWidth]) + .map { actual, fallback in actual >= 0 ? actual : fallback } + let resolved = imageDescriptor.resolvingDynamicDimensions(warmupShape) + _ = try await function.run(inputs: [imageInputName: NDArray(descriptor: resolved)]) } /// Detect objects in `image` using `.default` parameters. @@ -96,59 +99,188 @@ public struct ObjectDetector { try await detect(image: image, parameters: .default) } - /// Detect objects in `image`. + /// Detect objects in `image` — convenience wrapper over the batched API. public func detect(image: CGImage, parameters: DetectionParameters) async throws -> [DetectedObject] { - // Build image NDArray + let results = try await detect(images: [image], parameters: parameters) + return results.first ?? [] + } + + /// Detect objects in each of `images` using `.default` parameters. + public func detect(images: [CGImage]) async throws -> [[DetectedObject]] { + try await detect(images: images, parameters: .default) + } + + /// Detect objects across `images` in a single batched forward pass. + /// + /// Pipeline: + /// 1. Resolve a batch plan `(B, H, W)` from the model descriptor and + /// parameters. Batch is always `images.count`. Dynamic spatial dims + /// are filled from `parameters.inputHeight` / `inputWidth` (which + /// have struct-level defaults). + /// 2. Preprocess each image sequentially into a `[3, H, W]` Float buffer. + /// 3. Concatenate the per-image buffers into the `[B, 3, H, W]` input + /// NDArray and run a single forward pass. + /// 4. Slice each batch slot from the outputs and decode independently, + /// returning `images.count` detection lists in input order. + public func detect(images: [CGImage], parameters: DetectionParameters) async throws + -> [[DetectedObject]] + { + guard !images.isEmpty else { + throw DetectionRuntimeError.invalidConfiguration("detect requires at least one image") + } guard case .ndArray(let imageDescriptor) = functionDescriptor.inputDescriptor(of: imageInputName) else { throw DetectionRuntimeError.invalidConfiguration( "No array descriptor for image input '\(imageInputName)'" ) } - let expectedShape = imageDescriptor.shape guard expectedShape.count == 4 else { throw DetectionRuntimeError.invalidConfiguration( "Expected 4-dimensional input shape, got \(expectedShape.count)" ) } - let height = expectedShape[2] - let width = expectedShape[3] - let floatPixels = try ImagePreprocessor( - targetSize: CGSize(width: width, height: height), + + let plan = try Self.planBatch( + expectedShape: expectedShape, + imageSizes: images.map { CGSize(width: $0.width, height: $0.height) }, + parameters: parameters + ) + + // 1. Preprocess each input image (sequential). + let perImagePixels = try preprocessImages(images, plan: plan, parameters: parameters) + + // 2. Build batched NDArray and run inference once. + let resolvedDescriptor = imageDescriptor.resolvingDynamicDimensions( + [plan.batch, 3, plan.height, plan.width]) + let imageArray = try buildInputNDArray(descriptor: resolvedDescriptor, perImagePixels: perImagePixels) + + var outputs = try await function.run(inputs: [imageInputName: imageArray]) + guard let logitsArray = outputs.remove(logitsOutputName)?.ndArray, + let boxesArray = outputs.remove(boxesOutputName)?.ndArray + else { + throw DetectionRuntimeError.invalidConfiguration( + "Missing one or more outputs after run." + ) + } + + // 3. Decode each input image's batch slot. + return Self.decodePerImage( + logitsArray: logitsArray, + boxesArray: boxesArray, + images: images, + parameters: parameters + ) + } + + // MARK: - Preprocessing + + /// Sequentially preprocess each image to a `[3 * H * W]` Float buffer at + /// the plan's target spatial dimensions. + private func preprocessImages( + _ images: [CGImage], plan: BatchPlan, parameters: DetectionParameters + ) throws -> [[Float]] { + let preprocessor = ImagePreprocessor( + targetSize: CGSize(width: plan.width, height: plan.height), mean: parameters.normalizationMeans, std: parameters.normalizationStds, rescaleFactor: 1.0 - ).preprocessCHW(cgImage: image) - - var imageArray = NDArray(descriptor: imageDescriptor) + ) + return try images.map { try preprocessor.preprocessCHW(cgImage: $0) } + } - if imageDescriptor.scalarType == .float16 { + /// Build the input NDArray for a `[B, 3, H, W]` resolved descriptor by + /// concatenating per-image CHW buffers in batch order. Each per-image + /// entry is `3*H*W` floats; the buffers are written contiguously to match + /// row-major batch-leading layout. + private func buildInputNDArray( + descriptor: NDArrayDescriptor, perImagePixels: [[Float]] + ) throws -> NDArray { + var imageArray = NDArray(descriptor: descriptor) + let flat = Array(perImagePixels.joined()) + if descriptor.scalarType == .float16 { #if !((os(macOS) || targetEnvironment(macCatalyst)) && arch(x86_64)) - fillNDArray(&imageArray, as: Float16.self, with: floatPixels.map(Float16.init)) + fillNDArray(&imageArray, as: Float16.self, with: flat.map(Float16.init)) #else fatalError("Float16 is not supported on this platform") #endif } else { - fillNDArray(&imageArray, as: Float.self, with: floatPixels) + fillNDArray(&imageArray, as: Float.self, with: flat) } + return imageArray + } - // Run inference and extract outputs - var outputs = try await function.run(inputs: [imageInputName: imageArray]) - guard let logitsArray = outputs.remove(logitsOutputName)?.ndArray, - let boxesArray = outputs.remove(boxesOutputName)?.ndArray - else { + // MARK: - Output decoding + + private static func decodePerImage( + logitsArray: NDArray, + boxesArray: NDArray, + images: [CGImage], + parameters: DetectionParameters + ) -> [[DetectedObject]] { + let logitsShape = logitsArray.shape // [B, Q, C] + let boxesShape = boxesArray.shape // [B, Q, 4] + let logitsAll = flattenAsFloat(logitsArray) + let boxesAll = flattenAsFloat(boxesArray) + let perBatchLog = logitsShape.dropFirst().reduce(1, *) + let perBatchBox = boxesShape.dropFirst().reduce(1, *) + let singleBatchLogitsShape = [1] + logitsShape.dropFirst() + + return images.enumerated().map { i, image in + let raw = DetectionOutput( + logits: Array(logitsAll[i * perBatchLog..<(i + 1) * perBatchLog]), + logitsShape: singleBatchLogitsShape, + predictedBoxes: Array(boxesAll[i * perBatchBox..<(i + 1) * perBatchBox]) + ) + return DetectionPostprocessor.decode( + output: raw, + inputSize: CGSize(width: image.width, height: image.height), + parameters: parameters + ) + } + } + + // MARK: - Batch planning + + struct BatchPlan: Equatable { + let batch: Int + let height: Int + let width: Int + } + + /// Resolve the concrete `(B, H, W)` to bind the model with, given the + /// model's expected shape (which may contain `-1` for dynamic dims), the + /// list of input image sizes, and the user's parameter overrides. + /// + /// Resolution rules: + /// - **Batch**: always `images.count`. A static-batch model must match. + /// - **Spatial dims**: a dynamic `-1` dim is filled from + /// `parameters.inputHeight` / `inputWidth`. A static dim is taken + /// from the model descriptor (the parameters' values are ignored for + /// that axis). + static func planBatch( + expectedShape: [Int], + imageSizes: [CGSize], + parameters: DetectionParameters + ) throws -> BatchPlan { + guard !imageSizes.isEmpty else { + throw DetectionRuntimeError.invalidConfiguration("planBatch requires at least one image") + } + + // Resolve batch from image count; verify it matches a static batch dim. + let targetBatch = imageSizes.count + let batchExpected = expectedShape[0] + if batchExpected >= 0 && batchExpected != targetBatch { throw DetectionRuntimeError.invalidConfiguration( - "Missing one or more outputs after run." + "Model expects fixed batch=\(batchExpected) but caller supplied \(targetBatch) image(s)" ) } - let rawOutput = DetectionOutput( - logits: flattenAsFloat(logitsArray), - logitsShape: logitsArray.shape, - predictedBoxes: flattenAsFloat(boxesArray) - ) - let inputSize = CGSize(width: image.width, height: image.height) - return DetectionPostprocessor.decode(output: rawOutput, inputSize: inputSize, parameters: parameters) + let heightExpected = expectedShape[2] + let widthExpected = expectedShape[3] + let height = heightExpected < 0 ? parameters.inputHeight : heightExpected + let width = widthExpected < 0 ? parameters.inputWidth : widthExpected + + return BatchPlan(batch: targetBatch, height: height, width: width) } // MARK: - Name Discovery diff --git a/swift/Sources/Tools/object-detector/ObjectDetectionMain.swift b/swift/Sources/Tools/object-detector/ObjectDetectionMain.swift index 747f69e..07ce518 100644 --- a/swift/Sources/Tools/object-detector/ObjectDetectionMain.swift +++ b/swift/Sources/Tools/object-detector/ObjectDetectionMain.swift @@ -22,8 +22,12 @@ struct ObjectDetectorCLI: AsyncParsableCommand { @Option(name: .long, help: "Path to the .aimodel directory.") var model: String - @Option(name: .long, help: "Path to the input image.") - var image: String + @Option( + name: .long, + help: + "Path to an input image. Pass --image multiple times to run detection on a batch of images (one --image per source file)." + ) + var image: [String] = [] @Option(name: .long, help: "Confidence threshold (0–1).") var threshold: Float = 0.3 @@ -31,78 +35,171 @@ struct ObjectDetectorCLI: AsyncParsableCommand { @Option(name: .long, help: "Maximum number of detections to return.") var maxDetections: Int = 100 + @Option( + name: .long, + help: + "Override model input height (only used for dynamic models). Defaults to DetectionParameters.inputHeight if not set." + ) + var inputHeight: Int? + + @Option( + name: .long, + help: + "Override model input width (only used for dynamic models). Defaults to DetectionParameters.inputWidth if not set." + ) + var inputWidth: Int? + @Flag(name: .long, help: "Run a warmup pass before timed inference.") var warmup: Bool = false - @Option(name: .long, help: "Save output image with rendered boxes to this path.") + @Option( + name: .long, + help: + "Render detections onto the input image(s). For a single --image, this is a file path. For multiple --image inputs, this is a directory; the CLI writes _detections.png into it." + ) var outputImage: String? - @Option(name: .long, help: "Write JSON results to this path instead of stdout.") + @Option( + name: .long, + help: + "Write JSON results to this path instead of stdout. With one --image, the JSON is an array of detections; with multiple, it's an array of {image, detections} objects." + ) var outputJson: String? @Flag(name: .long, help: "Print verbose progress information.") var verbose: Bool = false + // MARK: - Validation + + func validate() throws { + if image.isEmpty { + throw ValidationError("At least one --image must be provided.") + } + } + // MARK: - Run func run() async throws { if verbose { print("Loading model from \(model)...") } - let params = DetectionParameters(threshold: threshold, maxDetections: maxDetections) + var params = DetectionParameters( + threshold: threshold, + maxDetections: maxDetections + ) + if let h = inputHeight { params.inputHeight = h } + if let w = inputWidth { params.inputWidth = w } let detector = try await ObjectDetector(resourcesAt: model) - let cgImage = try loadCGImage(from: image) - if verbose { print("Loaded image: \(cgImage.width)×\(cgImage.height)") } + let loaded: [(path: String, image: CGImage)] = try image.map { path in + let cgImage = try loadCGImage(from: path) + if verbose { print("Loaded image \(path): \(cgImage.width)×\(cgImage.height)") } + return (path, cgImage) + } if warmup { if verbose { print("Running warmup...") } try await detector.warmup() } - if verbose { print("Running detection...") } + if verbose { print("Running detection on \(loaded.count) image(s)...") } let start = SuspendingClock().now - let detections = try await detector.detect(image: cgImage, parameters: params) + let allDetections = try await detector.detect( + images: loaded.map { $0.image }, + parameters: params + ) let elapsed = SuspendingClock().now - start if verbose { - print("Inference time: \(elapsed)") + print("Inference time (total): \(elapsed)") } - // Format results - let results = detections.map { d -> JSONDetection in - JSONDetection( - label: d.label, - labelIndex: d.labelIndex, - score: d.confidence, - box: JSONDetection.Box( - x: d.boundingBox.origin.x, - y: d.boundingBox.origin.y, - width: d.boundingBox.size.width, - height: d.boundingBox.size.height - ) + // Build per-image JSON entries + let entries: [JSONImageResult] = zip(loaded, allDetections).map { (entry, detections) in + JSONImageResult( + image: entry.path, + detections: detections.map { d in + JSONDetection( + label: d.label, + labelIndex: d.labelIndex, + score: d.confidence, + box: JSONDetection.Box( + x: d.boundingBox.origin.x, + y: d.boundingBox.origin.y, + width: d.boundingBox.size.width, + height: d.boundingBox.size.height + ) + ) + } ) } - if let jsonPath = outputJson { - let encoder = JSONEncoder() - encoder.outputFormatting = [.prettyPrinted, .sortedKeys] - let data = try encoder.encode(results) - try data.write(to: URL(fileURLWithPath: NSString(string: jsonPath).expandingTildeInPath)) - print("Results written to \(jsonPath)") - } else { - print("\nDetections (\(detections.count)):") - for (i, d) in detections.enumerated() { - print( - " [\(i)] \(d.label) score=\(String(format: "%.3f", d.confidence))" - + " box=(\(Int(d.boundingBox.origin.x)),\(Int(d.boundingBox.origin.y)),\(Int(d.boundingBox.width))×\(Int(d.boundingBox.height)))" - ) + try writeJsonOutput(entries: entries, multiImage: loaded.count > 1) + + // Stdout summary (suppressed when --output-json is set, matching prior behavior) + if outputJson == nil { + for (i, entry) in entries.enumerated() { + if loaded.count > 1 { + print("\nImage \(i + 1)/\(loaded.count): \(entry.image)") + } + print("Detections (\(entry.detections.count)):") + for (idx, d) in entry.detections.enumerated() { + print( + " [\(idx)] \(d.label) score=\(String(format: "%.3f", d.score))" + + " box=(\(Int(d.box.x)),\(Int(d.box.y)),\(Int(d.box.width))×\(Int(d.box.height)))" + ) + } } } - // Render output image with bounding boxes + // Render output image(s) if let imagePath = outputImage { - let outputURL = URL(fileURLWithPath: NSString(string: imagePath).expandingTildeInPath) - try renderDetections(onto: cgImage, detections: detections, saveTo: outputURL) + try renderOutputImages( + imagePath: imagePath, + loaded: loaded, + allDetections: allDetections + ) + } + } + + // MARK: - Output helpers + + private func writeJsonOutput(entries: [JSONImageResult], multiImage: Bool) throws { + guard let jsonPath = outputJson else { return } + let encoder = JSONEncoder() + encoder.outputFormatting = [.prettyPrinted, .sortedKeys] + let url = URL(fileURLWithPath: NSString(string: jsonPath).expandingTildeInPath) + let data: Data + if multiImage { + data = try encoder.encode(entries) + } else { + // Single-image: keep the prior schema (top-level array of detections). + data = try encoder.encode(entries.first?.detections ?? []) + } + try data.write(to: url) + print("Results written to \(jsonPath)") + } + + private func renderOutputImages( + imagePath: String, + loaded: [(path: String, image: CGImage)], + allDetections: [[DetectedObject]] + ) throws { + let expanded = NSString(string: imagePath).expandingTildeInPath + if loaded.count == 1 { + let outputURL = URL(fileURLWithPath: expanded) + try renderDetections(onto: loaded[0].image, detections: allDetections[0], saveTo: outputURL) print("Output image written to \(imagePath)") + return + } + + // Multi-image: treat --output-image as a directory. + let dirURL = URL(fileURLWithPath: expanded) + try FileManager.default.createDirectory(at: dirURL, withIntermediateDirectories: true) + for (entry, detections) in zip(loaded, allDetections) { + let stem = (entry.path as NSString).lastPathComponent + let stemNoExt = (stem as NSString).deletingPathExtension + let outURL = dirURL.appendingPathComponent("\(stemNoExt)_detections.png") + try renderDetections(onto: entry.image, detections: detections, saveTo: outURL) + print("Output image written to \(outURL.path)") } } } @@ -241,3 +338,8 @@ private struct JSONDetection: Codable { let x, y, width, height: Double } } + +private struct JSONImageResult: Codable { + let image: String + let detections: [JSONDetection] +} diff --git a/swift/Tests/ObjectDetectorTests/ObjectDetectorTests.swift b/swift/Tests/ObjectDetectorTests/ObjectDetectorTests.swift index 374b70f..10d41a5 100644 --- a/swift/Tests/ObjectDetectorTests/ObjectDetectorTests.swift +++ b/swift/Tests/ObjectDetectorTests/ObjectDetectorTests.swift @@ -212,4 +212,77 @@ struct ObjectDetectorTests { #expect(result.allSatisfy { $0.isFinite }) #expect(abs(result[0] - 0.5) < 1e-5) } + + // MARK: - Batch planning + + @Test("planBatch: single image, dynamic dims, no overrides → parameter defaults") + func planBatchSingleDefault() throws { + let p = DetectionParameters() + let plan = try ObjectDetector.planBatch( + expectedShape: [-1, 3, -1, -1], + imageSizes: [CGSize(width: 728, height: 408)], + parameters: .default + ) + #expect(plan == ObjectDetector.BatchPlan(batch: 1, height: p.inputHeight, width: p.inputWidth)) + } + + @Test("planBatch: multi-image, dynamic dims, no overrides → parameter defaults") + func planBatchMultiDefault() throws { + let p = DetectionParameters() + let plan = try ObjectDetector.planBatch( + expectedShape: [-1, 3, -1, -1], + imageSizes: [ + CGSize(width: 1280, height: 720), + CGSize(width: 600, height: 800), + CGSize(width: 1024, height: 768), + ], + parameters: .default + ) + #expect(plan == ObjectDetector.BatchPlan(batch: 3, height: p.inputHeight, width: p.inputWidth)) + } + + @Test("planBatch: multi-image, dynamic dims, explicit overrides win") + func planBatchMultiOverride() throws { + var params = DetectionParameters.default + params.inputHeight = 512 + params.inputWidth = 512 + let plan = try ObjectDetector.planBatch( + expectedShape: [-1, 3, -1, -1], + imageSizes: [ + CGSize(width: 1280, height: 720), + CGSize(width: 600, height: 800), + ], + parameters: params + ) + #expect(plan == ObjectDetector.BatchPlan(batch: 2, height: 512, width: 512)) + } + + @Test("planBatch: static spatial dims override parameter values silently") + func planBatchStaticSpatialIgnoresParams() throws { + // Static [1, 3, 800, 800] with mismatching params → planBatch uses + // the static dims; parameter values are silently ignored for fixed axes. + var params = DetectionParameters.default + params.inputHeight = 512 + params.inputWidth = 512 + let plan = try ObjectDetector.planBatch( + expectedShape: [1, 3, 800, 800], + imageSizes: [CGSize(width: 640, height: 480)], + parameters: params + ) + #expect(plan == ObjectDetector.BatchPlan(batch: 1, height: 800, width: 800)) + } + + @Test("planBatch: static batch mismatch throws (multi-image into batch=1 model)") + func planBatchStaticBatchMismatchThrows() { + #expect(throws: DetectionRuntimeError.self) { + try ObjectDetector.planBatch( + expectedShape: [1, 3, -1, -1], + imageSizes: [ + CGSize(width: 1000, height: 1000), + CGSize(width: 1000, height: 1000), + ], + parameters: .default + ) + } + } } From c87c09977ab3efb7b370532bd603b8acf69315b0 Mon Sep 17 00:00:00 2001 From: Kevin Cheng <59463423+kevchengcodes@users.noreply.github.com> Date: Fri, 12 Jun 2026 14:36:29 -0400 Subject: [PATCH 2/3] make warmup use same shape and update planBatch signature --- .../CoreAIObjectDetector/ObjectDetector.swift | 45 ++++++++++++------- .../object-detector/ObjectDetectionMain.swift | 2 +- .../ObjectDetectorTests.swift | 20 +++------ 3 files changed, 34 insertions(+), 33 deletions(-) diff --git a/swift/Sources/CoreAIObjectDetector/ObjectDetector.swift b/swift/Sources/CoreAIObjectDetector/ObjectDetector.swift index 045531b..acaacc0 100644 --- a/swift/Sources/CoreAIObjectDetector/ObjectDetector.swift +++ b/swift/Sources/CoreAIObjectDetector/ObjectDetector.swift @@ -80,17 +80,29 @@ public struct ObjectDetector { // MARK: - Inference - /// Warm up the backend (e.g. trigger Metal kernel compilation) with a dummy pass. - public func warmup() async throws { + /// Warm up the backend (e.g. trigger Metal kernel compilation) with a dummy + /// pass at the same `(B, H, W)` that subsequent `detect()` calls will use. + /// For static-shape models the arguments are ignored — `planBatch` falls + /// back to the descriptor's fixed dims. + public func warmup(imageCount: Int = 1, parameters: DetectionParameters = .default) async throws { guard case .ndArray(let imageDescriptor) = functionDescriptor.inputDescriptor(of: imageInputName) else { throw DetectionRuntimeError.invalidConfiguration( "No array descriptor for image input '\(imageInputName)'" ) } - let defaults = DetectionParameters() - let warmupShape = zip(imageDescriptor.shape, [1, 3, defaults.inputHeight, defaults.inputWidth]) - .map { actual, fallback in actual >= 0 ? actual : fallback } - let resolved = imageDescriptor.resolvingDynamicDimensions(warmupShape) + let expectedShape = imageDescriptor.shape + guard expectedShape.count == 4 else { + throw DetectionRuntimeError.invalidConfiguration( + "Expected 4-dimensional input shape, got \(expectedShape.count)" + ) + } + let plan = try Self.planBatch( + expectedShape: expectedShape, + imageCount: imageCount, + parameters: parameters + ) + let resolved = imageDescriptor.resolvingDynamicDimensions( + [plan.batch, 3, plan.height, plan.width]) _ = try await function.run(inputs: [imageInputName: NDArray(descriptor: resolved)]) } @@ -142,7 +154,7 @@ public struct ObjectDetector { let plan = try Self.planBatch( expectedShape: expectedShape, - imageSizes: images.map { CGSize(width: $0.width, height: $0.height) }, + imageCount: images.count, parameters: parameters ) @@ -249,29 +261,28 @@ public struct ObjectDetector { /// Resolve the concrete `(B, H, W)` to bind the model with, given the /// model's expected shape (which may contain `-1` for dynamic dims), the - /// list of input image sizes, and the user's parameter overrides. + /// number of input images, and the user's parameter overrides. /// /// Resolution rules: - /// - **Batch**: always `images.count`. A static-batch model must match. + /// - **Batch**: always `imageCount`. A static-batch model must match. /// - **Spatial dims**: a dynamic `-1` dim is filled from /// `parameters.inputHeight` / `inputWidth`. A static dim is taken /// from the model descriptor (the parameters' values are ignored for /// that axis). static func planBatch( expectedShape: [Int], - imageSizes: [CGSize], + imageCount: Int, parameters: DetectionParameters ) throws -> BatchPlan { - guard !imageSizes.isEmpty else { - throw DetectionRuntimeError.invalidConfiguration("planBatch requires at least one image") + guard imageCount >= 1 else { + throw DetectionRuntimeError.invalidConfiguration("planBatch requires imageCount >= 1") } - // Resolve batch from image count; verify it matches a static batch dim. - let targetBatch = imageSizes.count + // Verify image count matches a static batch dim. let batchExpected = expectedShape[0] - if batchExpected >= 0 && batchExpected != targetBatch { + if batchExpected >= 0 && batchExpected != imageCount { throw DetectionRuntimeError.invalidConfiguration( - "Model expects fixed batch=\(batchExpected) but caller supplied \(targetBatch) image(s)" + "Model expects fixed batch=\(batchExpected) but caller supplied \(imageCount) image(s)" ) } @@ -280,7 +291,7 @@ public struct ObjectDetector { let height = heightExpected < 0 ? parameters.inputHeight : heightExpected let width = widthExpected < 0 ? parameters.inputWidth : widthExpected - return BatchPlan(batch: targetBatch, height: height, width: width) + return BatchPlan(batch: imageCount, height: height, width: width) } // MARK: - Name Discovery diff --git a/swift/Sources/Tools/object-detector/ObjectDetectionMain.swift b/swift/Sources/Tools/object-detector/ObjectDetectionMain.swift index 07ce518..2f897d9 100644 --- a/swift/Sources/Tools/object-detector/ObjectDetectionMain.swift +++ b/swift/Sources/Tools/object-detector/ObjectDetectionMain.swift @@ -97,7 +97,7 @@ struct ObjectDetectorCLI: AsyncParsableCommand { if warmup { if verbose { print("Running warmup...") } - try await detector.warmup() + try await detector.warmup(imageCount: loaded.count, parameters: params) } if verbose { print("Running detection on \(loaded.count) image(s)...") } diff --git a/swift/Tests/ObjectDetectorTests/ObjectDetectorTests.swift b/swift/Tests/ObjectDetectorTests/ObjectDetectorTests.swift index 10d41a5..2c65a8d 100644 --- a/swift/Tests/ObjectDetectorTests/ObjectDetectorTests.swift +++ b/swift/Tests/ObjectDetectorTests/ObjectDetectorTests.swift @@ -220,7 +220,7 @@ struct ObjectDetectorTests { let p = DetectionParameters() let plan = try ObjectDetector.planBatch( expectedShape: [-1, 3, -1, -1], - imageSizes: [CGSize(width: 728, height: 408)], + imageCount: 1, parameters: .default ) #expect(plan == ObjectDetector.BatchPlan(batch: 1, height: p.inputHeight, width: p.inputWidth)) @@ -231,11 +231,7 @@ struct ObjectDetectorTests { let p = DetectionParameters() let plan = try ObjectDetector.planBatch( expectedShape: [-1, 3, -1, -1], - imageSizes: [ - CGSize(width: 1280, height: 720), - CGSize(width: 600, height: 800), - CGSize(width: 1024, height: 768), - ], + imageCount: 3, parameters: .default ) #expect(plan == ObjectDetector.BatchPlan(batch: 3, height: p.inputHeight, width: p.inputWidth)) @@ -248,10 +244,7 @@ struct ObjectDetectorTests { params.inputWidth = 512 let plan = try ObjectDetector.planBatch( expectedShape: [-1, 3, -1, -1], - imageSizes: [ - CGSize(width: 1280, height: 720), - CGSize(width: 600, height: 800), - ], + imageCount: 2, parameters: params ) #expect(plan == ObjectDetector.BatchPlan(batch: 2, height: 512, width: 512)) @@ -266,7 +259,7 @@ struct ObjectDetectorTests { params.inputWidth = 512 let plan = try ObjectDetector.planBatch( expectedShape: [1, 3, 800, 800], - imageSizes: [CGSize(width: 640, height: 480)], + imageCount: 1, parameters: params ) #expect(plan == ObjectDetector.BatchPlan(batch: 1, height: 800, width: 800)) @@ -277,10 +270,7 @@ struct ObjectDetectorTests { #expect(throws: DetectionRuntimeError.self) { try ObjectDetector.planBatch( expectedShape: [1, 3, -1, -1], - imageSizes: [ - CGSize(width: 1000, height: 1000), - CGSize(width: 1000, height: 1000), - ], + imageCount: 2, parameters: .default ) } From c06360f66adb23ee0cdc20e103e84e59691f0d44 Mon Sep 17 00:00:00 2001 From: Kevin Cheng <59463423+kevchengcodes@users.noreply.github.com> Date: Fri, 12 Jun 2026 16:01:56 -0400 Subject: [PATCH 3/3] update README --- models/yolo/README.md | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/models/yolo/README.md b/models/yolo/README.md index 5bc2108..97a5791 100644 --- a/models/yolo/README.md +++ b/models/yolo/README.md @@ -44,22 +44,37 @@ uv run export.py --help ### In your iOS and macOS applications ```swift -import ObjectDetector - -// Detection parameters -let params = DetectionParameters() +import CoreAIObjectDetector // Load directly from an exported .aimodel directory. let detector = try await ObjectDetector(resourcesAt: "coreai-models/exports/yolos-base_float32_static.aimodel") -// Run inference -let detections = try await detector.detect(image: cgImage, parameters: params) +// Single image, default parameters. +let detections = try await detector.detect(image: cgImage) + +// Batched detection. For dynamic-shape exports, optionally override the spatial dims +// on DetectionParameters; for static exports the values are ignored. +var params = DetectionParameters() +params.inputHeight = 800 +params.inputWidth = 1024 +let batchDetections = try await detector.detect(images: [imageA, imageB], parameters: params) + +// Optional: warm up the kernel for the exact (B, H, W) you'll run with. +try await detector.warmup(imageCount: 2, parameters: params) ``` ### On your Mac using built-in Command Line Tool ```bash +# Single image, static-shape model. swift run -c release object-detector --model path/to/exported_model.aimodel --image path/to/image.jpg + +# Batched detection on a dynamic-shape export, with optional, explicit input dims and warmup. +swift run -c release object-detector \ + --model path/to/dynamic.aimodel \ + --image a.jpg --image b.jpg \ + --input-height 800 --input-width 1024 \ + --warmup ``` [^1]: [Paper](https://arxiv.org/abs/2106.00666) · [HuggingFace](https://huggingface.co/hustvl/yolos-tiny)