Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 21 additions & 6 deletions models/yolo/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,22 +44,37 @@ uv run export.py --help
### In your iOS and macOS applications

```swift
import ObjectDetector

// Detection parameters
let params = DetectionParameters()
import CoreAIObjectDetector

// Load directly from an exported .aimodel directory.
let detector = try await ObjectDetector(resourcesAt: "coreai-models/exports/yolos-base_float32_static.aimodel")

// Run inference
let detections = try await detector.detect(image: cgImage, parameters: params)
// Single image, default parameters.
let detections = try await detector.detect(image: cgImage)

// Batched detection. For dynamic-shape exports, optionally override the spatial dims
// on DetectionParameters; for static exports the values are ignored.
var params = DetectionParameters()
params.inputHeight = 800
params.inputWidth = 1024
let batchDetections = try await detector.detect(images: [imageA, imageB], parameters: params)

// Optional: warm up the kernel for the exact (B, H, W) you'll run with.
try await detector.warmup(imageCount: 2, parameters: params)
```

### On your Mac using built-in Command Line Tool

```bash
# Single image, static-shape model.
swift run -c release object-detector --model path/to/exported_model.aimodel --image path/to/image.jpg

# Batched detection on a dynamic-shape export, with optional, explicit input dims and warmup.
swift run -c release object-detector \
--model path/to/dynamic.aimodel \
--image a.jpg --image b.jpg \
--input-height 800 --input-width 1024 \
--warmup
```

[^1]: [Paper](https://arxiv.org/abs/2106.00666) · [HuggingFace](https://huggingface.co/hustvl/yolos-tiny)
16 changes: 15 additions & 1 deletion swift/Sources/CoreAIObjectDetector/DetectionOutputs.swift
Original file line number Diff line number Diff line change
Expand Up @@ -73,18 +73,32 @@ public struct DetectionParameters: Sendable {
/// When empty, labels default to "class_N".
public var classLabels: [Int: String]

/// Model input height. Only consulted when the model declares a dynamic
/// spatial dimension; ignored for static-shape models. Defaults to 800
/// (matches the YOLOS export's reference input and the training-time
/// canvas geometry).
public var inputHeight: Int

/// Model input width. Only consulted when the model declares a dynamic
/// spatial dimension; ignored for static-shape models. Defaults to 800.
public var inputWidth: Int

public init(
threshold: Float = 0.3,
maxDetections: Int = 100,
normalizationMeans: (CGFloat, CGFloat, CGFloat) = (0.485, 0.456, 0.406),
normalizationStds: (CGFloat, CGFloat, CGFloat) = (0.229, 0.224, 0.225),
classLabels: [Int: String] = ObjectDetectionLabels.coco
classLabels: [Int: String] = ObjectDetectionLabels.coco,
inputHeight: Int = 800,
inputWidth: Int = 800
) {
self.threshold = threshold
self.maxDetections = maxDetections
self.normalizationMeans = normalizationMeans
self.normalizationStds = normalizationStds
self.classLabels = classLabels
self.inputHeight = inputHeight
self.inputWidth = inputWidth
}

public static let `default` = DetectionParameters()
Expand Down
203 changes: 173 additions & 30 deletions swift/Sources/CoreAIObjectDetector/ObjectDetector.swift
Original file line number Diff line number Diff line change
Expand Up @@ -80,75 +80,218 @@ public struct ObjectDetector {

// MARK: - Inference

/// Warm up the backend (e.g. trigger Metal kernel compilation) with a dummy pass.
public func warmup() async throws {
/// Warm up the backend (e.g. trigger Metal kernel compilation) with a dummy
/// pass at the same `(B, H, W)` that subsequent `detect()` calls will use.
/// For static-shape models the arguments are ignored — `planBatch` falls
/// back to the descriptor's fixed dims.
public func warmup(imageCount: Int = 1, parameters: DetectionParameters = .default) async throws {
guard case .ndArray(let imageDescriptor) = functionDescriptor.inputDescriptor(of: imageInputName) else {
throw DetectionRuntimeError.invalidConfiguration(
"No array descriptor for image input '\(imageInputName)'"
)
}
let imageArray = NDArray(descriptor: imageDescriptor)
_ = try await function.run(inputs: [imageInputName: imageArray])
let expectedShape = imageDescriptor.shape
guard expectedShape.count == 4 else {
throw DetectionRuntimeError.invalidConfiguration(
"Expected 4-dimensional input shape, got \(expectedShape.count)"
)
}
let plan = try Self.planBatch(
expectedShape: expectedShape,
imageCount: imageCount,
parameters: parameters
)
let resolved = imageDescriptor.resolvingDynamicDimensions(
[plan.batch, 3, plan.height, plan.width])
_ = try await function.run(inputs: [imageInputName: NDArray(descriptor: resolved)])
}

/// Detect objects in `image` using `.default` parameters.
public func detect(image: CGImage) async throws -> [DetectedObject] {
try await detect(image: image, parameters: .default)
}

/// Detect objects in `image`.
/// Detect objects in `image` — convenience wrapper over the batched API.
public func detect(image: CGImage, parameters: DetectionParameters) async throws -> [DetectedObject] {
// Build image NDArray
let results = try await detect(images: [image], parameters: parameters)
return results.first ?? []
}

/// Detect objects in each of `images` using `.default` parameters.
public func detect(images: [CGImage]) async throws -> [[DetectedObject]] {
try await detect(images: images, parameters: .default)
}

/// Detect objects across `images` in a single batched forward pass.
///
/// Pipeline:
/// 1. Resolve a batch plan `(B, H, W)` from the model descriptor and
/// parameters. Batch is always `images.count`. Dynamic spatial dims
/// are filled from `parameters.inputHeight` / `inputWidth` (which
/// have struct-level defaults).
/// 2. Preprocess each image sequentially into a `[3, H, W]` Float buffer.
/// 3. Concatenate the per-image buffers into the `[B, 3, H, W]` input
/// NDArray and run a single forward pass.
/// 4. Slice each batch slot from the outputs and decode independently,
/// returning `images.count` detection lists in input order.
public func detect(images: [CGImage], parameters: DetectionParameters) async throws
-> [[DetectedObject]]
{
guard !images.isEmpty else {
throw DetectionRuntimeError.invalidConfiguration("detect requires at least one image")
}
guard case .ndArray(let imageDescriptor) = functionDescriptor.inputDescriptor(of: imageInputName) else {
throw DetectionRuntimeError.invalidConfiguration(
"No array descriptor for image input '\(imageInputName)'"
)
}

let expectedShape = imageDescriptor.shape
guard expectedShape.count == 4 else {
throw DetectionRuntimeError.invalidConfiguration(
"Expected 4-dimensional input shape, got \(expectedShape.count)"
)
}
let height = expectedShape[2]
let width = expectedShape[3]
let floatPixels = try ImagePreprocessor(
targetSize: CGSize(width: width, height: height),

let plan = try Self.planBatch(
expectedShape: expectedShape,
imageCount: images.count,
parameters: parameters
)

// 1. Preprocess each input image (sequential).
let perImagePixels = try preprocessImages(images, plan: plan, parameters: parameters)

// 2. Build batched NDArray and run inference once.
let resolvedDescriptor = imageDescriptor.resolvingDynamicDimensions(
[plan.batch, 3, plan.height, plan.width])
let imageArray = try buildInputNDArray(descriptor: resolvedDescriptor, perImagePixels: perImagePixels)

var outputs = try await function.run(inputs: [imageInputName: imageArray])
guard let logitsArray = outputs.remove(logitsOutputName)?.ndArray,
let boxesArray = outputs.remove(boxesOutputName)?.ndArray
else {
throw DetectionRuntimeError.invalidConfiguration(
"Missing one or more outputs after run."
)
}

// 3. Decode each input image's batch slot.
return Self.decodePerImage(
logitsArray: logitsArray,
boxesArray: boxesArray,
images: images,
parameters: parameters
)
}

// MARK: - Preprocessing

/// Sequentially preprocess each image to a `[3 * H * W]` Float buffer at
/// the plan's target spatial dimensions.
private func preprocessImages(
_ images: [CGImage], plan: BatchPlan, parameters: DetectionParameters
) throws -> [[Float]] {
let preprocessor = ImagePreprocessor(
targetSize: CGSize(width: plan.width, height: plan.height),
mean: parameters.normalizationMeans,
std: parameters.normalizationStds,
rescaleFactor: 1.0
).preprocessCHW(cgImage: image)

var imageArray = NDArray(descriptor: imageDescriptor)
)
return try images.map { try preprocessor.preprocessCHW(cgImage: $0) }
}

if imageDescriptor.scalarType == .float16 {
/// Build the input NDArray for a `[B, 3, H, W]` resolved descriptor by
/// concatenating per-image CHW buffers in batch order. Each per-image
/// entry is `3*H*W` floats; the buffers are written contiguously to match
/// row-major batch-leading layout.
private func buildInputNDArray(
descriptor: NDArrayDescriptor, perImagePixels: [[Float]]
) throws -> NDArray {
var imageArray = NDArray(descriptor: descriptor)
let flat = Array(perImagePixels.joined())
if descriptor.scalarType == .float16 {
#if !((os(macOS) || targetEnvironment(macCatalyst)) && arch(x86_64))
fillNDArray(&imageArray, as: Float16.self, with: floatPixels.map(Float16.init))
fillNDArray(&imageArray, as: Float16.self, with: flat.map(Float16.init))
#else
fatalError("Float16 is not supported on this platform")
#endif
} else {
fillNDArray(&imageArray, as: Float.self, with: floatPixels)
fillNDArray(&imageArray, as: Float.self, with: flat)
}
return imageArray
}

// Run inference and extract outputs
var outputs = try await function.run(inputs: [imageInputName: imageArray])
guard let logitsArray = outputs.remove(logitsOutputName)?.ndArray,
let boxesArray = outputs.remove(boxesOutputName)?.ndArray
else {
// MARK: - Output decoding

private static func decodePerImage(
logitsArray: NDArray,
boxesArray: NDArray,
images: [CGImage],
parameters: DetectionParameters
) -> [[DetectedObject]] {
let logitsShape = logitsArray.shape // [B, Q, C]
let boxesShape = boxesArray.shape // [B, Q, 4]
let logitsAll = flattenAsFloat(logitsArray)
let boxesAll = flattenAsFloat(boxesArray)
let perBatchLog = logitsShape.dropFirst().reduce(1, *)
let perBatchBox = boxesShape.dropFirst().reduce(1, *)
let singleBatchLogitsShape = [1] + logitsShape.dropFirst()

return images.enumerated().map { i, image in
let raw = DetectionOutput(
logits: Array(logitsAll[i * perBatchLog..<(i + 1) * perBatchLog]),
logitsShape: singleBatchLogitsShape,
predictedBoxes: Array(boxesAll[i * perBatchBox..<(i + 1) * perBatchBox])
)
return DetectionPostprocessor.decode(
output: raw,
inputSize: CGSize(width: image.width, height: image.height),
parameters: parameters
)
}
}

// MARK: - Batch planning

struct BatchPlan: Equatable {
let batch: Int
let height: Int
let width: Int
}

/// Resolve the concrete `(B, H, W)` to bind the model with, given the
/// model's expected shape (which may contain `-1` for dynamic dims), the
/// number of input images, and the user's parameter overrides.
///
/// Resolution rules:
/// - **Batch**: always `imageCount`. A static-batch model must match.
/// - **Spatial dims**: a dynamic `-1` dim is filled from
/// `parameters.inputHeight` / `inputWidth`. A static dim is taken
/// from the model descriptor (the parameters' values are ignored for
/// that axis).
static func planBatch(
expectedShape: [Int],
imageCount: Int,
parameters: DetectionParameters
) throws -> BatchPlan {
guard imageCount >= 1 else {
throw DetectionRuntimeError.invalidConfiguration("planBatch requires imageCount >= 1")
}

// Verify image count matches a static batch dim.
let batchExpected = expectedShape[0]
if batchExpected >= 0 && batchExpected != imageCount {
throw DetectionRuntimeError.invalidConfiguration(
"Missing one or more outputs after run."
"Model expects fixed batch=\(batchExpected) but caller supplied \(imageCount) image(s)"
)
}

let rawOutput = DetectionOutput(
logits: flattenAsFloat(logitsArray),
logitsShape: logitsArray.shape,
predictedBoxes: flattenAsFloat(boxesArray)
)
let inputSize = CGSize(width: image.width, height: image.height)
return DetectionPostprocessor.decode(output: rawOutput, inputSize: inputSize, parameters: parameters)
let heightExpected = expectedShape[2]
let widthExpected = expectedShape[3]
let height = heightExpected < 0 ? parameters.inputHeight : heightExpected
let width = widthExpected < 0 ? parameters.inputWidth : widthExpected

return BatchPlan(batch: imageCount, height: height, width: width)
}

// MARK: - Name Discovery
Expand Down
Loading