From 1048e74a91603d49460a9ff35b1220643b5ee05b Mon Sep 17 00:00:00 2001 From: "M. Samil Atesoglu" Date: Sun, 19 Apr 2026 22:09:52 +0300 Subject: [PATCH 1/4] 1.4 macOS port --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index a9f8d80..a9e6c80 100644 --- a/.gitignore +++ b/.gitignore @@ -8,4 +8,5 @@ __pycache__ *_generated.h *.ptx -*.egg-info \ No newline at end of file +*.egg-info +.DS_Store From 16efc3fa3c22442fca37e53aa746e4dda6c16a4f Mon Sep 17 00:00:00 2001 From: "M. Samil Atesoglu" Date: Tue, 21 Apr 2026 16:15:55 +0300 Subject: [PATCH 2/4] System audio input: split into platform backends, add macOS ScreenCaptureKit backend, stabilize node status MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Splits SystemAudioInput's monolithic WASAPI-in-one-cpp into a platform- agnostic interface (ISystemAudioCapture) with shared ring buffer / int24 packing in SystemAudioCaptureBase, and per-OS concrete backends: - Windows: WASAPI loopback (moved out of SystemAudioInput.cpp) - macOS 13+: ScreenCaptureKit via SCStream with capturesAudio = YES The macOS backend runs all SCShareableContent / SCStream setup inside the getShareableContentWithCompletionHandler block — smuggling the returned SCShareableContent out across the block boundary crashed objc_msgSend in the caller's frame on macOS 26, and the framework guarantees the object is alive only inside the completion. Waits on ScreenCaptureKit completions pump the main CFRunLoop instead of blocking on dispatch_semaphore; that fixes both the lifetime crash and the main-queue deadlock that occurred when libxpc delivered completions on dispatch_get_main_queue() while the main thread was parked on the semaphore. SystemAudioInput no longer pushes status messages from the Active pin watcher or OnPathStart, and ExecuteNode no longer flips "Capturing audio" vs "Audio capture is ready" every frame based on transient buffer state. Three writers competing over one LastStatusMessage slot caused a visible flap on the editor status line; ExecuteNode now owns the status and only publishes on real state transitions. OnPathStart also stopped forcing a NeedsReinitialize = true that destroyed and re-created the backend on every scheduler-driven path restart; the SampleRate / ChannelCount watchers cover actual format changes. --- CMakeLists.txt | 29 +- Source/SystemAudioCapture.cpp | 116 +++++++ Source/SystemAudioCapture.h | 79 +++++ Source/SystemAudioCaptureMac.mm | 438 ++++++++++++++++++++++++++ Source/SystemAudioCaptureWindows.cpp | 194 ++++++++++++ Source/SystemAudioInput.cpp | 441 +++++---------------------- 6 files changed, 935 insertions(+), 362 deletions(-) create mode 100644 Source/SystemAudioCapture.cpp create mode 100644 Source/SystemAudioCapture.h create mode 100644 Source/SystemAudioCaptureMac.mm create mode 100644 Source/SystemAudioCaptureWindows.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index f214161..a978fc4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,4 +3,31 @@ target_compile_definitions(${NOS_PLUGIN_TARGET} PRIVATE NOS_DISABLE_DEPRECATED) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/External/AudioFile) -target_link_libraries(${NOS_PLUGIN_TARGET} PRIVATE AudioFile) \ No newline at end of file +target_link_libraries(${NOS_PLUGIN_TARGET} PRIVATE AudioFile) + +# SystemAudioCapture has one backend per OS. The Source/ glob picks up every +# .cpp unconditionally, so exclude the platform files that don't match the +# current target to keep the build clean, then add the .mm file back on +# macOS (the glob doesn't match .mm extensions). +if (NOT WIN32) + set_source_files_properties( + ${CMAKE_CURRENT_SOURCE_DIR}/Source/SystemAudioCaptureWindows.cpp + PROPERTIES HEADER_FILE_ONLY ON) +endif() + +if (APPLE) + target_sources(${NOS_PLUGIN_TARGET} PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/Source/SystemAudioCaptureMac.mm) + # ARC keeps retain/release of SCStream/SCContentFilter/etc. out of our + # hair; the other .mm files in the engine deliberately avoid ARC, but + # here we hold Obj-C objects across method boundaries so it's worth it. + set_source_files_properties( + ${CMAKE_CURRENT_SOURCE_DIR}/Source/SystemAudioCaptureMac.mm + PROPERTIES COMPILE_FLAGS "-fobjc-arc") + target_link_libraries(${NOS_PLUGIN_TARGET} PRIVATE + "-framework Foundation" + "-framework CoreMedia" + "-framework CoreAudio" + "-framework AudioToolbox" + "-framework ScreenCaptureKit") +endif() diff --git a/Source/SystemAudioCapture.cpp b/Source/SystemAudioCapture.cpp new file mode 100644 index 0000000..3edcb5e --- /dev/null +++ b/Source/SystemAudioCapture.cpp @@ -0,0 +1,116 @@ +// Copyright MediaZ Teknoloji A.S. All Rights Reserved. + +#include "SystemAudioCapture.h" + +#include "nosAudio/AudioConversions.hpp" + +#include + +namespace nos::audio +{ +namespace +{ +// Keep at most this many seconds of buffered source samples to stop a stalled +// consumer (or a hung execution graph) from growing memory unbounded. The +// existing WASAPI backend used 5 seconds; we keep the same budget for parity. +constexpr uint32_t kMaxBufferedSeconds = 5; +} // namespace + +void SystemAudioCaptureBase::ResetBuffer() +{ + std::lock_guard lock(BufferMutex); + CapturedSamples.clear(); +} + +void SystemAudioCaptureBase::PushInterleavedSamples(const float* samples, + uint32_t frameCount, + uint32_t sourceSampleRate, + uint8_t sourceChannelCount) +{ + if (!samples || frameCount == 0 || sourceChannelCount == 0 || sourceSampleRate == 0) + return; + + std::lock_guard lock(BufferMutex); + + // Format renegotiated mid-stream — drop stale samples so the resampler + // doesn't mix two layouts into a single read. + if (sourceSampleRate != SourceSampleRate || sourceChannelCount != SourceChannelCount) + { + CapturedSamples.clear(); + SourceSampleRate = sourceSampleRate; + SourceChannelCount = sourceChannelCount; + } + + const size_t sampleCount = static_cast(frameCount) * sourceChannelCount; + CapturedSamples.insert(CapturedSamples.end(), samples, samples + sampleCount); + + const size_t maxSamples = static_cast(SourceSampleRate) * SourceChannelCount * kMaxBufferedSeconds; + if (CapturedSamples.size() > maxSamples) + { + const size_t overflow = CapturedSamples.size() - maxSamples; + CapturedSamples.erase(CapturedSamples.begin(), CapturedSamples.begin() + overflow); + } +} + +bool SystemAudioCaptureBase::ReadSamples(int32_t* outBuffer, uint32_t numSamples, uint8_t targetChannels, float gain) +{ + std::unique_lock lock(BufferMutex); + + // Nothing has been pushed yet (or the stream is idle): emit silence. + if (SourceSampleRate == 0 || SourceChannelCount == 0) + { + for (uint32_t i = 0; i < numSamples * targetChannels; ++i) + outBuffer[i] = 0; + return false; + } + + const float sampleRateRatio = static_cast(SourceSampleRate) / static_cast(TargetSampleRate); + const uint32_t sourceSamplesNeeded = static_cast(numSamples * sampleRateRatio); + + if (CapturedSamples.size() < static_cast(sourceSamplesNeeded) * SourceChannelCount) + { + for (uint32_t i = 0; i < numSamples * targetChannels; ++i) + outBuffer[i] = 0; + return false; + } + + // Linear interpolation resample + channel map + gain + int24 pack. + for (uint32_t i = 0; i < numSamples; ++i) + { + const float sourceIndex = i * sampleRateRatio; + const uint32_t sourceIndexInt = static_cast(sourceIndex); + const float frac = sourceIndex - sourceIndexInt; + + for (uint8_t ch = 0; ch < targetChannels; ++ch) + { + // Fold extra target channels onto source channel 0 (mono fallback). + const uint8_t sourceChannel = (ch < SourceChannelCount) ? ch : 0; + + const uint32_t idx1 = sourceIndexInt * SourceChannelCount + sourceChannel; + const uint32_t idx2 = std::min(idx1 + SourceChannelCount, + static_cast(CapturedSamples.size() - 1)); + + if (idx1 < CapturedSamples.size() && idx2 < CapturedSamples.size()) + { + const float sample1 = CapturedSamples[idx1]; + const float sample2 = CapturedSamples[idx2]; + float interpolated = sample1 + (sample2 - sample1) * frac; + interpolated *= gain; + outBuffer[i * targetChannels + ch] = FloatToShiftedInt24(interpolated); + } + else + { + outBuffer[i * targetChannels + ch] = 0; + } + } + } + + const size_t samplesToRemove = static_cast(sourceSamplesNeeded) * SourceChannelCount; + if (samplesToRemove < CapturedSamples.size()) + CapturedSamples.erase(CapturedSamples.begin(), CapturedSamples.begin() + samplesToRemove); + else + CapturedSamples.clear(); + + return true; +} +} // namespace nos::audio diff --git a/Source/SystemAudioCapture.h b/Source/SystemAudioCapture.h new file mode 100644 index 0000000..2921b80 --- /dev/null +++ b/Source/SystemAudioCapture.h @@ -0,0 +1,79 @@ +// Copyright MediaZ Teknoloji A.S. All Rights Reserved. + +#pragma once + +#include +#include +#include +#include +#include + +namespace nos::audio +{ +// Platform-agnostic contract for capturing the host's system audio output +// (loopback). One concrete backend is linked per target OS; the node code +// only ever sees this interface. See the *Windows.cpp / *Mac.mm files for +// the actual WASAPI / ScreenCaptureKit implementations. +class ISystemAudioCapture +{ +public: + virtual ~ISystemAudioCapture() = default; + + // Prepare the backend for the node's requested target format. On failure + // returns false and leaves a user-readable reason in GetLastError(). + virtual bool Initialize(uint32_t sampleRate, uint8_t channelCount) = 0; + + // Begin producing samples into the internal ring buffer. Idempotent. + virtual bool Start() = 0; + + // Stop the backend and drain its worker. Safe to call before Start() or + // more than once. + virtual void Stop() = 0; + + // Pull numSamples interleaved shifted-int24 frames into outBuffer at the + // requested channel layout and gain. Returns true when real audio was + // delivered, false when the buffer was filled with silence because the + // backend has not produced enough data yet. + virtual bool ReadSamples(int32_t* outBuffer, uint32_t numSamples, uint8_t targetChannels, float gain) = 0; + + virtual const std::string& GetDeviceName() const = 0; + virtual const std::string& GetLastError() const = 0; + + // Platform factory — defined once per OS in its own TU. + static std::unique_ptr Create(); +}; + +// Shared ring-buffer + resampler scaffolding. Platform backends only have to +// push interleaved float frames via PushInterleavedSamples; the base handles +// rate conversion, gain, and int24 packing so the WASAPI / ScreenCaptureKit +// files can stay focused on their respective native API dances. +class SystemAudioCaptureBase : public ISystemAudioCapture +{ +public: + bool ReadSamples(int32_t* outBuffer, uint32_t numSamples, uint8_t targetChannels, float gain) override; + const std::string& GetDeviceName() const override { return DeviceName; } + const std::string& GetLastError() const override { return LastError; } + +protected: + // Feed interleaved Float32 samples from the platform capture callback. + // If sourceSampleRate or sourceChannelCount differ from the previous call + // the internal buffer is reset, so format renegotiation mid-stream can't + // produce torn audio. + void PushInterleavedSamples(const float* samples, + uint32_t frameCount, + uint32_t sourceSampleRate, + uint8_t sourceChannelCount); + + // Drop any buffered samples — used when the node reinitializes. + void ResetBuffer(); + + std::mutex BufferMutex; + std::vector CapturedSamples; + uint32_t SourceSampleRate = 0; + uint8_t SourceChannelCount = 0; + uint32_t TargetSampleRate = 0; + uint8_t TargetChannelCount = 0; + std::string DeviceName; + std::string LastError; +}; +} // namespace nos::audio diff --git a/Source/SystemAudioCaptureMac.mm b/Source/SystemAudioCaptureMac.mm new file mode 100644 index 0000000..71e3e5b --- /dev/null +++ b/Source/SystemAudioCaptureMac.mm @@ -0,0 +1,438 @@ +// Copyright MediaZ Teknoloji A.S. All Rights Reserved. + +#ifdef __APPLE__ + +#include "SystemAudioCapture.h" + +#import +#import +#import +#import +#import + +#include + +#include + +#include +#include +#include + +// ScreenCaptureKit is the only first-party macOS API for capturing the system +// audio output without installing a virtual device. It requires macOS 13+ and +// the Screen Recording TCC permission — macOS prompts the user automatically +// on the first SCShareableContent request. On denial or pre-13 hosts we fill +// LastError with a human-readable reason so the node surfaces it in the +// editor status area. +// +// Thread model: every public entry point (Initialize/Start/Stop) is invoked +// on an engine runner thread. Apple's ScreenCaptureKit docs DON'T formally +// require main thread, but in practice calling SCShareableContent / SCStream +// cold from a worker is known to hang or crash (FB12114396, FB15779754, +// and community report nonstrict-hq/SCShareableContent-hangs-sample). The +// root cause is that libxpc delivers replies from tccd / replayd / the +// WindowServer to dispatch_get_main_queue() by default, and CFRunLoop on +// main is the canonical place those Mach-port sources get serviced — plus +// Obj-C +initialize for these frameworks assumes the main runloop is live. +// Apple DTS has confirmed on-record that their sample code "just happened +// to be called on main" (developer.apple.com/forums/thread/735651). We +// funnel the Obj-C work through nosEngine.RunOnMainThread so we don't +// depend on that accident — the same pattern nos.display uses for AppKit. + +namespace nos::audio +{ +// Forward-declared here so the Obj-C delegate below can hold a raw pointer +// back to it. The anonymous-namespace pattern doesn't work — the Obj-C +// @property needs a named, externally-addressable type. +class ScreenCaptureKitCapture; +} // namespace nos::audio + +// Obj-C adapter for SCStream callbacks. The SCStream delegate is a separate +// protocol from the SCStreamOutput sample handler, but ScreenCaptureKit lets +// us implement both on the same object, which keeps ownership simple. +API_AVAILABLE(macos(13.0)) +@interface NosAudioStreamOutput : NSObject +@property (nonatomic, assign) nos::audio::ScreenCaptureKitCapture* backend; +@end + +namespace nos::audio +{ +class ScreenCaptureKitCapture : public SystemAudioCaptureBase +{ +public: + ~ScreenCaptureKitCapture() override { Stop(); } + + bool Initialize(uint32_t sampleRate, uint8_t channelCount) override; + bool Start() override; + void Stop() override; + + // Invoked from the SCStreamOutput delegate on the capture queue. + void OnAudioSampleBuffer(CMSampleBufferRef sampleBuffer); + +private: + // Wait this long for the async ScreenCaptureKit handshake / teardown + // before giving up. Initialize / Start / Stop are called from the editor + // execution thread, so we cap the wait to keep a hung system-service + // from stalling the whole graph. + static constexpr uint64_t kAsyncTimeoutSeconds = 5; + + SCStream* Stream API_AVAILABLE(macos(13.0)) = nil; + NosAudioStreamOutput* Delegate API_AVAILABLE(macos(13.0)) = nil; + dispatch_queue_t AudioQueue = nullptr; + std::atomic Running{false}; + + // Reused each callback so we don't allocate in the capture hot path when + // ScreenCaptureKit delivers planar (non-interleaved) float samples. + std::vector InterleaveScratch; +}; + +namespace +{ +// Synchronously bounce fn onto the main thread via the engine's dispatcher. +// If the host engine predates plugin API 41.1 (no RunOnMainThread exposed), +// we log and fall through — running on the worker anyway is the best we can +// offer, and users on old engines will see the same intermittent crash they +// would have seen before this plugin existed. +void RunOnMainThreadSync(std::function fn) +{ + if (!fn) + return; + if ([NSThread isMainThread]) + { + fn(); + return; + } + if (::nosEngine.RunOnMainThread) + { + ::nosEngine.RunOnMainThread( + [](void* p) { (*static_cast*>(p))(); }, + &fn, + NOS_TRUE); + return; + } + static bool warned = false; + if (!warned) + { + ::nosEngine.LogE("nos.audio: host engine has no RunOnMainThread; ScreenCaptureKit calls may crash."); + warned = true; + } + fn(); +} + +// Pump the main runloop up to `timeoutSeconds` while `*done` is still false. +// Two reasons this is preferred over dispatch_semaphore_wait on main: +// 1. ScreenCaptureKit completion handlers are delivered through libxpc, and +// libxpc replies default to dispatch_get_main_queue(). Blocking main +// with a semaphore deadlocks: the main queue can't service the reply +// because we're sitting on it. +// 2. We stay on a runloop pass cadence, so any plugin that queued work on +// the main thread via MainThreadDispatcher (e.g. the display plugin) +// doesn't starve while we wait. +// Returns true if `done` flipped before the deadline. +bool PumpMainRunLoopUntil(const bool& done, double timeoutSeconds) +{ + const CFAbsoluteTime deadline = CFAbsoluteTimeGetCurrent() + timeoutSeconds; + while (!done) + { + const CFAbsoluteTime remaining = deadline - CFAbsoluteTimeGetCurrent(); + if (remaining <= 0.0) + return false; + const CFTimeInterval step = std::min(0.05, remaining); + CFRunLoopRunInMode(kCFRunLoopDefaultMode, step, /*returnAfterSourceHandled*/ true); + } + return true; +} +} // namespace + +bool ScreenCaptureKitCapture::Initialize(uint32_t sampleRate, uint8_t channelCount) +{ + if (@available(macOS 13.0, *)) + { + TargetSampleRate = sampleRate; + TargetChannelCount = channelCount; + + bool ok = false; + RunOnMainThreadSync([&] { + @autoreleasepool + { + // Everything that touches SCShareableContent / SCDisplay lives + // inside the completion handler, where the framework guarantees + // those objects are alive. Propagating them out through ARC + + // __block + dispatch_semaphore_wait crashes intermittently on + // macOS 26 (objc_msgSend on an apparently-valid SCShareableContent + // in the caller's frame). Building the SCStream here avoids that. + __block bool done = false; + __block bool fetchOk = false; + __block std::string fetchError; + [SCShareableContent getShareableContentWithCompletionHandler:^(SCShareableContent* content, NSError* error) { + @autoreleasepool + { + if (error) + { + NSString* desc = error.localizedDescription; + const char* utf8 = desc.UTF8String; + fetchError = utf8 ? utf8 : "ScreenCaptureKit returned an unspecified error"; + done = true; + return; + } + if (!content || content.displays.count == 0) + { + fetchError = "Screen Recording permission required. Enable Nodos in System Settings → " + "Privacy & Security → Screen & System Audio Recording, then restart the editor."; + done = true; + return; + } + + SCDisplay* display = content.displays.firstObject; + DeviceName = std::string("System Audio (Display ") + std::to_string(display.displayID) + ")"; + + SCContentFilter* filter = [[SCContentFilter alloc] initWithDisplay:display excludingWindows:@[]]; + SCStreamConfiguration* config = [[SCStreamConfiguration alloc] init]; + config.capturesAudio = YES; + config.excludesCurrentProcessAudio = NO; + config.sampleRate = (NSInteger)sampleRate; + config.channelCount = (NSInteger)channelCount; + // ScreenCaptureKit on macOS 13–14 still requires a video track + // to be configured even for audio-only capture. A 2×2, 1 fps + // track is the cheapest legal configuration and we never attach + // a video output, so the frames are dropped by the framework. + config.width = 2; + config.height = 2; + config.minimumFrameInterval = CMTimeMake(1, 1); + config.queueDepth = 6; + + Delegate = [[NosAudioStreamOutput alloc] init]; + Delegate.backend = this; + + Stream = [[SCStream alloc] initWithFilter:filter configuration:config delegate:Delegate]; + + AudioQueue = dispatch_queue_create("dev.nodos.audio.SystemAudioCapture", DISPATCH_QUEUE_SERIAL); + + NSError* attachError = nil; + const BOOL attached = [Stream addStreamOutput:Delegate + type:SCStreamOutputTypeAudio + sampleHandlerQueue:AudioQueue + error:&attachError]; + if (!attached) + { + const char* utf8 = attachError.localizedDescription.UTF8String; + fetchError = utf8 ? utf8 : "Failed to attach audio stream output"; + Stream = nil; + Delegate = nil; + AudioQueue = nullptr; + done = true; + return; + } + + fetchOk = true; + done = true; + } + }]; + + if (!PumpMainRunLoopUntil(done, 5.0)) + { + LastError = "Timed out waiting for shareable content"; + return; + } + if (!fetchOk) + { + LastError = std::move(fetchError); + return; + } + LastError.clear(); + ok = true; + } + }); + return ok; + } + + LastError = "System audio capture requires macOS 13 (Ventura) or later"; + return false; +} + +bool ScreenCaptureKitCapture::Start() +{ + if (Running) + return true; + if (@available(macOS 13.0, *)) + { + if (!Stream) + return false; + + bool ok = false; + RunOnMainThreadSync([&] { + @autoreleasepool + { + __block bool done = false; + __block std::string startErrorMessage; + __block bool hasStartError = false; + [Stream startCaptureWithCompletionHandler:^(NSError* error) { + if (error) + { + hasStartError = true; + NSString* desc = error.localizedDescription; + const char* utf8 = desc.UTF8String; + startErrorMessage = utf8 ? utf8 : "ScreenCaptureKit returned an unspecified error"; + } + done = true; + }]; + if (!PumpMainRunLoopUntil(done, kAsyncTimeoutSeconds)) + { + LastError = "Timed out starting ScreenCaptureKit stream"; + return; + } + if (hasStartError) + { + LastError = std::move(startErrorMessage); + return; + } + Running = true; + ok = true; + } + }); + return ok; + } + return false; +} + +void ScreenCaptureKitCapture::Stop() +{ + if (!Running && !Stream) + return; + + if (@available(macOS 13.0, *)) + { + RunOnMainThreadSync([&] { + @autoreleasepool + { + if (Stream && Running) + { + __block bool done = false; + [Stream stopCaptureWithCompletionHandler:^(NSError* /*error*/) { + done = true; + }]; + PumpMainRunLoopUntil(done, kAsyncTimeoutSeconds); + } + if (Delegate) + Delegate.backend = nullptr; + Stream = nil; + Delegate = nil; + } + }); + } + + AudioQueue = nullptr; + Running = false; + ResetBuffer(); +} + +void ScreenCaptureKitCapture::OnAudioSampleBuffer(CMSampleBufferRef sampleBuffer) +{ + if (!sampleBuffer || !CMSampleBufferIsValid(sampleBuffer) || !CMSampleBufferDataIsReady(sampleBuffer)) + return; + + CMFormatDescriptionRef desc = CMSampleBufferGetFormatDescription(sampleBuffer); + if (!desc) + return; + const AudioStreamBasicDescription* asbd = CMAudioFormatDescriptionGetStreamBasicDescription(desc); + if (!asbd || asbd->mBitsPerChannel != 32 || !(asbd->mFormatFlags & kAudioFormatFlagIsFloat)) + { + // ScreenCaptureKit always delivers Float32 PCM per the docs; bail + // safely if a future macOS changes that so we don't interpret the + // bytes as the wrong type. + return; + } + + const uint32_t channels = asbd->mChannelsPerFrame; + const uint32_t sourceRate = static_cast(asbd->mSampleRate); + if (channels == 0 || sourceRate == 0) + return; + + const CMItemCount frameCount = CMSampleBufferGetNumSamples(sampleBuffer); + if (frameCount == 0) + return; + + // Two-phase pull: first call sizes the AudioBufferList, second copies. + size_t bufferListSize = 0; + OSStatus status = CMSampleBufferGetAudioBufferListWithRetainedBlockBuffer( + sampleBuffer, + &bufferListSize, + nullptr, + 0, + kCFAllocatorSystemDefault, + kCFAllocatorSystemDefault, + kCMSampleBufferFlag_AudioBufferList_Assure16ByteAlignment, + nullptr); + if (status != noErr || bufferListSize == 0) + return; + + std::vector storage(bufferListSize); + auto* list = reinterpret_cast(storage.data()); + CMBlockBufferRef blockBuffer = nullptr; + status = CMSampleBufferGetAudioBufferListWithRetainedBlockBuffer( + sampleBuffer, + nullptr, + list, + bufferListSize, + kCFAllocatorSystemDefault, + kCFAllocatorSystemDefault, + kCMSampleBufferFlag_AudioBufferList_Assure16ByteAlignment, + &blockBuffer); + if (status != noErr || !blockBuffer) + return; + + const bool nonInterleaved = (asbd->mFormatFlags & kAudioFormatFlagIsNonInterleaved) != 0; + if (nonInterleaved && list->mNumberBuffers == channels) + { + InterleaveScratch.resize(static_cast(frameCount) * channels); + for (uint32_t ch = 0; ch < channels; ++ch) + { + const float* src = reinterpret_cast(list->mBuffers[ch].mData); + if (!src) + continue; + for (CMItemCount f = 0; f < frameCount; ++f) + InterleaveScratch[static_cast(f) * channels + ch] = src[f]; + } + PushInterleavedSamples(InterleaveScratch.data(), + static_cast(frameCount), + sourceRate, + static_cast(channels)); + } + else if (list->mNumberBuffers >= 1) + { + const float* src = reinterpret_cast(list->mBuffers[0].mData); + if (src) + PushInterleavedSamples(src, + static_cast(frameCount), + sourceRate, + static_cast(channels)); + } + + CFRelease(blockBuffer); +} + +std::unique_ptr ISystemAudioCapture::Create() +{ + return std::make_unique(); +} +} // namespace nos::audio + +@implementation NosAudioStreamOutput +- (void)stream:(SCStream*)stream didOutputSampleBuffer:(CMSampleBufferRef)sampleBuffer ofType:(SCStreamOutputType)type +{ + if (type != SCStreamOutputTypeAudio) + return; + auto* backend = self.backend; + if (backend) + backend->OnAudioSampleBuffer(sampleBuffer); +} + +- (void)stream:(SCStream*)stream didStopWithError:(NSError*)error +{ + // Surface nothing here directly; the node already displays a warning when + // ReadSamples returns silence. Recording the error on the backend would + // race with Stop() tearing everything down, so we keep the hook empty. +} +@end + +#endif // __APPLE__ diff --git a/Source/SystemAudioCaptureWindows.cpp b/Source/SystemAudioCaptureWindows.cpp new file mode 100644 index 0000000..10e38cb --- /dev/null +++ b/Source/SystemAudioCaptureWindows.cpp @@ -0,0 +1,194 @@ +// Copyright MediaZ Teknoloji A.S. All Rights Reserved. + +#ifdef _WIN32 + +#include "SystemAudioCapture.h" + +#include +#include +#include + +#include +#include +#include +#include +#include + +_COM_SMARTPTR_TYPEDEF(IMMDeviceEnumerator, __uuidof(IMMDeviceEnumerator)); +_COM_SMARTPTR_TYPEDEF(IMMDevice, __uuidof(IMMDevice)); +_COM_SMARTPTR_TYPEDEF(IAudioClient, __uuidof(IAudioClient)); +_COM_SMARTPTR_TYPEDEF(IAudioCaptureClient, __uuidof(IAudioCaptureClient)); + +namespace nos::audio +{ +namespace +{ +class WASAPICapture : public SystemAudioCaptureBase +{ +public: + WASAPICapture() { CoInitializeEx(nullptr, COINIT_MULTITHREADED); } + ~WASAPICapture() override + { + Stop(); + CoUninitialize(); + } + + bool Initialize(uint32_t sampleRate, uint8_t channelCount) override + { + IMMDeviceEnumeratorPtr enumerator; + if (FAILED(enumerator.CreateInstance(__uuidof(MMDeviceEnumerator)))) + { + LastError = "Failed to create MMDeviceEnumerator"; + return false; + } + + IMMDevicePtr device; + if (FAILED(enumerator->GetDefaultAudioEndpoint(eRender, eConsole, &device))) + { + LastError = "No default render endpoint"; + return false; + } + + IPropertyStore* props = nullptr; + if (SUCCEEDED(device->OpenPropertyStore(STGM_READ, &props))) + { + PROPVARIANT varName; + PropVariantInit(&varName); + if (SUCCEEDED(props->GetValue(PKEY_Device_FriendlyName, &varName))) + { + DeviceName = _com_util::ConvertBSTRToString(varName.bstrVal); + PropVariantClear(&varName); + } + props->Release(); + } + + if (FAILED(device->Activate(__uuidof(IAudioClient), CLSCTX_ALL, nullptr, (void**)&AudioClient))) + { + LastError = "Failed to activate audio client"; + return false; + } + + WAVEFORMATEX* mixFormat = nullptr; + if (FAILED(AudioClient->GetMixFormat(&mixFormat))) + { + LastError = "Failed to get mix format"; + return false; + } + + const HRESULT initHr = AudioClient->Initialize( + AUDCLNT_SHAREMODE_SHARED, + AUDCLNT_STREAMFLAGS_LOOPBACK, + 10'000'000, // 1 second buffer in 100-ns units + 0, + mixFormat, + nullptr); + + // Snapshot the negotiated format before freeing it. + const uint32_t negotiatedRate = mixFormat->nSamplesPerSec; + const uint8_t negotiatedChannels = static_cast(mixFormat->nChannels); + CoTaskMemFree(mixFormat); + + if (FAILED(initHr)) + { + LastError = "Failed to initialize loopback client"; + return false; + } + + SourceSampleRate = negotiatedRate; + SourceChannelCount = negotiatedChannels; + TargetSampleRate = sampleRate; + TargetChannelCount = channelCount; + + if (FAILED(AudioClient->GetService(__uuidof(IAudioCaptureClient), (void**)&CaptureClient))) + { + LastError = "Failed to get capture client"; + return false; + } + + LastError.clear(); + return true; + } + + bool Start() override + { + if (IsCapturing) + return true; + if (!AudioClient) + return false; + if (FAILED(AudioClient->Start())) + { + LastError = "Failed to start audio client"; + return false; + } + IsCapturing = true; + ShouldStop = false; + CaptureThread = std::thread(&WASAPICapture::CaptureThreadFunc, this); + return true; + } + + void Stop() override + { + if (!IsCapturing) + return; + ShouldStop = true; + if (CaptureThread.joinable()) + CaptureThread.join(); + if (AudioClient) + AudioClient->Stop(); + IsCapturing = false; + } + +private: + void CaptureThreadFunc() + { + while (!ShouldStop) + { + if (!CaptureClient) + break; + + UINT32 packetLength = 0; + if (FAILED(CaptureClient->GetNextPacketSize(&packetLength))) + break; + + while (packetLength > 0) + { + BYTE* data = nullptr; + UINT32 numFramesAvailable = 0; + DWORD flags = 0; + if (FAILED(CaptureClient->GetBuffer(&data, &numFramesAvailable, &flags, nullptr, nullptr))) + break; + + if (!(flags & AUDCLNT_BUFFERFLAGS_SILENT) && numFramesAvailable > 0) + { + PushInterleavedSamples( + reinterpret_cast(data), + numFramesAvailable, + SourceSampleRate, + SourceChannelCount); + } + + CaptureClient->ReleaseBuffer(numFramesAvailable); + + if (FAILED(CaptureClient->GetNextPacketSize(&packetLength))) + break; + } + + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + } + } + + IAudioClientPtr AudioClient; + IAudioCaptureClientPtr CaptureClient; + std::thread CaptureThread; + std::atomic IsCapturing{false}; + std::atomic ShouldStop{false}; +}; +} // namespace + +std::unique_ptr ISystemAudioCapture::Create() +{ + return std::make_unique(); +} +} // namespace nos::audio + +#endif // _WIN32 diff --git a/Source/SystemAudioInput.cpp b/Source/SystemAudioInput.cpp index dd3e1ed..c351d83 100644 --- a/Source/SystemAudioInput.cpp +++ b/Source/SystemAudioInput.cpp @@ -3,283 +3,19 @@ #include #include -#include -#include -#include -#include -#include + +#include +#include +#include #include "nosAudio/Audio_generated.h" #include "nosAudio/AudioConversions.hpp" -#ifdef _WIN32 -#include -#include -#include -#include -#include - -// COM smart pointer helpers -_COM_SMARTPTR_TYPEDEF(IMMDeviceEnumerator, __uuidof(IMMDeviceEnumerator)); -_COM_SMARTPTR_TYPEDEF(IMMDevice, __uuidof(IMMDevice)); -_COM_SMARTPTR_TYPEDEF(IAudioClient, __uuidof(IAudioClient)); -_COM_SMARTPTR_TYPEDEF(IAudioCaptureClient, __uuidof(IAudioCaptureClient)); -#endif +#include "SystemAudioCapture.h" namespace nos::audio { -#ifdef _WIN32 -class WASAPICapture -{ -public: - WASAPICapture() : IsCapturing(false), ShouldStop(false) - { - CoInitializeEx(nullptr, COINIT_MULTITHREADED); - } - - ~WASAPICapture() - { - Stop(); - CoUninitialize(); - } - - bool Initialize(uint32_t sampleRate, uint8_t channelCount) - { - HRESULT hr; - - // Create device enumerator - IMMDeviceEnumeratorPtr enumerator; - hr = enumerator.CreateInstance(__uuidof(MMDeviceEnumerator)); - if (FAILED(hr)) - return false; - - // Get default audio endpoint (for loopback capture) - IMMDevicePtr device; - hr = enumerator->GetDefaultAudioEndpoint(eRender, eConsole, &device); - if (FAILED(hr)) - return false; - - // Get device name - IPropertyStore* props = nullptr; - hr = device->OpenPropertyStore(STGM_READ, &props); - if (SUCCEEDED(hr)) - { - PROPVARIANT varName; - PropVariantInit(&varName); - hr = props->GetValue(PKEY_Device_FriendlyName, &varName); - if (SUCCEEDED(hr)) - { - DeviceName = _com_util::ConvertBSTRToString(varName.bstrVal); - PropVariantClear(&varName); - } - props->Release(); - } - - // Activate audio client - hr = device->Activate(__uuidof(IAudioClient), CLSCTX_ALL, nullptr, (void**)&AudioClient); - if (FAILED(hr)) - return false; - - // Get the mix format - WAVEFORMATEX* mixFormat = nullptr; - hr = AudioClient->GetMixFormat(&mixFormat); - if (FAILED(hr)) - return false; - - // Initialize audio client for loopback capture - hr = AudioClient->Initialize( - AUDCLNT_SHAREMODE_SHARED, - AUDCLNT_STREAMFLAGS_LOOPBACK, - 10000000, // 1 second buffer - 0, - mixFormat, - nullptr); - - if (FAILED(hr)) - { - CoTaskMemFree(mixFormat); - return false; - } - - // Store format info - SourceSampleRate = mixFormat->nSamplesPerSec; - SourceChannelCount = mixFormat->nChannels; - TargetSampleRate = sampleRate; - TargetChannelCount = channelCount; - - CoTaskMemFree(mixFormat); - - // Get capture client - hr = AudioClient->GetService(__uuidof(IAudioCaptureClient), (void**)&CaptureClient); - if (FAILED(hr)) - return false; - - return true; - } - - bool Start() - { - if (IsCapturing) - return true; - - if (!AudioClient) - return false; - - HRESULT hr = AudioClient->Start(); - if (FAILED(hr)) - return false; - - IsCapturing = true; - ShouldStop = false; - CaptureThread = std::thread(&WASAPICapture::CaptureThreadFunc, this); - - return true; - } - - void Stop() - { - if (!IsCapturing) - return; - - ShouldStop = true; - if (CaptureThread.joinable()) - CaptureThread.join(); - - if (AudioClient) - AudioClient->Stop(); - - IsCapturing = false; - } - - bool ReadSamples(int32_t* outBuffer, uint32_t numSamples, uint8_t targetChannels, float gain) - { - std::unique_lock lock(BufferMutex); - - // Calculate how many source samples we need - float sampleRateRatio = static_cast(SourceSampleRate) / static_cast(TargetSampleRate); - uint32_t sourceSamplesNeeded = static_cast(numSamples * sampleRateRatio); - - // If we don't have enough samples, fill with silence - if (CapturedSamples.size() < sourceSamplesNeeded * SourceChannelCount) - { - for (uint32_t i = 0; i < numSamples * targetChannels; ++i) - outBuffer[i] = 0; - return false; // No audio available - } - - // Resample and convert - for (uint32_t i = 0; i < numSamples; ++i) - { - float sourceIndex = i * sampleRateRatio; - uint32_t sourceIndexInt = static_cast(sourceIndex); - float frac = sourceIndex - sourceIndexInt; - - for (uint8_t ch = 0; ch < targetChannels; ++ch) - { - // Map target channel to source channel (handle mono/stereo conversions) - uint8_t sourceChannel = (ch < SourceChannelCount) ? ch : 0; - - // Get samples for interpolation - uint32_t idx1 = sourceIndexInt * SourceChannelCount + sourceChannel; - uint32_t idx2 = std::min(idx1 + SourceChannelCount, static_cast(CapturedSamples.size() - 1)); - - if (idx1 < CapturedSamples.size() && idx2 < CapturedSamples.size()) - { - float sample1 = CapturedSamples[idx1]; - float sample2 = CapturedSamples[idx2]; - float interpolated = sample1 + (sample2 - sample1) * frac; - - // Apply gain and convert to shifted int24 - interpolated *= gain; - outBuffer[i * targetChannels + ch] = FloatToShiftedInt24(interpolated); - } - else - { - outBuffer[i * targetChannels + ch] = 0; - } - } - } - - // Remove consumed samples - uint32_t samplesToRemove = sourceSamplesNeeded * SourceChannelCount; - if (samplesToRemove < CapturedSamples.size()) - CapturedSamples.erase(CapturedSamples.begin(), CapturedSamples.begin() + samplesToRemove); - - return true; // Audio successfully read - } - - const std::string& GetDeviceName() const { return DeviceName; } - -private: - void CaptureThreadFunc() - { - while (!ShouldStop) - { - if (!CaptureClient) - break; - - UINT32 packetLength = 0; - HRESULT hr = CaptureClient->GetNextPacketSize(&packetLength); - if (FAILED(hr)) - break; - - while (packetLength > 0) - { - BYTE* data = nullptr; - UINT32 numFramesAvailable = 0; - DWORD flags = 0; - - hr = CaptureClient->GetBuffer(&data, &numFramesAvailable, &flags, nullptr, nullptr); - if (FAILED(hr)) - break; - - // Convert samples to float and store - if (!(flags & AUDCLNT_BUFFERFLAGS_SILENT)) - { - float* floatData = reinterpret_cast(data); - std::lock_guard lock(BufferMutex); - - for (UINT32 i = 0; i < numFramesAvailable * SourceChannelCount; ++i) - { - CapturedSamples.push_back(floatData[i]); - } - - // Limit buffer size to prevent unbounded growth (keep max 5 seconds) - size_t maxSamples = SourceSampleRate * SourceChannelCount * 5; - if (CapturedSamples.size() > maxSamples) - { - CapturedSamples.erase(CapturedSamples.begin(), - CapturedSamples.begin() + (CapturedSamples.size() - maxSamples)); - } - } - - CaptureClient->ReleaseBuffer(numFramesAvailable); - - hr = CaptureClient->GetNextPacketSize(&packetLength); - if (FAILED(hr)) - break; - } - - std::this_thread::sleep_for(std::chrono::milliseconds(10)); - } - } - - IAudioClientPtr AudioClient; - IAudioCaptureClientPtr CaptureClient; - std::thread CaptureThread; - std::atomic IsCapturing; - std::atomic ShouldStop; - std::vector CapturedSamples; - std::mutex BufferMutex; - uint32_t SourceSampleRate = 0; - uint8_t SourceChannelCount = 0; - uint32_t TargetSampleRate = 0; - uint8_t TargetChannelCount = 0; - std::string DeviceName; -}; -#endif - struct SystemAudioInputNode : NodeContext { void SetNodeStatusMessageIfChanged(const std::string& message, fb::NodeStatusMessageType type) @@ -294,61 +30,60 @@ struct SystemAudioInputNode : NodeContext nosResult OnCreate(nosFbNodePtr) override { AddPinValueWatcher(NOS_NAME("Active"), - [this](const bool* newVal, std::optional oldVal) { + [this](const bool* newVal, std::optional /*oldVal*/) { Active = *newVal; - if (Active) - SetNodeStatusMessageIfChanged("System audio input active", fb::NodeStatusMessageType::INFO); - else - SetNodeStatusMessageIfChanged("System audio input inactive", fb::NodeStatusMessageType::WARNING); + // Status is owned by ExecuteNode so it stays consistent with the + // capture backend state. Writing here too races with the frame + // loop and flaps the node status between "active" and the + // capture-backend messages on every pin-value update (including + // the one that fires during graph load). }); AddPinValueWatcher(NOS_NAME("SampleRate"), - [this](const uint32_t* newVal, std::optional oldVal) { - if (!oldVal || *newVal != **oldVal) - NeedsReinitialize = true; - }); + [this](const uint32_t* newVal, std::optional oldVal) { + if (!oldVal || *newVal != **oldVal) + NeedsReinitialize = true; + }); AddPinValueWatcher(NOS_NAME("ChannelCount"), - [this](const uint8_t* newVal, std::optional oldVal) { - if (!oldVal || *newVal != **oldVal) - NeedsReinitialize = true; - }); + [this](const uint8_t* newVal, std::optional oldVal) { + if (!oldVal || *newVal != **oldVal) + NeedsReinitialize = true; + }); return NOS_RESULT_SUCCESS; } ~SystemAudioInputNode() override { -#ifdef _WIN32 if (Capture) - { Capture->Stop(); - Capture.reset(); - } -#endif } void OnPathStart() override { - ClearNodeStatusMessages(); AccumulatedSampleNumerator = 0; CurrentSampleIndex = 0; - NeedsReinitialize = true; - - if (Active) - SetNodeStatusMessageIfChanged("System audio input active", fb::NodeStatusMessageType::INFO); - else - SetNodeStatusMessageIfChanged("System audio input inactive", fb::NodeStatusMessageType::WARNING); + + // Don't force a re-init here. If the engine restarts paths frequently + // (e.g. downstream scheduler changes, other nodes calling SendPathRestart), + // destroying and recreating Capture every round makes the node post the + // same "Audio capture is ready …" status message over and over. The + // SampleRate / ChannelCount pin watchers already set NeedsReinitialize + // when the capture format actually changes; anything else just needs a + // cheap Start() to resume a backend we paused in OnPathStop. + if (Active && Capture) + Capture->Start(); + + // Don't clear LastStatusMessage either — keeping it means the guard in + // SetNodeStatusMessageIfChanged suppresses a same-string repost from + // the first post-restart frame, which is the source of the flap. } void OnPathStop() override { -#ifdef _WIN32 if (Capture) - { Capture->Stop(); - } -#endif ClearNodeStatusMessages(); } @@ -358,22 +93,21 @@ struct SystemAudioInputNode : NodeContext auto& channelCount = *pins.GetPinValue(NOS_NAME("ChannelCount")); auto& gain = *pins.GetPinValue(NOS_NAME("Gain")); - // Only support fixed step timing if (pins.TimingMode != NOS_EXECUTION_TIMING_MODE_FIXED_STEP) { SetNodeStatusMessageIfChanged("Unsupported timing mode", fb::NodeStatusMessageType::FAILURE); return NOS_RESULT_FAILED; } - // Check for invalid timing values if (pins.FixedStepTiming.DeltaSeconds.y == 0) { SetNodeStatusMessageIfChanged("Invalid timing values", fb::NodeStatusMessageType::FAILURE); return NOS_RESULT_FAILED; } -#ifdef _WIN32 - // Initialize or reinitialize capture if needed + // (Re)create the backend whenever Active flips on or the requested + // format changes. A null Capture after this branch means the platform + // has no backend compiled in — we emit silence + a status message. if (Active && (NeedsReinitialize || !Capture)) { if (Capture) @@ -382,10 +116,22 @@ struct SystemAudioInputNode : NodeContext Capture.reset(); } - Capture = std::make_unique(); + Capture = ISystemAudioCapture::Create(); + if (!Capture) + { + SetNodeStatusMessageIfChanged("System audio input is not supported on this platform", + fb::NodeStatusMessageType::FAILURE); + Active = false; + return NOS_RESULT_FAILED; + } + if (!Capture->Initialize(sampleRate, channelCount)) { - SetNodeStatusMessageIfChanged("Failed to initialize system audio capture", fb::NodeStatusMessageType::FAILURE); + const auto& err = Capture->GetLastError(); + SetNodeStatusMessageIfChanged( + err.empty() ? std::string("Failed to initialize system audio capture") + : "Failed to initialize system audio capture: " + err, + fb::NodeStatusMessageType::FAILURE); Capture.reset(); Active = false; return NOS_RESULT_FAILED; @@ -393,7 +139,11 @@ struct SystemAudioInputNode : NodeContext if (!Capture->Start()) { - SetNodeStatusMessageIfChanged("Failed to start system audio capture", fb::NodeStatusMessageType::FAILURE); + const auto& err = Capture->GetLastError(); + SetNodeStatusMessageIfChanged( + err.empty() ? std::string("Failed to start system audio capture") + : "Failed to start system audio capture: " + err, + fb::NodeStatusMessageType::FAILURE); Capture.reset(); Active = false; return NOS_RESULT_FAILED; @@ -411,22 +161,17 @@ struct SystemAudioInputNode : NodeContext Capture.reset(); SetNodeStatusMessageIfChanged("System audio input inactive", fb::NodeStatusMessageType::WARNING); } -#else - // System audio input is not supported on non-Windows platforms yet - SetNodeStatusMessageIfChanged("System audio input is not supported on this platform yet", fb::NodeStatusMessageType::FAILURE); - return NOS_RESULT_FAILED; -#endif - uint64_t deltaNumerator = pins.FixedStepTiming.DeltaSeconds.x; - uint64_t deltaDenominator = pins.FixedStepTiming.DeltaSeconds.y; + const uint64_t deltaNumerator = pins.FixedStepTiming.DeltaSeconds.x; + const uint64_t deltaDenominator = pins.FixedStepTiming.DeltaSeconds.y; AccumulatedSampleNumerator += deltaNumerator * static_cast(sampleRate); + const uint32_t numSamples = static_cast(AccumulatedSampleNumerator / deltaDenominator); + AccumulatedSampleNumerator %= deltaDenominator; - uint32_t numSamples = static_cast(AccumulatedSampleNumerator / deltaDenominator); - AccumulatedSampleNumerator %= deltaDenominator; // Keep remainder for next frame - - // Create or resize audio buffer only if needed (with 1.1x headroom to avoid frequent reallocations) - size_t requiredBufferSize = numSamples * sizeof(uint32_t) * channelCount; + // Create or grow the audio buffer only when strictly necessary; 1.1x + // headroom amortises reallocations across small timing fluctuations. + const size_t requiredBufferSize = static_cast(numSamples) * sizeof(uint32_t) * channelCount; size_t allocatedBufferSize = 0; if (AudioPacketBuffer) { @@ -439,9 +184,7 @@ struct SystemAudioInputNode : NodeContext if (!AudioPacketBuffer || requiredBufferSize > allocatedBufferSize) { AudioPacketBuffer = {}; - - // Allocate 1.1x the required size to reduce frequency of reallocations - size_t newBufferSize = requiredBufferSize * 1.1f; + const size_t newBufferSize = static_cast(requiredBufferSize * 1.1f); nosBufferInfo audioBufferDesc = {}; audioBufferDesc.Size = static_cast(newBufferSize); @@ -459,57 +202,38 @@ struct SystemAudioInputNode : NodeContext } } - int32_t* audioSamples = reinterpret_cast(nosVulkan->Map(AudioPacketBuffer)); + auto* audioSamples = reinterpret_cast(nosVulkan->Map(AudioPacketBuffer)); if (!audioSamples) { SetNodeStatusMessageIfChanged("Failed to map audio buffer", fb::NodeStatusMessageType::FAILURE); return NOS_RESULT_FAILED; } - if (Active) + if (Active && Capture) { -#ifdef _WIN32 - // Read captured system audio - if (Capture) - { - bool hasAudio = Capture->ReadSamples(audioSamples, numSamples, channelCount, gain); - std::string deviceName = Capture->GetDeviceName(); - std::string deviceSuffix = deviceName.empty() ? "" : " - " + deviceName; - - if (hasAudio) - SetNodeStatusMessageIfChanged("Capturing audio" + deviceSuffix, fb::NodeStatusMessageType::INFO); - else - SetNodeStatusMessageIfChanged("Audio capture is ready" + deviceSuffix, fb::NodeStatusMessageType::INFO); - } - else -#endif - { - // Fill with silence if capture failed - for (uint32_t i = 0; i < numSamples * channelCount; ++i) - { - audioSamples[i] = 0; - } - } + // Don't update the status message every frame based on whether + // this single frame delivered audio — ReadSamples flips true/false + // at the rate of buffer fills, which causes the editor's node + // status area to spam updates. The "ready" message posted after + // Initialize/Start stays put; transitions (inactive, failure) are + // the only things that republish. + Capture->ReadSamples(audioSamples, numSamples, channelCount, gain); } else { - // Fill with silence when inactive for (uint32_t i = 0; i < numSamples * channelCount; ++i) - { audioSamples[i] = 0; - } } - // Update current sample index CurrentSampleIndex += numSamples; AudioPacketDescriptor audioPacketDesc( sampleRate, numSamples, BitDepth::AUDIO_BIT_DEPTH_24_BIT, sizeof(int32_t), channelCount); ObjectRef outDesc{}; - nosEngine.ObjectAPI->CreatePrimitiveObject(NOS_NAME(AudioPacketDescriptor::GetFullyQualifiedName()), - nos::Buffer::From(audioPacketDesc), - &outDesc.GetStorage()); + nosEngine.ObjectAPI->CreatePrimitiveObject(NOS_NAME(AudioPacketDescriptor::GetFullyQualifiedName()), + nos::Buffer::From(audioPacketDesc), + &outDesc.GetStorage()); ObjectRef out{}; std::vector fields; @@ -521,14 +245,12 @@ struct SystemAudioInputNode : NodeContext .FieldName = NOS_NAME("buffer"), .FieldObjectId = AudioPacketBuffer, }); - nosEngine.ObjectAPI->CreateCompositeObject(NOS_NAME(AudioPacket::GetFullyQualifiedName()), - fields.data(), - fields.size(), - &out.GetStorage()); + nosEngine.ObjectAPI->CreateCompositeObject(NOS_NAME(AudioPacket::GetFullyQualifiedName()), + fields.data(), + fields.size(), + &out.GetStorage()); NOS_SOFT_CHECK(out, "Failed to create output AudioPacket object"); - - // Set output pin values SetPinObject(NOS_NAME("AudioPacket"), out); return NOS_RESULT_SUCCESS; @@ -540,10 +262,7 @@ struct SystemAudioInputNode : NodeContext bool Active = false; bool NeedsReinitialize = false; std::string LastStatusMessage; - -#ifdef _WIN32 - std::unique_ptr Capture; -#endif + std::unique_ptr Capture; }; nosResult RegisterSystemAudioInputNode(nosNodeFunctions* fn) From 7294d6a9fe86340e4ee2dda033b5e66a4bdedc75 Mon Sep 17 00:00:00 2001 From: "M. Samil Atesoglu" Date: Wed, 22 Apr 2026 13:16:53 +0300 Subject: [PATCH 3/4] Audio: fix SystemAudioInput activation, latency, and path-restart handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Read Active pin directly in ExecuteNode instead of a watcher-backed mirror, so the node doesn't sit inert on graph load when the first ExecuteNode fires before the saved pin value has propagated. - Drop OnPathStop's Capture->Stop() call: ScreenCaptureKit's Stop is a full stream teardown (not a pause), so across routine OnPathStop → OnPathStart cycles (graph load, downstream reconfig) the stream was left dead while the "ready" status lingered. Capture now stays alive until Active flips off or the node is destroyed. - Add ISystemAudioCapture::DiscardBufferedSamples() and call it from OnPathStart to drop whatever queued up between Capture->Start() and the first consumer tick, so startup gap doesn't become permanent latency. - Cap the residual ring buffer post-ReadSamples at ~100 ms so historical producer/consumer skew (startup, frame-drop stall, path-restart burst) resyncs to live instead of persisting as lag. - Move the steady-state "Capturing system audio" status out of the init branch so it survives ClearNodeStatusMessages and reposts idempotently. - Failure paths now write Active=false back to the pin via SetPinValue rather than toggling an internal mirror. - Add NOS_MEMORY_FLAGS_DOWNLOAD to AudioPacketBuffer so VMA picks HOST_CACHED memory (the buffer is read on the host by the consumer node, which doesn't match the default SEQUENTIAL_WRITE hint). AudioOscilloscope: guard the binning averager's divisor. When numSamples is smaller than scopeTexSize * samplesPerBin, trailing bins have startSample >= numSamples and the 0/0 produced NaN, poisoning FrameHistory for the lifetime of the node context. Rename k-prefixed constants to SNAKE_CASE per CodingConvention.md. --- Source/AudioOscilloscope.cpp | 15 ++++- Source/SystemAudioCapture.cpp | 20 ++++++- Source/SystemAudioCapture.h | 6 ++ Source/SystemAudioCaptureMac.mm | 6 +- Source/SystemAudioInput.cpp | 98 ++++++++++++++++++--------------- 5 files changed, 94 insertions(+), 51 deletions(-) diff --git a/Source/AudioOscilloscope.cpp b/Source/AudioOscilloscope.cpp index 9a06a61..77e59a8 100644 --- a/Source/AudioOscilloscope.cpp +++ b/Source/AudioOscilloscope.cpp @@ -112,14 +112,23 @@ struct AudioOscilloscopeNode : NodeContext { uint32_t startSample = bin * samplesPerBin; uint32_t endSample = std::min(startSample + samplesPerBin, numSamples); + // Guard the averaging: if numSamples is smaller than + // scopeTexSize * samplesPerBin, trailing bins have + // startSample >= numSamples, the inner loop runs zero times, + // and `binValue / 0` produces NaN. That NaN then flows into + // FrameHistory and poisons the bin's moving average for the + // lifetime of this node context. + if (endSample <= startSample) + { + currentFrameData[bin] = 0.0f; + continue; + } float binValue = 0.0f; for (uint32_t i = startSample; i < endSample; ++i) { binValue += monoAudio[i]; } - binValue /= (endSample - startSample); // Average the samples in this bin - - currentFrameData[bin] = binValue; + currentFrameData[bin] = binValue / static_cast(endSample - startSample); } // Store current frame data and calculate moving average diff --git a/Source/SystemAudioCapture.cpp b/Source/SystemAudioCapture.cpp index 3edcb5e..3ffceb1 100644 --- a/Source/SystemAudioCapture.cpp +++ b/Source/SystemAudioCapture.cpp @@ -13,7 +13,7 @@ namespace // Keep at most this many seconds of buffered source samples to stop a stalled // consumer (or a hung execution graph) from growing memory unbounded. The // existing WASAPI backend used 5 seconds; we keep the same budget for parity. -constexpr uint32_t kMaxBufferedSeconds = 5; +constexpr uint32_t MAX_BUFFERED_SECONDS = 5; } // namespace void SystemAudioCaptureBase::ResetBuffer() @@ -44,7 +44,7 @@ void SystemAudioCaptureBase::PushInterleavedSamples(const float* samples, const size_t sampleCount = static_cast(frameCount) * sourceChannelCount; CapturedSamples.insert(CapturedSamples.end(), samples, samples + sampleCount); - const size_t maxSamples = static_cast(SourceSampleRate) * SourceChannelCount * kMaxBufferedSeconds; + const size_t maxSamples = static_cast(SourceSampleRate) * SourceChannelCount * MAX_BUFFERED_SECONDS; if (CapturedSamples.size() > maxSamples) { const size_t overflow = CapturedSamples.size() - maxSamples; @@ -111,6 +111,22 @@ bool SystemAudioCaptureBase::ReadSamples(int32_t* outBuffer, uint32_t numSamples else CapturedSamples.clear(); + // Post-read drift correction: ReadSamples consumes at exactly real-time + // rate, so any historical producer/consumer skew (startup gap, frame-drop + // stall, path-restart burst) would otherwise persist as permanent latency + // — we just pull from the head forever, staying N ms behind live. Cap the + // residual buffer at a small smoothing window; anything older gets dropped + // so the next read snaps back toward live. The discontinuity this causes + // is audibly a one-shot click, preferable to sustained lag. + constexpr float MAX_POST_READ_SECONDS = 0.1f; + const size_t maxKeep = static_cast(static_cast(SourceSampleRate) * MAX_POST_READ_SECONDS) * + SourceChannelCount; + if (maxKeep > 0 && CapturedSamples.size() > maxKeep) + { + const size_t drop = CapturedSamples.size() - maxKeep; + CapturedSamples.erase(CapturedSamples.begin(), CapturedSamples.begin() + drop); + } + return true; } } // namespace nos::audio diff --git a/Source/SystemAudioCapture.h b/Source/SystemAudioCapture.h index 2921b80..dc1be8a 100644 --- a/Source/SystemAudioCapture.h +++ b/Source/SystemAudioCapture.h @@ -36,6 +36,11 @@ class ISystemAudioCapture // backend has not produced enough data yet. virtual bool ReadSamples(int32_t* outBuffer, uint32_t numSamples, uint8_t targetChannels, float gain) = 0; + // Drop any audio that has accumulated in the internal ring buffer. Called + // on path start so the consumer doesn't have to pay for latency that built + // up between Start() and the first ReadSamples. + virtual void DiscardBufferedSamples() = 0; + virtual const std::string& GetDeviceName() const = 0; virtual const std::string& GetLastError() const = 0; @@ -51,6 +56,7 @@ class SystemAudioCaptureBase : public ISystemAudioCapture { public: bool ReadSamples(int32_t* outBuffer, uint32_t numSamples, uint8_t targetChannels, float gain) override; + void DiscardBufferedSamples() override { ResetBuffer(); } const std::string& GetDeviceName() const override { return DeviceName; } const std::string& GetLastError() const override { return LastError; } diff --git a/Source/SystemAudioCaptureMac.mm b/Source/SystemAudioCaptureMac.mm index 71e3e5b..e7ae79f 100644 --- a/Source/SystemAudioCaptureMac.mm +++ b/Source/SystemAudioCaptureMac.mm @@ -74,7 +74,7 @@ @interface NosAudioStreamOutput : NSObject // before giving up. Initialize / Start / Stop are called from the editor // execution thread, so we cap the wait to keep a hung system-service // from stalling the whole graph. - static constexpr uint64_t kAsyncTimeoutSeconds = 5; + static constexpr uint64_t ASYNC_TIMEOUT_SECONDS = 5; SCStream* Stream API_AVAILABLE(macos(13.0)) = nil; NosAudioStreamOutput* Delegate API_AVAILABLE(macos(13.0)) = nil; @@ -276,7 +276,7 @@ bool PumpMainRunLoopUntil(const bool& done, double timeoutSeconds) } done = true; }]; - if (!PumpMainRunLoopUntil(done, kAsyncTimeoutSeconds)) + if (!PumpMainRunLoopUntil(done, ASYNC_TIMEOUT_SECONDS)) { LastError = "Timed out starting ScreenCaptureKit stream"; return; @@ -311,7 +311,7 @@ bool PumpMainRunLoopUntil(const bool& done, double timeoutSeconds) [Stream stopCaptureWithCompletionHandler:^(NSError* /*error*/) { done = true; }]; - PumpMainRunLoopUntil(done, kAsyncTimeoutSeconds); + PumpMainRunLoopUntil(done, ASYNC_TIMEOUT_SECONDS); } if (Delegate) Delegate.backend = nullptr; diff --git a/Source/SystemAudioInput.cpp b/Source/SystemAudioInput.cpp index c351d83..1bb52b9 100644 --- a/Source/SystemAudioInput.cpp +++ b/Source/SystemAudioInput.cpp @@ -29,16 +29,6 @@ struct SystemAudioInputNode : NodeContext nosResult OnCreate(nosFbNodePtr) override { - AddPinValueWatcher(NOS_NAME("Active"), - [this](const bool* newVal, std::optional /*oldVal*/) { - Active = *newVal; - // Status is owned by ExecuteNode so it stays consistent with the - // capture backend state. Writing here too races with the frame - // loop and flaps the node status between "active" and the - // capture-backend messages on every pin-value update (including - // the one that fires during graph load). - }); - AddPinValueWatcher(NOS_NAME("SampleRate"), [this](const uint32_t* newVal, std::optional oldVal) { if (!oldVal || *newVal != **oldVal) @@ -64,31 +54,31 @@ struct SystemAudioInputNode : NodeContext { AccumulatedSampleNumerator = 0; CurrentSampleIndex = 0; - - // Don't force a re-init here. If the engine restarts paths frequently - // (e.g. downstream scheduler changes, other nodes calling SendPathRestart), - // destroying and recreating Capture every round makes the node post the - // same "Audio capture is ready …" status message over and over. The - // SampleRate / ChannelCount pin watchers already set NeedsReinitialize - // when the capture format actually changes; anything else just needs a - // cheap Start() to resume a backend we paused in OnPathStop. - if (Active && Capture) - Capture->Start(); - - // Don't clear LastStatusMessage either — keeping it means the guard in - // SetNodeStatusMessageIfChanged suppresses a same-string repost from - // the first post-restart frame, which is the source of the flap. - } - - void OnPathStop() override - { + // Drop any audio that queued up between Capture->Start() and this + // first consumer tick. Without this, the consumer would forever play + // from the back of a full ring buffer, running the apparent latency + // ceiling (~100ms after the in-read trim) instead of the floor. if (Capture) - Capture->Stop(); - ClearNodeStatusMessages(); + Capture->DiscardBufferedSamples(); } + // Deliberately no OnPathStop override: ScreenCaptureKit's Stop() is a full + // stream teardown (not a pause), so any Stop here would leave the stream + // dead across the routine OnPathStop → OnPathStart cycles that happen on + // graph load and downstream reconfiguration. The backend stays running + // until Active flips off or the node is destroyed, and the status message + // is kept in sync by ExecuteNode below rather than being cleared here — + // clearing with ClearNodeStatusMessages without also resetting the + // LastStatusMessage mirror used to leave the node with no visible status + // after a path restart. + nosResult ExecuteNode(NodeExecuteParams const& pins) override { + // Read Active straight from the pin rather than relying on a watcher- + // backed mirror: on graph load the first ExecuteNode can fire before + // the watcher has propagated the saved `true`, which left the node + // inert until the user toggled the pin. + const bool active = *pins.GetPinValue(NOS_NAME("Active")); auto& sampleRate = *pins.GetPinValue(NOS_NAME("SampleRate")); auto& channelCount = *pins.GetPinValue(NOS_NAME("ChannelCount")); auto& gain = *pins.GetPinValue(NOS_NAME("Gain")); @@ -108,7 +98,7 @@ struct SystemAudioInputNode : NodeContext // (Re)create the backend whenever Active flips on or the requested // format changes. A null Capture after this branch means the platform // has no backend compiled in — we emit silence + a status message. - if (Active && (NeedsReinitialize || !Capture)) + if (active && (NeedsReinitialize || !Capture)) { if (Capture) { @@ -121,7 +111,7 @@ struct SystemAudioInputNode : NodeContext { SetNodeStatusMessageIfChanged("System audio input is not supported on this platform", fb::NodeStatusMessageType::FAILURE); - Active = false; + SetPinValue(NOS_NAME("Active"), false); return NOS_RESULT_FAILED; } @@ -133,7 +123,7 @@ struct SystemAudioInputNode : NodeContext : "Failed to initialize system audio capture: " + err, fb::NodeStatusMessageType::FAILURE); Capture.reset(); - Active = false; + SetPinValue(NOS_NAME("Active"), false); return NOS_RESULT_FAILED; } @@ -145,23 +135,33 @@ struct SystemAudioInputNode : NodeContext : "Failed to start system audio capture: " + err, fb::NodeStatusMessageType::FAILURE); Capture.reset(); - Active = false; + SetPinValue(NOS_NAME("Active"), false); return NOS_RESULT_FAILED; } NeedsReinitialize = false; - std::string deviceMsg = "Audio capture is ready"; - if (!Capture->GetDeviceName().empty()) - deviceMsg += " (" + Capture->GetDeviceName() + ")"; - SetNodeStatusMessageIfChanged(deviceMsg, fb::NodeStatusMessageType::INFO); } - else if (!Active && Capture) + else if (!active && Capture) { Capture->Stop(); Capture.reset(); SetNodeStatusMessageIfChanged("System audio input inactive", fb::NodeStatusMessageType::WARNING); } + // Steady-state status, re-posted every frame while capture is live. + // Posting here (instead of once inside the init branch) means the + // message survives path restarts: if OnPathStop or an external clear + // wipes the node status, the very next ExecuteNode repaints it, and + // the SetNodeStatusMessageIfChanged guard suppresses spam in the + // common case where the string hasn't changed. + if (active && Capture) + { + std::string deviceMsg = "Capturing system audio"; + if (!Capture->GetDeviceName().empty()) + deviceMsg += " (" + Capture->GetDeviceName() + ")"; + SetNodeStatusMessageIfChanged(deviceMsg, fb::NodeStatusMessageType::INFO); + } + const uint64_t deltaNumerator = pins.FixedStepTiming.DeltaSeconds.x; const uint64_t deltaDenominator = pins.FixedStepTiming.DeltaSeconds.y; @@ -190,8 +190,21 @@ struct SystemAudioInputNode : NodeContext audioBufferDesc.Size = static_cast(newBufferSize); audioBufferDesc.Usage = nosBufferUsage(NOS_BUFFER_USAGE_STORAGE_BUFFER | NOS_BUFFER_USAGE_TRANSFER_DST | NOS_BUFFER_USAGE_TRANSFER_SRC); - audioBufferDesc.MemoryFlags = - nosMemoryFlags(NOS_MEMORY_FLAGS_HOST_VISIBLE | NOS_MEMORY_FLAGS_FORCE_HOST_MEMORY); + // DOWNLOAD flips VMA from HOST_ACCESS_SEQUENTIAL_WRITE (which lets + // it pick write-combined memory) to HOST_ACCESS_RANDOM (cached + // memory). The engine already requests VK_MEMORY_PROPERTY_HOST_- + // COHERENT_BIT in either case, so host↔device coherence is fine + // without this flag — but the buffer is ALSO read by a consumer + // node (AudioOscilloscope) running on a different engine runner + // thread. Write-combined memory doesn't participate in normal + // CPU cache coherence between cores, so the consumer's reads + // could miss the producer's writes until some unrelated sync + // event flushed things. Cached memory fixes this, at the cost + // of slightly slower sequential writes (unmeasurable at audio + // sample volumes). + audioBufferDesc.MemoryFlags = nosMemoryFlags(NOS_MEMORY_FLAGS_HOST_VISIBLE | + NOS_MEMORY_FLAGS_DOWNLOAD | + NOS_MEMORY_FLAGS_FORCE_HOST_MEMORY); audioBufferDesc.ElementType = NOS_BUFFER_ELEMENT_TYPE_INT32; AudioPacketBuffer = sys::vulkan::CreateBuffer(audioBufferDesc, "SystemAudioInput AudioBuffer"); @@ -209,7 +222,7 @@ struct SystemAudioInputNode : NodeContext return NOS_RESULT_FAILED; } - if (Active && Capture) + if (active && Capture) { // Don't update the status message every frame based on whether // this single frame delivered audio — ReadSamples flips true/false @@ -259,7 +272,6 @@ struct SystemAudioInputNode : NodeContext TypedObjectRef AudioPacketBuffer; uint64_t AccumulatedSampleNumerator = 0; uint64_t CurrentSampleIndex = 0; - bool Active = false; bool NeedsReinitialize = false; std::string LastStatusMessage; std::unique_ptr Capture; From 59c0a962dec1c07f760b93830e06e4584fed6e0c Mon Sep 17 00:00:00 2001 From: "M. Samil Atesoglu" Date: Wed, 22 Apr 2026 18:09:14 +0300 Subject: [PATCH 4/4] Audio (Windows): fix include order for functiondiscoverykeys_devpkey.h functiondiscoverykeys_devpkey.h references PROPERTYKEY, which mmdeviceapi.h brings in. Include mmdeviceapi.h first so the property-key definitions see a declared PROPERTYKEY. --- Source/SystemAudioCaptureWindows.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Source/SystemAudioCaptureWindows.cpp b/Source/SystemAudioCaptureWindows.cpp index 10e38cb..b62c151 100644 --- a/Source/SystemAudioCaptureWindows.cpp +++ b/Source/SystemAudioCaptureWindows.cpp @@ -11,8 +11,8 @@ #include #include #include -#include #include +#include _COM_SMARTPTR_TYPEDEF(IMMDeviceEnumerator, __uuidof(IMMDeviceEnumerator)); _COM_SMARTPTR_TYPEDEF(IMMDevice, __uuidof(IMMDevice));