From 4832cda1b28d09026840255e6c4dfa9e8796a789 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Felipe=20Santos?= Date: Fri, 6 Feb 2026 09:33:50 -0800 Subject: [PATCH 1/7] Add profiling instrumentation for NAM building blocks Adds a profiling framework (NAM/profiling.h, NAM/profiling.cpp) with NAM_PROFILE_START()/NAM_PROFILE_ADD() macros and 14 timing categories. Supports both desktop (std::chrono) and ARM Cortex-M7 (DWT cycle counter) backends. Profiling is compile-time gated via -DNAM_PROFILING. Instruments wavenet _Layer::Process() and _LayerArray::ProcessInner() with per-category timing, and adds profiling reset/print calls to the benchmodel tool. Co-Authored-By: Claude Opus 4.6 --- NAM/conv1d.cpp | 4 ++ NAM/dsp.cpp | 4 ++ NAM/film.h | 4 ++ NAM/profiling.cpp | 47 +++++++++++++ NAM/profiling.h | 153 +++++++++++++++++++++++++++++++++++++++++++ NAM/wavenet.cpp | 23 +++++++ tools/benchmodel.cpp | 7 ++ 7 files changed, 242 insertions(+) create mode 100644 NAM/profiling.cpp create mode 100644 NAM/profiling.h diff --git a/NAM/conv1d.cpp b/NAM/conv1d.cpp index 9bbbc02..d440f0c 100644 --- a/NAM/conv1d.cpp +++ b/NAM/conv1d.cpp @@ -1,4 +1,5 @@ #include "conv1d.h" +#include "profiling.h" #include namespace nam @@ -143,6 +144,9 @@ void Conv1D::SetMaxBufferSize(const int maxBufferSize) void Conv1D::Process(const Eigen::MatrixXf& input, const int num_frames) { + // Note: Profiling is done at the caller level (e.g., _Layer::Process in wavenet.cpp) + // to avoid double-counting when Conv1D is called from within profiled blocks. + // Write input to ring buffer _input_buffer.Write(input, num_frames); diff --git a/NAM/dsp.cpp b/NAM/dsp.cpp index 05dab09..b644af3 100644 --- a/NAM/dsp.cpp +++ b/NAM/dsp.cpp @@ -8,6 +8,7 @@ #include #include "dsp.h" +#include "profiling.h" #include "registry.h" #define tanh_impl_ std::tanh @@ -443,6 +444,9 @@ Eigen::MatrixXf nam::Conv1x1::process(const Eigen::MatrixXf& input, const int nu void nam::Conv1x1::process_(const Eigen::Ref& input, const int num_frames) { + // Note: Profiling is done at the caller level (e.g., _Layer::Process in wavenet.cpp) + // to provide meaningful categories (input_mixin, layer1x1, head1x1, rechannel) + // rather than generic conv1x1. assert(num_frames <= _output.cols()); if (this->_is_depthwise) diff --git a/NAM/film.h b/NAM/film.h index f0f86fb..eeb750a 100644 --- a/NAM/film.h +++ b/NAM/film.h @@ -81,9 +81,13 @@ class FiLM assert(num_frames <= condition.cols()); assert(num_frames <= _output.cols()); + // Conv1x1 to compute scale/shift from condition _cond_to_scale_shift.process_(condition, num_frames); const auto& scale_shift = _cond_to_scale_shift.GetOutput(); + // Note: FiLM time is included in the caller's profiling category (e.g., conv1d, input_mixin) + // rather than tracked separately, to avoid double-counting. + const auto scale = scale_shift.topRows(get_input_dim()).leftCols(num_frames); if (_do_shift) { diff --git a/NAM/profiling.cpp b/NAM/profiling.cpp new file mode 100644 index 0000000..65d430e --- /dev/null +++ b/NAM/profiling.cpp @@ -0,0 +1,47 @@ +#include "profiling.h" + +#ifdef NAM_PROFILING + +#if defined(__ARM_ARCH_7EM__) || defined(ARM_MATH_CM7) +// ARM Cortex-M7: Use DWT cycle counter for precise timing +#include "stm32h7xx.h" + +namespace nam { +namespace profiling { + +Timings g_timings; + +// CPU frequency in MHz (Daisy runs at 480 MHz) +static constexpr uint32_t CPU_FREQ_MHZ = 480; + +uint32_t get_time_us() { + // DWT->CYCCNT gives cycle count + // Divide by CPU_FREQ_MHZ to get microseconds + return DWT->CYCCNT / CPU_FREQ_MHZ; +} + +} // namespace profiling +} // namespace nam + +#else +// Non-ARM: Use std::chrono for timing (for testing on desktop) +#include + +namespace nam { +namespace profiling { + +Timings g_timings; + +uint32_t get_time_us() { + using namespace std::chrono; + static auto start = high_resolution_clock::now(); + auto now = high_resolution_clock::now(); + return (uint32_t)duration_cast(now - start).count(); +} + +} // namespace profiling +} // namespace nam + +#endif // ARM check + +#endif // NAM_PROFILING diff --git a/NAM/profiling.h b/NAM/profiling.h new file mode 100644 index 0000000..71031fe --- /dev/null +++ b/NAM/profiling.h @@ -0,0 +1,153 @@ +#pragma once + +// Comprehensive profiling for NAM building blocks +// Enable with -DNAM_PROFILING +// +// Usage: +// 1. Call nam::profiling::reset() before benchmark +// 2. Run model processing +// 3. Call nam::profiling::print_results() to display breakdown +// +// Categories cover all WaveNet operations including FiLM modulation. + +#ifdef NAM_PROFILING + +#include +#include + +namespace nam { +namespace profiling { + +// Timing accumulators (in microseconds) +struct Timings { + // Dilated convolution (Conv1D) + uint32_t conv1d = 0; + + // Pointwise convolutions (Conv1x1 variants) + uint32_t input_mixin = 0; // Input mixing Conv1x1 + uint32_t layer1x1 = 0; // Layer 1x1 (residual projection) + uint32_t head1x1 = 0; // Head 1x1 (skip connection projection) + uint32_t rechannel = 0; // Rechannel Conv1x1 (input/output) + uint32_t conv1x1 = 0; // Other Conv1x1 (catch-all for non-WaveNet uses) + + // Activation + uint32_t activation = 0; // Activation functions (tanh, ReLU, Softsign, etc.) + + // FiLM modulation + uint32_t film = 0; // Feature-wise Linear Modulation (scale/shift) + + // Memory operations + uint32_t copies = 0; // Memory copies and additions + uint32_t setzero = 0; // setZero() calls + uint32_t ringbuf = 0; // Ring buffer operations (Write, Read, Advance) + + // Conditioning + uint32_t condition = 0; // Condition DSP processing + + // LSTM (for LSTM models) + uint32_t lstm = 0; // LSTM cell computations + + // Catch-all + uint32_t other = 0; // Everything else + + void reset() { + conv1d = 0; + input_mixin = 0; + layer1x1 = 0; + head1x1 = 0; + rechannel = 0; + conv1x1 = 0; + activation = 0; + film = 0; + copies = 0; + setzero = 0; + ringbuf = 0; + condition = 0; + lstm = 0; + other = 0; + } + + uint32_t total() const { + return conv1d + input_mixin + layer1x1 + head1x1 + rechannel + conv1x1 + activation + film + copies + setzero + ringbuf + condition + lstm + other; + } +}; + +// Global timing accumulator +extern Timings g_timings; + +// Get current time in microseconds (platform-specific) +uint32_t get_time_us(); + +// Reset profiling counters +inline void reset() { g_timings.reset(); } + +// Print profiling results to stdout +inline void print_results() { + const auto& t = g_timings; + uint32_t total = t.total(); + + printf("\nProfiling breakdown:\n"); + printf("%-12s %8s %6s\n", "Category", "Time(ms)", "%%"); + printf("%-12s %8s %6s\n", "--------", "--------", "----"); + + auto print_row = [total](const char* name, uint32_t us) { + if (us > 0 || total == 0) { + uint32_t pct = total > 0 ? (us * 100 / total) : 0; + printf("%-12s %8.1f %5lu%%\n", name, us / 1000.0f, (unsigned long)pct); + } + }; + + print_row("Conv1D", t.conv1d); + print_row("InputMixin", t.input_mixin); + print_row("Layer1x1", t.layer1x1); + print_row("Head1x1", t.head1x1); + print_row("Rechannel", t.rechannel); + print_row("Conv1x1", t.conv1x1); + print_row("Activation", t.activation); + print_row("FiLM", t.film); + print_row("Copies", t.copies); + print_row("SetZero", t.setzero); + print_row("RingBuf", t.ringbuf); + print_row("Condition", t.condition); + print_row("LSTM", t.lstm); + print_row("Other", t.other); + + printf("%-12s %8s %6s\n", "--------", "--------", "----"); + printf("%-12s %8.1f %5s\n", "Total", total / 1000.0f, "100%"); +} + +// Helper macros for timing sections +// Usage: +// NAM_PROFILE_START(); +// // ... code to profile ... +// NAM_PROFILE_ADD(conv1d); // Adds elapsed time to conv1d, resets timer + +#define NAM_PROFILE_START() uint32_t _prof_start = nam::profiling::get_time_us() +#define NAM_PROFILE_ADD(category) do { \ + uint32_t _prof_now = nam::profiling::get_time_us(); \ + nam::profiling::g_timings.category += (_prof_now - _prof_start); \ + _prof_start = _prof_now; \ +} while(0) + +// Variant that doesn't reset the timer (for one-shot measurements) +#define NAM_PROFILE_ADD_NORESTART(category) \ + nam::profiling::g_timings.category += (nam::profiling::get_time_us() - _prof_start) + +} // namespace profiling +} // namespace nam + +#else // NAM_PROFILING not defined + +// No-op macros when profiling is disabled +#define NAM_PROFILE_START() ((void)0) +#define NAM_PROFILE_ADD(category) ((void)0) +#define NAM_PROFILE_ADD_NORESTART(category) ((void)0) + +namespace nam { +namespace profiling { + inline void reset() {} + inline void print_results() {} +} // namespace profiling +} // namespace nam + +#endif // NAM_PROFILING diff --git a/NAM/wavenet.cpp b/NAM/wavenet.cpp index 7d9b5d0..6112169 100644 --- a/NAM/wavenet.cpp +++ b/NAM/wavenet.cpp @@ -6,6 +6,7 @@ #include #include "get_dsp.h" +#include "profiling.h" #include "registry.h" #include "wavenet.h" @@ -89,6 +90,8 @@ void nam::wavenet::_Layer::set_weights_(std::vector::iterator& weights) void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::MatrixXf& condition, const int num_frames) { + NAM_PROFILE_START(); + const long bottleneck = this->_bottleneck; // Use the actual bottleneck value, not the doubled output channels // Step 1: input convolutions @@ -107,6 +110,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma Eigen::MatrixXf& conv_output = this->_conv.GetOutput(); this->_conv_post_film->Process_(conv_output, condition, num_frames); } + NAM_PROFILE_ADD(conv1d); if (this->_input_mixin_pre_film) { @@ -123,8 +127,12 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma Eigen::MatrixXf& input_mixin_output = this->_input_mixin.GetOutput(); this->_input_mixin_post_film->Process_(input_mixin_output, condition, num_frames); } + NAM_PROFILE_ADD(input_mixin); + this->_z.leftCols(num_frames).noalias() = _conv.GetOutput().leftCols(num_frames) + _input_mixin.GetOutput().leftCols(num_frames); + NAM_PROFILE_ADD(copies); + if (this->_activation_pre_film) { this->_activation_pre_film->Process_(this->_z, condition, num_frames); @@ -139,6 +147,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma if (this->_gating_mode == GatingMode::NONE) { this->_activation->apply(this->_z.leftCols(num_frames)); + NAM_PROFILE_ADD(activation); if (this->_activation_post_film) { this->_activation_post_film->Process_(this->_z, condition, num_frames); @@ -146,6 +155,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma if (this->_layer1x1) { this->_layer1x1->process_(this->_z, num_frames); + NAM_PROFILE_ADD(layer1x1); } } else if (this->_gating_mode == GatingMode::GATED) @@ -155,6 +165,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma auto input_block = this->_z.leftCols(num_frames); auto output_block = this->_z.topRows(bottleneck).leftCols(num_frames); this->_gating_activation->apply(input_block, output_block); + NAM_PROFILE_ADD(activation); if (this->_activation_post_film) { // Use Process() for blocks and copy result back @@ -165,6 +176,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma if (this->_layer1x1) { this->_layer1x1->process_(this->_z.topRows(bottleneck), num_frames); + NAM_PROFILE_ADD(layer1x1); } } else if (this->_gating_mode == GatingMode::BLENDED) @@ -174,6 +186,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma auto input_block = this->_z.leftCols(num_frames); auto output_block = this->_z.topRows(bottleneck).leftCols(num_frames); this->_blending_activation->apply(input_block, output_block); + NAM_PROFILE_ADD(activation); if (this->_activation_post_film) { // Use Process() for blocks and copy result back @@ -184,6 +197,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma if (this->_layer1x1) { this->_layer1x1->process_(this->_z.topRows(bottleneck), num_frames); + NAM_PROFILE_ADD(layer1x1); if (this->_layer1x1_post_film) { Eigen::MatrixXf& layer1x1_output = this->_layer1x1->GetOutput(); @@ -207,6 +221,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma Eigen::MatrixXf& head1x1_output = this->_head1x1->GetOutput(); this->_head1x1_post_film->Process_(head1x1_output, condition, num_frames); } + NAM_PROFILE_ADD(head1x1); this->_output_head.leftCols(num_frames).noalias() = this->_head1x1->GetOutput().leftCols(num_frames); } else // No head 1x1 @@ -230,6 +245,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma // If layer1x1 is inactive, residual connection is just the input (identity) this->_output_next_layer.leftCols(num_frames).noalias() = input.leftCols(num_frames); } + NAM_PROFILE_ADD(copies); } // LayerArray ================================================================= @@ -298,9 +314,12 @@ void nam::wavenet::_LayerArray::Process(const Eigen::MatrixXf& layer_inputs, con void nam::wavenet::_LayerArray::ProcessInner(const Eigen::MatrixXf& layer_inputs, const Eigen::MatrixXf& condition, const int num_frames) { + NAM_PROFILE_START(); + // Process rechannel and get output this->_rechannel.process_(layer_inputs, num_frames); Eigen::MatrixXf& rechannel_output = _rechannel.GetOutput(); + NAM_PROFILE_ADD(rechannel); // Process layers for (size_t i = 0; i < this->_layers.size(); i++) @@ -329,7 +348,11 @@ void nam::wavenet::_LayerArray::ProcessInner(const Eigen::MatrixXf& layer_inputs this->_layers[last_layer].GetOutputNextLayer().leftCols(num_frames); // Process head rechannel +#ifdef NAM_PROFILING + _prof_start = nam::profiling::get_time_us(); // Reset timer for accurate head_rechannel measurement +#endif _head_rechannel.process_(this->_head_inputs, num_frames); + NAM_PROFILE_ADD(rechannel); } diff --git a/tools/benchmodel.cpp b/tools/benchmodel.cpp index 39c14b0..42556f5 100644 --- a/tools/benchmodel.cpp +++ b/tools/benchmodel.cpp @@ -4,6 +4,7 @@ #include "NAM/dsp.h" #include "NAM/get_dsp.h" +#include "NAM/profiling.h" using std::chrono::duration; using std::chrono::duration_cast; @@ -62,6 +63,9 @@ int main(int argc, char* argv[]) outputPtrs[ch] = outputBuffers[ch].data(); } + // Reset profiling counters before benchmark + nam::profiling::reset(); + std::cout << "Running benchmark\n"; auto t1 = high_resolution_clock::now(); for (size_t i = 0; i < numBuffers; i++) @@ -80,6 +84,9 @@ int main(int argc, char* argv[]) std::cout << ms_int.count() << "ms\n"; std::cout << ms_double.count() << "ms\n"; + + // Print profiling breakdown if enabled + nam::profiling::print_results(); } else { From 5c535246e531c7721c449b6c3c60871cbee92831 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Felipe=20Santos?= Date: Fri, 6 Feb 2026 10:20:55 -0800 Subject: [PATCH 2/7] Fixed build flags for benchmodel --- tools/CMakeLists.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt index 8118e08..94adfce 100644 --- a/tools/CMakeLists.txt +++ b/tools/CMakeLists.txt @@ -16,6 +16,8 @@ add_executable(run_tests run_tests.cpp test/allocation_tracking.cpp ${NAM_SOURCE # Compile run_tests without optimizations to ensure allocation tracking works correctly # Also ensure assertions are enabled (NDEBUG is not defined) so tests actually run set_target_properties(run_tests PROPERTIES COMPILE_OPTIONS "-O0") +# Benchmodel should be built with NAM_PROFILING set +target_compile_definitions(benchmodel PRIVATE NAM_PROFILING) # Ensure assertions are enabled for run_tests by removing NDEBUG if it was set # Release/RelWithDebInfo/MinSizeRel build types automatically define NDEBUG # We use a compile option to undefine it, which works on GCC, Clang, and MSVC @@ -61,4 +63,4 @@ endif() # /Users/steve/src/NeuralAmpModelerCore/Dependencies/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h # Don't let this break my build on debug: set_source_files_properties(../NAM/dsp.cpp PROPERTIES COMPILE_FLAGS "-Wno-error") -set_source_files_properties(../NAM/conv1d.cpp PROPERTIES COMPILE_FLAGS "-Wno-error") \ No newline at end of file +set_source_files_properties(../NAM/conv1d.cpp PROPERTIES COMPILE_FLAGS "-Wno-error") From 84deb8ab6a17c3dbb77e7efe32716c06d1b2db84 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Felipe=20Santos?= Date: Mon, 9 Feb 2026 10:31:07 -0800 Subject: [PATCH 3/7] Added a command line tool to output memory usage for a given .nam file --- tools/CMakeLists.txt | 2 + tools/memory_usage.cpp | 611 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 613 insertions(+) create mode 100644 tools/memory_usage.cpp diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt index 94adfce..8f02f20 100644 --- a/tools/CMakeLists.txt +++ b/tools/CMakeLists.txt @@ -12,6 +12,7 @@ include_directories(tools ${NAM_DEPS_PATH}/nlohmann) add_executable(loadmodel loadmodel.cpp ${NAM_SOURCES}) add_executable(benchmodel benchmodel.cpp ${NAM_SOURCES}) +add_executable(memory_usage memory_usage.cpp) add_executable(run_tests run_tests.cpp test/allocation_tracking.cpp ${NAM_SOURCES}) # Compile run_tests without optimizations to ensure allocation tracking works correctly # Also ensure assertions are enabled (NDEBUG is not defined) so tests actually run @@ -34,6 +35,7 @@ endif() source_group(NAM ${CMAKE_CURRENT_SOURCE_DIR} FILES ${NAM_SOURCES}) target_compile_features(${TOOLS} PUBLIC cxx_std_20) +target_compile_features(memory_usage PUBLIC cxx_std_20) set_target_properties(${TOOLS} PROPERTIES diff --git a/tools/memory_usage.cpp b/tools/memory_usage.cpp new file mode 100644 index 0000000..853ca8f --- /dev/null +++ b/tools/memory_usage.cpp @@ -0,0 +1,611 @@ +// memory_usage.cpp — Report total memory required to host a NAM model at runtime. +// +// Usage: memory_usage [--buffer-size N] +// +// Parses the .nam JSON config and computes weight memory (learned parameters stored +// in Eigen matrices/vectors) and buffer memory (intermediate computation/state that +// depends on maxBufferSize) without instantiating the model. + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "json.hpp" + +using json = nlohmann::json; + +static constexpr int DEFAULT_BUFFER_SIZE = 2048; +static constexpr long INPUT_BUFFER_SAFETY_FACTOR = 32; + +// ─── Result accumulator ───────────────────────────────────────────────────── + +struct MemoryResult +{ + size_t weight_bytes = 0; + size_t buffer_bytes = 0; + + void add_weights(size_t floats) { weight_bytes += floats * sizeof(float); } + void add_buffers(size_t floats) { buffer_bytes += floats * sizeof(float); } + + MemoryResult& operator+=(const MemoryResult& o) + { + weight_bytes += o.weight_bytes; + buffer_bytes += o.buffer_bytes; + return *this; + } +}; + +// ─── Conv1x1 ──────────────────────────────────────────────────────────────── + +// Conv1x1 stores either a full (out_channels x in_channels) matrix (possibly +// block-diagonal when grouped), or a depthwise weight vector when groups == +// in_channels == out_channels. +static MemoryResult conv1x1_memory(int in_ch, int out_ch, bool bias, int groups, int M) +{ + MemoryResult r; + bool depthwise = (groups == in_ch && in_ch == out_ch); + if (depthwise) + r.add_weights(in_ch); // _depthwise_weight(in_ch) + else + r.add_weights((size_t)out_ch * in_ch); // _weight(out_ch, in_ch) + if (bias) + r.add_weights(out_ch); // _bias(out_ch) + r.add_buffers((size_t)out_ch * M); // _output(out_ch, M) + return r; +} + +// ─── Conv1D ───────────────────────────────────────────────────────────────── + +// Conv1D stores kernel_size weight matrices (each out_ch x in_ch) or depthwise +// vectors, plus a bias vector, a ring buffer, and an output buffer. +static MemoryResult conv1d_memory(int in_ch, int out_ch, int kernel_size, bool bias, int dilation, int groups, int M) +{ + MemoryResult r; + bool depthwise = (groups == in_ch && in_ch == out_ch); + if (depthwise) + r.add_weights((size_t)kernel_size * in_ch); // _depthwise_weight[k](in_ch) + else + r.add_weights((size_t)kernel_size * out_ch * in_ch); // _weight[k](out_ch, in_ch) + if (bias) + r.add_weights(out_ch); // _bias(out_ch) + + // Ring buffer: storage = (in_ch, 2 * max_lookback + M) + // max_lookback = (kernel_size - 1) * dilation + long max_lookback = (kernel_size > 0) ? (long)(kernel_size - 1) * dilation : 0; + long ring_storage = 2 * max_lookback + M; + r.add_buffers((size_t)in_ch * ring_storage); // _input_buffer._storage + + // Output buffer: (out_ch, M) + r.add_buffers((size_t)out_ch * M); // _output + + return r; +} + +// ─── FiLM ─────────────────────────────────────────────────────────────────── + +struct FiLMParams +{ + bool active = false; + bool shift = true; + int groups = 1; +}; + +static MemoryResult film_memory(int condition_dim, int input_dim, const FiLMParams& fp, int M) +{ + if (!fp.active) + return {}; + MemoryResult r; + int scale_shift_dim = fp.shift ? 2 * input_dim : input_dim; + // _cond_to_scale_shift is a Conv1x1(condition_dim -> scale_shift_dim, bias=true, groups) + r += conv1x1_memory(condition_dim, scale_shift_dim, true, fp.groups, M); + // _output(input_dim, M) + r.add_buffers((size_t)input_dim * M); + return r; +} + +// ─── BatchNorm ────────────────────────────────────────────────────────────── + +static MemoryResult batchnorm_memory(int dim) +{ + MemoryResult r; + // Stores scale(dim) + loc(dim) derived from running_mean, running_var, weight, bias, eps + // The source values are consumed from weights array; only scale + loc are stored at runtime. + r.add_weights(2 * (size_t)dim); + return r; +} + +// ─── LSTM ─────────────────────────────────────────────────────────────────── + +static MemoryResult lstm_memory(const json& config) +{ + MemoryResult r; + int num_layers = config["num_layers"]; + int input_size = config["input_size"]; + int hidden_size = config["hidden_size"]; + int in_channels = config.value("in_channels", 1); + int out_channels = config.value("out_channels", 1); + + for (int i = 0; i < num_layers; i++) + { + int cell_input = (i == 0) ? input_size : hidden_size; + // _w(4*H, I+H) + r.add_weights((size_t)4 * hidden_size * (cell_input + hidden_size)); + // _b(4*H) + r.add_weights(4 * (size_t)hidden_size); + // _xh(I+H) — stores initial hidden state in the hidden portion + r.add_weights((size_t)(cell_input + hidden_size)); + // _c(H) — initial cell state + r.add_weights((size_t)hidden_size); + + // Buffers: _ifgo(4*H) + r.add_buffers(4 * (size_t)hidden_size); + // Note: _xh and _c are also modified during inference but they are + // loaded from weights (initial state), so counted as weights above. + } + + // _head_weight(out_channels, hidden_size) + r.add_weights((size_t)out_channels * hidden_size); + // _head_bias(out_channels) + r.add_weights(out_channels); + + // Top-level buffers: _input(input_size), _output(out_channels) + r.add_buffers(input_size); + r.add_buffers(out_channels); + + return r; +} + +// ─── Linear ───────────────────────────────────────────────────────────────── + +static MemoryResult linear_memory(const json& config) +{ + MemoryResult r; + int receptive_field = config["receptive_field"]; + bool bias = config["bias"]; + int in_channels = config.value("in_channels", 1); + int out_channels = config.value("out_channels", 1); + + // _weight(receptive_field) + r.add_weights(receptive_field); + // _bias (scalar float) + if (bias) + r.add_weights(1); + + // Buffer base: _input_buffers = in_channels vectors of (32 * receptive_field) + r.add_buffers((size_t)in_channels * INPUT_BUFFER_SAFETY_FACTOR * receptive_field); + // _output_buffers: resized per-call, not pre-allocated to a fixed size + // (depends on num_frames, not maxBufferSize) + + return r; +} + +// ─── ConvNet ──────────────────────────────────────────────────────────────── + +static MemoryResult convnet_memory(const json& config, int M) +{ + MemoryResult r; + int channels = config["channels"]; + std::vector dilations = config["dilations"]; + bool batchnorm = config["batchnorm"]; + int groups = config.value("groups", 1); + int in_channels = config.value("in_channels", 1); + int out_channels = config.value("out_channels", 1); + + int max_dilation = *std::max_element(dilations.begin(), dilations.end()); + + // Buffer base class: _input_buffers = in_channels * (32 * max_dilation) + int receptive_field = max_dilation; // passed to Buffer as receptive_field + r.add_buffers((size_t)in_channels * INPUT_BUFFER_SAFETY_FACTOR * receptive_field); + + // ConvNet blocks + for (size_t i = 0; i < dilations.size(); i++) + { + int block_in = (i == 0) ? in_channels : channels; + int block_out = channels; + // Conv1D with kernel_size=2, bias=!batchnorm + r += conv1d_memory(block_in, block_out, 2, !batchnorm, dilations[i], groups, M); + // Optional batchnorm + if (batchnorm) + r += batchnorm_memory(block_out); + // _output(out_channels, M) per block + r.add_buffers((size_t)block_out * M); + } + + // _block_vals: 1 entry of (channels, buffer_size) + // buffer_size = input_buffers[0].size() = 32 * receptive_field + long buffer_size = INPUT_BUFFER_SAFETY_FACTOR * receptive_field; + r.add_buffers((size_t)channels * buffer_size); + + // _head: weight(out_channels, channels) + bias(out_channels) + r.add_weights((size_t)out_channels * channels); + r.add_weights(out_channels); + + // _head_output is resized per-call, not a fixed pre-allocation + + return r; +} + +// ─── WaveNet helpers ──────────────────────────────────────────────────────── + +static FiLMParams parse_film_params(const json& layer_config, const std::string& key) +{ + FiLMParams fp; + if (layer_config.find(key) == layer_config.end() || layer_config[key] == false) + return fp; // inactive + const json& fc = layer_config[key]; + fp.active = fc.value("active", true); + fp.shift = fc.value("shift", true); + fp.groups = fc.value("groups", 1); + return fp; +} + +enum class GatingMode +{ + NONE, + GATED, + BLENDED +}; + +static std::vector parse_gating_modes(const json& layer_config, size_t num_layers) +{ + std::vector modes; + + auto parse_str = [](const std::string& s) -> GatingMode { + if (s == "gated") + return GatingMode::GATED; + if (s == "blended") + return GatingMode::BLENDED; + return GatingMode::NONE; + }; + + if (layer_config.find("gating_mode") != layer_config.end()) + { + if (layer_config["gating_mode"].is_array()) + { + for (const auto& gm : layer_config["gating_mode"]) + modes.push_back(parse_str(gm.get())); + } + else + { + GatingMode mode = parse_str(layer_config["gating_mode"].get()); + modes.resize(num_layers, mode); + } + } + else if (layer_config.find("gated") != layer_config.end()) + { + bool gated = layer_config["gated"]; + modes.resize(num_layers, gated ? GatingMode::GATED : GatingMode::NONE); + } + else + { + modes.resize(num_layers, GatingMode::NONE); + } + return modes; +} + +// WaveNet _Layer memory +static MemoryResult wavenet_layer_memory(int condition_size, int channels, int bottleneck, int kernel_size, int dilation, + GatingMode gating_mode, int groups_input, int groups_input_mixin, + bool layer1x1_active, int layer1x1_groups, bool head1x1_active, + int head1x1_out_channels, int head1x1_groups, const FiLMParams& conv_pre_film, + const FiLMParams& conv_post_film, const FiLMParams& input_mixin_pre_film, + const FiLMParams& input_mixin_post_film, + const FiLMParams& activation_pre_film, + const FiLMParams& activation_post_film, + const FiLMParams& layer1x1_post_film, const FiLMParams& head1x1_post_film, + int M) +{ + MemoryResult r; + bool gated = (gating_mode != GatingMode::NONE); + int conv_out = gated ? 2 * bottleneck : bottleneck; + + // _conv: Conv1D(channels -> conv_out, kernel_size, bias=true, dilation, groups_input) + r += conv1d_memory(channels, conv_out, kernel_size, true, dilation, groups_input, M); + + // _input_mixin: Conv1x1(condition_size -> conv_out, bias=false, groups_input_mixin) + r += conv1x1_memory(condition_size, conv_out, false, groups_input_mixin, M); + + // _layer1x1 (optional): Conv1x1(bottleneck -> channels, bias=true, layer1x1_groups) + if (layer1x1_active) + r += conv1x1_memory(bottleneck, channels, true, layer1x1_groups, M); + + // _head1x1 (optional): Conv1x1(bottleneck -> head1x1_out_channels, bias=true, head1x1_groups) + if (head1x1_active) + r += conv1x1_memory(bottleneck, head1x1_out_channels, true, head1x1_groups, M); + + // Buffers: _z(conv_out, M) + r.add_buffers((size_t)conv_out * M); + // _output_next_layer(channels, M) + r.add_buffers((size_t)channels * M); + // _output_head: if head1x1 active -> (head1x1_out_channels, M), else (bottleneck, M) + int head_out = head1x1_active ? head1x1_out_channels : bottleneck; + r.add_buffers((size_t)head_out * M); + + // FiLM modules (up to 8) + r += film_memory(condition_size, channels, conv_pre_film, M); + r += film_memory(condition_size, conv_out, conv_post_film, M); + r += film_memory(condition_size, condition_size, input_mixin_pre_film, M); + r += film_memory(condition_size, conv_out, input_mixin_post_film, M); + r += film_memory(condition_size, conv_out, activation_pre_film, M); + r += film_memory(condition_size, bottleneck, activation_post_film, M); + if (layer1x1_active) + r += film_memory(condition_size, channels, layer1x1_post_film, M); + if (head1x1_active) + r += film_memory(condition_size, head1x1_out_channels, head1x1_post_film, M); + + return r; +} + +// WaveNet _LayerArray memory +static MemoryResult wavenet_layer_array_memory(const json& layer_config, int M) +{ + MemoryResult r; + int input_size = layer_config["input_size"]; + int condition_size = layer_config["condition_size"]; + int head_size = layer_config["head_size"]; + int channels = layer_config["channels"]; + int bottleneck = layer_config.value("bottleneck", channels); + int kernel_size = layer_config["kernel_size"]; + std::vector dilations = layer_config["dilations"]; + size_t num_layers = dilations.size(); + bool head_bias = layer_config["head_bias"]; + + int groups_input = layer_config.value("groups_input", 1); + int groups_input_mixin = layer_config.value("groups_input_mixin", 1); + + // layer1x1 params + bool layer1x1_active = true; + int layer1x1_groups = 1; + if (layer_config.find("layer1x1") != layer_config.end()) + { + layer1x1_active = layer_config["layer1x1"]["active"]; + layer1x1_groups = layer_config["layer1x1"]["groups"]; + } + + // head1x1 params + bool head1x1_active = false; + int head1x1_out_channels = channels; + int head1x1_groups = 1; + if (layer_config.find("head1x1") != layer_config.end()) + { + head1x1_active = layer_config["head1x1"]["active"]; + head1x1_out_channels = layer_config["head1x1"]["out_channels"]; + head1x1_groups = layer_config["head1x1"]["groups"]; + } + + // Gating modes + std::vector gating_modes = parse_gating_modes(layer_config, num_layers); + + // FiLM params + FiLMParams conv_pre = parse_film_params(layer_config, "conv_pre_film"); + FiLMParams conv_post = parse_film_params(layer_config, "conv_post_film"); + FiLMParams input_mixin_pre = parse_film_params(layer_config, "input_mixin_pre_film"); + FiLMParams input_mixin_post = parse_film_params(layer_config, "input_mixin_post_film"); + FiLMParams activation_pre = parse_film_params(layer_config, "activation_pre_film"); + FiLMParams activation_post = parse_film_params(layer_config, "activation_post_film"); + FiLMParams layer1x1_post = parse_film_params(layer_config, "layer1x1_post_film"); + FiLMParams head1x1_post = parse_film_params(layer_config, "head1x1_post_film"); + + // _rechannel: Conv1x1(input_size -> channels, bias=false) + r += conv1x1_memory(input_size, channels, false, 1, M); + + // Per-layer + for (size_t i = 0; i < num_layers; i++) + { + r += wavenet_layer_memory(condition_size, channels, bottleneck, kernel_size, dilations[i], gating_modes[i], + groups_input, groups_input_mixin, layer1x1_active, layer1x1_groups, head1x1_active, + head1x1_out_channels, head1x1_groups, conv_pre, conv_post, input_mixin_pre, + input_mixin_post, activation_pre, activation_post, layer1x1_post, head1x1_post, M); + } + + // _head_rechannel: Conv1x1(head_output_size -> head_size, bias=head_bias) + int head_output_size = head1x1_active ? head1x1_out_channels : bottleneck; + r += conv1x1_memory(head_output_size, head_size, head_bias, 1, M); + + // Buffers: _layer_outputs(channels, M) + r.add_buffers((size_t)channels * M); + // _head_inputs(head_output_size, M) + r.add_buffers((size_t)head_output_size * M); + + return r; +} + +// Forward declaration for recursive condition_dsp +static MemoryResult compute_memory(const std::string& architecture, const json& config, int M); + +// WaveNet top-level memory +static MemoryResult wavenet_memory(const json& config, int M) +{ + MemoryResult r; + int in_channels = config.value("in_channels", 1); + + // condition_dim = in_channels (from _get_condition_dim()) + int condition_dim = in_channels; + + // Recursive condition_dsp + bool has_condition_dsp = false; + int condition_output_channels = condition_dim; + if (config.find("condition_dsp") != config.end()) + { + has_condition_dsp = true; + const json& cdsp = config["condition_dsp"]; + std::string cdsp_arch = cdsp["architecture"]; + json cdsp_config = cdsp["config"]; + r += compute_memory(cdsp_arch, cdsp_config, M); + // condition_output_channels comes from the condition_dsp's output + // For now, we use condition_size from first layer as a proxy + // (the actual model validates this match) + if (config.find("layers") != config.end() && config["layers"].size() > 0) + condition_output_channels = config["layers"][0]["condition_size"]; + } + + // _condition_input(condition_dim, M) + r.add_buffers((size_t)condition_dim * M); + + // _condition_output + if (!has_condition_dsp) + { + // _condition_output(condition_dim, M) + r.add_buffers((size_t)condition_dim * M); + } + else + { + // _condition_output(condition_output_channels, M) + r.add_buffers((size_t)condition_output_channels * M); + // _condition_dsp_input_buffers: condition_dim vectors of M doubles/floats + // These are std::vector> where NAM_SAMPLE is double + r.add_buffers((size_t)condition_dim * M * (sizeof(double) / sizeof(float))); + // _condition_dsp_output_buffers: condition_output_channels vectors of M doubles + r.add_buffers((size_t)condition_output_channels * M * (sizeof(double) / sizeof(float))); + // Pointer arrays are negligible + } + + // Layer arrays + for (const auto& layer_config : config["layers"]) + r += wavenet_layer_array_memory(layer_config, M); + + // _head_scale (1 float) — it's a weight + r.add_weights(1); + + return r; +} + +// ─── Dispatch ─────────────────────────────────────────────────────────────── + +static MemoryResult compute_memory(const std::string& architecture, const json& config, int M) +{ + if (architecture == "WaveNet") + return wavenet_memory(config, M); + if (architecture == "LSTM") + return lstm_memory(config); + if (architecture == "ConvNet") + return convnet_memory(config, M); + if (architecture == "Linear") + return linear_memory(config); + throw std::runtime_error("Unknown architecture: " + architecture); +} + +// ─── Formatting helpers ───────────────────────────────────────────────────── + +static std::string format_bytes(size_t bytes) +{ + char buf[64]; + if (bytes < 1024) + snprintf(buf, sizeof(buf), "%zu bytes", bytes); + else if (bytes < 1024 * 1024) + snprintf(buf, sizeof(buf), "%.2f KB", bytes / 1024.0); + else + snprintf(buf, sizeof(buf), "%.2f MB", bytes / (1024.0 * 1024.0)); + return buf; +} + +static std::string format_with_commas(size_t n) +{ + std::string s = std::to_string(n); + int insert_pos = (int)s.length() - 3; + while (insert_pos > 0) + { + s.insert(insert_pos, ","); + insert_pos -= 3; + } + return s; +} + +// ─── Main ─────────────────────────────────────────────────────────────────── + +int main(int argc, char* argv[]) +{ + if (argc < 2) + { + fprintf(stderr, "Usage: memory_usage [--buffer-size N]\n"); + return 1; + } + + const char* model_path = argv[1]; + int buffer_size = DEFAULT_BUFFER_SIZE; + + for (int i = 2; i < argc; i++) + { + if (strcmp(argv[i], "--buffer-size") == 0 && i + 1 < argc) + { + buffer_size = atoi(argv[++i]); + if (buffer_size <= 0) + { + fprintf(stderr, "Error: buffer size must be positive\n"); + return 1; + } + } + else + { + fprintf(stderr, "Unknown option: %s\n", argv[i]); + return 1; + } + } + + // Read and parse JSON + std::ifstream file(model_path); + if (!file.is_open()) + { + fprintf(stderr, "Error: cannot open %s\n", model_path); + return 1; + } + + json j; + try + { + file >> j; + } + catch (const std::exception& e) + { + fprintf(stderr, "Error parsing JSON: %s\n", e.what()); + return 1; + } + + std::string architecture = j["architecture"]; + json config = j["config"]; + + // Cross-check: count weights in JSON + size_t json_weight_count = 0; + if (j.find("weights") != j.end()) + json_weight_count = j["weights"].size(); + + double sample_rate = -1.0; + if (j.find("sample_rate") != j.end()) + sample_rate = j["sample_rate"]; + + try + { + MemoryResult result = compute_memory(architecture, config, buffer_size); + size_t total = result.weight_bytes + result.buffer_bytes; + + printf("Model: %s\n", model_path); + printf("Architecture: %s\n", architecture.c_str()); + if (sample_rate > 0) + printf("Sample rate: %.0f Hz\n", sample_rate); + printf("\n"); + printf("Weights: %s bytes (%s)\n", format_with_commas(result.weight_bytes).c_str(), + format_bytes(result.weight_bytes).c_str()); + printf("Buffers: %s bytes (%s) [buffer size: %d]\n", format_with_commas(result.buffer_bytes).c_str(), + format_bytes(result.buffer_bytes).c_str(), buffer_size); + printf("Total: %s bytes (%s)\n", format_with_commas(total).c_str(), format_bytes(total).c_str()); + + if (json_weight_count > 0) + { + printf("\nJSON weights: %zu values (%s bytes)\n", json_weight_count, + format_with_commas(json_weight_count * sizeof(float)).c_str()); + } + } + catch (const std::exception& e) + { + fprintf(stderr, "Error computing memory: %s\n", e.what()); + return 1; + } + + return 0; +} From 725c8ca814d491c0709f2c129df807e9e7356c59 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Felipe=20Santos?= <5733+jfsantos@users.noreply.github.com> Date: Wed, 11 Feb 2026 16:42:43 -0800 Subject: [PATCH 4/7] Bugfix - checking that condition_dsp is not null in the JSON (#220) --- NAM/wavenet.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NAM/wavenet.cpp b/NAM/wavenet.cpp index 6112169..a955d4e 100644 --- a/NAM/wavenet.cpp +++ b/NAM/wavenet.cpp @@ -596,7 +596,7 @@ std::unique_ptr nam::wavenet::Factory(const nlohmann::json& config, st const double expectedSampleRate) { std::unique_ptr condition_dsp = nullptr; - if (config.find("condition_dsp") != config.end()) + if (config.find("condition_dsp") != config.end() && !config["condition_dsp"].is_null()) { const nlohmann::json& condition_dsp_json = config["condition_dsp"]; condition_dsp = nam::get_dsp(condition_dsp_json); From c95605599a100ff68aaba8af014a3124c4bfbfb9 Mon Sep 17 00:00:00 2001 From: Steven Atkinson Date: Wed, 11 Feb 2026 19:15:32 -0800 Subject: [PATCH 5/7] [BUGFIX, BREAKING] Make activation base class abstract, fix PReLU implementation (#223) * Make activation apply method pure virtual instead of no-op default * Fix bugs * Refactor to throw std::invalid_argument in debug mode, add tests --- NAM/activations.h | 35 ++++++++++++++++--- tools/run_tests.cpp | 5 +-- tools/test/test_activations.cpp | 61 +++++++++++++++++++++++++++++---- 3 files changed, 89 insertions(+), 12 deletions(-) diff --git a/NAM/activations.h b/NAM/activations.h index 68d5025..a05c456 100644 --- a/NAM/activations.h +++ b/NAM/activations.h @@ -2,6 +2,8 @@ #include #include // expf +#include // std::cerr (kept for potential debug use) +#include // std::invalid_argument #include #include #include @@ -150,7 +152,7 @@ class Activation { apply(block.data(), block.rows() * block.cols()); } - virtual void apply(float* data, long size) {} + virtual void apply(float* data, long size) = 0; static Ptr get_activation(const std::string name); static Ptr get_activation(const ActivationConfig& config); @@ -165,13 +167,13 @@ class Activation static std::unordered_map _activations; }; -// identity function activation +// identity function activation--"do nothing" class ActivationIdentity : public nam::activations::Activation { public: ActivationIdentity() = default; ~ActivationIdentity() = default; - // Inherit the default apply methods which do nothing + virtual void apply(float* data, long size) override {}; }; class ActivationTanh : public Activation @@ -276,6 +278,24 @@ class ActivationPReLU : public Activation } ActivationPReLU(std::vector ns) { negative_slopes = ns; } + void apply(float* data, long size) override + { + // Assume column-major (this is brittle) +#ifndef NDEBUG + if (size % negative_slopes.size() != 0) + { + throw std::invalid_argument("PReLU.apply(*data, size) was given an array of size " + std::to_string(size) + + " but the activation has " + std::to_string(negative_slopes.size()) + + " channels, which doesn't divide evenly."); + } +#endif + for (long pos = 0; pos < size; pos++) + { + const float negative_slope = negative_slopes[pos % negative_slopes.size()]; + data[pos] = leaky_relu(data[pos], negative_slope); + } + } + void apply(Eigen::MatrixXf& matrix) override { // Matrix is organized as (channels, time_steps) @@ -285,7 +305,14 @@ class ActivationPReLU : public Activation std::vector slopes_for_channels = negative_slopes; // Fail loudly if input has more channels than activation - assert(actual_channels == negative_slopes.size()); +#ifndef NDEBUG + if (actual_channels != negative_slopes.size()) + { + throw std::invalid_argument("PReLU: Received " + std::to_string(actual_channels) + + " channels, but activation has " + std::to_string(negative_slopes.size()) + + " channels"); + } +#endif // Apply each negative slope to its corresponding channel for (unsigned long channel = 0; channel < actual_channels; channel++) diff --git a/tools/run_tests.cpp b/tools/run_tests.cpp index 9b3bdec..1c8c34a 100644 --- a/tools/run_tests.cpp +++ b/tools/run_tests.cpp @@ -48,8 +48,9 @@ int main() test_activations::TestPReLU::test_core_function(); test_activations::TestPReLU::test_per_channel_behavior(); - // This is enforced by an assert so it doesn't need to be tested - // test_activations::TestPReLU::test_wrong_number_of_channels(); + test_activations::TestPReLU::test_wrong_number_of_channels_matrix(); + test_activations::TestPReLU::test_wrong_size_array(); + test_activations::TestPReLU::test_valid_array_size(); // Typed ActivationConfig tests test_activations::TestTypedActivationConfig::test_simple_config(); diff --git a/tools/test/test_activations.cpp b/tools/test/test_activations.cpp index a8dd705..55d61ff 100644 --- a/tools/test/test_activations.cpp +++ b/tools/test/test_activations.cpp @@ -220,9 +220,10 @@ class TestPReLU assert(fabs(data(1, 2) - 0.0f) < 1e-6); // 0.0 (unchanged) } - static void test_wrong_number_of_channels() + static void test_wrong_number_of_channels_matrix() { - // Test that we fail when we have more channels than slopes + // Test that we fail when matrix has more channels than slopes + // Note: This validation only runs in debug builds (#ifndef NDEBUG) Eigen::MatrixXf data(3, 2); // 3 channels, 2 time steps // Initialize with test data @@ -232,21 +233,69 @@ class TestPReLU std::vector slopes = {0.01f, 0.05f}; nam::activations::ActivationPReLU prelu(slopes); - // Apply the activation +#ifndef NDEBUG + // In debug mode, this should throw std::invalid_argument bool caught = false; try { prelu.apply(data); } - catch (const std::runtime_error& e) + catch (const std::invalid_argument& e) { caught = true; } - catch (...) + assert(caught && "Expected std::invalid_argument for channel count mismatch"); +#endif + } + + static void test_wrong_size_array() + { + // Test that we fail when array size doesn't divide evenly by channel count + // Note: This validation only runs in debug builds (#ifndef NDEBUG) + + // Create PReLU with 2 channels + std::vector slopes = {0.01f, 0.05f}; + nam::activations::ActivationPReLU prelu(slopes); + + // Array of size 5 doesn't divide evenly by 2 channels + std::vector data = {-1.0f, -2.0f, 0.5f, 1.0f, -0.5f}; + +#ifndef NDEBUG + // In debug mode, this should throw std::invalid_argument + bool caught = false; + try + { + prelu.apply(data.data(), (long)data.size()); + } + catch (const std::invalid_argument& e) { + caught = true; } + assert(caught && "Expected std::invalid_argument for array size mismatch"); +#endif + } + + static void test_valid_array_size() + { + // Test that valid array sizes work correctly + + // Create PReLU with 2 channels + std::vector slopes = {0.1f, 0.2f}; + nam::activations::ActivationPReLU prelu(slopes); + + // Array of size 6 divides evenly by 2 channels (3 time steps per channel) + std::vector data = {-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f}; + + // Should not throw + prelu.apply(data.data(), (long)data.size()); - assert(caught); + // Verify results: alternating between slope 0.1 and 0.2 + assert(fabs(data[0] - (-0.1f)) < 1e-6); // channel 0, slope 0.1 + assert(fabs(data[1] - (-0.2f)) < 1e-6); // channel 1, slope 0.2 + assert(fabs(data[2] - (-0.1f)) < 1e-6); // channel 0, slope 0.1 + assert(fabs(data[3] - (-0.2f)) < 1e-6); // channel 1, slope 0.2 + assert(fabs(data[4] - (-0.1f)) < 1e-6); // channel 0, slope 0.1 + assert(fabs(data[5] - (-0.2f)) < 1e-6); // channel 1, slope 0.2 } }; From d499f74fb75711641dfc6207ba5931902f58661f Mon Sep 17 00:00:00 2001 From: Steven Atkinson Date: Wed, 11 Feb 2026 19:46:05 -0800 Subject: [PATCH 6/7] Add TONE3000 support note in README.md (#224) --- README.md | 9 +++++++++ media/tone3000-logo.svg | 31 +++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+) create mode 100644 media/tone3000-logo.svg diff --git a/README.md b/README.md index 6e5aeda..f28bc97 100644 --- a/README.md +++ b/README.md @@ -12,3 +12,12 @@ You should be able to run it locally to test if you'd like. ## Sharp edges This library uses [Eigen](http://eigen.tuxfamily.org) to do the linear algebra routines that its neural networks require. Since these models hold their parameters as eigen object members, there is a risk with certain compilers and compiler optimizations that their memory is not aligned properly. This can be worked around by providing two preprocessor macros: `EIGEN_MAX_ALIGN_BYTES 0` and `EIGEN_DONT_VECTORIZE`, though this will probably harm performance. See [Structs Having Eigen Members](http://eigen.tuxfamily.org/dox-3.2/group__TopicStructHavingEigenMembers.html) for more information. This is being tracked as [Issue 67](https://github.com/sdatkinson/NeuralAmpModelerCore/issues/67). + +## Sponsors + +
+ Tone3000 logo +
+ +Development of version 0.4.0 of this library has been generously supported by [TONE3000](https://tone3000.com). +**Thank you!** diff --git a/media/tone3000-logo.svg b/media/tone3000-logo.svg new file mode 100644 index 0000000..469592d --- /dev/null +++ b/media/tone3000-logo.svg @@ -0,0 +1,31 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + From d68514da4ae5d68146c6cceaac1d1ca22c27f110 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Felipe=20Santos?= Date: Thu, 12 Feb 2026 11:16:10 -0800 Subject: [PATCH 7/7] Replace hardcoded profiling struct with dynamic registry The Timings struct hardcoded 14 named fields, requiring manual updates to reset(), total(), print_results(), and every call site whenever a category was added or removed. Replace with a flat-array registry where types are registered at file scope via register_type(), returning an integer index for O(1) accumulation in the hot path. Also adds NAM_PROFILE_RESTART() macro to replace a raw #ifdef block in wavenet.cpp. --- NAM/profiling.cpp | 45 ++++++++++++++- NAM/profiling.h | 136 ++++++++++++---------------------------------- NAM/wavenet.cpp | 40 ++++++++------ 3 files changed, 101 insertions(+), 120 deletions(-) diff --git a/NAM/profiling.cpp b/NAM/profiling.cpp index 65d430e..885872e 100644 --- a/NAM/profiling.cpp +++ b/NAM/profiling.cpp @@ -9,7 +9,8 @@ namespace nam { namespace profiling { -Timings g_timings; +ProfilingEntry g_entries[MAX_PROFILING_TYPES] = {}; +int g_num_entries = 0; // CPU frequency in MHz (Daisy runs at 480 MHz) static constexpr uint32_t CPU_FREQ_MHZ = 480; @@ -30,7 +31,8 @@ uint32_t get_time_us() { namespace nam { namespace profiling { -Timings g_timings; +ProfilingEntry g_entries[MAX_PROFILING_TYPES] = {}; +int g_num_entries = 0; uint32_t get_time_us() { using namespace std::chrono; @@ -44,4 +46,43 @@ uint32_t get_time_us() { #endif // ARM check +namespace nam { +namespace profiling { + +int register_type(const char* name) { + int idx = g_num_entries++; + g_entries[idx].name = name; + g_entries[idx].accumulated_us = 0; + return idx; +} + +void reset() { + for (int i = 0; i < g_num_entries; i++) + g_entries[i].accumulated_us = 0; +} + +void print_results() { + uint32_t total = 0; + for (int i = 0; i < g_num_entries; i++) + total += g_entries[i].accumulated_us; + + printf("\nProfiling breakdown:\n"); + printf("%-12s %8s %6s\n", "Category", "Time(ms)", "%"); + printf("%-12s %8s %6s\n", "--------", "--------", "----"); + + for (int i = 0; i < g_num_entries; i++) { + uint32_t us = g_entries[i].accumulated_us; + if (us > 0) { + uint32_t pct = total > 0 ? (us * 100 / total) : 0; + printf("%-12s %8.1f %5lu%%\n", g_entries[i].name, us / 1000.0f, (unsigned long)pct); + } + } + + printf("%-12s %8s %6s\n", "--------", "--------", "----"); + printf("%-12s %8.1f %5s\n", "Total", total / 1000.0f, "100%"); +} + +} // namespace profiling +} // namespace nam + #endif // NAM_PROFILING diff --git a/NAM/profiling.h b/NAM/profiling.h index 71031fe..4db570b 100644 --- a/NAM/profiling.h +++ b/NAM/profiling.h @@ -1,14 +1,17 @@ #pragma once -// Comprehensive profiling for NAM building blocks +// Dynamic profiling registry for NAM building blocks // Enable with -DNAM_PROFILING // // Usage: -// 1. Call nam::profiling::reset() before benchmark -// 2. Run model processing -// 3. Call nam::profiling::print_results() to display breakdown -// -// Categories cover all WaveNet operations including FiLM modulation. +// 1. Register profiling types at file scope (static init): +// static int PROF_FOO = nam::profiling::register_type("Foo"); +// 2. Call nam::profiling::reset() before benchmark +// 3. In hot path: +// NAM_PROFILE_START(); +// // ... code ... +// NAM_PROFILE_ADD(PROF_FOO); +// 4. Call nam::profiling::print_results() to display breakdown #ifdef NAM_PROFILING @@ -18,120 +21,48 @@ namespace nam { namespace profiling { -// Timing accumulators (in microseconds) -struct Timings { - // Dilated convolution (Conv1D) - uint32_t conv1d = 0; - - // Pointwise convolutions (Conv1x1 variants) - uint32_t input_mixin = 0; // Input mixing Conv1x1 - uint32_t layer1x1 = 0; // Layer 1x1 (residual projection) - uint32_t head1x1 = 0; // Head 1x1 (skip connection projection) - uint32_t rechannel = 0; // Rechannel Conv1x1 (input/output) - uint32_t conv1x1 = 0; // Other Conv1x1 (catch-all for non-WaveNet uses) - - // Activation - uint32_t activation = 0; // Activation functions (tanh, ReLU, Softsign, etc.) - - // FiLM modulation - uint32_t film = 0; // Feature-wise Linear Modulation (scale/shift) - - // Memory operations - uint32_t copies = 0; // Memory copies and additions - uint32_t setzero = 0; // setZero() calls - uint32_t ringbuf = 0; // Ring buffer operations (Write, Read, Advance) - - // Conditioning - uint32_t condition = 0; // Condition DSP processing - - // LSTM (for LSTM models) - uint32_t lstm = 0; // LSTM cell computations - - // Catch-all - uint32_t other = 0; // Everything else - - void reset() { - conv1d = 0; - input_mixin = 0; - layer1x1 = 0; - head1x1 = 0; - rechannel = 0; - conv1x1 = 0; - activation = 0; - film = 0; - copies = 0; - setzero = 0; - ringbuf = 0; - condition = 0; - lstm = 0; - other = 0; - } - - uint32_t total() const { - return conv1d + input_mixin + layer1x1 + head1x1 + rechannel + conv1x1 + activation + film + copies + setzero + ringbuf + condition + lstm + other; - } +constexpr int MAX_PROFILING_TYPES = 32; + +struct ProfilingEntry { + const char* name; + uint32_t accumulated_us; }; -// Global timing accumulator -extern Timings g_timings; +extern ProfilingEntry g_entries[MAX_PROFILING_TYPES]; +extern int g_num_entries; + +// Register a named profiling type. Returns index for fast accumulation. +// Called at static-init time or during setup, NOT in the hot path. +int register_type(const char* name); // Get current time in microseconds (platform-specific) uint32_t get_time_us(); -// Reset profiling counters -inline void reset() { g_timings.reset(); } +// Reset all profiling counters +void reset(); // Print profiling results to stdout -inline void print_results() { - const auto& t = g_timings; - uint32_t total = t.total(); - - printf("\nProfiling breakdown:\n"); - printf("%-12s %8s %6s\n", "Category", "Time(ms)", "%%"); - printf("%-12s %8s %6s\n", "--------", "--------", "----"); - - auto print_row = [total](const char* name, uint32_t us) { - if (us > 0 || total == 0) { - uint32_t pct = total > 0 ? (us * 100 / total) : 0; - printf("%-12s %8.1f %5lu%%\n", name, us / 1000.0f, (unsigned long)pct); - } - }; - - print_row("Conv1D", t.conv1d); - print_row("InputMixin", t.input_mixin); - print_row("Layer1x1", t.layer1x1); - print_row("Head1x1", t.head1x1); - print_row("Rechannel", t.rechannel); - print_row("Conv1x1", t.conv1x1); - print_row("Activation", t.activation); - print_row("FiLM", t.film); - print_row("Copies", t.copies); - print_row("SetZero", t.setzero); - print_row("RingBuf", t.ringbuf); - print_row("Condition", t.condition); - print_row("LSTM", t.lstm); - print_row("Other", t.other); - - printf("%-12s %8s %6s\n", "--------", "--------", "----"); - printf("%-12s %8.1f %5s\n", "Total", total / 1000.0f, "100%"); -} +void print_results(); // Helper macros for timing sections // Usage: // NAM_PROFILE_START(); // // ... code to profile ... -// NAM_PROFILE_ADD(conv1d); // Adds elapsed time to conv1d, resets timer +// NAM_PROFILE_ADD(PROF_FOO); // Adds elapsed time to entry, resets timer #define NAM_PROFILE_START() uint32_t _prof_start = nam::profiling::get_time_us() -#define NAM_PROFILE_ADD(category) do { \ +#define NAM_PROFILE_ADD(idx) do { \ uint32_t _prof_now = nam::profiling::get_time_us(); \ - nam::profiling::g_timings.category += (_prof_now - _prof_start); \ + nam::profiling::g_entries[idx].accumulated_us += (_prof_now - _prof_start); \ _prof_start = _prof_now; \ } while(0) // Variant that doesn't reset the timer (for one-shot measurements) -#define NAM_PROFILE_ADD_NORESTART(category) \ - nam::profiling::g_timings.category += (nam::profiling::get_time_us() - _prof_start) +#define NAM_PROFILE_ADD_NORESTART(idx) \ + nam::profiling::g_entries[idx].accumulated_us += (nam::profiling::get_time_us() - _prof_start) + +// Reset the timer without recording (for re-syncing mid-function) +#define NAM_PROFILE_RESTART() _prof_start = nam::profiling::get_time_us() } // namespace profiling } // namespace nam @@ -140,8 +71,9 @@ inline void print_results() { // No-op macros when profiling is disabled #define NAM_PROFILE_START() ((void)0) -#define NAM_PROFILE_ADD(category) ((void)0) -#define NAM_PROFILE_ADD_NORESTART(category) ((void)0) +#define NAM_PROFILE_ADD(idx) ((void)0) +#define NAM_PROFILE_ADD_NORESTART(idx) ((void)0) +#define NAM_PROFILE_RESTART() ((void)0) namespace nam { namespace profiling { diff --git a/NAM/wavenet.cpp b/NAM/wavenet.cpp index a955d4e..d9ca43e 100644 --- a/NAM/wavenet.cpp +++ b/NAM/wavenet.cpp @@ -10,6 +10,16 @@ #include "registry.h" #include "wavenet.h" +#ifdef NAM_PROFILING +static int PROF_CONV1D = nam::profiling::register_type("Conv1D"); +static int PROF_INPUT_MIXIN = nam::profiling::register_type("InputMixin"); +static int PROF_LAYER1X1 = nam::profiling::register_type("Layer1x1"); +static int PROF_HEAD1X1 = nam::profiling::register_type("Head1x1"); +static int PROF_RECHANNEL = nam::profiling::register_type("Rechannel"); +static int PROF_ACTIVATION = nam::profiling::register_type("Activation"); +static int PROF_COPIES = nam::profiling::register_type("Copies"); +#endif + // Layer ====================================================================== void nam::wavenet::_Layer::SetMaxBufferSize(const int maxBufferSize) @@ -110,7 +120,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma Eigen::MatrixXf& conv_output = this->_conv.GetOutput(); this->_conv_post_film->Process_(conv_output, condition, num_frames); } - NAM_PROFILE_ADD(conv1d); + NAM_PROFILE_ADD(PROF_CONV1D); if (this->_input_mixin_pre_film) { @@ -127,11 +137,11 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma Eigen::MatrixXf& input_mixin_output = this->_input_mixin.GetOutput(); this->_input_mixin_post_film->Process_(input_mixin_output, condition, num_frames); } - NAM_PROFILE_ADD(input_mixin); + NAM_PROFILE_ADD(PROF_INPUT_MIXIN); this->_z.leftCols(num_frames).noalias() = _conv.GetOutput().leftCols(num_frames) + _input_mixin.GetOutput().leftCols(num_frames); - NAM_PROFILE_ADD(copies); + NAM_PROFILE_ADD(PROF_COPIES); if (this->_activation_pre_film) { @@ -147,7 +157,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma if (this->_gating_mode == GatingMode::NONE) { this->_activation->apply(this->_z.leftCols(num_frames)); - NAM_PROFILE_ADD(activation); + NAM_PROFILE_ADD(PROF_ACTIVATION); if (this->_activation_post_film) { this->_activation_post_film->Process_(this->_z, condition, num_frames); @@ -155,7 +165,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma if (this->_layer1x1) { this->_layer1x1->process_(this->_z, num_frames); - NAM_PROFILE_ADD(layer1x1); + NAM_PROFILE_ADD(PROF_LAYER1X1); } } else if (this->_gating_mode == GatingMode::GATED) @@ -165,7 +175,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma auto input_block = this->_z.leftCols(num_frames); auto output_block = this->_z.topRows(bottleneck).leftCols(num_frames); this->_gating_activation->apply(input_block, output_block); - NAM_PROFILE_ADD(activation); + NAM_PROFILE_ADD(PROF_ACTIVATION); if (this->_activation_post_film) { // Use Process() for blocks and copy result back @@ -176,7 +186,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma if (this->_layer1x1) { this->_layer1x1->process_(this->_z.topRows(bottleneck), num_frames); - NAM_PROFILE_ADD(layer1x1); + NAM_PROFILE_ADD(PROF_LAYER1X1); } } else if (this->_gating_mode == GatingMode::BLENDED) @@ -186,7 +196,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma auto input_block = this->_z.leftCols(num_frames); auto output_block = this->_z.topRows(bottleneck).leftCols(num_frames); this->_blending_activation->apply(input_block, output_block); - NAM_PROFILE_ADD(activation); + NAM_PROFILE_ADD(PROF_ACTIVATION); if (this->_activation_post_film) { // Use Process() for blocks and copy result back @@ -197,7 +207,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma if (this->_layer1x1) { this->_layer1x1->process_(this->_z.topRows(bottleneck), num_frames); - NAM_PROFILE_ADD(layer1x1); + NAM_PROFILE_ADD(PROF_LAYER1X1); if (this->_layer1x1_post_film) { Eigen::MatrixXf& layer1x1_output = this->_layer1x1->GetOutput(); @@ -221,7 +231,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma Eigen::MatrixXf& head1x1_output = this->_head1x1->GetOutput(); this->_head1x1_post_film->Process_(head1x1_output, condition, num_frames); } - NAM_PROFILE_ADD(head1x1); + NAM_PROFILE_ADD(PROF_HEAD1X1); this->_output_head.leftCols(num_frames).noalias() = this->_head1x1->GetOutput().leftCols(num_frames); } else // No head 1x1 @@ -245,7 +255,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma // If layer1x1 is inactive, residual connection is just the input (identity) this->_output_next_layer.leftCols(num_frames).noalias() = input.leftCols(num_frames); } - NAM_PROFILE_ADD(copies); + NAM_PROFILE_ADD(PROF_COPIES); } // LayerArray ================================================================= @@ -319,7 +329,7 @@ void nam::wavenet::_LayerArray::ProcessInner(const Eigen::MatrixXf& layer_inputs // Process rechannel and get output this->_rechannel.process_(layer_inputs, num_frames); Eigen::MatrixXf& rechannel_output = _rechannel.GetOutput(); - NAM_PROFILE_ADD(rechannel); + NAM_PROFILE_ADD(PROF_RECHANNEL); // Process layers for (size_t i = 0; i < this->_layers.size(); i++) @@ -348,11 +358,9 @@ void nam::wavenet::_LayerArray::ProcessInner(const Eigen::MatrixXf& layer_inputs this->_layers[last_layer].GetOutputNextLayer().leftCols(num_frames); // Process head rechannel -#ifdef NAM_PROFILING - _prof_start = nam::profiling::get_time_us(); // Reset timer for accurate head_rechannel measurement -#endif + NAM_PROFILE_RESTART(); _head_rechannel.process_(this->_head_inputs, num_frames); - NAM_PROFILE_ADD(rechannel); + NAM_PROFILE_ADD(PROF_RECHANNEL); }