From 4832cda1b28d09026840255e6c4dfa9e8796a789 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jo=C3=A3o=20Felipe=20Santos?= <santosjf@pm.me>
Date: Fri, 6 Feb 2026 09:33:50 -0800
Subject: [PATCH 1/7] Add profiling instrumentation for NAM building blocks

Adds a profiling framework (NAM/profiling.h, NAM/profiling.cpp) with
NAM_PROFILE_START()/NAM_PROFILE_ADD() macros and 14 timing categories.
Supports both desktop (std::chrono) and ARM Cortex-M7 (DWT cycle counter)
backends. Profiling is compile-time gated via -DNAM_PROFILING.

Instruments wavenet _Layer::Process() and _LayerArray::ProcessInner()
with per-category timing, and adds profiling reset/print calls to the
benchmodel tool.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 NAM/conv1d.cpp       |   4 ++
 NAM/dsp.cpp          |   4 ++
 NAM/film.h           |   4 ++
 NAM/profiling.cpp    |  47 +++++++++++++
 NAM/profiling.h      | 153 +++++++++++++++++++++++++++++++++++++++++++
 NAM/wavenet.cpp      |  23 +++++++
 tools/benchmodel.cpp |   7 ++
 7 files changed, 242 insertions(+)
 create mode 100644 NAM/profiling.cpp
 create mode 100644 NAM/profiling.h
diff --git a/NAM/conv1d.cpp b/NAM/conv1d.cpp
index 9bbbc02..d440f0c 100644
--- a/NAM/conv1d.cpp
+++ b/NAM/conv1d.cpp
@@ -1,4 +1,5 @@
 #include "conv1d.h"
+#include "profiling.h"
 #include <stdexcept>
 
 namespace nam
@@ -143,6 +144,9 @@ void Conv1D::SetMaxBufferSize(const int maxBufferSize)
 
 void Conv1D::Process(const Eigen::MatrixXf& input, const int num_frames)
 {
+  // Note: Profiling is done at the caller level (e.g., _Layer::Process in wavenet.cpp)
+  // to avoid double-counting when Conv1D is called from within profiled blocks.
+
   // Write input to ring buffer
   _input_buffer.Write(input, num_frames);
 
diff --git a/NAM/dsp.cpp b/NAM/dsp.cpp
index 05dab09..b644af3 100644
--- a/NAM/dsp.cpp
+++ b/NAM/dsp.cpp
@@ -8,6 +8,7 @@
 #include <unordered_set>
 
 #include "dsp.h"
+#include "profiling.h"
 #include "registry.h"
 
 #define tanh_impl_ std::tanh
@@ -443,6 +444,9 @@ Eigen::MatrixXf nam::Conv1x1::process(const Eigen::MatrixXf& input, const int nu
 
 void nam::Conv1x1::process_(const Eigen::Ref<const Eigen::MatrixXf>& input, const int num_frames)
 {
+  // Note: Profiling is done at the caller level (e.g., _Layer::Process in wavenet.cpp)
+  // to provide meaningful categories (input_mixin, layer1x1, head1x1, rechannel)
+  // rather than generic conv1x1.
   assert(num_frames <= _output.cols());
 
   if (this->_is_depthwise)
diff --git a/NAM/film.h b/NAM/film.h
index f0f86fb..eeb750a 100644
--- a/NAM/film.h
+++ b/NAM/film.h
@@ -81,9 +81,13 @@ class FiLM
     assert(num_frames <= condition.cols());
     assert(num_frames <= _output.cols());
 
+    // Conv1x1 to compute scale/shift from condition
     _cond_to_scale_shift.process_(condition, num_frames);
     const auto& scale_shift = _cond_to_scale_shift.GetOutput();
 
+    // Note: FiLM time is included in the caller's profiling category (e.g., conv1d, input_mixin)
+    // rather than tracked separately, to avoid double-counting.
+
     const auto scale = scale_shift.topRows(get_input_dim()).leftCols(num_frames);
     if (_do_shift)
     {
diff --git a/NAM/profiling.cpp b/NAM/profiling.cpp
new file mode 100644
index 0000000..65d430e
--- /dev/null
+++ b/NAM/profiling.cpp
@@ -0,0 +1,47 @@
+#include "profiling.h"
+
+#ifdef NAM_PROFILING
+
+#if defined(__ARM_ARCH_7EM__) || defined(ARM_MATH_CM7)
+// ARM Cortex-M7: Use DWT cycle counter for precise timing
+#include "stm32h7xx.h"
+
+namespace nam {
+namespace profiling {
+
+Timings g_timings;
+
+// CPU frequency in MHz (Daisy runs at 480 MHz)
+static constexpr uint32_t CPU_FREQ_MHZ = 480;
+
+uint32_t get_time_us() {
+  // DWT->CYCCNT gives cycle count
+  // Divide by CPU_FREQ_MHZ to get microseconds
+  return DWT->CYCCNT / CPU_FREQ_MHZ;
+}
+
+} // namespace profiling
+} // namespace nam
+
+#else
+// Non-ARM: Use std::chrono for timing (for testing on desktop)
+#include <chrono>
+
+namespace nam {
+namespace profiling {
+
+Timings g_timings;
+
+uint32_t get_time_us() {
+  using namespace std::chrono;
+  static auto start = high_resolution_clock::now();
+  auto now = high_resolution_clock::now();
+  return (uint32_t)duration_cast<microseconds>(now - start).count();
+}
+
+} // namespace profiling
+} // namespace nam
+
+#endif // ARM check
+
+#endif // NAM_PROFILING
diff --git a/NAM/profiling.h b/NAM/profiling.h
new file mode 100644
index 0000000..71031fe
--- /dev/null
+++ b/NAM/profiling.h
@@ -0,0 +1,153 @@
+#pragma once
+
+// Comprehensive profiling for NAM building blocks
+// Enable with -DNAM_PROFILING
+//
+// Usage:
+//   1. Call nam::profiling::reset() before benchmark
+//   2. Run model processing
+//   3. Call nam::profiling::print_results() to display breakdown
+//
+// Categories cover all WaveNet operations including FiLM modulation.
+
+#ifdef NAM_PROFILING
+
+#include <cstdint>
+#include <cstdio>
+
+namespace nam {
+namespace profiling {
+
+// Timing accumulators (in microseconds)
+struct Timings {
+  // Dilated convolution (Conv1D)
+  uint32_t conv1d = 0;
+
+  // Pointwise convolutions (Conv1x1 variants)
+  uint32_t input_mixin = 0;   // Input mixing Conv1x1
+  uint32_t layer1x1 = 0;      // Layer 1x1 (residual projection)
+  uint32_t head1x1 = 0;       // Head 1x1 (skip connection projection)
+  uint32_t rechannel = 0;     // Rechannel Conv1x1 (input/output)
+  uint32_t conv1x1 = 0;       // Other Conv1x1 (catch-all for non-WaveNet uses)
+
+  // Activation
+  uint32_t activation = 0;    // Activation functions (tanh, ReLU, Softsign, etc.)
+
+  // FiLM modulation
+  uint32_t film = 0;          // Feature-wise Linear Modulation (scale/shift)
+
+  // Memory operations
+  uint32_t copies = 0;        // Memory copies and additions
+  uint32_t setzero = 0;       // setZero() calls
+  uint32_t ringbuf = 0;       // Ring buffer operations (Write, Read, Advance)
+
+  // Conditioning
+  uint32_t condition = 0;     // Condition DSP processing
+
+  // LSTM (for LSTM models)
+  uint32_t lstm = 0;          // LSTM cell computations
+
+  // Catch-all
+  uint32_t other = 0;         // Everything else
+
+  void reset() {
+    conv1d = 0;
+    input_mixin = 0;
+    layer1x1 = 0;
+    head1x1 = 0;
+    rechannel = 0;
+    conv1x1 = 0;
+    activation = 0;
+    film = 0;
+    copies = 0;
+    setzero = 0;
+    ringbuf = 0;
+    condition = 0;
+    lstm = 0;
+    other = 0;
+  }
+
+  uint32_t total() const {
+    return conv1d + input_mixin + layer1x1 + head1x1 + rechannel + conv1x1 + activation + film + copies + setzero + ringbuf + condition + lstm + other;
+  }
+};
+
+// Global timing accumulator
+extern Timings g_timings;
+
+// Get current time in microseconds (platform-specific)
+uint32_t get_time_us();
+
+// Reset profiling counters
+inline void reset() { g_timings.reset(); }
+
+// Print profiling results to stdout
+inline void print_results() {
+  const auto& t = g_timings;
+  uint32_t total = t.total();
+
+  printf("\nProfiling breakdown:\n");
+  printf("%-12s %8s %6s\n", "Category", "Time(ms)", "%%");
+  printf("%-12s %8s %6s\n", "--------", "--------", "----");
+
+  auto print_row = [total](const char* name, uint32_t us) {
+    if (us > 0 || total == 0) {
+      uint32_t pct = total > 0 ? (us * 100 / total) : 0;
+      printf("%-12s %8.1f %5lu%%\n", name, us / 1000.0f, (unsigned long)pct);
+    }
+  };
+
+  print_row("Conv1D", t.conv1d);
+  print_row("InputMixin", t.input_mixin);
+  print_row("Layer1x1", t.layer1x1);
+  print_row("Head1x1", t.head1x1);
+  print_row("Rechannel", t.rechannel);
+  print_row("Conv1x1", t.conv1x1);
+  print_row("Activation", t.activation);
+  print_row("FiLM", t.film);
+  print_row("Copies", t.copies);
+  print_row("SetZero", t.setzero);
+  print_row("RingBuf", t.ringbuf);
+  print_row("Condition", t.condition);
+  print_row("LSTM", t.lstm);
+  print_row("Other", t.other);
+
+  printf("%-12s %8s %6s\n", "--------", "--------", "----");
+  printf("%-12s %8.1f %5s\n", "Total", total / 1000.0f, "100%");
+}
+
+// Helper macros for timing sections
+// Usage:
+//   NAM_PROFILE_START();
+//   // ... code to profile ...
+//   NAM_PROFILE_ADD(conv1d);  // Adds elapsed time to conv1d, resets timer
+
+#define NAM_PROFILE_START() uint32_t _prof_start = nam::profiling::get_time_us()
+#define NAM_PROFILE_ADD(category) do { \
+  uint32_t _prof_now = nam::profiling::get_time_us(); \
+  nam::profiling::g_timings.category += (_prof_now - _prof_start); \
+  _prof_start = _prof_now; \
+} while(0)
+
+// Variant that doesn't reset the timer (for one-shot measurements)
+#define NAM_PROFILE_ADD_NORESTART(category) \
+  nam::profiling::g_timings.category += (nam::profiling::get_time_us() - _prof_start)
+
+} // namespace profiling
+} // namespace nam
+
+#else // NAM_PROFILING not defined
+
+// No-op macros when profiling is disabled
+#define NAM_PROFILE_START() ((void)0)
+#define NAM_PROFILE_ADD(category) ((void)0)
+#define NAM_PROFILE_ADD_NORESTART(category) ((void)0)
+
+namespace nam {
+namespace profiling {
+  inline void reset() {}
+  inline void print_results() {}
+} // namespace profiling
+} // namespace nam
+
+#endif // NAM_PROFILING
diff --git a/NAM/wavenet.cpp b/NAM/wavenet.cpp
index 7d9b5d0..6112169 100644
--- a/NAM/wavenet.cpp
+++ b/NAM/wavenet.cpp
@@ -6,6 +6,7 @@
 #include <Eigen/Dense>
 
 #include "get_dsp.h"
+#include "profiling.h"
 #include "registry.h"
 #include "wavenet.h"
 
@@ -89,6 +90,8 @@ void nam::wavenet::_Layer::set_weights_(std::vector<float>::iterator& weights)
 
 void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::MatrixXf& condition, const int num_frames)
 {
+  NAM_PROFILE_START();
+
   const long bottleneck = this->_bottleneck; // Use the actual bottleneck value, not the doubled output channels
 
   // Step 1: input convolutions
@@ -107,6 +110,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma
     Eigen::MatrixXf& conv_output = this->_conv.GetOutput();
     this->_conv_post_film->Process_(conv_output, condition, num_frames);
   }
+  NAM_PROFILE_ADD(conv1d);
 
   if (this->_input_mixin_pre_film)
   {
@@ -123,8 +127,12 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma
     Eigen::MatrixXf& input_mixin_output = this->_input_mixin.GetOutput();
     this->_input_mixin_post_film->Process_(input_mixin_output, condition, num_frames);
   }
+  NAM_PROFILE_ADD(input_mixin);
+
   this->_z.leftCols(num_frames).noalias() =
     _conv.GetOutput().leftCols(num_frames) + _input_mixin.GetOutput().leftCols(num_frames);
+  NAM_PROFILE_ADD(copies);
+
   if (this->_activation_pre_film)
   {
     this->_activation_pre_film->Process_(this->_z, condition, num_frames);
@@ -139,6 +147,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma
   if (this->_gating_mode == GatingMode::NONE)
   {
     this->_activation->apply(this->_z.leftCols(num_frames));
+    NAM_PROFILE_ADD(activation);
     if (this->_activation_post_film)
     {
       this->_activation_post_film->Process_(this->_z, condition, num_frames);
@@ -146,6 +155,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma
     if (this->_layer1x1)
     {
       this->_layer1x1->process_(this->_z, num_frames);
+      NAM_PROFILE_ADD(layer1x1);
     }
   }
   else if (this->_gating_mode == GatingMode::GATED)
@@ -155,6 +165,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma
     auto input_block = this->_z.leftCols(num_frames);
     auto output_block = this->_z.topRows(bottleneck).leftCols(num_frames);
     this->_gating_activation->apply(input_block, output_block);
+    NAM_PROFILE_ADD(activation);
     if (this->_activation_post_film)
     {
       // Use Process() for blocks and copy result back
@@ -165,6 +176,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma
     if (this->_layer1x1)
     {
       this->_layer1x1->process_(this->_z.topRows(bottleneck), num_frames);
+      NAM_PROFILE_ADD(layer1x1);
     }
   }
   else if (this->_gating_mode == GatingMode::BLENDED)
@@ -174,6 +186,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma
     auto input_block = this->_z.leftCols(num_frames);
     auto output_block = this->_z.topRows(bottleneck).leftCols(num_frames);
     this->_blending_activation->apply(input_block, output_block);
+    NAM_PROFILE_ADD(activation);
     if (this->_activation_post_film)
     {
       // Use Process() for blocks and copy result back
@@ -184,6 +197,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma
     if (this->_layer1x1)
     {
       this->_layer1x1->process_(this->_z.topRows(bottleneck), num_frames);
+      NAM_PROFILE_ADD(layer1x1);
       if (this->_layer1x1_post_film)
       {
         Eigen::MatrixXf& layer1x1_output = this->_layer1x1->GetOutput();
@@ -207,6 +221,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma
       Eigen::MatrixXf& head1x1_output = this->_head1x1->GetOutput();
       this->_head1x1_post_film->Process_(head1x1_output, condition, num_frames);
     }
+    NAM_PROFILE_ADD(head1x1);
     this->_output_head.leftCols(num_frames).noalias() = this->_head1x1->GetOutput().leftCols(num_frames);
   }
   else // No head 1x1
@@ -230,6 +245,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma
     // If layer1x1 is inactive, residual connection is just the input (identity)
     this->_output_next_layer.leftCols(num_frames).noalias() = input.leftCols(num_frames);
   }
+  NAM_PROFILE_ADD(copies);
 }
 
 // LayerArray =================================================================
@@ -298,9 +314,12 @@ void nam::wavenet::_LayerArray::Process(const Eigen::MatrixXf& layer_inputs, con
 void nam::wavenet::_LayerArray::ProcessInner(const Eigen::MatrixXf& layer_inputs, const Eigen::MatrixXf& condition,
                                              const int num_frames)
 {
+  NAM_PROFILE_START();
+
   // Process rechannel and get output
   this->_rechannel.process_(layer_inputs, num_frames);
   Eigen::MatrixXf& rechannel_output = _rechannel.GetOutput();
+  NAM_PROFILE_ADD(rechannel);
 
   // Process layers
   for (size_t i = 0; i < this->_layers.size(); i++)
@@ -329,7 +348,11 @@ void nam::wavenet::_LayerArray::ProcessInner(const Eigen::MatrixXf& layer_inputs
     this->_layers[last_layer].GetOutputNextLayer().leftCols(num_frames);
 
   // Process head rechannel
+#ifdef NAM_PROFILING
+  _prof_start = nam::profiling::get_time_us();  // Reset timer for accurate head_rechannel measurement
+#endif
   _head_rechannel.process_(this->_head_inputs, num_frames);
+  NAM_PROFILE_ADD(rechannel);
 }
 
 
diff --git a/tools/benchmodel.cpp b/tools/benchmodel.cpp
index 39c14b0..42556f5 100644
--- a/tools/benchmodel.cpp
+++ b/tools/benchmodel.cpp
@@ -4,6 +4,7 @@
 
 #include "NAM/dsp.h"
 #include "NAM/get_dsp.h"
+#include "NAM/profiling.h"
 
 using std::chrono::duration;
 using std::chrono::duration_cast;
@@ -62,6 +63,9 @@ int main(int argc, char* argv[])
       outputPtrs[ch] = outputBuffers[ch].data();
     }
 
+    // Reset profiling counters before benchmark
+    nam::profiling::reset();
+
     std::cout << "Running benchmark\n";
     auto t1 = high_resolution_clock::now();
     for (size_t i = 0; i < numBuffers; i++)
@@ -80,6 +84,9 @@ int main(int argc, char* argv[])
 
     std::cout << ms_int.count() << "ms\n";
     std::cout << ms_double.count() << "ms\n";
+
+    // Print profiling breakdown if enabled
+    nam::profiling::print_results();
   }
   else
   {

From 5c535246e531c7721c449b6c3c60871cbee92831 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jo=C3=A3o=20Felipe=20Santos?= <santosjf@pm.me>
Date: Fri, 6 Feb 2026 10:20:55 -0800
Subject: [PATCH 2/7] Fixed build flags for benchmodel

---
 tools/CMakeLists.txt | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt
index 8118e08..94adfce 100644
--- a/tools/CMakeLists.txt
+++ b/tools/CMakeLists.txt
@@ -16,6 +16,8 @@ add_executable(run_tests run_tests.cpp test/allocation_tracking.cpp ${NAM_SOURCE
 # Compile run_tests without optimizations to ensure allocation tracking works correctly
 # Also ensure assertions are enabled (NDEBUG is not defined) so tests actually run
 set_target_properties(run_tests PROPERTIES COMPILE_OPTIONS "-O0")
+# Benchmodel should be built with NAM_PROFILING set
+target_compile_definitions(benchmodel PRIVATE NAM_PROFILING)
 # Ensure assertions are enabled for run_tests by removing NDEBUG if it was set
 # Release/RelWithDebInfo/MinSizeRel build types automatically define NDEBUG
 # We use a compile option to undefine it, which works on GCC, Clang, and MSVC
@@ -61,4 +63,4 @@ endif()
 # /Users/steve/src/NeuralAmpModelerCore/Dependencies/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h
 # Don't let this break my build on debug:
 set_source_files_properties(../NAM/dsp.cpp PROPERTIES COMPILE_FLAGS "-Wno-error")
-set_source_files_properties(../NAM/conv1d.cpp PROPERTIES COMPILE_FLAGS "-Wno-error")
\ No newline at end of file
+set_source_files_properties(../NAM/conv1d.cpp PROPERTIES COMPILE_FLAGS "-Wno-error")

From 84deb8ab6a17c3dbb77e7efe32716c06d1b2db84 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jo=C3=A3o=20Felipe=20Santos?= <santosjf@pm.me>
Date: Mon, 9 Feb 2026 10:31:07 -0800
Subject: [PATCH 3/7] Added a command line tool to output memory usage for a
 given .nam file

---
 tools/CMakeLists.txt   |   2 +
 tools/memory_usage.cpp | 611 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 613 insertions(+)
 create mode 100644 tools/memory_usage.cpp

diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt
index 94adfce..8f02f20 100644
--- a/tools/CMakeLists.txt
+++ b/tools/CMakeLists.txt
@@ -12,6 +12,7 @@ include_directories(tools ${NAM_DEPS_PATH}/nlohmann)
 
 add_executable(loadmodel loadmodel.cpp ${NAM_SOURCES})
 add_executable(benchmodel benchmodel.cpp ${NAM_SOURCES})
+add_executable(memory_usage memory_usage.cpp)
 add_executable(run_tests run_tests.cpp test/allocation_tracking.cpp ${NAM_SOURCES})
 # Compile run_tests without optimizations to ensure allocation tracking works correctly
 # Also ensure assertions are enabled (NDEBUG is not defined) so tests actually run
@@ -34,6 +35,7 @@ endif()
 source_group(NAM ${CMAKE_CURRENT_SOURCE_DIR} FILES ${NAM_SOURCES})
 
 target_compile_features(${TOOLS} PUBLIC cxx_std_20)
+target_compile_features(memory_usage PUBLIC cxx_std_20)
 
 set_target_properties(${TOOLS}
 	PROPERTIES
diff --git a/tools/memory_usage.cpp b/tools/memory_usage.cpp
new file mode 100644
index 0000000..853ca8f
--- /dev/null
+++ b/tools/memory_usage.cpp
@@ -0,0 +1,611 @@
+// memory_usage.cpp — Report total memory required to host a NAM model at runtime.
+//
+// Usage: memory_usage <model_path> [--buffer-size N]
+//
+// Parses the .nam JSON config and computes weight memory (learned parameters stored
+// in Eigen matrices/vectors) and buffer memory (intermediate computation/state that
+// depends on maxBufferSize) without instantiating the model.
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <fstream>
+#include <numeric>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include "json.hpp"
+
+using json = nlohmann::json;
+
+static constexpr int DEFAULT_BUFFER_SIZE = 2048;
+static constexpr long INPUT_BUFFER_SAFETY_FACTOR = 32;
+
+// ─── Result accumulator ─────────────────────────────────────────────────────
+
+struct MemoryResult
+{
+  size_t weight_bytes = 0;
+  size_t buffer_bytes = 0;
+
+  void add_weights(size_t floats) { weight_bytes += floats * sizeof(float); }
+  void add_buffers(size_t floats) { buffer_bytes += floats * sizeof(float); }
+
+  MemoryResult& operator+=(const MemoryResult& o)
+  {
+    weight_bytes += o.weight_bytes;
+    buffer_bytes += o.buffer_bytes;
+    return *this;
+  }
+};
+
+// ─── Conv1x1 ────────────────────────────────────────────────────────────────
+
+// Conv1x1 stores either a full (out_channels x in_channels) matrix (possibly
+// block-diagonal when grouped), or a depthwise weight vector when groups ==
+// in_channels == out_channels.
+static MemoryResult conv1x1_memory(int in_ch, int out_ch, bool bias, int groups, int M)
+{
+  MemoryResult r;
+  bool depthwise = (groups == in_ch && in_ch == out_ch);
+  if (depthwise)
+    r.add_weights(in_ch); // _depthwise_weight(in_ch)
+  else
+    r.add_weights((size_t)out_ch * in_ch); // _weight(out_ch, in_ch)
+  if (bias)
+    r.add_weights(out_ch); // _bias(out_ch)
+  r.add_buffers((size_t)out_ch * M); // _output(out_ch, M)
+  return r;
+}
+
+// ─── Conv1D ─────────────────────────────────────────────────────────────────
+
+// Conv1D stores kernel_size weight matrices (each out_ch x in_ch) or depthwise
+// vectors, plus a bias vector, a ring buffer, and an output buffer.
+static MemoryResult conv1d_memory(int in_ch, int out_ch, int kernel_size, bool bias, int dilation, int groups, int M)
+{
+  MemoryResult r;
+  bool depthwise = (groups == in_ch && in_ch == out_ch);
+  if (depthwise)
+    r.add_weights((size_t)kernel_size * in_ch); // _depthwise_weight[k](in_ch)
+  else
+    r.add_weights((size_t)kernel_size * out_ch * in_ch); // _weight[k](out_ch, in_ch)
+  if (bias)
+    r.add_weights(out_ch); // _bias(out_ch)
+
+  // Ring buffer: storage = (in_ch, 2 * max_lookback + M)
+  // max_lookback = (kernel_size - 1) * dilation
+  long max_lookback = (kernel_size > 0) ? (long)(kernel_size - 1) * dilation : 0;
+  long ring_storage = 2 * max_lookback + M;
+  r.add_buffers((size_t)in_ch * ring_storage); // _input_buffer._storage
+
+  // Output buffer: (out_ch, M)
+  r.add_buffers((size_t)out_ch * M); // _output
+
+  return r;
+}
+
+// ─── FiLM ───────────────────────────────────────────────────────────────────
+
+struct FiLMParams
+{
+  bool active = false;
+  bool shift = true;
+  int groups = 1;
+};
+
+static MemoryResult film_memory(int condition_dim, int input_dim, const FiLMParams& fp, int M)
+{
+  if (!fp.active)
+    return {};
+  MemoryResult r;
+  int scale_shift_dim = fp.shift ? 2 * input_dim : input_dim;
+  // _cond_to_scale_shift is a Conv1x1(condition_dim -> scale_shift_dim, bias=true, groups)
+  r += conv1x1_memory(condition_dim, scale_shift_dim, true, fp.groups, M);
+  // _output(input_dim, M)
+  r.add_buffers((size_t)input_dim * M);
+  return r;
+}
+
+// ─── BatchNorm ──────────────────────────────────────────────────────────────
+
+static MemoryResult batchnorm_memory(int dim)
+{
+  MemoryResult r;
+  // Stores scale(dim) + loc(dim) derived from running_mean, running_var, weight, bias, eps
+  // The source values are consumed from weights array; only scale + loc are stored at runtime.
+  r.add_weights(2 * (size_t)dim);
+  return r;
+}
+
+// ─── LSTM ───────────────────────────────────────────────────────────────────
+
+static MemoryResult lstm_memory(const json& config)
+{
+  MemoryResult r;
+  int num_layers = config["num_layers"];
+  int input_size = config["input_size"];
+  int hidden_size = config["hidden_size"];
+  int in_channels = config.value("in_channels", 1);
+  int out_channels = config.value("out_channels", 1);
+
+  for (int i = 0; i < num_layers; i++)
+  {
+    int cell_input = (i == 0) ? input_size : hidden_size;
+    // _w(4*H, I+H)
+    r.add_weights((size_t)4 * hidden_size * (cell_input + hidden_size));
+    // _b(4*H)
+    r.add_weights(4 * (size_t)hidden_size);
+    // _xh(I+H) — stores initial hidden state in the hidden portion
+    r.add_weights((size_t)(cell_input + hidden_size));
+    // _c(H) — initial cell state
+    r.add_weights((size_t)hidden_size);
+
+    // Buffers: _ifgo(4*H)
+    r.add_buffers(4 * (size_t)hidden_size);
+    // Note: _xh and _c are also modified during inference but they are
+    // loaded from weights (initial state), so counted as weights above.
+  }
+
+  // _head_weight(out_channels, hidden_size)
+  r.add_weights((size_t)out_channels * hidden_size);
+  // _head_bias(out_channels)
+  r.add_weights(out_channels);
+
+  // Top-level buffers: _input(input_size), _output(out_channels)
+  r.add_buffers(input_size);
+  r.add_buffers(out_channels);
+
+  return r;
+}
+
+// ─── Linear ─────────────────────────────────────────────────────────────────
+
+static MemoryResult linear_memory(const json& config)
+{
+  MemoryResult r;
+  int receptive_field = config["receptive_field"];
+  bool bias = config["bias"];
+  int in_channels = config.value("in_channels", 1);
+  int out_channels = config.value("out_channels", 1);
+
+  // _weight(receptive_field)
+  r.add_weights(receptive_field);
+  // _bias (scalar float)
+  if (bias)
+    r.add_weights(1);
+
+  // Buffer base: _input_buffers = in_channels vectors of (32 * receptive_field)
+  r.add_buffers((size_t)in_channels * INPUT_BUFFER_SAFETY_FACTOR * receptive_field);
+  // _output_buffers: resized per-call, not pre-allocated to a fixed size
+  // (depends on num_frames, not maxBufferSize)
+
+  return r;
+}
+
+// ─── ConvNet ────────────────────────────────────────────────────────────────
+
+static MemoryResult convnet_memory(const json& config, int M)
+{
+  MemoryResult r;
+  int channels = config["channels"];
+  std::vector<int> dilations = config["dilations"];
+  bool batchnorm = config["batchnorm"];
+  int groups = config.value("groups", 1);
+  int in_channels = config.value("in_channels", 1);
+  int out_channels = config.value("out_channels", 1);
+
+  int max_dilation = *std::max_element(dilations.begin(), dilations.end());
+
+  // Buffer base class: _input_buffers = in_channels * (32 * max_dilation)
+  int receptive_field = max_dilation; // passed to Buffer as receptive_field
+  r.add_buffers((size_t)in_channels * INPUT_BUFFER_SAFETY_FACTOR * receptive_field);
+
+  // ConvNet blocks
+  for (size_t i = 0; i < dilations.size(); i++)
+  {
+    int block_in = (i == 0) ? in_channels : channels;
+    int block_out = channels;
+    // Conv1D with kernel_size=2, bias=!batchnorm
+    r += conv1d_memory(block_in, block_out, 2, !batchnorm, dilations[i], groups, M);
+    // Optional batchnorm
+    if (batchnorm)
+      r += batchnorm_memory(block_out);
+    // _output(out_channels, M) per block
+    r.add_buffers((size_t)block_out * M);
+  }
+
+  // _block_vals: 1 entry of (channels, buffer_size)
+  // buffer_size = input_buffers[0].size() = 32 * receptive_field
+  long buffer_size = INPUT_BUFFER_SAFETY_FACTOR * receptive_field;
+  r.add_buffers((size_t)channels * buffer_size);
+
+  // _head: weight(out_channels, channels) + bias(out_channels)
+  r.add_weights((size_t)out_channels * channels);
+  r.add_weights(out_channels);
+
+  // _head_output is resized per-call, not a fixed pre-allocation
+
+  return r;
+}
+
+// ─── WaveNet helpers ────────────────────────────────────────────────────────
+
+static FiLMParams parse_film_params(const json& layer_config, const std::string& key)
+{
+  FiLMParams fp;
+  if (layer_config.find(key) == layer_config.end() || layer_config[key] == false)
+    return fp; // inactive
+  const json& fc = layer_config[key];
+  fp.active = fc.value("active", true);
+  fp.shift = fc.value("shift", true);
+  fp.groups = fc.value("groups", 1);
+  return fp;
+}
+
+enum class GatingMode
+{
+  NONE,
+  GATED,
+  BLENDED
+};
+
+static std::vector<GatingMode> parse_gating_modes(const json& layer_config, size_t num_layers)
+{
+  std::vector<GatingMode> modes;
+
+  auto parse_str = [](const std::string& s) -> GatingMode {
+    if (s == "gated")
+      return GatingMode::GATED;
+    if (s == "blended")
+      return GatingMode::BLENDED;
+    return GatingMode::NONE;
+  };
+
+  if (layer_config.find("gating_mode") != layer_config.end())
+  {
+    if (layer_config["gating_mode"].is_array())
+    {
+      for (const auto& gm : layer_config["gating_mode"])
+        modes.push_back(parse_str(gm.get<std::string>()));
+    }
+    else
+    {
+      GatingMode mode = parse_str(layer_config["gating_mode"].get<std::string>());
+      modes.resize(num_layers, mode);
+    }
+  }
+  else if (layer_config.find("gated") != layer_config.end())
+  {
+    bool gated = layer_config["gated"];
+    modes.resize(num_layers, gated ? GatingMode::GATED : GatingMode::NONE);
+  }
+  else
+  {
+    modes.resize(num_layers, GatingMode::NONE);
+  }
+  return modes;
+}
+
+// WaveNet _Layer memory
+static MemoryResult wavenet_layer_memory(int condition_size, int channels, int bottleneck, int kernel_size, int dilation,
+                                         GatingMode gating_mode, int groups_input, int groups_input_mixin,
+                                         bool layer1x1_active, int layer1x1_groups, bool head1x1_active,
+                                         int head1x1_out_channels, int head1x1_groups, const FiLMParams& conv_pre_film,
+                                         const FiLMParams& conv_post_film, const FiLMParams& input_mixin_pre_film,
+                                         const FiLMParams& input_mixin_post_film,
+                                         const FiLMParams& activation_pre_film,
+                                         const FiLMParams& activation_post_film,
+                                         const FiLMParams& layer1x1_post_film, const FiLMParams& head1x1_post_film,
+                                         int M)
+{
+  MemoryResult r;
+  bool gated = (gating_mode != GatingMode::NONE);
+  int conv_out = gated ? 2 * bottleneck : bottleneck;
+
+  // _conv: Conv1D(channels -> conv_out, kernel_size, bias=true, dilation, groups_input)
+  r += conv1d_memory(channels, conv_out, kernel_size, true, dilation, groups_input, M);
+
+  // _input_mixin: Conv1x1(condition_size -> conv_out, bias=false, groups_input_mixin)
+  r += conv1x1_memory(condition_size, conv_out, false, groups_input_mixin, M);
+
+  // _layer1x1 (optional): Conv1x1(bottleneck -> channels, bias=true, layer1x1_groups)
+  if (layer1x1_active)
+    r += conv1x1_memory(bottleneck, channels, true, layer1x1_groups, M);
+
+  // _head1x1 (optional): Conv1x1(bottleneck -> head1x1_out_channels, bias=true, head1x1_groups)
+  if (head1x1_active)
+    r += conv1x1_memory(bottleneck, head1x1_out_channels, true, head1x1_groups, M);
+
+  // Buffers: _z(conv_out, M)
+  r.add_buffers((size_t)conv_out * M);
+  // _output_next_layer(channels, M)
+  r.add_buffers((size_t)channels * M);
+  // _output_head: if head1x1 active -> (head1x1_out_channels, M), else (bottleneck, M)
+  int head_out = head1x1_active ? head1x1_out_channels : bottleneck;
+  r.add_buffers((size_t)head_out * M);
+
+  // FiLM modules (up to 8)
+  r += film_memory(condition_size, channels, conv_pre_film, M);
+  r += film_memory(condition_size, conv_out, conv_post_film, M);
+  r += film_memory(condition_size, condition_size, input_mixin_pre_film, M);
+  r += film_memory(condition_size, conv_out, input_mixin_post_film, M);
+  r += film_memory(condition_size, conv_out, activation_pre_film, M);
+  r += film_memory(condition_size, bottleneck, activation_post_film, M);
+  if (layer1x1_active)
+    r += film_memory(condition_size, channels, layer1x1_post_film, M);
+  if (head1x1_active)
+    r += film_memory(condition_size, head1x1_out_channels, head1x1_post_film, M);
+
+  return r;
+}
+
+// WaveNet _LayerArray memory
+static MemoryResult wavenet_layer_array_memory(const json& layer_config, int M)
+{
+  MemoryResult r;
+  int input_size = layer_config["input_size"];
+  int condition_size = layer_config["condition_size"];
+  int head_size = layer_config["head_size"];
+  int channels = layer_config["channels"];
+  int bottleneck = layer_config.value("bottleneck", channels);
+  int kernel_size = layer_config["kernel_size"];
+  std::vector<int> dilations = layer_config["dilations"];
+  size_t num_layers = dilations.size();
+  bool head_bias = layer_config["head_bias"];
+
+  int groups_input = layer_config.value("groups_input", 1);
+  int groups_input_mixin = layer_config.value("groups_input_mixin", 1);
+
+  // layer1x1 params
+  bool layer1x1_active = true;
+  int layer1x1_groups = 1;
+  if (layer_config.find("layer1x1") != layer_config.end())
+  {
+    layer1x1_active = layer_config["layer1x1"]["active"];
+    layer1x1_groups = layer_config["layer1x1"]["groups"];
+  }
+
+  // head1x1 params
+  bool head1x1_active = false;
+  int head1x1_out_channels = channels;
+  int head1x1_groups = 1;
+  if (layer_config.find("head1x1") != layer_config.end())
+  {
+    head1x1_active = layer_config["head1x1"]["active"];
+    head1x1_out_channels = layer_config["head1x1"]["out_channels"];
+    head1x1_groups = layer_config["head1x1"]["groups"];
+  }
+
+  // Gating modes
+  std::vector<GatingMode> gating_modes = parse_gating_modes(layer_config, num_layers);
+
+  // FiLM params
+  FiLMParams conv_pre = parse_film_params(layer_config, "conv_pre_film");
+  FiLMParams conv_post = parse_film_params(layer_config, "conv_post_film");
+  FiLMParams input_mixin_pre = parse_film_params(layer_config, "input_mixin_pre_film");
+  FiLMParams input_mixin_post = parse_film_params(layer_config, "input_mixin_post_film");
+  FiLMParams activation_pre = parse_film_params(layer_config, "activation_pre_film");
+  FiLMParams activation_post = parse_film_params(layer_config, "activation_post_film");
+  FiLMParams layer1x1_post = parse_film_params(layer_config, "layer1x1_post_film");
+  FiLMParams head1x1_post = parse_film_params(layer_config, "head1x1_post_film");
+
+  // _rechannel: Conv1x1(input_size -> channels, bias=false)
+  r += conv1x1_memory(input_size, channels, false, 1, M);
+
+  // Per-layer
+  for (size_t i = 0; i < num_layers; i++)
+  {
+    r += wavenet_layer_memory(condition_size, channels, bottleneck, kernel_size, dilations[i], gating_modes[i],
+                              groups_input, groups_input_mixin, layer1x1_active, layer1x1_groups, head1x1_active,
+                              head1x1_out_channels, head1x1_groups, conv_pre, conv_post, input_mixin_pre,
+                              input_mixin_post, activation_pre, activation_post, layer1x1_post, head1x1_post, M);
+  }
+
+  // _head_rechannel: Conv1x1(head_output_size -> head_size, bias=head_bias)
+  int head_output_size = head1x1_active ? head1x1_out_channels : bottleneck;
+  r += conv1x1_memory(head_output_size, head_size, head_bias, 1, M);
+
+  // Buffers: _layer_outputs(channels, M)
+  r.add_buffers((size_t)channels * M);
+  // _head_inputs(head_output_size, M)
+  r.add_buffers((size_t)head_output_size * M);
+
+  return r;
+}
+
+// Forward declaration for recursive condition_dsp
+static MemoryResult compute_memory(const std::string& architecture, const json& config, int M);
+
+// WaveNet top-level memory
+static MemoryResult wavenet_memory(const json& config, int M)
+{
+  MemoryResult r;
+  int in_channels = config.value("in_channels", 1);
+
+  // condition_dim = in_channels (from _get_condition_dim())
+  int condition_dim = in_channels;
+
+  // Recursive condition_dsp
+  bool has_condition_dsp = false;
+  int condition_output_channels = condition_dim;
+  if (config.find("condition_dsp") != config.end())
+  {
+    has_condition_dsp = true;
+    const json& cdsp = config["condition_dsp"];
+    std::string cdsp_arch = cdsp["architecture"];
+    json cdsp_config = cdsp["config"];
+    r += compute_memory(cdsp_arch, cdsp_config, M);
+    // condition_output_channels comes from the condition_dsp's output
+    // For now, we use condition_size from first layer as a proxy
+    // (the actual model validates this match)
+    if (config.find("layers") != config.end() && config["layers"].size() > 0)
+      condition_output_channels = config["layers"][0]["condition_size"];
+  }
+
+  // _condition_input(condition_dim, M)
+  r.add_buffers((size_t)condition_dim * M);
+
+  // _condition_output
+  if (!has_condition_dsp)
+  {
+    // _condition_output(condition_dim, M)
+    r.add_buffers((size_t)condition_dim * M);
+  }
+  else
+  {
+    // _condition_output(condition_output_channels, M)
+    r.add_buffers((size_t)condition_output_channels * M);
+    // _condition_dsp_input_buffers: condition_dim vectors of M doubles/floats
+    // These are std::vector<std::vector<NAM_SAMPLE>> where NAM_SAMPLE is double
+    r.add_buffers((size_t)condition_dim * M * (sizeof(double) / sizeof(float)));
+    // _condition_dsp_output_buffers: condition_output_channels vectors of M doubles
+    r.add_buffers((size_t)condition_output_channels * M * (sizeof(double) / sizeof(float)));
+    // Pointer arrays are negligible
+  }
+
+  // Layer arrays
+  for (const auto& layer_config : config["layers"])
+    r += wavenet_layer_array_memory(layer_config, M);
+
+  // _head_scale (1 float) — it's a weight
+  r.add_weights(1);
+
+  return r;
+}
+
+// ─── Dispatch ───────────────────────────────────────────────────────────────
+
+static MemoryResult compute_memory(const std::string& architecture, const json& config, int M)
+{
+  if (architecture == "WaveNet")
+    return wavenet_memory(config, M);
+  if (architecture == "LSTM")
+    return lstm_memory(config);
+  if (architecture == "ConvNet")
+    return convnet_memory(config, M);
+  if (architecture == "Linear")
+    return linear_memory(config);
+  throw std::runtime_error("Unknown architecture: " + architecture);
+}
+
+// ─── Formatting helpers ─────────────────────────────────────────────────────
+
+static std::string format_bytes(size_t bytes)
+{
+  char buf[64];
+  if (bytes < 1024)
+    snprintf(buf, sizeof(buf), "%zu bytes", bytes);
+  else if (bytes < 1024 * 1024)
+    snprintf(buf, sizeof(buf), "%.2f KB", bytes / 1024.0);
+  else
+    snprintf(buf, sizeof(buf), "%.2f MB", bytes / (1024.0 * 1024.0));
+  return buf;
+}
+
+static std::string format_with_commas(size_t n)
+{
+  std::string s = std::to_string(n);
+  int insert_pos = (int)s.length() - 3;
+  while (insert_pos > 0)
+  {
+    s.insert(insert_pos, ",");
+    insert_pos -= 3;
+  }
+  return s;
+}
+
+// ─── Main ───────────────────────────────────────────────────────────────────
+
+int main(int argc, char* argv[])
+{
+  if (argc < 2)
+  {
+    fprintf(stderr, "Usage: memory_usage <model_path> [--buffer-size N]\n");
+    return 1;
+  }
+
+  const char* model_path = argv[1];
+  int buffer_size = DEFAULT_BUFFER_SIZE;
+
+  for (int i = 2; i < argc; i++)
+  {
+    if (strcmp(argv[i], "--buffer-size") == 0 && i + 1 < argc)
+    {
+      buffer_size = atoi(argv[++i]);
+      if (buffer_size <= 0)
+      {
+        fprintf(stderr, "Error: buffer size must be positive\n");
+        return 1;
+      }
+    }
+    else
+    {
+      fprintf(stderr, "Unknown option: %s\n", argv[i]);
+      return 1;
+    }
+  }
+
+  // Read and parse JSON
+  std::ifstream file(model_path);
+  if (!file.is_open())
+  {
+    fprintf(stderr, "Error: cannot open %s\n", model_path);
+    return 1;
+  }
+
+  json j;
+  try
+  {
+    file >> j;
+  }
+  catch (const std::exception& e)
+  {
+    fprintf(stderr, "Error parsing JSON: %s\n", e.what());
+    return 1;
+  }
+
+  std::string architecture = j["architecture"];
+  json config = j["config"];
+
+  // Cross-check: count weights in JSON
+  size_t json_weight_count = 0;
+  if (j.find("weights") != j.end())
+    json_weight_count = j["weights"].size();
+
+  double sample_rate = -1.0;
+  if (j.find("sample_rate") != j.end())
+    sample_rate = j["sample_rate"];
+
+  try
+  {
+    MemoryResult result = compute_memory(architecture, config, buffer_size);
+    size_t total = result.weight_bytes + result.buffer_bytes;
+
+    printf("Model: %s\n", model_path);
+    printf("Architecture: %s\n", architecture.c_str());
+    if (sample_rate > 0)
+      printf("Sample rate: %.0f Hz\n", sample_rate);
+    printf("\n");
+    printf("Weights:  %s bytes (%s)\n", format_with_commas(result.weight_bytes).c_str(),
+           format_bytes(result.weight_bytes).c_str());
+    printf("Buffers:  %s bytes (%s)  [buffer size: %d]\n", format_with_commas(result.buffer_bytes).c_str(),
+           format_bytes(result.buffer_bytes).c_str(), buffer_size);
+    printf("Total:    %s bytes (%s)\n", format_with_commas(total).c_str(), format_bytes(total).c_str());
+
+    if (json_weight_count > 0)
+    {
+      printf("\nJSON weights: %zu values (%s bytes)\n", json_weight_count,
+             format_with_commas(json_weight_count * sizeof(float)).c_str());
+    }
+  }
+  catch (const std::exception& e)
+  {
+    fprintf(stderr, "Error computing memory: %s\n", e.what());
+    return 1;
+  }
+
+  return 0;
+}

From 725c8ca814d491c0709f2c129df807e9e7356c59 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jo=C3=A3o=20Felipe=20Santos?=
 <5733+jfsantos@users.noreply.github.com>
Date: Wed, 11 Feb 2026 16:42:43 -0800
Subject: [PATCH 4/7] Bugfix - checking that condition_dsp is not null in the
 JSON (#220)

---
 NAM/wavenet.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/NAM/wavenet.cpp b/NAM/wavenet.cpp
index 6112169..a955d4e 100644
--- a/NAM/wavenet.cpp
+++ b/NAM/wavenet.cpp
@@ -596,7 +596,7 @@ std::unique_ptr<nam::DSP> nam::wavenet::Factory(const nlohmann::json& config, st
                                                 const double expectedSampleRate)
 {
   std::unique_ptr<nam::DSP> condition_dsp = nullptr;
-  if (config.find("condition_dsp") != config.end())
+  if (config.find("condition_dsp") != config.end() && !config["condition_dsp"].is_null())
   {
     const nlohmann::json& condition_dsp_json = config["condition_dsp"];
     condition_dsp = nam::get_dsp(condition_dsp_json);

From c95605599a100ff68aaba8af014a3124c4bfbfb9 Mon Sep 17 00:00:00 2001
From: Steven Atkinson <steven@atkinson.mn>
Date: Wed, 11 Feb 2026 19:15:32 -0800
Subject: [PATCH 5/7] [BUGFIX, BREAKING] Make activation base class abstract,
 fix PReLU implementation (#223)

* Make activation apply method pure virtual instead of no-op default

* Fix bugs

* Refactor to throw std::invalid_argument in debug mode, add tests
---
 NAM/activations.h               | 35 ++++++++++++++++---
 tools/run_tests.cpp             |  5 +--
 tools/test/test_activations.cpp | 61 +++++++++++++++++++++++++++++----
 3 files changed, 89 insertions(+), 12 deletions(-)

diff --git a/NAM/activations.h b/NAM/activations.h
index 68d5025..a05c456 100644
--- a/NAM/activations.h
+++ b/NAM/activations.h
@@ -2,6 +2,8 @@
 
 #include <cassert>
 #include <cmath> // expf
+#include <iostream> // std::cerr (kept for potential debug use)
+#include <stdexcept> // std::invalid_argument
 #include <functional>
 #include <memory>
 #include <optional>
@@ -150,7 +152,7 @@ class Activation
   {
     apply(block.data(), block.rows() * block.cols());
   }
-  virtual void apply(float* data, long size) {}
+  virtual void apply(float* data, long size) = 0;
 
   static Ptr get_activation(const std::string name);
   static Ptr get_activation(const ActivationConfig& config);
@@ -165,13 +167,13 @@ class Activation
   static std::unordered_map<std::string, Ptr> _activations;
 };
 
-// identity function activation
+// identity function activation--"do nothing"
 class ActivationIdentity : public nam::activations::Activation
 {
 public:
   ActivationIdentity() = default;
   ~ActivationIdentity() = default;
-  // Inherit the default apply methods which do nothing
+  virtual void apply(float* data, long size) override {};
 };
 
 class ActivationTanh : public Activation
@@ -276,6 +278,24 @@ class ActivationPReLU : public Activation
   }
   ActivationPReLU(std::vector<float> ns) { negative_slopes = ns; }
 
+  void apply(float* data, long size) override
+  {
+    // Assume column-major (this is brittle)
+#ifndef NDEBUG
+    if (size % negative_slopes.size() != 0)
+    {
+      throw std::invalid_argument("PReLU.apply(*data, size) was given an array of size " + std::to_string(size)
+                                  + " but the activation has " + std::to_string(negative_slopes.size())
+                                  + " channels, which doesn't divide evenly.");
+    }
+#endif
+    for (long pos = 0; pos < size; pos++)
+    {
+      const float negative_slope = negative_slopes[pos % negative_slopes.size()];
+      data[pos] = leaky_relu(data[pos], negative_slope);
+    }
+  }
+
   void apply(Eigen::MatrixXf& matrix) override
   {
     // Matrix is organized as (channels, time_steps)
@@ -285,7 +305,14 @@ class ActivationPReLU : public Activation
     std::vector<float> slopes_for_channels = negative_slopes;
 
     // Fail loudly if input has more channels than activation
-    assert(actual_channels == negative_slopes.size());
+#ifndef NDEBUG
+    if (actual_channels != negative_slopes.size())
+    {
+      throw std::invalid_argument("PReLU: Received " + std::to_string(actual_channels)
+                                  + " channels, but activation has " + std::to_string(negative_slopes.size())
+                                  + " channels");
+    }
+#endif
 
     // Apply each negative slope to its corresponding channel
     for (unsigned long channel = 0; channel < actual_channels; channel++)
diff --git a/tools/run_tests.cpp b/tools/run_tests.cpp
index 9b3bdec..1c8c34a 100644
--- a/tools/run_tests.cpp
+++ b/tools/run_tests.cpp
@@ -48,8 +48,9 @@ int main()
 
   test_activations::TestPReLU::test_core_function();
   test_activations::TestPReLU::test_per_channel_behavior();
-  // This is enforced by an assert so it doesn't need to be tested
-  // test_activations::TestPReLU::test_wrong_number_of_channels();
+  test_activations::TestPReLU::test_wrong_number_of_channels_matrix();
+  test_activations::TestPReLU::test_wrong_size_array();
+  test_activations::TestPReLU::test_valid_array_size();
 
   // Typed ActivationConfig tests
   test_activations::TestTypedActivationConfig::test_simple_config();
diff --git a/tools/test/test_activations.cpp b/tools/test/test_activations.cpp
index a8dd705..55d61ff 100644
--- a/tools/test/test_activations.cpp
+++ b/tools/test/test_activations.cpp
@@ -220,9 +220,10 @@ class TestPReLU
     assert(fabs(data(1, 2) - 0.0f) < 1e-6); // 0.0 (unchanged)
   }
 
-  static void test_wrong_number_of_channels()
+  static void test_wrong_number_of_channels_matrix()
   {
-    // Test that we fail when we have more channels than slopes
+    // Test that we fail when matrix has more channels than slopes
+    // Note: This validation only runs in debug builds (#ifndef NDEBUG)
     Eigen::MatrixXf data(3, 2); // 3 channels, 2 time steps
 
     // Initialize with test data
@@ -232,21 +233,69 @@ class TestPReLU
     std::vector<float> slopes = {0.01f, 0.05f};
     nam::activations::ActivationPReLU prelu(slopes);
 
-    // Apply the activation
+#ifndef NDEBUG
+    // In debug mode, this should throw std::invalid_argument
     bool caught = false;
     try
     {
       prelu.apply(data);
     }
-    catch (const std::runtime_error& e)
+    catch (const std::invalid_argument& e)
     {
       caught = true;
     }
-    catch (...)
+    assert(caught && "Expected std::invalid_argument for channel count mismatch");
+#endif
+  }
+
+  static void test_wrong_size_array()
+  {
+    // Test that we fail when array size doesn't divide evenly by channel count
+    // Note: This validation only runs in debug builds (#ifndef NDEBUG)
+
+    // Create PReLU with 2 channels
+    std::vector<float> slopes = {0.01f, 0.05f};
+    nam::activations::ActivationPReLU prelu(slopes);
+
+    // Array of size 5 doesn't divide evenly by 2 channels
+    std::vector<float> data = {-1.0f, -2.0f, 0.5f, 1.0f, -0.5f};
+
+#ifndef NDEBUG
+    // In debug mode, this should throw std::invalid_argument
+    bool caught = false;
+    try
+    {
+      prelu.apply(data.data(), (long)data.size());
+    }
+    catch (const std::invalid_argument& e)
     {
+      caught = true;
     }
+    assert(caught && "Expected std::invalid_argument for array size mismatch");
+#endif
+  }
+
+  static void test_valid_array_size()
+  {
+    // Test that valid array sizes work correctly
+
+    // Create PReLU with 2 channels
+    std::vector<float> slopes = {0.1f, 0.2f};
+    nam::activations::ActivationPReLU prelu(slopes);
+
+    // Array of size 6 divides evenly by 2 channels (3 time steps per channel)
+    std::vector<float> data = {-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f};
+
+    // Should not throw
+    prelu.apply(data.data(), (long)data.size());
 
-    assert(caught);
+    // Verify results: alternating between slope 0.1 and 0.2
+    assert(fabs(data[0] - (-0.1f)) < 1e-6); // channel 0, slope 0.1
+    assert(fabs(data[1] - (-0.2f)) < 1e-6); // channel 1, slope 0.2
+    assert(fabs(data[2] - (-0.1f)) < 1e-6); // channel 0, slope 0.1
+    assert(fabs(data[3] - (-0.2f)) < 1e-6); // channel 1, slope 0.2
+    assert(fabs(data[4] - (-0.1f)) < 1e-6); // channel 0, slope 0.1
+    assert(fabs(data[5] - (-0.2f)) < 1e-6); // channel 1, slope 0.2
   }
 };
 

From d499f74fb75711641dfc6207ba5931902f58661f Mon Sep 17 00:00:00 2001
From: Steven Atkinson <steven@atkinson.mn>
Date: Wed, 11 Feb 2026 19:46:05 -0800
Subject: [PATCH 6/7] Add TONE3000 support note in README.md (#224)

---
 README.md               |  9 +++++++++
 media/tone3000-logo.svg | 31 +++++++++++++++++++++++++++++++
 2 files changed, 40 insertions(+)
 create mode 100644 media/tone3000-logo.svg

diff --git a/README.md b/README.md
index 6e5aeda..f28bc97 100644
--- a/README.md
+++ b/README.md
@@ -12,3 +12,12 @@ You should be able to run it locally to test if you'd like.
 
 ## Sharp edges
 This library uses [Eigen](http://eigen.tuxfamily.org) to do the linear algebra routines that its neural networks require. Since these models hold their parameters as eigen object members, there is a risk with certain compilers and compiler optimizations that their memory is not aligned properly. This can be worked around by providing two preprocessor macros: `EIGEN_MAX_ALIGN_BYTES 0` and `EIGEN_DONT_VECTORIZE`, though this will probably harm performance. See [Structs Having Eigen Members](http://eigen.tuxfamily.org/dox-3.2/group__TopicStructHavingEigenMembers.html) for more information. This is being tracked as [Issue 67](https://github.com/sdatkinson/NeuralAmpModelerCore/issues/67).
+
+## Sponsors
+
+<div align="center">
+  <img src="media/tone3000-logo.svg" alt="Tone3000 logo">
+</div>
+
+Development of version 0.4.0 of this library has been generously supported by [TONE3000](https://tone3000.com). 
+**Thank you!**
diff --git a/media/tone3000-logo.svg b/media/tone3000-logo.svg
new file mode 100644
index 0000000..469592d
--- /dev/null
+++ b/media/tone3000-logo.svg
@@ -0,0 +1,31 @@
+<svg width="210" height="32" viewBox="0 0 210 32" fill="none" xmlns="http://www.w3.org/2000/svg">
+<g clip-path="url(#clip0_3586_9677)">
+<path d="M56.2412 25.6642V22.1787C55.8214 22.7661 55.3277 23.3051 54.753 23.7924C52.2832 25.8933 48.3415 27.0036 43.357 27.0036C40.2836 27.0036 35.8996 26.5164 32.484 24.2023C30.4017 22.792 29.2203 20.678 28.8878 17.7798C28.8231 18.322 28.7877 18.8868 28.7877 19.4839C28.7877 19.7259 28.7942 19.9615 28.8005 20.1939C28.7942 20.3811 28.7877 20.5683 28.7877 20.7619C28.7877 24.5961 29.9982 27.2781 32.484 28.9658C35.8996 31.2832 40.2805 31.7673 43.357 31.7673C48.3415 31.7673 52.2832 30.657 54.753 28.5561C55.3277 28.0686 55.8214 27.5297 56.2412 26.9423V31.4478H67.0884V26.6841H56.2412V25.6642Z" fill="#0000FF"/>
+<path d="M109.76 8.45915C110.519 7.73947 111.484 7.12303 112.653 6.61634C115.164 5.5287 118.238 4.97682 121.789 4.97682C125.524 4.97682 128.733 5.59326 131.332 6.80674C132.707 7.44899 133.714 8.18485 134.353 9.00458C134.618 8.71413 134.902 8.43981 135.209 8.17838C137.691 6.08381 141.623 4.97682 146.585 4.97682C151.002 4.97682 154.663 5.88693 157.165 7.60713C157.762 8.01379 158.285 8.47207 158.746 8.96909C159.002 8.69154 159.276 8.42689 159.573 8.17838C162.055 6.08381 165.988 4.97682 170.949 4.97682C175.337 4.97682 178.849 5.86112 181.39 7.60713C182.013 8.03639 182.565 8.51404 183.049 9.0401C183.32 8.73673 183.614 8.44948 183.937 8.17838C186.419 6.08381 190.352 4.97682 195.314 4.97682C199.73 4.97682 203.387 5.88693 205.889 7.60713C207.978 9.0401 209.227 11.0185 209.683 13.6165C209.815 12.8742 209.88 12.0803 209.88 11.2315C209.88 7.4264 208.575 4.68313 205.886 2.84029C203.384 1.12011 199.726 0.209991 195.31 0.209991C190.348 0.209991 186.416 1.31698 183.934 3.41154C183.614 3.68264 183.32 3.96989 183.046 4.27326C182.562 3.7472 182.013 3.26955 181.387 2.84029C178.849 1.09429 175.334 0.209991 170.946 0.209991C165.984 0.209991 162.052 1.31698 159.57 3.41154C159.273 3.66327 158.998 3.92793 158.743 4.20225C158.282 3.70201 157.758 3.24696 157.161 2.84029C154.659 1.12011 150.998 0.209991 146.582 0.209991C141.62 0.209991 137.688 1.31698 135.206 3.41154C134.896 3.67297 134.612 3.9473 134.35 4.23775C133.711 3.42124 132.707 2.68538 131.328 2.03991C128.73 0.82642 125.521 0.209991 121.785 0.209991C118.05 0.209991 115.161 0.761872 112.649 1.8495C111.481 2.3562 110.516 2.97263 109.757 3.69232V0.655367H78.7941V5.41898H109.757V8.45593L109.76 8.45915Z" fill="#FF0000"/>
+<path d="M33.6398 7.07784C36.0805 5.69976 39.4153 4.98005 43.4153 4.98005C47.8347 4.98005 51.4925 5.89016 53.9943 7.61035C54.8983 8.23001 55.644 8.95617 56.2381 9.78885V5.42221H66.0424L78.791 15.7046V10.941L66.0424 0.658594H56.2381V5.02524C55.644 4.19256 54.8983 3.4664 53.9943 2.84676C51.4925 1.12656 47.8347 0.216442 43.4153 0.216442C39.4122 0.216442 36.0805 0.936145 33.6398 2.31424V0.658594H0.094574V5.42221H33.6398V7.07784Z" fill="#FF0000"/>
+<path d="M148.71 14.0554C147.657 13.2357 146.773 13.2357 146.392 13.2357C145.746 13.2357 145.071 13.5132 144.387 14.0651C143.967 14.4008 143.096 15.3754 142.795 17.7379C143.096 20.1197 143.977 21.0814 144.397 21.4106C145.449 22.2336 146.292 22.2336 146.653 22.2336C147.325 22.2336 148.016 21.9528 148.706 21.4009C149.187 21.0169 149.93 20.0551 150.185 17.725C149.933 15.3787 149.187 14.4266 148.71 14.0522V14.0554Z" fill="#FF0000"/>
+<path d="M122.279 21.0233C122.018 21.0233 121.408 21.0492 120.268 21.2105L119.31 21.3461C120.236 22.2272 121.347 22.2336 121.86 22.2336C122.531 22.2336 123.435 22.1658 124.475 21.6333C124.539 21.601 124.597 21.5623 124.659 21.5268C123.961 21.1718 123.251 21.0201 122.279 21.0201V21.0233Z" fill="#0000FF"/>
+<path d="M108.482 16.6954V17.9735V18.2575V18.9965L116.34 18.4285L116.378 18.28L115.497 18.4059L115.917 16.6825L108.482 15.966V16.6954Z" fill="#0000FF"/>
+<path d="M173.074 14.0554C172.021 13.2357 171.137 13.2357 170.756 13.2357C170.11 13.2357 169.435 13.5132 168.751 14.0651C168.331 14.4008 167.46 15.3754 167.159 17.7379C167.46 20.1197 168.341 21.0814 168.761 21.4106C169.813 22.2336 170.656 22.2336 171.017 22.2336C171.689 22.2336 172.379 21.9528 173.07 21.4009C173.551 21.0169 174.294 20.0551 174.549 17.725C174.297 15.3787 173.551 14.4266 173.074 14.0522V14.0554Z" fill="#FF0000"/>
+<path d="M197.434 14.0554C196.382 13.2357 195.497 13.2357 195.117 13.2357C194.471 13.2357 193.796 13.5132 193.112 14.0651C192.692 14.4008 191.82 15.3754 191.52 17.7379C191.82 20.1197 192.702 21.0814 193.121 21.4106C194.174 22.2336 195.017 22.2336 195.378 22.2336C196.05 22.2336 196.74 21.9528 197.431 21.4009C197.912 21.0169 198.655 20.0551 198.91 17.725C198.658 15.3787 197.912 14.4266 197.434 14.0522V14.0554Z" fill="#FF0000"/>
+<path d="M45.5426 14.0554C44.4902 13.2357 43.6055 13.2357 43.2248 13.2357C42.5791 13.2357 41.9044 13.5132 41.2199 14.0651C40.7453 14.4492 40.0028 15.4109 39.7511 17.7379C40.0028 20.0809 40.7485 21.0362 41.2296 21.4106C42.282 22.2336 43.1247 22.2336 43.4862 22.2336C44.1577 22.2336 44.8486 21.9528 45.5394 21.4009C46.0203 21.0169 46.7628 20.0551 47.0148 17.725C46.7628 15.3787 46.0172 14.4266 45.5394 14.0522L45.5426 14.0554Z" fill="#FF0000"/>
+<path d="M11.3291 10.4698H0.094574V13.9554V15.2334H11.3291V13.9554V10.4698Z" fill="#0000FF"/>
+<path d="M11.329 30.1697V31.4478H22.3473V30.1697V26.6841H11.329V30.1697Z" fill="#0000FF"/>
+<path d="M30.1081 10.4698H22.3505V15.2334H28.8102C28.9007 13.3874 29.3333 11.8092 30.1081 10.4698Z" fill="#0000FF"/>
+<path d="M206.645 23.7924C204.175 25.8933 200.233 27.0036 195.249 27.0036C192.172 27.0036 187.791 26.5164 184.376 24.2023C183.879 23.8666 183.436 23.4858 183.04 23.0695C182.804 23.3212 182.552 23.5633 182.281 23.7924C179.811 25.8933 175.869 27.0036 170.885 27.0036C167.808 27.0036 163.428 26.5164 160.012 24.2023C159.515 23.8666 159.073 23.4858 158.675 23.0695C158.44 23.3212 158.188 23.5633 157.917 23.7924C155.447 25.8933 151.505 27.0036 146.521 27.0036C143.444 27.0036 139.063 26.5164 135.648 24.2023C135.345 23.9957 135.06 23.773 134.792 23.5375C134.182 24.028 133.44 24.4832 132.562 24.8994C129.634 26.2936 126.057 27.0036 121.928 27.0036C118.047 27.0036 114.712 26.3712 112.02 25.1253C111.481 24.8768 110.99 24.609 110.548 24.3248V26.6841H79.8015L67.0819 16.4049V21.1686L79.8015 31.4478H110.548V29.0885C110.99 29.3726 111.481 29.6405 112.02 29.889C114.712 31.1348 118.044 31.7673 121.928 31.7673C126.057 31.7673 129.637 31.0604 132.562 29.663C133.44 29.2434 134.182 28.7884 134.792 28.3011C135.057 28.5367 135.345 28.7593 135.648 28.9658C139.063 31.2832 143.444 31.7673 146.521 31.7673C151.505 31.7673 155.447 30.657 157.917 28.5561C158.188 28.3268 158.44 28.0848 158.675 27.833C159.069 28.2495 159.515 28.6303 160.012 28.9658C163.428 31.2832 167.808 31.7673 170.885 31.7673C175.869 31.7673 179.811 30.657 182.281 28.5561C182.552 28.3268 182.804 28.0848 183.04 27.833C183.433 28.2495 183.879 28.6303 184.376 28.9658C187.791 31.2832 192.172 31.7673 195.249 31.7673C200.233 31.7673 204.175 30.657 206.645 28.5561C208.821 26.7067 209.876 24.1571 209.876 20.7619C209.876 20.5489 209.87 20.3391 209.863 20.1326C209.873 19.9196 209.876 19.7033 209.876 19.4839C209.876 18.8771 209.841 18.2994 209.773 17.7443C209.46 20.2843 208.43 22.2755 206.645 23.7924Z" fill="#0000FF"/>
+<path d="M117.802 18.3253L118.315 19.7518C118.577 20.4779 118.925 20.9846 119.309 21.3493L120.268 21.2138C121.404 21.0524 122.018 21.0266 122.279 21.0266C123.248 21.0266 123.961 21.1783 124.659 21.5333C125.44 21.0815 125.824 20.4941 125.824 19.7421C125.824 18.9901 125.443 18.4447 124.656 18.0477C123.958 17.6927 123.248 17.541 122.276 17.541C122.015 17.541 121.404 17.5668 120.265 17.7282L116.375 18.2833L116.339 18.4318L117.799 18.3253H117.802Z" fill="#0000FF"/>
+<path d="M108.304 10.4698H89.6154V13.2131H108.482V15.9693L115.917 16.6857L116.746 13.2809L107.613 12.403L108.062 11.0733C108.13 10.87 108.211 10.6699 108.301 10.4763L108.304 10.4698Z" fill="#0000FF"/>
+<path d="M148.71 10.5699C147.657 9.75011 146.773 9.75011 146.392 9.75011C145.746 9.75011 145.071 10.0277 144.387 10.5796C143.877 10.9894 142.692 12.3385 142.692 15.9951C142.692 16.6503 142.731 17.228 142.795 17.7411C143.096 15.3787 143.967 14.404 144.387 14.0683C145.071 13.5197 145.746 13.2389 146.392 13.2389C146.769 13.2389 147.657 13.2389 148.71 14.0587C149.188 14.433 149.933 15.3819 150.185 17.7314C150.24 17.2215 150.272 16.647 150.272 15.9983C150.272 12.3417 149.294 11.0314 148.71 10.5763V10.5699Z" fill="#0000FF"/>
+<path d="M89.6154 18.2575V21.6398H108.007C107.817 21.2428 107.681 20.8297 107.604 20.3972L107.368 19.0772L108.482 18.9966V18.2575H89.6154Z" fill="#0000FF"/>
+<path d="M45.5426 10.5699C44.4902 9.75011 43.6056 9.75011 43.2246 9.75011C42.5791 9.75011 41.9042 10.0277 41.2199 10.5796C40.6389 11.0475 39.6638 12.3707 39.6638 15.9951C39.6638 16.6503 39.6961 17.228 39.7511 17.7411C40.0028 15.4142 40.7422 14.4524 41.2199 14.0683C41.9042 13.5197 42.5791 13.2389 43.2246 13.2389C43.6024 13.2389 44.4902 13.2389 45.5426 14.0587C46.0203 14.433 46.7662 15.3819 47.0179 17.7314C47.0727 17.2215 47.105 16.647 47.105 15.9983C47.105 12.3417 46.127 11.0314 45.5426 10.5763V10.5699Z" fill="#0000FF"/>
+<path d="M123.493 10.0793C123.006 9.84371 122.531 9.74688 121.86 9.74688C121.14 9.74688 120.636 9.85016 120.12 10.1051C119.658 10.3343 119.048 10.8216 118.764 11.8576L118.496 12.8419L120.556 12.9194C122.115 12.9065 122.864 12.6612 123.293 12.474C123.722 12.2868 124.291 11.9576 124.291 11.2153C124.291 10.473 123.69 10.1697 123.493 10.0728V10.0793Z" fill="#0000FF"/>
+<path d="M173.074 10.5699C172.021 9.75011 171.137 9.75011 170.756 9.75011C170.11 9.75011 169.435 10.0277 168.751 10.5796C168.241 10.9894 167.056 12.3385 167.056 15.9951C167.056 16.6503 167.095 17.228 167.16 17.7411C167.46 15.3787 168.331 14.404 168.751 14.0683C169.435 13.5197 170.11 13.2389 170.756 13.2389C171.134 13.2389 172.021 13.2389 173.074 14.0587C173.551 14.433 174.297 15.3819 174.549 17.7314C174.604 17.2215 174.636 16.647 174.636 15.9983C174.636 12.3417 173.658 11.0314 173.074 10.5763V10.5699Z" fill="#0000FF"/>
+<path d="M197.434 10.5699C196.382 9.75011 195.497 9.75011 195.117 9.75011C194.471 9.75011 193.796 10.0277 193.112 10.5796C192.602 10.9894 191.417 12.3385 191.417 15.9951C191.417 16.6503 191.456 17.228 191.52 17.7411C191.82 15.3787 192.692 14.404 193.112 14.0683C193.796 13.5197 194.471 13.2389 195.117 13.2389C195.494 13.2389 196.382 13.2389 197.434 14.0587C197.912 14.433 198.658 15.3819 198.91 17.7314C198.965 17.2215 198.997 16.647 198.997 15.9983C198.997 12.3417 198.019 11.0314 197.434 10.5763V10.5699Z" fill="#0000FF"/>
+<path d="M209.676 13.6197C209.221 11.0217 207.972 9.0401 205.883 7.61038C203.381 5.89018 199.723 4.98005 195.307 4.98005C190.345 4.98005 186.413 6.08705 183.93 8.1816C183.611 8.4527 183.317 8.73995 183.043 9.04332C182.558 8.51726 182.01 8.03961 181.383 7.61038C178.846 5.86436 175.33 4.98005 170.943 4.98005C165.981 4.98005 162.049 6.08705 159.567 8.1816C159.27 8.43336 158.995 8.69799 158.74 8.97231C158.278 8.47207 157.755 8.01702 157.158 7.61038C154.656 5.89018 150.995 4.98005 146.579 4.98005C141.617 4.98005 137.685 6.08705 135.202 8.1816C134.893 8.44303 134.608 8.71736 134.347 9.00783C133.708 8.1913 132.704 7.45544 131.325 6.80999C128.726 5.59648 125.517 4.98005 121.782 4.98005C118.047 4.98005 115.158 5.53193 112.646 6.61956C111.478 7.12625 110.512 7.74269 109.754 8.4624V5.42543H78.791V15.7079L66.0424 5.42543H56.2381V9.79207C55.644 8.95942 54.8983 8.23326 53.9943 7.6136C51.4925 5.89341 47.8347 4.98327 43.4153 4.98327C39.4122 4.98327 36.0805 5.70298 33.6398 7.08107V5.42543H0.094574V10.473H11.3291V26.6873H22.3473V10.473H30.105C29.3268 11.8124 28.8976 13.3906 28.8072 15.2367C28.7943 15.4884 28.7878 15.7401 28.7878 16.0016C28.7878 16.6277 28.8234 17.2183 28.8879 17.7831C29.2204 20.6812 30.402 22.792 32.4841 24.2055C35.8996 26.5227 40.2805 27.0068 43.3571 27.0068C48.3415 27.0068 52.2835 25.8967 54.7531 23.7957C55.3278 23.3083 55.8217 22.7694 56.2413 22.182V26.6873H67.0885V16.4082L79.8081 26.6873H110.554V24.3282C110.997 24.6121 111.487 24.88 112.027 25.1285C114.719 26.3743 118.05 27.0068 121.934 27.0068C126.063 27.0068 129.643 26.3001 132.568 24.9025C133.446 24.4832 134.189 24.028 134.799 23.5407C135.064 23.7763 135.351 23.999 135.654 24.2055C139.07 26.5227 143.451 27.0068 146.527 27.0068C151.512 27.0068 155.454 25.8967 157.923 23.7957C158.195 23.5665 158.446 23.3245 158.682 23.0727C159.076 23.4891 159.521 23.8699 160.019 24.2055C163.434 26.5227 167.815 27.0068 170.891 27.0068C175.876 27.0068 179.818 25.8967 182.287 23.7957C182.558 23.5665 182.81 23.3245 183.046 23.0727C183.44 23.4891 183.885 23.8699 184.382 24.2055C187.798 26.5227 192.179 27.0068 195.255 27.0068C200.24 27.0068 204.182 25.8967 206.651 23.7957C208.437 22.2788 209.467 20.2875 209.78 17.7476C209.848 17.1924 209.883 16.6115 209.883 16.0016C209.883 15.156 209.815 14.3653 209.686 13.623L209.676 13.6197ZM116.746 13.2744L115.917 16.6793L115.497 18.4027L116.375 18.2768L120.265 17.7217C121.401 17.5604 122.015 17.5345 122.276 17.5345C123.245 17.5345 123.958 17.6862 124.656 18.0412C125.443 18.4414 125.824 18.9966 125.824 19.7356C125.824 20.4747 125.44 21.075 124.659 21.5268C124.597 21.5623 124.539 21.5978 124.475 21.6333C123.435 22.1626 122.531 22.2336 121.86 22.2336C121.347 22.2336 120.233 22.2304 119.31 21.3461C118.929 20.9814 118.58 20.4747 118.315 19.7485L117.802 18.322L116.343 18.4285L108.485 18.9966L107.371 19.0772L107.607 20.3972C107.684 20.8297 107.82 21.2428 108.01 21.6398H89.6189V18.2575H108.485V13.2099H89.6189V10.4666H108.307C108.217 10.6602 108.136 10.8603 108.069 11.0637L107.62 12.3933L116.753 13.2712L116.746 13.2744ZM123.293 12.4772C122.864 12.6644 122.118 12.9097 120.556 12.9226L118.496 12.8452L118.764 11.8608C119.045 10.8248 119.655 10.3375 120.12 10.1084C120.636 9.85338 121.14 9.75011 121.86 9.75011C122.531 9.75011 123.006 9.84693 123.493 10.0825C123.69 10.1793 124.291 10.4666 124.291 11.225C124.291 11.9835 123.723 12.2965 123.293 12.4837V12.4772ZM150.185 17.725C149.933 20.0551 149.188 21.0169 148.706 21.4009C148.016 21.9528 147.322 22.2336 146.653 22.2336C146.292 22.2336 145.449 22.2336 144.397 21.4106C143.974 21.0814 143.096 20.1164 142.795 17.7379C142.731 17.2247 142.692 16.647 142.692 15.9919C142.692 12.3352 143.877 10.9894 144.387 10.5763C145.072 10.0277 145.746 9.74688 146.392 9.74688C146.769 9.74688 147.657 9.74688 148.71 10.5666C149.294 11.0217 150.272 12.332 150.272 15.9886C150.272 16.6406 150.24 17.2118 150.185 17.7217V17.725ZM174.549 17.725C174.297 20.0551 173.551 21.0169 173.071 21.4009C172.38 21.9528 171.685 22.2336 171.017 22.2336C170.656 22.2336 169.813 22.2336 168.761 21.4106C168.338 21.0814 167.46 20.1164 167.16 17.7379C167.095 17.2247 167.056 16.647 167.056 15.9919C167.056 12.3352 168.241 10.9894 168.751 10.5763C169.435 10.0277 170.11 9.74688 170.756 9.74688C171.134 9.74688 172.021 9.74688 173.074 10.5666C173.658 11.0217 174.636 12.332 174.636 15.9886C174.636 16.6406 174.604 17.2118 174.549 17.7217V17.725ZM198.91 17.725C198.658 20.0551 197.912 21.0169 197.431 21.4009C196.74 21.9528 196.046 22.2336 195.378 22.2336C195.017 22.2336 194.174 22.2336 193.121 21.4106C192.699 21.0814 191.821 20.1164 191.52 17.7379C191.456 17.2247 191.417 16.647 191.417 15.9919C191.417 12.3352 192.602 10.9894 193.112 10.5763C193.796 10.0277 194.471 9.74688 195.117 9.74688C195.494 9.74688 196.382 9.74688 197.434 10.5666C198.019 11.0217 198.997 12.332 198.997 15.9886C198.997 16.6406 198.965 17.2118 198.91 17.7217V17.725ZM47.018 17.725C46.7663 20.0551 46.0204 21.0169 45.5427 21.4009C44.8518 21.9528 44.1578 22.2336 43.4895 22.2336C43.1279 22.2336 42.2852 22.2336 41.2328 21.4106C40.7519 21.0362 40.006 20.0809 39.7543 17.7379C39.6995 17.2247 39.6673 16.647 39.6673 15.9919C39.6673 12.3707 40.6421 11.0443 41.2231 10.5763C41.9077 10.0277 42.5823 9.74688 43.2281 9.74688C43.6056 9.74688 44.4934 9.74688 45.5458 10.5666C46.1302 11.0217 47.1084 12.332 47.1084 15.9886C47.1084 16.6406 47.0762 17.2118 47.0211 17.7217L47.018 17.725Z" fill="#FFFF00"/>
+</g>
+<defs>
+<clipPath id="clip0_3586_9677">
+<rect width="210" height="32" fill="white"/>
+</clipPath>
+</defs>
+</svg>

From d68514da4ae5d68146c6cceaac1d1ca22c27f110 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jo=C3=A3o=20Felipe=20Santos?= <santosjf@pm.me>
Date: Thu, 12 Feb 2026 11:16:10 -0800
Subject: [PATCH 7/7] Replace hardcoded profiling struct with dynamic registry

  The Timings struct hardcoded 14 named fields, requiring manual updates
  to reset(), total(), print_results(), and every call site whenever a
  category was added or removed. Replace with a flat-array registry where
  types are registered at file scope via register_type(), returning an
  integer index for O(1) accumulation in the hot path.

  Also adds NAM_PROFILE_RESTART() macro to replace a raw #ifdef block
  in wavenet.cpp.
---
 NAM/profiling.cpp |  45 ++++++++++++++-
 NAM/profiling.h   | 136 ++++++++++++----------------------------------
 NAM/wavenet.cpp   |  40 ++++++++------
 3 files changed, 101 insertions(+), 120 deletions(-)

diff --git a/NAM/profiling.cpp b/NAM/profiling.cpp
index 65d430e..885872e 100644
--- a/NAM/profiling.cpp
+++ b/NAM/profiling.cpp
@@ -9,7 +9,8 @@
 namespace nam {
 namespace profiling {
 
-Timings g_timings;
+ProfilingEntry g_entries[MAX_PROFILING_TYPES] = {};
+int g_num_entries = 0;
 
 // CPU frequency in MHz (Daisy runs at 480 MHz)
 static constexpr uint32_t CPU_FREQ_MHZ = 480;
@@ -30,7 +31,8 @@ uint32_t get_time_us() {
 namespace nam {
 namespace profiling {
 
-Timings g_timings;
+ProfilingEntry g_entries[MAX_PROFILING_TYPES] = {};
+int g_num_entries = 0;
 
 uint32_t get_time_us() {
   using namespace std::chrono;
@@ -44,4 +46,43 @@ uint32_t get_time_us() {
 
 #endif // ARM check
 
+namespace nam {
+namespace profiling {
+
+int register_type(const char* name) {
+  int idx = g_num_entries++;
+  g_entries[idx].name = name;
+  g_entries[idx].accumulated_us = 0;
+  return idx;
+}
+
+void reset() {
+  for (int i = 0; i < g_num_entries; i++)
+    g_entries[i].accumulated_us = 0;
+}
+
+void print_results() {
+  uint32_t total = 0;
+  for (int i = 0; i < g_num_entries; i++)
+    total += g_entries[i].accumulated_us;
+
+  printf("\nProfiling breakdown:\n");
+  printf("%-12s %8s %6s\n", "Category", "Time(ms)", "%");
+  printf("%-12s %8s %6s\n", "--------", "--------", "----");
+
+  for (int i = 0; i < g_num_entries; i++) {
+    uint32_t us = g_entries[i].accumulated_us;
+    if (us > 0) {
+      uint32_t pct = total > 0 ? (us * 100 / total) : 0;
+      printf("%-12s %8.1f %5lu%%\n", g_entries[i].name, us / 1000.0f, (unsigned long)pct);
+    }
+  }
+
+  printf("%-12s %8s %6s\n", "--------", "--------", "----");
+  printf("%-12s %8.1f %5s\n", "Total", total / 1000.0f, "100%");
+}
+
+} // namespace profiling
+} // namespace nam
+
 #endif // NAM_PROFILING
diff --git a/NAM/profiling.h b/NAM/profiling.h
index 71031fe..4db570b 100644
--- a/NAM/profiling.h
+++ b/NAM/profiling.h
@@ -1,14 +1,17 @@
 #pragma once
 
-// Comprehensive profiling for NAM building blocks
+// Dynamic profiling registry for NAM building blocks
 // Enable with -DNAM_PROFILING
 //
 // Usage:
-//   1. Call nam::profiling::reset() before benchmark
-//   2. Run model processing
-//   3. Call nam::profiling::print_results() to display breakdown
-//
-// Categories cover all WaveNet operations including FiLM modulation.
+//   1. Register profiling types at file scope (static init):
+//        static int PROF_FOO = nam::profiling::register_type("Foo");
+//   2. Call nam::profiling::reset() before benchmark
+//   3. In hot path:
+//        NAM_PROFILE_START();
+//        // ... code ...
+//        NAM_PROFILE_ADD(PROF_FOO);
+//   4. Call nam::profiling::print_results() to display breakdown
 
 #ifdef NAM_PROFILING
 
@@ -18,120 +21,48 @@
 namespace nam {
 namespace profiling {
 
-// Timing accumulators (in microseconds)
-struct Timings {
-  // Dilated convolution (Conv1D)
-  uint32_t conv1d = 0;
-
-  // Pointwise convolutions (Conv1x1 variants)
-  uint32_t input_mixin = 0;   // Input mixing Conv1x1
-  uint32_t layer1x1 = 0;      // Layer 1x1 (residual projection)
-  uint32_t head1x1 = 0;       // Head 1x1 (skip connection projection)
-  uint32_t rechannel = 0;     // Rechannel Conv1x1 (input/output)
-  uint32_t conv1x1 = 0;       // Other Conv1x1 (catch-all for non-WaveNet uses)
-
-  // Activation
-  uint32_t activation = 0;    // Activation functions (tanh, ReLU, Softsign, etc.)
-
-  // FiLM modulation
-  uint32_t film = 0;          // Feature-wise Linear Modulation (scale/shift)
-
-  // Memory operations
-  uint32_t copies = 0;        // Memory copies and additions
-  uint32_t setzero = 0;       // setZero() calls
-  uint32_t ringbuf = 0;       // Ring buffer operations (Write, Read, Advance)
-
-  // Conditioning
-  uint32_t condition = 0;     // Condition DSP processing
-
-  // LSTM (for LSTM models)
-  uint32_t lstm = 0;          // LSTM cell computations
-
-  // Catch-all
-  uint32_t other = 0;         // Everything else
-
-  void reset() {
-    conv1d = 0;
-    input_mixin = 0;
-    layer1x1 = 0;
-    head1x1 = 0;
-    rechannel = 0;
-    conv1x1 = 0;
-    activation = 0;
-    film = 0;
-    copies = 0;
-    setzero = 0;
-    ringbuf = 0;
-    condition = 0;
-    lstm = 0;
-    other = 0;
-  }
-
-  uint32_t total() const {
-    return conv1d + input_mixin + layer1x1 + head1x1 + rechannel + conv1x1 + activation + film + copies + setzero + ringbuf + condition + lstm + other;
-  }
+constexpr int MAX_PROFILING_TYPES = 32;
+
+struct ProfilingEntry {
+  const char* name;
+  uint32_t accumulated_us;
 };
 
-// Global timing accumulator
-extern Timings g_timings;
+extern ProfilingEntry g_entries[MAX_PROFILING_TYPES];
+extern int g_num_entries;
+
+// Register a named profiling type. Returns index for fast accumulation.
+// Called at static-init time or during setup, NOT in the hot path.
+int register_type(const char* name);
 
 // Get current time in microseconds (platform-specific)
 uint32_t get_time_us();
 
-// Reset profiling counters
-inline void reset() { g_timings.reset(); }
+// Reset all profiling counters
+void reset();
 
 // Print profiling results to stdout
-inline void print_results() {
-  const auto& t = g_timings;
-  uint32_t total = t.total();
-
-  printf("\nProfiling breakdown:\n");
-  printf("%-12s %8s %6s\n", "Category", "Time(ms)", "%%");
-  printf("%-12s %8s %6s\n", "--------", "--------", "----");
-
-  auto print_row = [total](const char* name, uint32_t us) {
-    if (us > 0 || total == 0) {
-      uint32_t pct = total > 0 ? (us * 100 / total) : 0;
-      printf("%-12s %8.1f %5lu%%\n", name, us / 1000.0f, (unsigned long)pct);
-    }
-  };
-
-  print_row("Conv1D", t.conv1d);
-  print_row("InputMixin", t.input_mixin);
-  print_row("Layer1x1", t.layer1x1);
-  print_row("Head1x1", t.head1x1);
-  print_row("Rechannel", t.rechannel);
-  print_row("Conv1x1", t.conv1x1);
-  print_row("Activation", t.activation);
-  print_row("FiLM", t.film);
-  print_row("Copies", t.copies);
-  print_row("SetZero", t.setzero);
-  print_row("RingBuf", t.ringbuf);
-  print_row("Condition", t.condition);
-  print_row("LSTM", t.lstm);
-  print_row("Other", t.other);
-
-  printf("%-12s %8s %6s\n", "--------", "--------", "----");
-  printf("%-12s %8.1f %5s\n", "Total", total / 1000.0f, "100%");
-}
+void print_results();
 
 // Helper macros for timing sections
 // Usage:
 //   NAM_PROFILE_START();
 //   // ... code to profile ...
-//   NAM_PROFILE_ADD(conv1d);  // Adds elapsed time to conv1d, resets timer
+//   NAM_PROFILE_ADD(PROF_FOO);  // Adds elapsed time to entry, resets timer
 
 #define NAM_PROFILE_START() uint32_t _prof_start = nam::profiling::get_time_us()
-#define NAM_PROFILE_ADD(category) do { \
+#define NAM_PROFILE_ADD(idx) do { \
   uint32_t _prof_now = nam::profiling::get_time_us(); \
-  nam::profiling::g_timings.category += (_prof_now - _prof_start); \
+  nam::profiling::g_entries[idx].accumulated_us += (_prof_now - _prof_start); \
   _prof_start = _prof_now; \
 } while(0)
 
 // Variant that doesn't reset the timer (for one-shot measurements)
-#define NAM_PROFILE_ADD_NORESTART(category) \
-  nam::profiling::g_timings.category += (nam::profiling::get_time_us() - _prof_start)
+#define NAM_PROFILE_ADD_NORESTART(idx) \
+  nam::profiling::g_entries[idx].accumulated_us += (nam::profiling::get_time_us() - _prof_start)
+
+// Reset the timer without recording (for re-syncing mid-function)
+#define NAM_PROFILE_RESTART() _prof_start = nam::profiling::get_time_us()
 
 } // namespace profiling
 } // namespace nam
@@ -140,8 +71,9 @@ inline void print_results() {
 
 // No-op macros when profiling is disabled
 #define NAM_PROFILE_START() ((void)0)
-#define NAM_PROFILE_ADD(category) ((void)0)
-#define NAM_PROFILE_ADD_NORESTART(category) ((void)0)
+#define NAM_PROFILE_ADD(idx) ((void)0)
+#define NAM_PROFILE_ADD_NORESTART(idx) ((void)0)
+#define NAM_PROFILE_RESTART() ((void)0)
 
 namespace nam {
 namespace profiling {
diff --git a/NAM/wavenet.cpp b/NAM/wavenet.cpp
index a955d4e..d9ca43e 100644
--- a/NAM/wavenet.cpp
+++ b/NAM/wavenet.cpp
@@ -10,6 +10,16 @@
 #include "registry.h"
 #include "wavenet.h"
 
+#ifdef NAM_PROFILING
+static int PROF_CONV1D      = nam::profiling::register_type("Conv1D");
+static int PROF_INPUT_MIXIN = nam::profiling::register_type("InputMixin");
+static int PROF_LAYER1X1    = nam::profiling::register_type("Layer1x1");
+static int PROF_HEAD1X1     = nam::profiling::register_type("Head1x1");
+static int PROF_RECHANNEL   = nam::profiling::register_type("Rechannel");
+static int PROF_ACTIVATION  = nam::profiling::register_type("Activation");
+static int PROF_COPIES      = nam::profiling::register_type("Copies");
+#endif
+
 // Layer ======================================================================
 
 void nam::wavenet::_Layer::SetMaxBufferSize(const int maxBufferSize)
@@ -110,7 +120,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma
     Eigen::MatrixXf& conv_output = this->_conv.GetOutput();
     this->_conv_post_film->Process_(conv_output, condition, num_frames);
   }
-  NAM_PROFILE_ADD(conv1d);
+  NAM_PROFILE_ADD(PROF_CONV1D);
 
   if (this->_input_mixin_pre_film)
   {
@@ -127,11 +137,11 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma
     Eigen::MatrixXf& input_mixin_output = this->_input_mixin.GetOutput();
     this->_input_mixin_post_film->Process_(input_mixin_output, condition, num_frames);
   }
-  NAM_PROFILE_ADD(input_mixin);
+  NAM_PROFILE_ADD(PROF_INPUT_MIXIN);
 
   this->_z.leftCols(num_frames).noalias() =
     _conv.GetOutput().leftCols(num_frames) + _input_mixin.GetOutput().leftCols(num_frames);
-  NAM_PROFILE_ADD(copies);
+  NAM_PROFILE_ADD(PROF_COPIES);
 
   if (this->_activation_pre_film)
   {
@@ -147,7 +157,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma
   if (this->_gating_mode == GatingMode::NONE)
   {
     this->_activation->apply(this->_z.leftCols(num_frames));
-    NAM_PROFILE_ADD(activation);
+    NAM_PROFILE_ADD(PROF_ACTIVATION);
     if (this->_activation_post_film)
     {
       this->_activation_post_film->Process_(this->_z, condition, num_frames);
@@ -155,7 +165,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma
     if (this->_layer1x1)
     {
       this->_layer1x1->process_(this->_z, num_frames);
-      NAM_PROFILE_ADD(layer1x1);
+      NAM_PROFILE_ADD(PROF_LAYER1X1);
     }
   }
   else if (this->_gating_mode == GatingMode::GATED)
@@ -165,7 +175,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma
     auto input_block = this->_z.leftCols(num_frames);
     auto output_block = this->_z.topRows(bottleneck).leftCols(num_frames);
     this->_gating_activation->apply(input_block, output_block);
-    NAM_PROFILE_ADD(activation);
+    NAM_PROFILE_ADD(PROF_ACTIVATION);
     if (this->_activation_post_film)
     {
       // Use Process() for blocks and copy result back
@@ -176,7 +186,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma
     if (this->_layer1x1)
     {
       this->_layer1x1->process_(this->_z.topRows(bottleneck), num_frames);
-      NAM_PROFILE_ADD(layer1x1);
+      NAM_PROFILE_ADD(PROF_LAYER1X1);
     }
   }
   else if (this->_gating_mode == GatingMode::BLENDED)
@@ -186,7 +196,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma
     auto input_block = this->_z.leftCols(num_frames);
     auto output_block = this->_z.topRows(bottleneck).leftCols(num_frames);
     this->_blending_activation->apply(input_block, output_block);
-    NAM_PROFILE_ADD(activation);
+    NAM_PROFILE_ADD(PROF_ACTIVATION);
     if (this->_activation_post_film)
     {
       // Use Process() for blocks and copy result back
@@ -197,7 +207,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma
     if (this->_layer1x1)
     {
       this->_layer1x1->process_(this->_z.topRows(bottleneck), num_frames);
-      NAM_PROFILE_ADD(layer1x1);
+      NAM_PROFILE_ADD(PROF_LAYER1X1);
       if (this->_layer1x1_post_film)
       {
         Eigen::MatrixXf& layer1x1_output = this->_layer1x1->GetOutput();
@@ -221,7 +231,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma
       Eigen::MatrixXf& head1x1_output = this->_head1x1->GetOutput();
       this->_head1x1_post_film->Process_(head1x1_output, condition, num_frames);
     }
-    NAM_PROFILE_ADD(head1x1);
+    NAM_PROFILE_ADD(PROF_HEAD1X1);
     this->_output_head.leftCols(num_frames).noalias() = this->_head1x1->GetOutput().leftCols(num_frames);
   }
   else // No head 1x1
@@ -245,7 +255,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma
     // If layer1x1 is inactive, residual connection is just the input (identity)
     this->_output_next_layer.leftCols(num_frames).noalias() = input.leftCols(num_frames);
   }
-  NAM_PROFILE_ADD(copies);
+  NAM_PROFILE_ADD(PROF_COPIES);
 }
 
 // LayerArray =================================================================
@@ -319,7 +329,7 @@ void nam::wavenet::_LayerArray::ProcessInner(const Eigen::MatrixXf& layer_inputs
   // Process rechannel and get output
   this->_rechannel.process_(layer_inputs, num_frames);
   Eigen::MatrixXf& rechannel_output = _rechannel.GetOutput();
-  NAM_PROFILE_ADD(rechannel);
+  NAM_PROFILE_ADD(PROF_RECHANNEL);
 
   // Process layers
   for (size_t i = 0; i < this->_layers.size(); i++)
@@ -348,11 +358,9 @@ void nam::wavenet::_LayerArray::ProcessInner(const Eigen::MatrixXf& layer_inputs
     this->_layers[last_layer].GetOutputNextLayer().leftCols(num_frames);
 
   // Process head rechannel
-#ifdef NAM_PROFILING
-  _prof_start = nam::profiling::get_time_us();  // Reset timer for accurate head_rechannel measurement
-#endif
+  NAM_PROFILE_RESTART();
   _head_rechannel.process_(this->_head_inputs, num_frames);
-  NAM_PROFILE_ADD(rechannel);
+  NAM_PROFILE_ADD(PROF_RECHANNEL);
 }