Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 31 additions & 4 deletions NAM/activations.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

#include <cassert>
#include <cmath> // expf
#include <iostream> // std::cerr (kept for potential debug use)
#include <stdexcept> // std::invalid_argument
#include <functional>
#include <memory>
#include <optional>
Expand Down Expand Up @@ -150,7 +152,7 @@ class Activation
{
apply(block.data(), block.rows() * block.cols());
}
virtual void apply(float* data, long size) {}
virtual void apply(float* data, long size) = 0;

static Ptr get_activation(const std::string name);
static Ptr get_activation(const ActivationConfig& config);
Expand All @@ -165,13 +167,13 @@ class Activation
static std::unordered_map<std::string, Ptr> _activations;
};

// identity function activation
// identity function activation--"do nothing"
class ActivationIdentity : public nam::activations::Activation
{
public:
ActivationIdentity() = default;
~ActivationIdentity() = default;
// Inherit the default apply methods which do nothing
virtual void apply(float* data, long size) override {};
};

class ActivationTanh : public Activation
Expand Down Expand Up @@ -276,6 +278,24 @@ class ActivationPReLU : public Activation
}
ActivationPReLU(std::vector<float> ns) { negative_slopes = ns; }

void apply(float* data, long size) override
{
// Assume column-major (this is brittle)
#ifndef NDEBUG
if (size % negative_slopes.size() != 0)
{
throw std::invalid_argument("PReLU.apply(*data, size) was given an array of size " + std::to_string(size)
+ " but the activation has " + std::to_string(negative_slopes.size())
+ " channels, which doesn't divide evenly.");
}
#endif
for (long pos = 0; pos < size; pos++)
{
const float negative_slope = negative_slopes[pos % negative_slopes.size()];
data[pos] = leaky_relu(data[pos], negative_slope);
}
}

void apply(Eigen::MatrixXf& matrix) override
{
// Matrix is organized as (channels, time_steps)
Expand All @@ -285,7 +305,14 @@ class ActivationPReLU : public Activation
std::vector<float> slopes_for_channels = negative_slopes;

// Fail loudly if input has more channels than activation
assert(actual_channels == negative_slopes.size());
#ifndef NDEBUG
if (actual_channels != negative_slopes.size())
{
throw std::invalid_argument("PReLU: Received " + std::to_string(actual_channels)
+ " channels, but activation has " + std::to_string(negative_slopes.size())
+ " channels");
}
#endif

// Apply each negative slope to its corresponding channel
for (unsigned long channel = 0; channel < actual_channels; channel++)
Expand Down
4 changes: 4 additions & 0 deletions NAM/conv1d.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include "conv1d.h"
#include "profiling.h"
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this have to be included? Feels weird to include it if I'm doing e.g. a release build.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fair, it should at very least be inside an #ifdef. I'll change that.

#include <stdexcept>

namespace nam
Expand Down Expand Up @@ -143,6 +144,9 @@ void Conv1D::SetMaxBufferSize(const int maxBufferSize)

void Conv1D::Process(const Eigen::MatrixXf& input, const int num_frames)
{
// Note: Profiling is done at the caller level (e.g., _Layer::Process in wavenet.cpp)
// to avoid double-counting when Conv1D is called from within profiled blocks.

// Write input to ring buffer
_input_buffer.Write(input, num_frames);

Expand Down
4 changes: 4 additions & 0 deletions NAM/dsp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include <unordered_set>

#include "dsp.h"
#include "profiling.h"
#include "registry.h"

#define tanh_impl_ std::tanh
Expand Down Expand Up @@ -443,6 +444,9 @@ Eigen::MatrixXf nam::Conv1x1::process(const Eigen::MatrixXf& input, const int nu

void nam::Conv1x1::process_(const Eigen::Ref<const Eigen::MatrixXf>& input, const int num_frames)
{
// Note: Profiling is done at the caller level (e.g., _Layer::Process in wavenet.cpp)
// to provide meaningful categories (input_mixin, layer1x1, head1x1, rechannel)
// rather than generic conv1x1.
assert(num_frames <= _output.cols());

if (this->_is_depthwise)
Expand Down
4 changes: 4 additions & 0 deletions NAM/film.h
Original file line number Diff line number Diff line change
Expand Up @@ -81,9 +81,13 @@ class FiLM
assert(num_frames <= condition.cols());
assert(num_frames <= _output.cols());

// Conv1x1 to compute scale/shift from condition
_cond_to_scale_shift.process_(condition, num_frames);
const auto& scale_shift = _cond_to_scale_shift.GetOutput();

// Note: FiLM time is included in the caller's profiling category (e.g., conv1d, input_mixin)
// rather than tracked separately, to avoid double-counting.

const auto scale = scale_shift.topRows(get_input_dim()).leftCols(num_frames);
if (_do_shift)
{
Expand Down
88 changes: 88 additions & 0 deletions NAM/profiling.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
#include "profiling.h"

#ifdef NAM_PROFILING

#if defined(__ARM_ARCH_7EM__) || defined(ARM_MATH_CM7)
// ARM Cortex-M7: Use DWT cycle counter for precise timing
#include "stm32h7xx.h"

namespace nam {
namespace profiling {

ProfilingEntry g_entries[MAX_PROFILING_TYPES] = {};
int g_num_entries = 0;

// CPU frequency in MHz (Daisy runs at 480 MHz)
static constexpr uint32_t CPU_FREQ_MHZ = 480;

uint32_t get_time_us() {
// DWT->CYCCNT gives cycle count
// Divide by CPU_FREQ_MHZ to get microseconds
return DWT->CYCCNT / CPU_FREQ_MHZ;
}

} // namespace profiling
} // namespace nam

#else
// Non-ARM: Use std::chrono for timing (for testing on desktop)
#include <chrono>

namespace nam {
namespace profiling {

ProfilingEntry g_entries[MAX_PROFILING_TYPES] = {};
int g_num_entries = 0;

uint32_t get_time_us() {
using namespace std::chrono;
static auto start = high_resolution_clock::now();
auto now = high_resolution_clock::now();
return (uint32_t)duration_cast<microseconds>(now - start).count();
}

} // namespace profiling
} // namespace nam

#endif // ARM check

namespace nam {
namespace profiling {

int register_type(const char* name) {
int idx = g_num_entries++;
g_entries[idx].name = name;
g_entries[idx].accumulated_us = 0;
return idx;
}

void reset() {
for (int i = 0; i < g_num_entries; i++)
g_entries[i].accumulated_us = 0;
}

void print_results() {
uint32_t total = 0;
for (int i = 0; i < g_num_entries; i++)
total += g_entries[i].accumulated_us;

printf("\nProfiling breakdown:\n");
printf("%-12s %8s %6s\n", "Category", "Time(ms)", "%");
printf("%-12s %8s %6s\n", "--------", "--------", "----");

for (int i = 0; i < g_num_entries; i++) {
uint32_t us = g_entries[i].accumulated_us;
if (us > 0) {
uint32_t pct = total > 0 ? (us * 100 / total) : 0;
printf("%-12s %8.1f %5lu%%\n", g_entries[i].name, us / 1000.0f, (unsigned long)pct);
}
}

printf("%-12s %8s %6s\n", "--------", "--------", "----");
printf("%-12s %8.1f %5s\n", "Total", total / 1000.0f, "100%");
}

} // namespace profiling
} // namespace nam

#endif // NAM_PROFILING
85 changes: 85 additions & 0 deletions NAM/profiling.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
#pragma once

// Dynamic profiling registry for NAM building blocks
// Enable with -DNAM_PROFILING
//
// Usage:
// 1. Register profiling types at file scope (static init):
// static int PROF_FOO = nam::profiling::register_type("Foo");
// 2. Call nam::profiling::reset() before benchmark
// 3. In hot path:
// NAM_PROFILE_START();
// // ... code ...
// NAM_PROFILE_ADD(PROF_FOO);
// 4. Call nam::profiling::print_results() to display breakdown

#ifdef NAM_PROFILING

#include <cstdint>
#include <cstdio>

namespace nam {
namespace profiling {

constexpr int MAX_PROFILING_TYPES = 32;

struct ProfilingEntry {
const char* name;
uint32_t accumulated_us;
};

extern ProfilingEntry g_entries[MAX_PROFILING_TYPES];
extern int g_num_entries;

// Register a named profiling type. Returns index for fast accumulation.
// Called at static-init time or during setup, NOT in the hot path.
int register_type(const char* name);

// Get current time in microseconds (platform-specific)
uint32_t get_time_us();

// Reset all profiling counters
void reset();

// Print profiling results to stdout
void print_results();

// Helper macros for timing sections
// Usage:
// NAM_PROFILE_START();
// // ... code to profile ...
// NAM_PROFILE_ADD(PROF_FOO); // Adds elapsed time to entry, resets timer

#define NAM_PROFILE_START() uint32_t _prof_start = nam::profiling::get_time_us()
#define NAM_PROFILE_ADD(idx) do { \
uint32_t _prof_now = nam::profiling::get_time_us(); \
nam::profiling::g_entries[idx].accumulated_us += (_prof_now - _prof_start); \
_prof_start = _prof_now; \
} while(0)

// Variant that doesn't reset the timer (for one-shot measurements)
#define NAM_PROFILE_ADD_NORESTART(idx) \
nam::profiling::g_entries[idx].accumulated_us += (nam::profiling::get_time_us() - _prof_start)

// Reset the timer without recording (for re-syncing mid-function)
#define NAM_PROFILE_RESTART() _prof_start = nam::profiling::get_time_us()

} // namespace profiling
} // namespace nam

#else // NAM_PROFILING not defined

// No-op macros when profiling is disabled
#define NAM_PROFILE_START() ((void)0)
#define NAM_PROFILE_ADD(idx) ((void)0)
#define NAM_PROFILE_ADD_NORESTART(idx) ((void)0)
#define NAM_PROFILE_RESTART() ((void)0)

namespace nam {
namespace profiling {
inline void reset() {}
inline void print_results() {}
} // namespace profiling
} // namespace nam

#endif // NAM_PROFILING
Loading