From bd05325a9b8cf8dd51a18fab6698b9a2d83c3f2d Mon Sep 17 00:00:00 2001 From: Jim Huang Date: Sun, 8 Feb 2026 04:09:08 +0800 Subject: [PATCH] Add WCET measurement for O(1) validation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This introduces per-operation latency measurement under pathological scenarios to empirically bound the O(1) constant of TLSF malloc/free. - Four scenarios (malloc/free × worst/best case) with cycle-accurate timing (rdtsc on x86-64, cntvct_el0 on ARM64, mach_absolute_time on macOS). - Supports cold-cache mode (-C) with 64 MB thrash buffer to simulate interrupt-handler WCET. - Raw CSV output (-r) embeds the platform tick unit for correct plot labeling. --- .github/workflows/ci.yml | 74 ++++- Makefile | 25 +- scripts/wcet_plot.py | 233 +++++++++++++ tests/wcet.c | 695 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 1024 insertions(+), 3 deletions(-) create mode 100755 scripts/wcet_plot.py create mode 100644 tests/wcet.c diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index e909b6b..5046c92 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -16,7 +16,7 @@ jobs: wget -qO- https://apt.llvm.org/llvm-snapshot.gpg.key | sudo gpg --dearmor -o /usr/share/keyrings/llvm-snapshot.gpg echo "deb [signed-by=/usr/share/keyrings/llvm-snapshot.gpg] https://apt.llvm.org/noble/ llvm-toolchain-noble main" | sudo tee /etc/apt/sources.list.d/llvm.list sudo apt-get update - sudo apt-get install -y clang-format-20 shfmt + sudo apt-get install -y clang-format-20 shfmt black env: DEBIAN_FRONTEND: noninteractive - name: Check format @@ -62,3 +62,75 @@ jobs: env: CC: ${{ matrix.cc }} CFLAGS: ${{ matrix.cflags }} + + wcet: + needs: [coding-style, build-and-test] + runs-on: ${{ matrix.runner }} + timeout-minutes: 10 + strategy: + fail-fast: false + matrix: + include: + - runner: ubuntu-24.04 + arch: x86-64 + - runner: ubuntu-24.04-arm + arch: arm64 + steps: + - uses: actions/checkout@v6 + - name: Install dependencies + run: sudo apt-get update && sudo apt-get install -y python3-matplotlib + env: + DEBIAN_FRONTEND: noninteractive + - name: Build (release, no assert/check overhead) + run: make all CFLAGS="-Iinclude -std=gnu11 -O2 -Wall -Wextra -Wconversion" + - name: WCET measurement + run: ./build/wcet -i 5000 -w 500 -r build/wcet_raw.csv | tee build/wcet_output.txt + - name: Generate plots + run: python3 scripts/wcet_plot.py build/wcet_raw.csv -o build/wcet + - name: Job summary + if: always() + run: | + { + echo "## WCET Results (${{ matrix.arch }})" + echo "" + echo '```' + cat build/wcet_output.txt 2>/dev/null || echo "WCET measurement failed or was skipped." + echo '```' + echo "" + echo "Box plot and histogram available in the **wcet-${{ matrix.arch }}** artifact." + } >> "$GITHUB_STEP_SUMMARY" + - name: Comment on PR + if: >- + always() && github.event_name == 'pull_request' && + github.event.pull_request.head.repo.full_name == github.repository + env: + GH_TOKEN: ${{ github.token }} + run: | + MARKER="" + { + echo "$MARKER" + echo "## WCET Results (${{ matrix.arch }})" + echo "" + echo '```' + cat build/wcet_output.txt 2>/dev/null || echo "WCET measurement failed or was skipped." + echo '```' + } > /tmp/comment.md + + COMMENT_ID=$(gh api "repos/${{ github.repository }}/issues/${{ github.event.pull_request.number }}/comments" \ + --paginate --jq ".[] | select(.body | startswith(\"$MARKER\")) | .id" | head -1) + + if [ -n "$COMMENT_ID" ]; then + gh api "repos/${{ github.repository }}/issues/comments/$COMMENT_ID" \ + -X PATCH -F body=@/tmp/comment.md + else + gh pr comment "${{ github.event.pull_request.number }}" -F /tmp/comment.md + fi + - name: Upload WCET artifacts + if: always() + uses: actions/upload-artifact@v4 + with: + name: wcet-${{ matrix.arch }} + path: | + build/wcet_raw.csv + build/wcet_boxplot.png + build/wcet_histogram.png diff --git a/Makefile b/Makefile index cec9d13..277cd40 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,8 @@ OUT = build TARGETS = \ test \ - bench + bench \ + wcet TARGETS := $(addprefix $(OUT)/,$(TARGETS)) all: $(TARGETS) @@ -34,6 +35,9 @@ $(OUT)/test: $(OBJS) tests/test.c $(OUT)/bench: $(OBJS) tests/bench.c $(CC) $(CFLAGS) -o $@ -MMD -MF $@.d $^ $(LDFLAGS) -lm +$(OUT)/wcet: $(OBJS) tests/wcet.c + $(CC) $(CFLAGS) -o $@ -MMD -MF $@.d $^ $(LDFLAGS) -lm + $(OUT)/%.o: src/%.c @mkdir -p $(OUT) $(CC) $(CFLAGS) -c -o $@ -MMD -MF $@.d $< @@ -43,10 +47,27 @@ check: $(TARGETS) MALLOC_CHECK_=3 ./build/bench -l 10000 -i 3 -w 1 MALLOC_CHECK_=3 ./build/bench -s 32 -l 10000 -i 3 -w 1 MALLOC_CHECK_=3 ./build/bench -s 10:12345 -l 10000 -i 3 -w 1 + ./build/wcet -i 100 -w 10 + +# Full WCET measurement (10000 iterations, 1000 warmup) +wcet: all + ./build/wcet + +# Quick WCET check for development +wcet-quick: all + ./build/wcet -i 1000 -w 100 + +# WCET with raw output and analysis plots +wcet-plot: all + @mkdir -p $(OUT) + ./build/wcet -i 10000 -r $(OUT)/wcet_raw.csv -c > $(OUT)/wcet_summary.csv + python3 scripts/wcet_plot.py $(OUT)/wcet_raw.csv -o $(OUT)/wcet clean: $(RM) $(TARGETS) $(OBJS) $(deps) + $(RM) $(OUT)/wcet_raw.csv $(OUT)/wcet_summary.csv + $(RM) $(OUT)/wcet_boxplot.png $(OUT)/wcet_histogram.png -.PHONY: all check clean bench bench-quick +.PHONY: all check clean bench bench-quick wcet wcet-quick wcet-plot -include $(deps) diff --git a/scripts/wcet_plot.py b/scripts/wcet_plot.py new file mode 100755 index 0000000..54859fc --- /dev/null +++ b/scripts/wcet_plot.py @@ -0,0 +1,233 @@ +#!/usr/bin/env python3 +""" +WCET analysis plots for TLSF allocator. + +Reads raw sample data from 'wcet -r' output and generates latency +distribution plots. Falls back to text summary when matplotlib is +unavailable (e.g., CI environments). + +Usage: + build/wcet -r samples.csv && python3 scripts/wcet_plot.py samples.csv + build/wcet -r samples.csv && python3 scripts/wcet_plot.py samples.csv -o build/wcet + +The output prefix (-o) controls where PNG files are written. Two plots +are generated: + {prefix}_boxplot.png - Box plot of all scenarios per size class + {prefix}_histogram.png - Latency histograms per scenario +""" + +import argparse +import csv +import sys +from collections import defaultdict + +try: + import matplotlib + + matplotlib.use("Agg") + import matplotlib.pyplot as plt + + HAS_MATPLOTLIB = True +except ImportError: + HAS_MATPLOTLIB = False + + +def read_raw_csv(path): + """Read raw sample CSV: scenario,size,[unit,]value -> (data, unit).""" + data = defaultdict(lambda: defaultdict(list)) + unit = None + with open(path) as f: + reader = csv.DictReader(f) + for row in reader: + scenario = row["scenario"] + size = int(row["size"]) + value = int(row["value"]) + data[scenario][size].append(value) + if unit is None and "unit" in row: + unit = row["unit"] + return data, unit + + +def percentile(sorted_data, p): + """Return the p-th percentile from pre-sorted data.""" + if not sorted_data: + return 0 + idx = int(len(sorted_data) * p / 100.0) + if idx >= len(sorted_data): + idx = len(sorted_data) - 1 + return sorted_data[idx] + + +def text_report(data): + """Print text summary to stdout.""" + scenarios = sorted(data.keys()) + for scenario in scenarios: + print(f"\n {scenario}") + print( + f" {'Size':>6s} {'Min':>8s} {'P50':>8s} {'P90':>8s} " + f"{'P99':>8s} {'P99.9':>8s} {'Max':>8s}" + ) + for size in sorted(data[scenario].keys()): + v = sorted(data[scenario][size]) + print( + f" {size:>6d} {v[0]:>8d} {percentile(v, 50):>8d} " + f"{percentile(v, 90):>8d} {percentile(v, 99):>8d} " + f"{percentile(v, 99.9):>8d} {v[-1]:>8d}" + ) + + +def detect_unit(data, csv_unit=None): + """Return the tick unit, preferring the CSV-embedded value.""" + if csv_unit: + return csv_unit + # Fallback heuristic for legacy CSV files without a unit column + all_vals = [] + for scenario in data.values(): + for size_vals in scenario.values(): + all_vals.extend(size_vals[:100]) + if not all_vals: + return "ticks" + median = sorted(all_vals)[len(all_vals) // 2] + if median > 100000: + return "ns" + if median < 30: + return "ticks" + return "cycles" + + +def plot_boxplot(data, output_path, csv_unit=None): + """Box plot: all scenarios side by side for each allocation size.""" + scenarios = sorted(data.keys()) + sizes = sorted(next(iter(data.values())).keys()) + unit = detect_unit(data, csv_unit) + + n_sizes = len(sizes) + n_scenarios = len(scenarios) + + fig, axes = plt.subplots(1, n_sizes, figsize=(3.5 * n_sizes, 5), sharey=False) + if n_sizes == 1: + axes = [axes] + + colors = { + "malloc_worst": "#e74c3c", + "malloc_best": "#3498db", + "free_worst": "#e67e22", + "free_best": "#2ecc71", + } + + for ax, size in zip(axes, sizes): + box_data = [] + labels = [] + box_colors = [] + for scenario in scenarios: + vals = data[scenario].get(size, []) + box_data.append(vals) + label = scenario.replace("malloc_", "m/").replace("free_", "f/") + labels.append(label) + box_colors.append(colors.get(scenario, "#95a5a6")) + + bp = ax.boxplot( + box_data, + labels=labels, + showfliers=False, + patch_artist=True, + widths=0.6, + medianprops={"color": "black", "linewidth": 1.5}, + ) + for patch, color in zip(bp["boxes"], box_colors): + patch.set_facecolor(color) + patch.set_alpha(0.7) + + ax.set_title(f"{size}B", fontweight="bold") + ax.set_ylabel(f"Latency ({unit})" if ax == axes[0] else "") + ax.grid(True, alpha=0.3, axis="y") + ax.tick_params(axis="x", rotation=30) + + fig.suptitle("TLSF WCET: Per-Operation Latency", fontsize=13, y=1.02) + plt.tight_layout() + plt.savefig(output_path, dpi=150, bbox_inches="tight") + print(f" Box plot: {output_path}") + + +def plot_histogram(data, output_path, csv_unit=None): + """Latency histograms: one subplot per scenario, all sizes overlaid.""" + scenarios = sorted(data.keys()) + sizes = sorted(next(iter(data.values())).keys()) + unit = detect_unit(data, csv_unit) + + n_scenarios = len(scenarios) + fig, axes = plt.subplots( + n_scenarios, 1, figsize=(10, 3 * n_scenarios), sharex=False + ) + if n_scenarios == 1: + axes = [axes] + + size_colors = plt.cm.viridis( + [i / max(len(sizes) - 1, 1) for i in range(len(sizes))] + ) + + for ax, scenario in zip(axes, scenarios): + for si, size in enumerate(sizes): + vals = data[scenario].get(size, []) + if not vals: + continue + # Clip outliers beyond p99.5 for cleaner histograms + sv = sorted(vals) + clip = sv[min(int(len(sv) * 0.995), len(sv) - 1)] + clipped = [v for v in vals if v <= clip] + ax.hist( + clipped, + bins=50, + alpha=0.5, + label=f"{size}B", + color=size_colors[si], + density=True, + ) + + ax.set_title(scenario, fontweight="bold") + ax.set_ylabel("Density") + ax.legend(fontsize=8, loc="upper right") + ax.grid(True, alpha=0.3, axis="y") + + axes[-1].set_xlabel(f"Latency ({unit})") + fig.suptitle("TLSF WCET: Latency Distributions", fontsize=13, y=1.01) + plt.tight_layout() + plt.savefig(output_path, dpi=150, bbox_inches="tight") + print(f" Histogram: {output_path}") + + +def main(): + parser = argparse.ArgumentParser( + description="Generate WCET analysis plots from raw sample data." + ) + parser.add_argument("input", help="Raw CSV file from 'wcet -r'") + parser.add_argument( + "-o", + "--output", + default="wcet", + help="Output file prefix (default: wcet)", + ) + args = parser.parse_args() + + data, csv_unit = read_raw_csv(args.input) + if not data: + print("No data found in input file", file=sys.stderr) + return 1 + + # Always print text summary + print("WCET Summary:") + text_report(data) + print() + + if HAS_MATPLOTLIB: + plot_boxplot(data, f"{args.output}_boxplot.png", csv_unit) + plot_histogram(data, f"{args.output}_histogram.png", csv_unit) + else: + print("Note: matplotlib not available, skipping plot generation.") + print("Install with: pip install matplotlib") + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tests/wcet.c b/tests/wcet.c new file mode 100644 index 0000000..faa743e --- /dev/null +++ b/tests/wcet.c @@ -0,0 +1,695 @@ +/* + * Copyright (c) 2016 National Cheng Kung University, Taiwan. + * All rights reserved. + * Use of this source code is governed by a BSD-style license. + * + * WCET (Worst-Case Execution Time) measurement for TLSF allocator. + * + * Measures per-operation latency under pathological scenarios to bound + * the O(1) constant of TLSF's malloc and free operations. + * + * Two worst-case scenarios (from TLSF-WCET): + * malloc: Small request from a pool with one huge free block. + * Full bitmap search + split + remainder insertion. + * free: Block sandwiched between two free neighbors. + * Two merges + two list removals + one insertion. + * + * Two best-case baselines for comparison: + * malloc: Exact bin hit, no split required. + * free: No merge possible (used neighbors on both sides). + * + * Timing: rdtsc on x86-64, cntvct_el0 on ARM64, clock_gettime fallback. + * Addresses TLSF-WCET limitations: clock() resolution, single-size + * testing, no optimization-level variation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef __APPLE__ +#include +#else +#include +#endif + +#include "tlsf.h" + +/* --- Timing primitives --- */ + +typedef uint64_t tick_t; + +/* + * Platform-specific cycle/tick counter. + * + * x86-64: lfence + rdtsc gives cycle-accurate measurement with ~5 cycle + * overhead. lfence serializes preceding instructions without the full + * cost of cpuid (~100 cycles). + * + * ARM64 (Linux): cntvct_el0 reads the generic timer counter. Not true + * CPU cycles, but a fixed-frequency counter suitable for latency + * measurement. isb ensures instruction stream synchronization. + * Resolution depends on counter frequency (typically 25-100 MHz, + * i.e. 10-40 ns granularity); subtle regressions may be invisible. + * For cycle-accurate ARM measurements, enable userspace PMU access + * to PMCCNTR_EL0 via perf_event or a kernel module. + * + * macOS: mach_absolute_time() with timebase conversion to nanoseconds. + * Works on both Intel and Apple Silicon. + * + * Fallback: clock_gettime(CLOCK_MONOTONIC) gives nanosecond resolution, + * a major improvement over TLSF-WCET's clock() (millisecond resolution). + */ +#if defined(__x86_64__) || defined(__i386__) +#define TICK_UNIT "cycles" + +static inline tick_t read_tick(void) +{ + uint32_t lo, hi; + __asm__ __volatile__("lfence\n\trdtsc" : "=a"(lo), "=d"(hi)); + return ((tick_t) hi << 32) | lo; +} + +#elif defined(__aarch64__) && !defined(__APPLE__) +#define TICK_UNIT "ticks" + +static inline tick_t read_tick(void) +{ + tick_t val; + __asm__ __volatile__("isb\n\tmrs %0, cntvct_el0" : "=r"(val)); + return val; +} + +#elif defined(__APPLE__) +#define TICK_UNIT "ns" + +static mach_timebase_info_data_t tb_info; + +static inline tick_t read_tick(void) +{ + if (tb_info.denom == 0) + mach_timebase_info(&tb_info); + uint64_t ticks = mach_absolute_time(); + if (tb_info.numer <= tb_info.denom) + return ticks / tb_info.denom * tb_info.numer; +#ifdef __SIZEOF_INT128__ + return (uint64_t) (((__uint128_t) ticks * tb_info.numer) / tb_info.denom); +#else + return ticks * tb_info.numer / tb_info.denom; +#endif +} + +#else +#define TICK_UNIT "ns" + +static inline tick_t read_tick(void) +{ + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return (tick_t) ts.tv_sec * 1000000000ULL + (tick_t) ts.tv_nsec; +} +#endif + +/* --- Statistics --- */ + +static int cmp_tick(const void *a, const void *b) +{ + tick_t va = *(const tick_t *) a; + tick_t vb = *(const tick_t *) b; + return (va > vb) - (va < vb); +} + +typedef struct { + tick_t min, max; + tick_t p50, p90, p99, p999; + double mean, stddev; +} latency_stats_t; + +static void compute_latency_stats(tick_t *samples, size_t n, latency_stats_t *s) +{ + if (!n) { + memset(s, 0, sizeof(*s)); + return; + } + + qsort(samples, n, sizeof(tick_t), cmp_tick); + + s->min = samples[0]; + s->max = samples[n - 1]; + s->p50 = samples[n / 2]; + + size_t p90_idx = (size_t) ((double) n * 0.90); + size_t p99_idx = (size_t) ((double) n * 0.99); + size_t p999_idx = (size_t) ((double) n * 0.999); + if (p90_idx >= n) + p90_idx = n - 1; + if (p99_idx >= n) + p99_idx = n - 1; + if (p999_idx >= n) + p999_idx = n - 1; + + s->p90 = samples[p90_idx]; + s->p99 = samples[p99_idx]; + s->p999 = samples[p999_idx]; + + double sum = 0; + for (size_t i = 0; i < n; i++) + sum += (double) samples[i]; + s->mean = sum / (double) n; + + double var = 0; + for (size_t i = 0; i < n; i++) { + double d = (double) samples[i] - s->mean; + var += d * d; + } + s->stddev = n > 1 ? sqrt(var / (double) (n - 1)) : 0; +} + +/* --- Cache control --- + * + * Cold-cache mode (-C): stride through a large buffer before each timed + * operation to evict the tlsf_t control structure (~7 KB) and heap block + * headers from L1/L2 cache. This simulates the real-time worst case + * where an allocation happens in an interrupt handler after a long idle + * period. + * + * Without -C, measurements reflect hot-cache behavior: the bitmap and + * block headers sit in L1/L2 from the preceding setup code. Hot-cache + * numbers isolate algorithmic cost; cold-cache numbers bound the true + * system-level WCET including memory latency. + */ + +#define THRASH_SIZE (64U << 20) /* 64 MB -- exceeds typical L2 and most L3 */ + +static char *thrash_buf; + +static void cache_thrash(void) +{ + if (!thrash_buf) + return; + /* Read every cache line to evict pool data from cache hierarchy. + * Volatile pointer prevents compiler from optimizing away reads. + */ + volatile const char *p = (volatile const char *) thrash_buf; + for (size_t i = 0; i < THRASH_SIZE; i += 64) + (void) p[i]; +} + +/* --- Scenario measurement functions --- + * + * Each function sets up the pathological heap state, then measures the + * target operation in a tight loop. The pool is re-initialized per + * iteration to ensure consistent starting conditions. + * + * Static pools (tlsf_pool_init) are used exclusively: no tlsf_resize + * callback, no arena growth during measurement. + */ + +/* + * malloc worst case: Pool has a single huge free block, request is small. + * + * This forces the longest bitmap search path: the requested size maps + * to a low FL/SL bin, but the only available block is in a high FL bin. + * block_find_suitable must scan the SL bitmap (all zeros), then the FL + * bitmap to find the distant block. After extraction, the block is + * split and the remainder is inserted into its bin. + * + * Full path: adjust_size -> round_block_size -> mapping -> + * block_find_suitable (full bitmap search) -> remove_free_block -> + * block_use -> block_rtrim_free (split + insert) -> block_set_free + */ +static void measure_malloc_worst(char *pool, + size_t pool_size, + size_t alloc_size, + size_t iterations, + size_t warmup, + tick_t *samples) +{ + tlsf_t t; + + for (size_t i = 0; i < warmup; i++) { + tlsf_pool_init(&t, pool, pool_size); + void *p = tlsf_malloc(&t, alloc_size); + assert(p); + tlsf_free(&t, p); + } + + for (size_t i = 0; i < iterations; i++) { + tlsf_pool_init(&t, pool, pool_size); + cache_thrash(); + + tick_t start = read_tick(); + void *p = tlsf_malloc(&t, alloc_size); + tick_t end = read_tick(); + + assert(p); + (void) p; + samples[i] = end - start; + } +} + +/* + * malloc best case: A block exists in the exact bin for the request. + * + * Setup: allocate a block of the target size, then allocate a separator + * (prevents coalescing), then free the target block. The freed block + * lands in its size-class bin. The subsequent measured malloc hits the + * exact bin immediately -- no bitmap scanning beyond the target SL. + * The block size matches the bin's minimum, so no split occurs. + * + * Short path: adjust_size -> round_block_size -> mapping -> + * block_find_suitable (immediate hit) -> remove_free_block -> + * block_use (no split) -> block_set_free + */ +static void measure_malloc_best(char *pool, + size_t pool_size, + size_t alloc_size, + size_t iterations, + size_t warmup, + tick_t *samples) +{ + tlsf_t t; + + for (size_t i = 0; i < warmup; i++) { + tlsf_pool_init(&t, pool, pool_size); + void *a = tlsf_malloc(&t, alloc_size); + void *sep = tlsf_malloc(&t, 1); + assert(a && sep); + tlsf_free(&t, a); + void *b = tlsf_malloc(&t, alloc_size); + assert(b); + (void) b; + } + + for (size_t i = 0; i < iterations; i++) { + tlsf_pool_init(&t, pool, pool_size); + void *a = tlsf_malloc(&t, alloc_size); + void *sep = tlsf_malloc(&t, 1); + assert(a && sep); + (void) sep; + tlsf_free(&t, a); + cache_thrash(); + + tick_t start = read_tick(); + void *b = tlsf_malloc(&t, alloc_size); + tick_t end = read_tick(); + + assert(b); + (void) b; + samples[i] = end - start; + } +} + +/* + * Allocate three adjacent blocks of the requested size from a static pool. + * + * TLSF's block_find_free updates the request size to mapping_size(found_bin), + * which can be far larger than the original request when the pool has a + * single huge free block (e.g., requesting 1024 from a 4MB pool allocates + * ~4MB due to bin-minimum inflation). This prevents multiple allocations + * from a fresh pool. + * + * Workaround: allocate each block at the inflated size, then immediately + * realloc down to the target size. Realloc's trim path (block_rtrim_used) + * splits the oversized block, returning the excess to the free list. + * The excess merges with any adjacent free block, making space for the + * next allocation. After three malloc+realloc cycles, we have three + * adjacent blocks of the correct size. + */ +static void alloc_three_blocks(tlsf_t *t, + size_t alloc_size, + void **a, + void **b, + void **c) +{ + *a = tlsf_malloc(t, alloc_size); + assert(*a); + *a = tlsf_realloc(t, *a, alloc_size); + assert(*a); + + *b = tlsf_malloc(t, alloc_size); + assert(*b); + *b = tlsf_realloc(t, *b, alloc_size); + assert(*b); + + *c = tlsf_malloc(t, alloc_size); + assert(*c); + *c = tlsf_realloc(t, *c, alloc_size); + assert(*c); +} + +/* + * free worst case: Block sandwiched between two free neighbors. + * + * Setup: allocate three adjacent blocks A, B, C using the + * alloc+realloc trick (see alloc_three_blocks), then free A + * (left neighbor becomes free) and free C (right neighbor merges + * with the pool remainder, creating a large free block). + * Now B has a free block on each side. + * + * Freeing B triggers the maximum-work path: + * 1. block_merge_prev: remove A from free list, absorb A into B + * 2. block_merge_next: remove C+remainder from free list, absorb + * 3. block_insert: insert the fully merged block + * + * This is 2 list removals + 2 absorbs + 1 insertion = worst case. + */ +static void measure_free_worst(char *pool, + size_t pool_size, + size_t alloc_size, + size_t iterations, + size_t warmup, + tick_t *samples) +{ + tlsf_t t; + void *a, *b, *c; + + for (size_t i = 0; i < warmup; i++) { + tlsf_pool_init(&t, pool, pool_size); + alloc_three_blocks(&t, alloc_size, &a, &b, &c); + tlsf_free(&t, a); + tlsf_free(&t, c); + tlsf_free(&t, b); + } + + for (size_t i = 0; i < iterations; i++) { + tlsf_pool_init(&t, pool, pool_size); + alloc_three_blocks(&t, alloc_size, &a, &b, &c); + tlsf_free(&t, a); + tlsf_free(&t, c); + cache_thrash(); + + tick_t start = read_tick(); + tlsf_free(&t, b); + tick_t end = read_tick(); + + samples[i] = end - start; + } +} + +/* + * free best case: Block between two used neighbors (no merge). + * + * Setup: allocate three adjacent blocks A, B, C (using the + * alloc+realloc trick). All remain allocated. Freeing B finds used + * blocks on both sides, so block_merge_prev and block_merge_next both + * skip. Only a single list insertion occurs. + * + * Minimal path: block_from_payload -> block_set_free -> block_link_next + * -> block_merge_prev (skip) -> block_merge_next (skip) -> + * block_insert + */ +static void measure_free_best(char *pool, + size_t pool_size, + size_t alloc_size, + size_t iterations, + size_t warmup, + tick_t *samples) +{ + tlsf_t t; + void *a, *b, *c; + + for (size_t i = 0; i < warmup; i++) { + tlsf_pool_init(&t, pool, pool_size); + alloc_three_blocks(&t, alloc_size, &a, &b, &c); + tlsf_free(&t, b); + } + + for (size_t i = 0; i < iterations; i++) { + tlsf_pool_init(&t, pool, pool_size); + alloc_three_blocks(&t, alloc_size, &a, &b, &c); + cache_thrash(); + + tick_t start = read_tick(); + tlsf_free(&t, b); + tick_t end = read_tick(); + + samples[i] = end - start; + } +} + +/* --- Configuration --- */ + +static const size_t test_sizes[] = {16, 64, 256, 1024, 4096}; +#define NUM_SIZES (sizeof(test_sizes) / sizeof(test_sizes[0])) + +typedef void (*measure_fn)(char *, size_t, size_t, size_t, size_t, tick_t *); + +typedef struct { + const char *name; + const char *desc; + measure_fn measure; +} scenario_t; + +static const scenario_t scenarios[] = { + {"malloc_worst", "small alloc from single huge block", + measure_malloc_worst}, + {"malloc_best", "exact bin hit, no split", measure_malloc_best}, + {"free_worst", "sandwiched between two free blocks", measure_free_worst}, + {"free_best", "no merge (used neighbors)", measure_free_best}, +}; +#define NUM_SCENARIOS (sizeof(scenarios) / sizeof(scenarios[0])) + +/* --- Argument parsing --- */ + +static void usage(const char *prog) +{ + fprintf(stderr, + "TLSF WCET (Worst-Case Execution Time) measurement.\n\n" + "Measures per-operation latency under pathological scenarios\n" + "to bound the O(1) constant of TLSF malloc/free.\n\n" + "Usage: %s [options]\n\n" + "Options:\n" + " -i N Measured iterations per scenario (default: 10000)\n" + " -w N Warmup iterations (default: 1000)\n" + " -p SIZE Pool size in bytes (default: 4194304)\n" + " -c CSV output (machine-readable summary)\n" + " -r FILE Write raw samples to FILE (for plotting)\n" + " -C Cold-cache mode (64 MB thrash between iterations)\n" + " -h Show this help\n\n" + "Scenarios:\n", + prog); + + for (size_t i = 0; i < NUM_SCENARIOS; i++) + fprintf(stderr, " %-14s %s\n", scenarios[i].name, scenarios[i].desc); + + fprintf(stderr, + "\nTimer: %s\n\n" + "Example:\n" + " %s -i 10000 -c # CSV summary\n" + " %s -i 10000 -r samples.csv # raw data for plotting\n" + " %s -i 100 -w 10 # quick validation\n", + TICK_UNIT, prog, prog, prog); + exit(1); +} + +static size_t parse_size_arg(const char *arg, const char *name) +{ + char *end; + unsigned long val = strtoul(arg, &end, 0); + if (*end != '\0' || end == arg) { + fprintf(stderr, "Invalid %s: %s\n", name, arg); + exit(1); + } + return (size_t) val; +} + +/* --- Main --- */ + +#define DEFAULT_POOL_SIZE ((size_t) 4 << 20) /* 4 MB */ + +int main(int argc, char **argv) +{ + size_t iterations = 10000; + size_t warmup = 1000; + size_t pool_size = DEFAULT_POOL_SIZE; + bool csv_mode = false; + bool cold_cache = false; + const char *raw_file = NULL; + int opt; + + while ((opt = getopt(argc, argv, "i:w:p:cr:Ch")) > 0) { + switch (opt) { + case 'i': + iterations = parse_size_arg(optarg, "iterations"); + break; + case 'w': + warmup = parse_size_arg(optarg, "warmup"); + break; + case 'p': + pool_size = parse_size_arg(optarg, "pool size"); + break; + case 'c': + csv_mode = true; + break; + case 'r': + raw_file = optarg; + break; + case 'C': + cold_cache = true; + break; + case 'h': + default: + usage(argv[0]); + } + } + + if (!iterations) { + fprintf(stderr, "Error: iterations must be > 0\n"); + return 1; + } + if (pool_size < 4096) { + fprintf(stderr, "Error: pool size must be >= 4096\n"); + return 1; + } + + /* Allocate pool and sample buffer */ + char *pool = (char *) malloc(pool_size); + tick_t *samples = (tick_t *) malloc(iterations * sizeof(tick_t)); + if (!pool || !samples) { + fprintf(stderr, "Failed to allocate memory\n"); + free(pool); + free(samples); + return 1; + } + + if (cold_cache) { + thrash_buf = (char *) malloc(THRASH_SIZE); + if (!thrash_buf) { + fprintf(stderr, "Failed to allocate cache thrash buffer\n"); + free(pool); + free(samples); + return 1; + } + /* Touch all pages to prevent copy-on-write faults during measurement */ + memset(thrash_buf, 0xAA, THRASH_SIZE); + } + + FILE *raw_fp = NULL; + if (raw_file) { + raw_fp = fopen(raw_file, "w"); + if (!raw_fp) { + fprintf(stderr, "Failed to open %s\n", raw_file); + free(thrash_buf); + free(pool); + free(samples); + return 1; + } + fprintf(raw_fp, "scenario,size,unit,value\n"); + } + + /* Header */ + if (csv_mode) { + printf( + "scenario,size,samples,unit,min,p50,p90,p99,p999,max,mean," + "stddev\n"); + } else { + printf("TLSF WCET Analysis\n"); + printf("==================\n"); + printf("Timer: %s\n", TICK_UNIT); + printf("Cache: %s\n", thrash_buf ? "cold (64 MB thrash)" : "hot"); + printf("Pool: %zu bytes (%.1f MB)\n", pool_size, + (double) pool_size / (1024.0 * 1024.0)); + printf("Iterations: %zu (warmup: %zu)\n", iterations, warmup); + printf("Sizes: "); + for (size_t s = 0; s < NUM_SIZES; s++) + printf(" %zu", test_sizes[s]); + printf(" bytes\n\n"); + } + + /* Run all scenarios */ + for (size_t sc = 0; sc < NUM_SCENARIOS; sc++) { + if (!csv_mode) { + printf("--- %s (%s) ---\n", scenarios[sc].name, scenarios[sc].desc); + printf(" %6s %10s %10s %10s %10s %10s %10s %10s %10s\n", "size", + "min", "p50", "p90", "p99", "p99.9", "max", "mean", + "stddev"); + } + + for (size_t si = 0; si < NUM_SIZES; si++) { + size_t sz = test_sizes[si]; + + scenarios[sc].measure(pool, pool_size, sz, iterations, warmup, + samples); + + /* Write raw samples before sorting (compute_latency_stats sorts + * in place) */ + if (raw_fp) { + for (size_t i = 0; i < iterations; i++) + fprintf(raw_fp, "%s,%zu,%s,%" PRIu64 "\n", + scenarios[sc].name, sz, TICK_UNIT, samples[i]); + } + + latency_stats_t st; + compute_latency_stats(samples, iterations, &st); + + if (csv_mode) { + printf("%s,%zu,%zu,%s,%" PRIu64 ",%" PRIu64 ",%" PRIu64 + ",%" PRIu64 ",%" PRIu64 ",%" PRIu64 ",%.1f,%.1f\n", + scenarios[sc].name, sz, iterations, TICK_UNIT, st.min, + st.p50, st.p90, st.p99, st.p999, st.max, st.mean, + st.stddev); + } else { + printf(" %6zu %10" PRIu64 " %10" PRIu64 " %10" PRIu64 + " %10" PRIu64 " %10" PRIu64 " %10" PRIu64 + " %10.1f %10.1f\n", + sz, st.min, st.p50, st.p90, st.p99, st.p999, st.max, + st.mean, st.stddev); + } + } + + if (!csv_mode) + printf("\n"); + } + + /* Summary: worst/best ratios */ + if (!csv_mode) { + printf("--- worst/best ratio (p99) ---\n"); + printf(" %6s %10s %10s\n", "size", "malloc", "free"); + + for (size_t si = 0; si < NUM_SIZES; si++) { + size_t sz = test_sizes[si]; + latency_stats_t mw, mb, fw, fb; + + /* Re-measure with minimal iterations for ratio computation. + * The samples array was already overwritten, so reuse it. + */ + scenarios[0].measure(pool, pool_size, sz, iterations, warmup, + samples); + compute_latency_stats(samples, iterations, &mw); + + scenarios[1].measure(pool, pool_size, sz, iterations, warmup, + samples); + compute_latency_stats(samples, iterations, &mb); + + scenarios[2].measure(pool, pool_size, sz, iterations, warmup, + samples); + compute_latency_stats(samples, iterations, &fw); + + scenarios[3].measure(pool, pool_size, sz, iterations, warmup, + samples); + compute_latency_stats(samples, iterations, &fb); + + double malloc_ratio = + mb.p99 ? (double) mw.p99 / (double) mb.p99 : 0; + double free_ratio = fb.p99 ? (double) fw.p99 / (double) fb.p99 : 0; + + printf(" %6zu %9.2fx %9.2fx\n", sz, malloc_ratio, free_ratio); + } + printf("\n"); + } + + if (raw_fp) + fclose(raw_fp); + free(thrash_buf); + free(samples); + free(pool); + + return 0; +}