From bd05325a9b8cf8dd51a18fab6698b9a2d83c3f2d Mon Sep 17 00:00:00 2001
From: Jim Huang <jserv@ccns.ncku.edu.tw>
Date: Sun, 8 Feb 2026 04:09:08 +0800
Subject: [PATCH] Add WCET measurement for O(1) validation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This introduces per-operation latency measurement under pathological
scenarios to empirically bound the O(1) constant of TLSF malloc/free.
- Four scenarios (malloc/free × worst/best case) with cycle-accurate
  timing (rdtsc on x86-64, cntvct_el0 on ARM64, mach_absolute_time on
  macOS).
- Supports cold-cache mode (-C) with 64 MB thrash buffer to simulate
  interrupt-handler WCET.
- Raw CSV output (-r) embeds the platform tick unit for correct plot
  labeling.
---
 .github/workflows/ci.yml |  74 ++++-
 Makefile                 |  25 +-
 scripts/wcet_plot.py     | 233 +++++++++++++
 tests/wcet.c             | 695 +++++++++++++++++++++++++++++++++++++++
 4 files changed, 1024 insertions(+), 3 deletions(-)
 create mode 100755 scripts/wcet_plot.py
 create mode 100644 tests/wcet.c

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index e909b6b..5046c92 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -16,7 +16,7 @@ jobs:
           wget -qO- https://apt.llvm.org/llvm-snapshot.gpg.key | sudo gpg --dearmor -o /usr/share/keyrings/llvm-snapshot.gpg
           echo "deb [signed-by=/usr/share/keyrings/llvm-snapshot.gpg] https://apt.llvm.org/noble/ llvm-toolchain-noble main" | sudo tee /etc/apt/sources.list.d/llvm.list
           sudo apt-get update
-          sudo apt-get install -y clang-format-20 shfmt
+          sudo apt-get install -y clang-format-20 shfmt black
         env:
           DEBIAN_FRONTEND: noninteractive
       - name: Check format
@@ -62,3 +62,75 @@ jobs:
         env:
           CC: ${{ matrix.cc }}
           CFLAGS: ${{ matrix.cflags }}
+
+  wcet:
+    needs: [coding-style, build-and-test]
+    runs-on: ${{ matrix.runner }}
+    timeout-minutes: 10
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - runner: ubuntu-24.04
+            arch: x86-64
+          - runner: ubuntu-24.04-arm
+            arch: arm64
+    steps:
+      - uses: actions/checkout@v6
+      - name: Install dependencies
+        run: sudo apt-get update && sudo apt-get install -y python3-matplotlib
+        env:
+          DEBIAN_FRONTEND: noninteractive
+      - name: Build (release, no assert/check overhead)
+        run: make all CFLAGS="-Iinclude -std=gnu11 -O2 -Wall -Wextra -Wconversion"
+      - name: WCET measurement
+        run: ./build/wcet -i 5000 -w 500 -r build/wcet_raw.csv | tee build/wcet_output.txt
+      - name: Generate plots
+        run: python3 scripts/wcet_plot.py build/wcet_raw.csv -o build/wcet
+      - name: Job summary
+        if: always()
+        run: |
+          {
+            echo "## WCET Results (${{ matrix.arch }})"
+            echo ""
+            echo '```'
+            cat build/wcet_output.txt 2>/dev/null || echo "WCET measurement failed or was skipped."
+            echo '```'
+            echo ""
+            echo "Box plot and histogram available in the **wcet-${{ matrix.arch }}** artifact."
+          } >> "$GITHUB_STEP_SUMMARY"
+      - name: Comment on PR
+        if: >-
+          always() && github.event_name == 'pull_request' &&
+          github.event.pull_request.head.repo.full_name == github.repository
+        env:
+          GH_TOKEN: ${{ github.token }}
+        run: |
+          MARKER="<!-- wcet-${{ matrix.arch }} -->"
+          {
+            echo "$MARKER"
+            echo "## WCET Results (${{ matrix.arch }})"
+            echo ""
+            echo '```'
+            cat build/wcet_output.txt 2>/dev/null || echo "WCET measurement failed or was skipped."
+            echo '```'
+          } > /tmp/comment.md
+
+          COMMENT_ID=$(gh api "repos/${{ github.repository }}/issues/${{ github.event.pull_request.number }}/comments" \
+            --paginate --jq ".[] | select(.body | startswith(\"$MARKER\")) | .id" | head -1)
+
+          if [ -n "$COMMENT_ID" ]; then
+            gh api "repos/${{ github.repository }}/issues/comments/$COMMENT_ID" \
+              -X PATCH -F body=@/tmp/comment.md
+          else
+            gh pr comment "${{ github.event.pull_request.number }}" -F /tmp/comment.md
+          fi
+      - name: Upload WCET artifacts
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: wcet-${{ matrix.arch }}
+          path: |
+            build/wcet_raw.csv
+            build/wcet_boxplot.png
+            build/wcet_histogram.png
diff --git a/Makefile b/Makefile
index cec9d13..277cd40 100644
--- a/Makefile
+++ b/Makefile
@@ -2,7 +2,8 @@ OUT = build
 
 TARGETS = \
 	test \
-	bench
+	bench \
+	wcet
 TARGETS := $(addprefix $(OUT)/,$(TARGETS))
 
 all: $(TARGETS)
@@ -34,6 +35,9 @@ $(OUT)/test: $(OBJS) tests/test.c
 $(OUT)/bench: $(OBJS) tests/bench.c
 	$(CC) $(CFLAGS) -o $@ -MMD -MF $@.d $^ $(LDFLAGS) -lm
 
+$(OUT)/wcet: $(OBJS) tests/wcet.c
+	$(CC) $(CFLAGS) -o $@ -MMD -MF $@.d $^ $(LDFLAGS) -lm
+
 $(OUT)/%.o: src/%.c
 	@mkdir -p $(OUT)
 	$(CC) $(CFLAGS) -c -o $@ -MMD -MF $@.d $<
@@ -43,10 +47,27 @@ check: $(TARGETS)
 	MALLOC_CHECK_=3 ./build/bench -l 10000 -i 3 -w 1
 	MALLOC_CHECK_=3 ./build/bench -s 32 -l 10000 -i 3 -w 1
 	MALLOC_CHECK_=3 ./build/bench -s 10:12345 -l 10000 -i 3 -w 1
+	./build/wcet -i 100 -w 10
+
+# Full WCET measurement (10000 iterations, 1000 warmup)
+wcet: all
+	./build/wcet
+
+# Quick WCET check for development
+wcet-quick: all
+	./build/wcet -i 1000 -w 100
+
+# WCET with raw output and analysis plots
+wcet-plot: all
+	@mkdir -p $(OUT)
+	./build/wcet -i 10000 -r $(OUT)/wcet_raw.csv -c > $(OUT)/wcet_summary.csv
+	python3 scripts/wcet_plot.py $(OUT)/wcet_raw.csv -o $(OUT)/wcet
 
 clean:
 	$(RM) $(TARGETS) $(OBJS) $(deps)
+	$(RM) $(OUT)/wcet_raw.csv $(OUT)/wcet_summary.csv
+	$(RM) $(OUT)/wcet_boxplot.png $(OUT)/wcet_histogram.png
 
-.PHONY: all check clean bench bench-quick
+.PHONY: all check clean bench bench-quick wcet wcet-quick wcet-plot
 
 -include $(deps)
diff --git a/scripts/wcet_plot.py b/scripts/wcet_plot.py
new file mode 100755
index 0000000..54859fc
--- /dev/null
+++ b/scripts/wcet_plot.py
@@ -0,0 +1,233 @@
+#!/usr/bin/env python3
+"""
+WCET analysis plots for TLSF allocator.
+
+Reads raw sample data from 'wcet -r' output and generates latency
+distribution plots.  Falls back to text summary when matplotlib is
+unavailable (e.g., CI environments).
+
+Usage:
+    build/wcet -r samples.csv && python3 scripts/wcet_plot.py samples.csv
+    build/wcet -r samples.csv && python3 scripts/wcet_plot.py samples.csv -o build/wcet
+
+The output prefix (-o) controls where PNG files are written.  Two plots
+are generated:
+    {prefix}_boxplot.png     - Box plot of all scenarios per size class
+    {prefix}_histogram.png   - Latency histograms per scenario
+"""
+
+import argparse
+import csv
+import sys
+from collections import defaultdict
+
+try:
+    import matplotlib
+
+    matplotlib.use("Agg")
+    import matplotlib.pyplot as plt
+
+    HAS_MATPLOTLIB = True
+except ImportError:
+    HAS_MATPLOTLIB = False
+
+
+def read_raw_csv(path):
+    """Read raw sample CSV: scenario,size,[unit,]value -> (data, unit)."""
+    data = defaultdict(lambda: defaultdict(list))
+    unit = None
+    with open(path) as f:
+        reader = csv.DictReader(f)
+        for row in reader:
+            scenario = row["scenario"]
+            size = int(row["size"])
+            value = int(row["value"])
+            data[scenario][size].append(value)
+            if unit is None and "unit" in row:
+                unit = row["unit"]
+    return data, unit
+
+
+def percentile(sorted_data, p):
+    """Return the p-th percentile from pre-sorted data."""
+    if not sorted_data:
+        return 0
+    idx = int(len(sorted_data) * p / 100.0)
+    if idx >= len(sorted_data):
+        idx = len(sorted_data) - 1
+    return sorted_data[idx]
+
+
+def text_report(data):
+    """Print text summary to stdout."""
+    scenarios = sorted(data.keys())
+    for scenario in scenarios:
+        print(f"\n  {scenario}")
+        print(
+            f"    {'Size':>6s} {'Min':>8s} {'P50':>8s} {'P90':>8s} "
+            f"{'P99':>8s} {'P99.9':>8s} {'Max':>8s}"
+        )
+        for size in sorted(data[scenario].keys()):
+            v = sorted(data[scenario][size])
+            print(
+                f"    {size:>6d} {v[0]:>8d} {percentile(v, 50):>8d} "
+                f"{percentile(v, 90):>8d} {percentile(v, 99):>8d} "
+                f"{percentile(v, 99.9):>8d} {v[-1]:>8d}"
+            )
+
+
+def detect_unit(data, csv_unit=None):
+    """Return the tick unit, preferring the CSV-embedded value."""
+    if csv_unit:
+        return csv_unit
+    # Fallback heuristic for legacy CSV files without a unit column
+    all_vals = []
+    for scenario in data.values():
+        for size_vals in scenario.values():
+            all_vals.extend(size_vals[:100])
+    if not all_vals:
+        return "ticks"
+    median = sorted(all_vals)[len(all_vals) // 2]
+    if median > 100000:
+        return "ns"
+    if median < 30:
+        return "ticks"
+    return "cycles"
+
+
+def plot_boxplot(data, output_path, csv_unit=None):
+    """Box plot: all scenarios side by side for each allocation size."""
+    scenarios = sorted(data.keys())
+    sizes = sorted(next(iter(data.values())).keys())
+    unit = detect_unit(data, csv_unit)
+
+    n_sizes = len(sizes)
+    n_scenarios = len(scenarios)
+
+    fig, axes = plt.subplots(1, n_sizes, figsize=(3.5 * n_sizes, 5), sharey=False)
+    if n_sizes == 1:
+        axes = [axes]
+
+    colors = {
+        "malloc_worst": "#e74c3c",
+        "malloc_best": "#3498db",
+        "free_worst": "#e67e22",
+        "free_best": "#2ecc71",
+    }
+
+    for ax, size in zip(axes, sizes):
+        box_data = []
+        labels = []
+        box_colors = []
+        for scenario in scenarios:
+            vals = data[scenario].get(size, [])
+            box_data.append(vals)
+            label = scenario.replace("malloc_", "m/").replace("free_", "f/")
+            labels.append(label)
+            box_colors.append(colors.get(scenario, "#95a5a6"))
+
+        bp = ax.boxplot(
+            box_data,
+            labels=labels,
+            showfliers=False,
+            patch_artist=True,
+            widths=0.6,
+            medianprops={"color": "black", "linewidth": 1.5},
+        )
+        for patch, color in zip(bp["boxes"], box_colors):
+            patch.set_facecolor(color)
+            patch.set_alpha(0.7)
+
+        ax.set_title(f"{size}B", fontweight="bold")
+        ax.set_ylabel(f"Latency ({unit})" if ax == axes[0] else "")
+        ax.grid(True, alpha=0.3, axis="y")
+        ax.tick_params(axis="x", rotation=30)
+
+    fig.suptitle("TLSF WCET: Per-Operation Latency", fontsize=13, y=1.02)
+    plt.tight_layout()
+    plt.savefig(output_path, dpi=150, bbox_inches="tight")
+    print(f"  Box plot: {output_path}")
+
+
+def plot_histogram(data, output_path, csv_unit=None):
+    """Latency histograms: one subplot per scenario, all sizes overlaid."""
+    scenarios = sorted(data.keys())
+    sizes = sorted(next(iter(data.values())).keys())
+    unit = detect_unit(data, csv_unit)
+
+    n_scenarios = len(scenarios)
+    fig, axes = plt.subplots(
+        n_scenarios, 1, figsize=(10, 3 * n_scenarios), sharex=False
+    )
+    if n_scenarios == 1:
+        axes = [axes]
+
+    size_colors = plt.cm.viridis(
+        [i / max(len(sizes) - 1, 1) for i in range(len(sizes))]
+    )
+
+    for ax, scenario in zip(axes, scenarios):
+        for si, size in enumerate(sizes):
+            vals = data[scenario].get(size, [])
+            if not vals:
+                continue
+            # Clip outliers beyond p99.5 for cleaner histograms
+            sv = sorted(vals)
+            clip = sv[min(int(len(sv) * 0.995), len(sv) - 1)]
+            clipped = [v for v in vals if v <= clip]
+            ax.hist(
+                clipped,
+                bins=50,
+                alpha=0.5,
+                label=f"{size}B",
+                color=size_colors[si],
+                density=True,
+            )
+
+        ax.set_title(scenario, fontweight="bold")
+        ax.set_ylabel("Density")
+        ax.legend(fontsize=8, loc="upper right")
+        ax.grid(True, alpha=0.3, axis="y")
+
+    axes[-1].set_xlabel(f"Latency ({unit})")
+    fig.suptitle("TLSF WCET: Latency Distributions", fontsize=13, y=1.01)
+    plt.tight_layout()
+    plt.savefig(output_path, dpi=150, bbox_inches="tight")
+    print(f"  Histogram: {output_path}")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Generate WCET analysis plots from raw sample data."
+    )
+    parser.add_argument("input", help="Raw CSV file from 'wcet -r'")
+    parser.add_argument(
+        "-o",
+        "--output",
+        default="wcet",
+        help="Output file prefix (default: wcet)",
+    )
+    args = parser.parse_args()
+
+    data, csv_unit = read_raw_csv(args.input)
+    if not data:
+        print("No data found in input file", file=sys.stderr)
+        return 1
+
+    # Always print text summary
+    print("WCET Summary:")
+    text_report(data)
+    print()
+
+    if HAS_MATPLOTLIB:
+        plot_boxplot(data, f"{args.output}_boxplot.png", csv_unit)
+        plot_histogram(data, f"{args.output}_histogram.png", csv_unit)
+    else:
+        print("Note: matplotlib not available, skipping plot generation.")
+        print("Install with: pip install matplotlib")
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tests/wcet.c b/tests/wcet.c
new file mode 100644
index 0000000..faa743e
--- /dev/null
+++ b/tests/wcet.c
@@ -0,0 +1,695 @@
+/*
+ * Copyright (c) 2016 National Cheng Kung University, Taiwan.
+ * All rights reserved.
+ * Use of this source code is governed by a BSD-style license.
+ *
+ * WCET (Worst-Case Execution Time) measurement for TLSF allocator.
+ *
+ * Measures per-operation latency under pathological scenarios to bound
+ * the O(1) constant of TLSF's malloc and free operations.
+ *
+ * Two worst-case scenarios (from TLSF-WCET):
+ *   malloc: Small request from a pool with one huge free block.
+ *           Full bitmap search + split + remainder insertion.
+ *   free:   Block sandwiched between two free neighbors.
+ *           Two merges + two list removals + one insertion.
+ *
+ * Two best-case baselines for comparison:
+ *   malloc: Exact bin hit, no split required.
+ *   free:   No merge possible (used neighbors on both sides).
+ *
+ * Timing: rdtsc on x86-64, cntvct_el0 on ARM64, clock_gettime fallback.
+ * Addresses TLSF-WCET limitations: clock() resolution, single-size
+ * testing, no optimization-level variation.
+ */
+
+#include <assert.h>
+#include <inttypes.h>
+#include <math.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#ifdef __APPLE__
+#include <mach/mach_time.h>
+#else
+#include <time.h>
+#endif
+
+#include "tlsf.h"
+
+/* --- Timing primitives --- */
+
+typedef uint64_t tick_t;
+
+/*
+ * Platform-specific cycle/tick counter.
+ *
+ * x86-64: lfence + rdtsc gives cycle-accurate measurement with ~5 cycle
+ * overhead.  lfence serializes preceding instructions without the full
+ * cost of cpuid (~100 cycles).
+ *
+ * ARM64 (Linux): cntvct_el0 reads the generic timer counter.  Not true
+ * CPU cycles, but a fixed-frequency counter suitable for latency
+ * measurement.  isb ensures instruction stream synchronization.
+ * Resolution depends on counter frequency (typically 25-100 MHz,
+ * i.e. 10-40 ns granularity); subtle regressions may be invisible.
+ * For cycle-accurate ARM measurements, enable userspace PMU access
+ * to PMCCNTR_EL0 via perf_event or a kernel module.
+ *
+ * macOS: mach_absolute_time() with timebase conversion to nanoseconds.
+ * Works on both Intel and Apple Silicon.
+ *
+ * Fallback: clock_gettime(CLOCK_MONOTONIC) gives nanosecond resolution,
+ * a major improvement over TLSF-WCET's clock() (millisecond resolution).
+ */
+#if defined(__x86_64__) || defined(__i386__)
+#define TICK_UNIT "cycles"
+
+static inline tick_t read_tick(void)
+{
+    uint32_t lo, hi;
+    __asm__ __volatile__("lfence\n\trdtsc" : "=a"(lo), "=d"(hi));
+    return ((tick_t) hi << 32) | lo;
+}
+
+#elif defined(__aarch64__) && !defined(__APPLE__)
+#define TICK_UNIT "ticks"
+
+static inline tick_t read_tick(void)
+{
+    tick_t val;
+    __asm__ __volatile__("isb\n\tmrs %0, cntvct_el0" : "=r"(val));
+    return val;
+}
+
+#elif defined(__APPLE__)
+#define TICK_UNIT "ns"
+
+static mach_timebase_info_data_t tb_info;
+
+static inline tick_t read_tick(void)
+{
+    if (tb_info.denom == 0)
+        mach_timebase_info(&tb_info);
+    uint64_t ticks = mach_absolute_time();
+    if (tb_info.numer <= tb_info.denom)
+        return ticks / tb_info.denom * tb_info.numer;
+#ifdef __SIZEOF_INT128__
+    return (uint64_t) (((__uint128_t) ticks * tb_info.numer) / tb_info.denom);
+#else
+    return ticks * tb_info.numer / tb_info.denom;
+#endif
+}
+
+#else
+#define TICK_UNIT "ns"
+
+static inline tick_t read_tick(void)
+{
+    struct timespec ts;
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    return (tick_t) ts.tv_sec * 1000000000ULL + (tick_t) ts.tv_nsec;
+}
+#endif
+
+/* --- Statistics --- */
+
+static int cmp_tick(const void *a, const void *b)
+{
+    tick_t va = *(const tick_t *) a;
+    tick_t vb = *(const tick_t *) b;
+    return (va > vb) - (va < vb);
+}
+
+typedef struct {
+    tick_t min, max;
+    tick_t p50, p90, p99, p999;
+    double mean, stddev;
+} latency_stats_t;
+
+static void compute_latency_stats(tick_t *samples, size_t n, latency_stats_t *s)
+{
+    if (!n) {
+        memset(s, 0, sizeof(*s));
+        return;
+    }
+
+    qsort(samples, n, sizeof(tick_t), cmp_tick);
+
+    s->min = samples[0];
+    s->max = samples[n - 1];
+    s->p50 = samples[n / 2];
+
+    size_t p90_idx = (size_t) ((double) n * 0.90);
+    size_t p99_idx = (size_t) ((double) n * 0.99);
+    size_t p999_idx = (size_t) ((double) n * 0.999);
+    if (p90_idx >= n)
+        p90_idx = n - 1;
+    if (p99_idx >= n)
+        p99_idx = n - 1;
+    if (p999_idx >= n)
+        p999_idx = n - 1;
+
+    s->p90 = samples[p90_idx];
+    s->p99 = samples[p99_idx];
+    s->p999 = samples[p999_idx];
+
+    double sum = 0;
+    for (size_t i = 0; i < n; i++)
+        sum += (double) samples[i];
+    s->mean = sum / (double) n;
+
+    double var = 0;
+    for (size_t i = 0; i < n; i++) {
+        double d = (double) samples[i] - s->mean;
+        var += d * d;
+    }
+    s->stddev = n > 1 ? sqrt(var / (double) (n - 1)) : 0;
+}
+
+/* --- Cache control ---
+ *
+ * Cold-cache mode (-C): stride through a large buffer before each timed
+ * operation to evict the tlsf_t control structure (~7 KB) and heap block
+ * headers from L1/L2 cache.  This simulates the real-time worst case
+ * where an allocation happens in an interrupt handler after a long idle
+ * period.
+ *
+ * Without -C, measurements reflect hot-cache behavior: the bitmap and
+ * block headers sit in L1/L2 from the preceding setup code.  Hot-cache
+ * numbers isolate algorithmic cost; cold-cache numbers bound the true
+ * system-level WCET including memory latency.
+ */
+
+#define THRASH_SIZE (64U << 20) /* 64 MB -- exceeds typical L2 and most L3 */
+
+static char *thrash_buf;
+
+static void cache_thrash(void)
+{
+    if (!thrash_buf)
+        return;
+    /* Read every cache line to evict pool data from cache hierarchy.
+     * Volatile pointer prevents compiler from optimizing away reads.
+     */
+    volatile const char *p = (volatile const char *) thrash_buf;
+    for (size_t i = 0; i < THRASH_SIZE; i += 64)
+        (void) p[i];
+}
+
+/* --- Scenario measurement functions ---
+ *
+ * Each function sets up the pathological heap state, then measures the
+ * target operation in a tight loop.  The pool is re-initialized per
+ * iteration to ensure consistent starting conditions.
+ *
+ * Static pools (tlsf_pool_init) are used exclusively: no tlsf_resize
+ * callback, no arena growth during measurement.
+ */
+
+/*
+ * malloc worst case: Pool has a single huge free block, request is small.
+ *
+ * This forces the longest bitmap search path: the requested size maps
+ * to a low FL/SL bin, but the only available block is in a high FL bin.
+ * block_find_suitable must scan the SL bitmap (all zeros), then the FL
+ * bitmap to find the distant block.  After extraction, the block is
+ * split and the remainder is inserted into its bin.
+ *
+ * Full path: adjust_size -> round_block_size -> mapping ->
+ *   block_find_suitable (full bitmap search) -> remove_free_block ->
+ *   block_use -> block_rtrim_free (split + insert) -> block_set_free
+ */
+static void measure_malloc_worst(char *pool,
+                                 size_t pool_size,
+                                 size_t alloc_size,
+                                 size_t iterations,
+                                 size_t warmup,
+                                 tick_t *samples)
+{
+    tlsf_t t;
+
+    for (size_t i = 0; i < warmup; i++) {
+        tlsf_pool_init(&t, pool, pool_size);
+        void *p = tlsf_malloc(&t, alloc_size);
+        assert(p);
+        tlsf_free(&t, p);
+    }
+
+    for (size_t i = 0; i < iterations; i++) {
+        tlsf_pool_init(&t, pool, pool_size);
+        cache_thrash();
+
+        tick_t start = read_tick();
+        void *p = tlsf_malloc(&t, alloc_size);
+        tick_t end = read_tick();
+
+        assert(p);
+        (void) p;
+        samples[i] = end - start;
+    }
+}
+
+/*
+ * malloc best case: A block exists in the exact bin for the request.
+ *
+ * Setup: allocate a block of the target size, then allocate a separator
+ * (prevents coalescing), then free the target block.  The freed block
+ * lands in its size-class bin.  The subsequent measured malloc hits the
+ * exact bin immediately -- no bitmap scanning beyond the target SL.
+ * The block size matches the bin's minimum, so no split occurs.
+ *
+ * Short path: adjust_size -> round_block_size -> mapping ->
+ *   block_find_suitable (immediate hit) -> remove_free_block ->
+ *   block_use (no split) -> block_set_free
+ */
+static void measure_malloc_best(char *pool,
+                                size_t pool_size,
+                                size_t alloc_size,
+                                size_t iterations,
+                                size_t warmup,
+                                tick_t *samples)
+{
+    tlsf_t t;
+
+    for (size_t i = 0; i < warmup; i++) {
+        tlsf_pool_init(&t, pool, pool_size);
+        void *a = tlsf_malloc(&t, alloc_size);
+        void *sep = tlsf_malloc(&t, 1);
+        assert(a && sep);
+        tlsf_free(&t, a);
+        void *b = tlsf_malloc(&t, alloc_size);
+        assert(b);
+        (void) b;
+    }
+
+    for (size_t i = 0; i < iterations; i++) {
+        tlsf_pool_init(&t, pool, pool_size);
+        void *a = tlsf_malloc(&t, alloc_size);
+        void *sep = tlsf_malloc(&t, 1);
+        assert(a && sep);
+        (void) sep;
+        tlsf_free(&t, a);
+        cache_thrash();
+
+        tick_t start = read_tick();
+        void *b = tlsf_malloc(&t, alloc_size);
+        tick_t end = read_tick();
+
+        assert(b);
+        (void) b;
+        samples[i] = end - start;
+    }
+}
+
+/*
+ * Allocate three adjacent blocks of the requested size from a static pool.
+ *
+ * TLSF's block_find_free updates the request size to mapping_size(found_bin),
+ * which can be far larger than the original request when the pool has a
+ * single huge free block (e.g., requesting 1024 from a 4MB pool allocates
+ * ~4MB due to bin-minimum inflation).  This prevents multiple allocations
+ * from a fresh pool.
+ *
+ * Workaround: allocate each block at the inflated size, then immediately
+ * realloc down to the target size.  Realloc's trim path (block_rtrim_used)
+ * splits the oversized block, returning the excess to the free list.
+ * The excess merges with any adjacent free block, making space for the
+ * next allocation.  After three malloc+realloc cycles, we have three
+ * adjacent blocks of the correct size.
+ */
+static void alloc_three_blocks(tlsf_t *t,
+                               size_t alloc_size,
+                               void **a,
+                               void **b,
+                               void **c)
+{
+    *a = tlsf_malloc(t, alloc_size);
+    assert(*a);
+    *a = tlsf_realloc(t, *a, alloc_size);
+    assert(*a);
+
+    *b = tlsf_malloc(t, alloc_size);
+    assert(*b);
+    *b = tlsf_realloc(t, *b, alloc_size);
+    assert(*b);
+
+    *c = tlsf_malloc(t, alloc_size);
+    assert(*c);
+    *c = tlsf_realloc(t, *c, alloc_size);
+    assert(*c);
+}
+
+/*
+ * free worst case: Block sandwiched between two free neighbors.
+ *
+ * Setup: allocate three adjacent blocks A, B, C using the
+ * alloc+realloc trick (see alloc_three_blocks), then free A
+ * (left neighbor becomes free) and free C (right neighbor merges
+ * with the pool remainder, creating a large free block).
+ * Now B has a free block on each side.
+ *
+ * Freeing B triggers the maximum-work path:
+ *   1. block_merge_prev: remove A from free list, absorb A into B
+ *   2. block_merge_next: remove C+remainder from free list, absorb
+ *   3. block_insert: insert the fully merged block
+ *
+ * This is 2 list removals + 2 absorbs + 1 insertion = worst case.
+ */
+static void measure_free_worst(char *pool,
+                               size_t pool_size,
+                               size_t alloc_size,
+                               size_t iterations,
+                               size_t warmup,
+                               tick_t *samples)
+{
+    tlsf_t t;
+    void *a, *b, *c;
+
+    for (size_t i = 0; i < warmup; i++) {
+        tlsf_pool_init(&t, pool, pool_size);
+        alloc_three_blocks(&t, alloc_size, &a, &b, &c);
+        tlsf_free(&t, a);
+        tlsf_free(&t, c);
+        tlsf_free(&t, b);
+    }
+
+    for (size_t i = 0; i < iterations; i++) {
+        tlsf_pool_init(&t, pool, pool_size);
+        alloc_three_blocks(&t, alloc_size, &a, &b, &c);
+        tlsf_free(&t, a);
+        tlsf_free(&t, c);
+        cache_thrash();
+
+        tick_t start = read_tick();
+        tlsf_free(&t, b);
+        tick_t end = read_tick();
+
+        samples[i] = end - start;
+    }
+}
+
+/*
+ * free best case: Block between two used neighbors (no merge).
+ *
+ * Setup: allocate three adjacent blocks A, B, C (using the
+ * alloc+realloc trick).  All remain allocated.  Freeing B finds used
+ * blocks on both sides, so block_merge_prev and block_merge_next both
+ * skip.  Only a single list insertion occurs.
+ *
+ * Minimal path: block_from_payload -> block_set_free -> block_link_next
+ *   -> block_merge_prev (skip) -> block_merge_next (skip) ->
+ *   block_insert
+ */
+static void measure_free_best(char *pool,
+                              size_t pool_size,
+                              size_t alloc_size,
+                              size_t iterations,
+                              size_t warmup,
+                              tick_t *samples)
+{
+    tlsf_t t;
+    void *a, *b, *c;
+
+    for (size_t i = 0; i < warmup; i++) {
+        tlsf_pool_init(&t, pool, pool_size);
+        alloc_three_blocks(&t, alloc_size, &a, &b, &c);
+        tlsf_free(&t, b);
+    }
+
+    for (size_t i = 0; i < iterations; i++) {
+        tlsf_pool_init(&t, pool, pool_size);
+        alloc_three_blocks(&t, alloc_size, &a, &b, &c);
+        cache_thrash();
+
+        tick_t start = read_tick();
+        tlsf_free(&t, b);
+        tick_t end = read_tick();
+
+        samples[i] = end - start;
+    }
+}
+
+/* --- Configuration --- */
+
+static const size_t test_sizes[] = {16, 64, 256, 1024, 4096};
+#define NUM_SIZES (sizeof(test_sizes) / sizeof(test_sizes[0]))
+
+typedef void (*measure_fn)(char *, size_t, size_t, size_t, size_t, tick_t *);
+
+typedef struct {
+    const char *name;
+    const char *desc;
+    measure_fn measure;
+} scenario_t;
+
+static const scenario_t scenarios[] = {
+    {"malloc_worst", "small alloc from single huge block",
+     measure_malloc_worst},
+    {"malloc_best", "exact bin hit, no split", measure_malloc_best},
+    {"free_worst", "sandwiched between two free blocks", measure_free_worst},
+    {"free_best", "no merge (used neighbors)", measure_free_best},
+};
+#define NUM_SCENARIOS (sizeof(scenarios) / sizeof(scenarios[0]))
+
+/* --- Argument parsing --- */
+
+static void usage(const char *prog)
+{
+    fprintf(stderr,
+            "TLSF WCET (Worst-Case Execution Time) measurement.\n\n"
+            "Measures per-operation latency under pathological scenarios\n"
+            "to bound the O(1) constant of TLSF malloc/free.\n\n"
+            "Usage: %s [options]\n\n"
+            "Options:\n"
+            "  -i N       Measured iterations per scenario (default: 10000)\n"
+            "  -w N       Warmup iterations (default: 1000)\n"
+            "  -p SIZE    Pool size in bytes (default: 4194304)\n"
+            "  -c         CSV output (machine-readable summary)\n"
+            "  -r FILE    Write raw samples to FILE (for plotting)\n"
+            "  -C         Cold-cache mode (64 MB thrash between iterations)\n"
+            "  -h         Show this help\n\n"
+            "Scenarios:\n",
+            prog);
+
+    for (size_t i = 0; i < NUM_SCENARIOS; i++)
+        fprintf(stderr, "  %-14s %s\n", scenarios[i].name, scenarios[i].desc);
+
+    fprintf(stderr,
+            "\nTimer: %s\n\n"
+            "Example:\n"
+            "  %s -i 10000 -c                    # CSV summary\n"
+            "  %s -i 10000 -r samples.csv         # raw data for plotting\n"
+            "  %s -i 100 -w 10                    # quick validation\n",
+            TICK_UNIT, prog, prog, prog);
+    exit(1);
+}
+
+static size_t parse_size_arg(const char *arg, const char *name)
+{
+    char *end;
+    unsigned long val = strtoul(arg, &end, 0);
+    if (*end != '\0' || end == arg) {
+        fprintf(stderr, "Invalid %s: %s\n", name, arg);
+        exit(1);
+    }
+    return (size_t) val;
+}
+
+/* --- Main --- */
+
+#define DEFAULT_POOL_SIZE ((size_t) 4 << 20) /* 4 MB */
+
+int main(int argc, char **argv)
+{
+    size_t iterations = 10000;
+    size_t warmup = 1000;
+    size_t pool_size = DEFAULT_POOL_SIZE;
+    bool csv_mode = false;
+    bool cold_cache = false;
+    const char *raw_file = NULL;
+    int opt;
+
+    while ((opt = getopt(argc, argv, "i:w:p:cr:Ch")) > 0) {
+        switch (opt) {
+        case 'i':
+            iterations = parse_size_arg(optarg, "iterations");
+            break;
+        case 'w':
+            warmup = parse_size_arg(optarg, "warmup");
+            break;
+        case 'p':
+            pool_size = parse_size_arg(optarg, "pool size");
+            break;
+        case 'c':
+            csv_mode = true;
+            break;
+        case 'r':
+            raw_file = optarg;
+            break;
+        case 'C':
+            cold_cache = true;
+            break;
+        case 'h':
+        default:
+            usage(argv[0]);
+        }
+    }
+
+    if (!iterations) {
+        fprintf(stderr, "Error: iterations must be > 0\n");
+        return 1;
+    }
+    if (pool_size < 4096) {
+        fprintf(stderr, "Error: pool size must be >= 4096\n");
+        return 1;
+    }
+
+    /* Allocate pool and sample buffer */
+    char *pool = (char *) malloc(pool_size);
+    tick_t *samples = (tick_t *) malloc(iterations * sizeof(tick_t));
+    if (!pool || !samples) {
+        fprintf(stderr, "Failed to allocate memory\n");
+        free(pool);
+        free(samples);
+        return 1;
+    }
+
+    if (cold_cache) {
+        thrash_buf = (char *) malloc(THRASH_SIZE);
+        if (!thrash_buf) {
+            fprintf(stderr, "Failed to allocate cache thrash buffer\n");
+            free(pool);
+            free(samples);
+            return 1;
+        }
+        /* Touch all pages to prevent copy-on-write faults during measurement */
+        memset(thrash_buf, 0xAA, THRASH_SIZE);
+    }
+
+    FILE *raw_fp = NULL;
+    if (raw_file) {
+        raw_fp = fopen(raw_file, "w");
+        if (!raw_fp) {
+            fprintf(stderr, "Failed to open %s\n", raw_file);
+            free(thrash_buf);
+            free(pool);
+            free(samples);
+            return 1;
+        }
+        fprintf(raw_fp, "scenario,size,unit,value\n");
+    }
+
+    /* Header */
+    if (csv_mode) {
+        printf(
+            "scenario,size,samples,unit,min,p50,p90,p99,p999,max,mean,"
+            "stddev\n");
+    } else {
+        printf("TLSF WCET Analysis\n");
+        printf("==================\n");
+        printf("Timer:      %s\n", TICK_UNIT);
+        printf("Cache:      %s\n", thrash_buf ? "cold (64 MB thrash)" : "hot");
+        printf("Pool:       %zu bytes (%.1f MB)\n", pool_size,
+               (double) pool_size / (1024.0 * 1024.0));
+        printf("Iterations: %zu (warmup: %zu)\n", iterations, warmup);
+        printf("Sizes:     ");
+        for (size_t s = 0; s < NUM_SIZES; s++)
+            printf(" %zu", test_sizes[s]);
+        printf(" bytes\n\n");
+    }
+
+    /* Run all scenarios */
+    for (size_t sc = 0; sc < NUM_SCENARIOS; sc++) {
+        if (!csv_mode) {
+            printf("--- %s (%s) ---\n", scenarios[sc].name, scenarios[sc].desc);
+            printf("  %6s %10s %10s %10s %10s %10s %10s %10s %10s\n", "size",
+                   "min", "p50", "p90", "p99", "p99.9", "max", "mean",
+                   "stddev");
+        }
+
+        for (size_t si = 0; si < NUM_SIZES; si++) {
+            size_t sz = test_sizes[si];
+
+            scenarios[sc].measure(pool, pool_size, sz, iterations, warmup,
+                                  samples);
+
+            /* Write raw samples before sorting (compute_latency_stats sorts
+             * in place) */
+            if (raw_fp) {
+                for (size_t i = 0; i < iterations; i++)
+                    fprintf(raw_fp, "%s,%zu,%s,%" PRIu64 "\n",
+                            scenarios[sc].name, sz, TICK_UNIT, samples[i]);
+            }
+
+            latency_stats_t st;
+            compute_latency_stats(samples, iterations, &st);
+
+            if (csv_mode) {
+                printf("%s,%zu,%zu,%s,%" PRIu64 ",%" PRIu64 ",%" PRIu64
+                       ",%" PRIu64 ",%" PRIu64 ",%" PRIu64 ",%.1f,%.1f\n",
+                       scenarios[sc].name, sz, iterations, TICK_UNIT, st.min,
+                       st.p50, st.p90, st.p99, st.p999, st.max, st.mean,
+                       st.stddev);
+            } else {
+                printf("  %6zu %10" PRIu64 " %10" PRIu64 " %10" PRIu64
+                       " %10" PRIu64 " %10" PRIu64 " %10" PRIu64
+                       " %10.1f %10.1f\n",
+                       sz, st.min, st.p50, st.p90, st.p99, st.p999, st.max,
+                       st.mean, st.stddev);
+            }
+        }
+
+        if (!csv_mode)
+            printf("\n");
+    }
+
+    /* Summary: worst/best ratios */
+    if (!csv_mode) {
+        printf("--- worst/best ratio (p99) ---\n");
+        printf("  %6s %10s %10s\n", "size", "malloc", "free");
+
+        for (size_t si = 0; si < NUM_SIZES; si++) {
+            size_t sz = test_sizes[si];
+            latency_stats_t mw, mb, fw, fb;
+
+            /* Re-measure with minimal iterations for ratio computation.
+             * The samples array was already overwritten, so reuse it.
+             */
+            scenarios[0].measure(pool, pool_size, sz, iterations, warmup,
+                                 samples);
+            compute_latency_stats(samples, iterations, &mw);
+
+            scenarios[1].measure(pool, pool_size, sz, iterations, warmup,
+                                 samples);
+            compute_latency_stats(samples, iterations, &mb);
+
+            scenarios[2].measure(pool, pool_size, sz, iterations, warmup,
+                                 samples);
+            compute_latency_stats(samples, iterations, &fw);
+
+            scenarios[3].measure(pool, pool_size, sz, iterations, warmup,
+                                 samples);
+            compute_latency_stats(samples, iterations, &fb);
+
+            double malloc_ratio =
+                mb.p99 ? (double) mw.p99 / (double) mb.p99 : 0;
+            double free_ratio = fb.p99 ? (double) fw.p99 / (double) fb.p99 : 0;
+
+            printf("  %6zu %9.2fx %9.2fx\n", sz, malloc_ratio, free_ratio);
+        }
+        printf("\n");
+    }
+
+    if (raw_fp)
+        fclose(raw_fp);
+    free(thrash_buf);
+    free(samples);
+    free(pool);
+
+    return 0;
+}