From 0b8e9a9e4a549a047be9d755ea5c12391b845c56 Mon Sep 17 00:00:00 2001 From: Stefan Andersson DAG Date: Mon, 25 May 2026 08:31:20 +0000 Subject: [PATCH] loaded-latency: add mode-2 multipass random chain (v1) Merge the 8-pass random Hamiltonian chain randomization (mode 2) Defeating Neoverse V2 L2 prefetchers that learn the single-chain pattern used by mode 1. Changes: - args.c: add -R/--lat-randomize-mode to select randomization mode (1 = pair-swap shuffle, 2 = 8-pass random chain). -r remains shorthand for mode 1. - memlatency.c: * Add make_multipass_chain() building LAT_CHAIN_PASSES (=8) independent Hamiltonian cycles via ptr_t slots per cacheline, concatenated head-to-tail into one closed loop. Each pass uses an independent Fisher-Yates random visitation order and a different slot offset within each cacheline, defeating stride, next-line, and short-history temporal prefetchers. * Refactor make_pairswap_chain() to use an external order[] array (mirroring mode 2). Drop per-node order/index bookkeeping fields; nodes now carry only ->next. * Shrink local node_t in lat_initialize() accordingly; union is now { void *next; ptr_t ptrs[LAT_CHAIN_PASSES]; } + cacheline pad. * Replace dead #if 0 debug block with #ifdef LAT_DEBUG_CHAIN that derives cacheline index/slot from the pointer and buffer base. Signed-off-by: Stefan Andersson DAG Signed-off-by: Steven Miao --- loaded-latency/args.c | 16 ++- loaded-latency/memlatency.c | 258 +++++++++++++++++++++++++++--------- 2 files changed, 209 insertions(+), 65 deletions(-) diff --git a/loaded-latency/args.c b/loaded-latency/args.c index e590bb4..665a0ae 100644 --- a/loaded-latency/args.c +++ b/loaded-latency/args.c @@ -109,7 +109,10 @@ static void print_help(void) { " -j | --lat-cacheline-stride count number of cachelines to skip between loads for latency measurement\n" " -o | --lat-offset count number of deploads to advance secondary latency threads\n" " -c | --lat-clear-cache clear caches before latency run\n" -" -r | --lat-randomize randomize ordering of dependent loads\n" +" -r | --lat-randomize randomize ordering of dependent loads (mode 1: pair-swap shuffle)\n" +" -R | --lat-randomize-mode mode select randomization mode by number (1, 2, ...)\n" +" mode 1 = pair-swap shuffle (default for -r)\n" +" mode 2 = 8-way interleaved Hamiltonian chains (defeats HW prefetchers)\n" " -h | --lat-use-hugepages size hugepage size to use for latency. Use \"-h help\" to show known sizes\n" " -w | --lat-warmup-cpu cpu_num on which CPU to warm up latency loop (repeat for additional CPUs)\n" " -s | --lat-shared-memory use the same memory for all latency threads\n" @@ -169,6 +172,7 @@ void handle_args(int argc, char ** argv, args_t * pargs) { {"lat-offset", required_argument, 0, 'o'}, {"lat-clear-cache", no_argument, 0, 'c'}, {"lat-randomize", no_argument, 0, 'r'}, + {"lat-randomize-mode", required_argument, 0, 'R'}, {"lat-use-hugepages", required_argument, 0, 'h'}, {"lat-warmup-cpu", required_argument, 0, 'w'}, {"lat-shared-memory", no_argument, 0, 's'}, @@ -192,7 +196,7 @@ void handle_args(int argc, char ** argv, args_t * pargs) { while (1) { - int c = getopt_long(argc, argv, "D:S:d:Qq:f:t:l:n:e:i:z:j:o:crh:w:su:B:I:L:F:C:H:Z:W", long_options, NULL); + int c = getopt_long(argc, argv, "D:S:d:Qq:f:t:l:n:e:i:z:j:o:crR:h:w:su:B:I:L:F:C:H:Z:W", long_options, NULL); switch (c) { @@ -302,6 +306,14 @@ void handle_args(int argc, char ** argv, args_t * pargs) { pargs->lat_randomize = 1; break; + case 'R': // --lat-randomize-mode N + pargs->lat_randomize = (int) strtol(optarg, NULL, 0); + if (pargs->lat_randomize < 0) { + printf("invalid --lat-randomize-mode value: %s\n", optarg); + exit(-1); + } + break; + case 'h': // --lat-use-hugepages hugepage_size_string pargs->lat_use_hugepages = parse_hugepage_parameter('h', optarg); break; diff --git a/loaded-latency/memlatency.c b/loaded-latency/memlatency.c index 5267907..de1756f 100644 --- a/loaded-latency/memlatency.c +++ b/loaded-latency/memlatency.c @@ -31,90 +31,210 @@ #include "alloc.h" #include "memlatency.h" +#include + /* lat_initialize can be called from main.c for shared memory */ +/* + * LAT_CHAIN_PASSES — number of independent random passes through the + * buffer concatenated head-to-tail to form the mode-2 pointer chain. + * + * Algorithm (make_multipass_chain): + * Each pass j produces one Hamiltonian cycle through all cachelines + * using slot j of the per-cacheline pointer array. The cacheline + * visitation order within each pass is an independent Fisher-Yates + * shuffle. Passes are concatenated so the last pointer of pass j + * links to the first pointer of pass j+1, and the last pointer of + * pass (PASSES-1) wraps to the first pointer of pass 0. Total + * chain length = cacheline_count * LAT_CHAIN_PASSES. + * + */ +#define LAT_CHAIN_PASSES 8 -void ** lat_initialize(size_t cacheline_bytes, - size_t cacheline_count, int randomize, int clear_cache, size_t cacheline_stride, int use_hugepages) { +typedef struct ptr_s { + struct ptr_s * next; +} ptr_t; - size_t i; +/* + * Header layout of the mode-1 node_t (first field only). + * node_t itself is declared inside lat_initialize() because its trailing + * pad depends on the runtime cacheline_bytes. node_hdr_t lets helper + * functions access ->next without seeing the local typedef. + */ +typedef struct { + void * next; +} node_hdr_t; - typedef struct { - void * next; - size_t order; - size_t index; - char buf[cacheline_bytes - sizeof(void *) - sizeof(size_t) - sizeof(size_t)]; - } node_t; +#define NODE_HDR(base, i, cbytes) \ + ((node_hdr_t *)((char *)(base) + (i) * (cbytes))) - // check that sizeof(node_t) == cacheline_bytes // XXX: might not be on 32-bit - if (sizeof(node_t) != cacheline_bytes) { - printf("in lat_setup, sizeof(node_t) = %zu, does not equal cacheline_bytes = %zu\n", - sizeof(node_t), cacheline_bytes); +/* + * Generate a random permutation of [0, count) using Fisher-Yates. + * Matches mode 1 style: size_t indices, lrand48(). + */ +static void make_random_order(size_t * index, size_t count) { + size_t * selection = malloc(count * sizeof(size_t)); + if (selection == NULL) { + printf("make_random_order: malloc failed for count=%zu\n", count); exit(-1); } - if (cacheline_bytes % sizeof(void*)) { - printf("cacheline_bytes = %zu, is not an exact multiple of sizeof(void*) = %zu\n", cacheline_bytes, sizeof(void*)); - exit(-1); + for (size_t i = 0; i < count; i++) { + selection[i] = i; } - node_t * p = do_alloc(cacheline_bytes * cacheline_count, use_hugepages, cacheline_bytes); + size_t remaining = count; + for (size_t i = 0; i < count; i++) { + size_t n = (size_t) lrand48() % remaining; + index[i] = selection[n]; + remaining--; + selection[n] = selection[remaining]; + } - // order is the sequence of node_t elements to traverse. Initialize for sequential order. + free(selection); +} - for (i = 0; i < cacheline_count; i++) { - p[i].order = i; + +/* + * Mode 1: pair-swap shuffle. + * Build a permutation of stride-aligned cacheline offsets in an + * external array (mirroring make_multipass_chain), then wire + * node->next pointers by walking it. The nodes carry only ->next. + */ +static void ** make_pairswap_chain(void * ptr, + size_t cacheline_bytes, size_t cacheline_count, size_t cacheline_stride, int randomize) { + + size_t count = cacheline_count / cacheline_stride; + if (count < 2) { + printf("make_pairswap_chain: need >=2 stride slots, got %zu\n", count); + exit(-1); } - // if randomize is used, randomly swap the order values + size_t * order = malloc(count * sizeof(size_t)); + if (order == NULL) { + printf("make_pairswap_chain: malloc failed for count=%zu\n", count); + exit(-1); + } + + /* Sequential permutation: order[k] = k-th stride-aligned offset */ + for (size_t k = 0; k < count; k++) { + order[k] = k * cacheline_stride; + } + /* Optional shuffle: 10 rounds of random pair swaps on the array */ if (randomize) { for (int rounds = 0; rounds < 10; rounds++) { - for (i = 0; i < cacheline_count; i+= cacheline_stride) { - size_t offset_a, offset_b, x; - + for (size_t k = 0; k < count; k++) { + size_t a, b, x; do { - offset_a = (lrand48() % (cacheline_count/cacheline_stride)) * cacheline_stride; - offset_b = (lrand48() % (cacheline_count/cacheline_stride)) * cacheline_stride; - } while (offset_a == offset_b); - - x = p[offset_a].order; - p[offset_a].order = p[offset_b].order; - p[offset_b].order = x; + a = (size_t) lrand48() % count; + b = (size_t) lrand48() % count; + } while (a == b); + x = order[a]; + order[a] = order[b]; + order[b] = x; } } } - // create the pointer loop using the ordering table + /* Wire pointer loop following the permutation */ + for (size_t k = 0; k < count - 1; k++) { + size_t cur = order[k]; + size_t next = order[k + 1]; + NODE_HDR(ptr, cur, cacheline_bytes)->next = + &NODE_HDR(ptr, next, cacheline_bytes)->next; + } + /* Close the cycle */ + { + size_t cur = order[count - 1]; + size_t first = order[0]; + NODE_HDR(ptr, cur, cacheline_bytes)->next = + &NODE_HDR(ptr, first, cacheline_bytes)->next; + } + + free(order); + return (void **) ptr; +} - for (i = 0; i < cacheline_count - cacheline_stride; i += cacheline_stride) { - p[p[i].order].next = &(p[p[i + cacheline_stride].order].next); - p[p[i].order].index = i; +/* + * Mode 2: build LAT_CHAIN_PASSES independent Hamiltonian cycles, + * each using slot j of every cacheline. Cycles are concatenated + * head-to-tail to form one closed loop of length + * cacheline_count * LAT_CHAIN_PASSES. + */ +static void ** make_multipass_chain(void * ptr, + size_t cacheline_count, size_t cacheline_bytes) { + + size_t * index = malloc(cacheline_count * sizeof(size_t)); + if (index == NULL) { + printf("make_multipass_chain: malloc failed for count=%zu\n", cacheline_count); + exit(-1); } - p[p[i].order].next = &(p[p[0].order].next); - p[p[i].order].index = i; + char * base = (char *) ptr; + ptr_t * first = NULL; + ptr_t * current = NULL; -#if 0 - // print out latency loop pointers for debug - printf("by pointer:\n"); - node_t * pp = (node_t *) ppvoid; - for (i = 0; i < cacheline_count; i++) { - printf("%zu\tpp=%p pp->next=%p delta=%ld bytes\n", i, pp, pp->next, (long) pp->next - (long) pp ); - pp = pp->next; + for (size_t j = 0; j < LAT_CHAIN_PASSES; j++) { + make_random_order(index, cacheline_count); + + for (size_t i = 0; i < cacheline_count; i++) { + ptr_t * next = (ptr_t *)(base + index[i] * cacheline_bytes + j * sizeof(ptr_t)); + if (current == NULL) { + first = next; + } else { + current->next = next; + } + current = next; + } } - printf("by entry:\n"); - pp = (node_t *) ppvoid; - for (i = 0; i < cacheline_count; i++) { - printf("pp[%zu]\t= %p, .next=%p\n", i, &(pp[i]), pp[i].next); + /* close the loop */ + current->next = first; + + free(index); + return (void **) first; +} + +void ** lat_initialize(size_t cacheline_bytes, + size_t cacheline_count, int randomize, int clear_cache, size_t cacheline_stride, int use_hugepages) { + + typedef struct { + union { + void * next; /* mode 1: single chain */ + ptr_t ptrs[LAT_CHAIN_PASSES]; /* mode 2: per-pass chain slots */ + }; + char buf[cacheline_bytes + - (sizeof(ptr_t) * LAT_CHAIN_PASSES > sizeof(void *) + ? sizeof(ptr_t) * LAT_CHAIN_PASSES + : sizeof(void *))]; + } node_t; + + // check that sizeof(node_t) == cacheline_bytes // XXX: might not be on 32-bit + if (sizeof(node_t) != cacheline_bytes) { + printf("in lat_setup, sizeof(node_t) = %zu, does not equal cacheline_bytes = %zu\n", + sizeof(node_t), cacheline_bytes); + exit(-1); + } + + if (cacheline_bytes % sizeof(void*)) { + printf("cacheline_bytes = %zu, is not an exact multiple of sizeof(void*) = %zu\n", cacheline_bytes, sizeof(void*)); + exit(-1); + } + + node_t * p = do_alloc(cacheline_bytes * cacheline_count, use_hugepages, cacheline_bytes); + + void ** head; + if (randomize == 2) { + head = make_multipass_chain(p, cacheline_count, cacheline_bytes); + } else { + head = make_pairswap_chain(p, cacheline_bytes, cacheline_count, cacheline_stride, randomize); } -#endif if (clear_cache) { - __builtin___clear_cache(p, p+cacheline_count); + __builtin___clear_cache((char *) p, (char *) p + cacheline_bytes * cacheline_count); } - return (void **) p; + return head; } @@ -232,18 +352,30 @@ void latency_thread (struct lat_thread_info * lat_tinfo) { size_t this_hwcounter = read_hwcounter(); -#if 0 - typedef struct { - void * next; - size_t order; - size_t index; - } partial_node_t; - - size_t current_index = ((partial_node_t *) p)->index; - - printf("CPU%d LATTHREAD%d: %.6f ns, %.6f cycles, cntvct=0x%08lx cntvct_diff=%lu p=%p index=%zu latency_samples=%zu\n", - cpu, thread_num, x_per_iter, x_per_iter/cycle_time_ns, this_hwcounter, - this_hwcounter - last_hwcounter, p, current_index, latency_samples); +#ifdef LAT_DEBUG_CHAIN + /* + * Chain-walk debug. `p` points at a node's ->next field. + * With the new layout there is no stored index/order, so + * derive the cacheline index from the buffer base: + * line_base = p & ~(cacheline_bytes - 1) + * idx = (line_base - mem) / cacheline_bytes + * slot = (p - line_base) / sizeof(void *) + * slot is 0 for mode 1 and 0..LAT_CHAIN_PASSES-1 for mode 2. + */ + uintptr_t pu = (uintptr_t) p; + uintptr_t line_base = pu & ~((uintptr_t) cacheline_bytes - 1); + size_t slot = (size_t)((pu - line_base) / sizeof(void *)); + size_t current_index = (mem != NULL) + ? (size_t)(((char *) line_base - (char *) mem) / cacheline_bytes) + : (size_t) -1; + + printf("CPU%d LATTHREAD%d: %.6f ns, %.6f cycles, " + HWCOUNTER "=0x%08lx " HWCOUNTER "_diff=%lu " + "p=%p line=0x%lx idx=%zu slot=%zu samples=%zu\n", + cpu, thread_num, x_per_iter, x_per_iter / cycle_time_ns, + this_hwcounter, this_hwcounter - last_hwcounter, + (void *) p, (unsigned long) line_base, + current_index, slot, latency_samples); #else printf("CPU%d LATTHREAD%d: %.6f ns, %.6f cycles\n", cpu, thread_num, x_per_iter, x_per_iter/cycle_time_ns); #endif