diff --git a/Makefile b/Makefile index 1ba57eb..9f5151c 100644 --- a/Makefile +++ b/Makefile @@ -26,6 +26,13 @@ ifeq ($(ARCH),aarch64) endif endif +ifeq ($(ARCH),riscv64) + CAP ?= $(shell cat /proc/cpuinfo | grep -E 'zihintpause' | head -1) + ifneq (,$(findstring zihintpause,$(CAP))) + CFLAGS+=-march=rv64gc_zihintpause + endif +endif + EXE=multichase multiload fairness pingpong all: $(EXE) diff --git a/br_asm.c b/br_asm.c index 5dbbae7..f2a4507 100644 --- a/br_asm.c +++ b/br_asm.c @@ -83,6 +83,65 @@ int convert_pointers_to_branches(void *head, int chunk_size) { return base_chunk_size; } +#elif defined(__riscv) && __riscv_xlen == 64 +static char *riscv64_emit_lui_a0_imm64(char *p, uint64_t imm64) { + *p++ = 0x37; // opcode for LUI (Load Upper Immediate) + *p++ = 0x05; // rd = a0 + *p++ = (imm64 >> 32) & 0xff; + *p++ = (imm64 >> 40) & 0xff; + *p++ = (imm64 >> 48) & 0xff; + *p++ = (imm64 >> 56) & 0xff; + return p; +} + +static char *riscv64_emit_jalr_a0(char *p) { + *p++ = 0x67; // opcode for JALR (Jump and Link Register) + *p++ = 0x80; // rd = x1 (return address), rs1 = a0 + *p++ = 0x00; + *p++ = 0x00; + return p; +} + +static char *riscv64_emit_ret(char *p) { + *p++ = 0x80; // opcode for RETL (Return) + *p++ = 0x02; + *p++ = 0x10; + *p++ = 0x00; + return p; +} + +int convert_pointers_to_branches(void *head, int chunk_size) { + int remain = cycle_len(head); + chunk_size = (remain < chunk_size) + ? remain + : remain / (1 << lround(log2(1.0 * remain / chunk_size))); + int base_chunk_size = chunk_size; + int chunks_remaining = remain / chunk_size; + int chunk_count = 0; + const int br_code_len = 20; // len(lui) + len(jalr) + char *p = (char *)head; + do { + if (!chunk_count) chunk_count = remain / chunks_remaining; + char *next = *((char **)p); + for (int i = 8; i < br_code_len; i++) { + if (p[i]) { + fprintf(stderr, "not enough space to convert a pointer to branches\n"); + exit(1); + } + } + p = riscv64_emit_lui_a0_imm64(p, (intptr_t)next); + --remain; + if (--chunk_count == 0) { + p = riscv64_emit_ret(p); + --chunks_remaining; + } else { + p = riscv64_emit_jalr_a0(p); + } + p = next; + } while (p != head); + return base_chunk_size; +} + #elif defined(__x86_64__) static char *x64_emit_mov_imm64_rax(char *p, uint64_t imm64) { diff --git a/cpu_util.h b/cpu_util.h index ded2a6d..949745f 100644 --- a/cpu_util.h +++ b/cpu_util.h @@ -40,6 +40,8 @@ static inline void cpu_relax(void) { } #elif defined(__aarch64__) #define cpu_relax() asm volatile("yield" ::: "memory") +#elif defined(__riscv) && __riscv_xlen == 64 +#define cpu_relax() asm volatile("pause" ::: "memory") #else #warning "no cpu_relax for your cpu" #define cpu_relax() \ diff --git a/multichase.c b/multichase.c index 095d355..738b6f8 100644 --- a/multichase.c +++ b/multichase.c @@ -487,11 +487,12 @@ static void *thread_start(void *data) { if (!strcmp(args->x.chase->name, "branch")) { void *p = args->x.cycle[0]; args->x.branch_chunk_size = convert_pointers_to_branches(p, 200); -#if defined(__aarch64__) +#if defined(__aarch64__) || (defined(__riscv) && __riscv_xlen == 64) __builtin___clear_cache( args->x.genchase_args->arena, args->x.genchase_args->arena + args->x.genchase_args->total_memory); #endif + } // now flush our caches diff --git a/multiload.c b/multiload.c index 090c2e8..cce7527 100644 --- a/multiload.c +++ b/multiload.c @@ -601,6 +601,17 @@ static void load_stream_triad_nontemporal_injection_delay(per_thread_t *t) { if (i % num_elem_twocachelines == 0) delay_until_iteration(t->x.delay); #if defined(__aarch64__) asm volatile ("stnp %0, %1, [%2]" :: "r"(b[i]+c[i]), "r"(b[i+1]+c[i+1]), "r" (a+i)); +#elif defined(__riscv) && __riscv_xlen == 64 + // sd (Store Doubleword) is a risc-v memory store instruction that stores a 64-bit (8-byte) value from a register to memory. + // "sd %0, (%2)\n\t" stores 64-bit value from register %0 to memory address contained in register %2, + // "sd %1, 8(%2)" stores 64-bit value from register %1 to memory address [%2] + 8. + asm volatile ("sd %0, (%2)\n\t" + "sd %1, 8(%2)" + : // no outputs + : "r"(b[i]+c[i]), + "r"(b[i+1]+c[i+1]), + "r" (a+i) + : "memory"); #elif defined(__x86_64__) _mm_stream_si64(&((long long*)a)[i], b[i]+c[i]); _mm_stream_si64(&((long long*)a)[i+1], b[i+1]+c[i+1]);