diff --git a/mlkem/src/native/ppc64le/src/intt_ppc_asm.S b/mlkem/src/native/ppc64le/src/intt_ppc_asm.S index 2078b936b..5fa33e467 100644 --- a/mlkem/src/native/ppc64le/src/intt_ppc_asm.S +++ b/mlkem/src/native/ppc64le/src/intt_ppc_asm.S @@ -20,17 +20,28 @@ .global MLK_ASM_NAMESPACE(intt_ppc_asm) MLK_ASM_FN_SYMBOL(intt_ppc_asm) + .cfi_startproc stdu 1, -352(1) + .cfi_def_cfa 1, 352 mflr 0 std 14, 56(1) + .cfi_offset 14, -296 std 15, 64(1) + .cfi_offset 15, -288 std 16, 72(1) + .cfi_offset 16, -280 std 17, 80(1) + .cfi_offset 17, -272 std 18, 88(1) + .cfi_offset 18, -264 std 19, 96(1) + .cfi_offset 19, -256 std 20, 104(1) + .cfi_offset 20, -248 std 21, 112(1) + .cfi_offset 21, -240 std 22, 120(1) + .cfi_offset 22, -232 li 10, 128 li 11, 144 li 12, 160 @@ -38,11 +49,17 @@ MLK_ASM_FN_SYMBOL(intt_ppc_asm) li 15, 192 li 16, 208 stxvd2x 52, 10, 1 + .cfi_offset 1144, -224 stxvd2x 53, 11, 1 + .cfi_offset 1145, -208 stxvd2x 54, 12, 1 + .cfi_offset 1146, -192 stxvd2x 55, 14, 1 + .cfi_offset 1147, -176 stxvd2x 56, 15, 1 + .cfi_offset 1148, -160 stxvd2x 57, 16, 1 + .cfi_offset 1149, -144 li 10, 224 li 11, 240 li 12, 256 @@ -50,11 +67,17 @@ MLK_ASM_FN_SYMBOL(intt_ppc_asm) li 15, 288 li 16, 304 stxvd2x 58, 10, 1 + .cfi_offset 1150, -128 stxvd2x 59, 11, 1 + .cfi_offset 1151, -112 stxvd2x 60, 12, 1 + .cfi_offset 1152, -96 stxvd2x 61, 14, 1 + .cfi_offset 1153, -80 stxvd2x 62, 15, 1 + .cfi_offset 1154, -64 stxvd2x 63, 16, 1 + .cfi_offset 1155, -48 lxvd2x 0, 0, 4 xxlxor 35, 35, 35 xxlor 3, 35, 35 @@ -84,7 +107,7 @@ MLK_ASM_FN_SYMBOL(intt_ppc_asm) mtctr 8 xxlor 37, 0, 0 -intt_ppc_asm_Loopf: +Lintt_ppc_asm_Loopf: lxvd2x 57, 0, 3 lxvd2x 58, 10, 3 lxvd2x 62, 11, 3 @@ -129,7 +152,7 @@ intt_ppc_asm_Loopf: stxvd2x 55, 17, 3 stxvd2x 60, 18, 3 addi 3, 3, 128 - bdnz intt_ppc_asm_Loopf + bdnz Lintt_ppc_asm_Loopf addi 3, 3, -512 nop nop @@ -3182,11 +3205,17 @@ intt_ppc_asm_Loopf: li 15, 192 li 16, 208 lxvd2x 52, 10, 1 + .cfi_restore 1144 lxvd2x 53, 11, 1 + .cfi_restore 1145 lxvd2x 54, 12, 1 + .cfi_restore 1146 lxvd2x 55, 14, 1 + .cfi_restore 1147 lxvd2x 56, 15, 1 + .cfi_restore 1148 lxvd2x 57, 16, 1 + .cfi_restore 1149 li 10, 224 li 11, 240 li 12, 256 @@ -3194,23 +3223,40 @@ intt_ppc_asm_Loopf: li 15, 288 li 16, 304 lxvd2x 58, 10, 1 + .cfi_restore 1150 lxvd2x 59, 11, 1 + .cfi_restore 1151 lxvd2x 60, 12, 1 + .cfi_restore 1152 lxvd2x 61, 14, 1 + .cfi_restore 1153 lxvd2x 62, 15, 1 + .cfi_restore 1154 lxvd2x 63, 16, 1 + .cfi_restore 1155 ld 14, 56(1) + .cfi_restore 14 ld 15, 64(1) + .cfi_restore 15 ld 16, 72(1) + .cfi_restore 16 ld 17, 80(1) + .cfi_restore 17 ld 18, 88(1) + .cfi_restore 18 ld 19, 96(1) + .cfi_restore 19 ld 20, 104(1) + .cfi_restore 20 ld 21, 112(1) + .cfi_restore 21 ld 22, 120(1) + .cfi_restore 22 mtlr 0 addi 1, 1, 352 + .cfi_def_cfa 1, 0 blr + .cfi_endproc MLK_ASM_FN_SIZE(intt_ppc_asm) diff --git a/mlkem/src/native/ppc64le/src/ntt_ppc_asm.S b/mlkem/src/native/ppc64le/src/ntt_ppc_asm.S index 6a99943b8..8f574dd96 100644 --- a/mlkem/src/native/ppc64le/src/ntt_ppc_asm.S +++ b/mlkem/src/native/ppc64le/src/ntt_ppc_asm.S @@ -20,17 +20,28 @@ .global MLK_ASM_NAMESPACE(ntt_ppc_asm) MLK_ASM_FN_SYMBOL(ntt_ppc_asm) + .cfi_startproc stdu 1, -352(1) + .cfi_def_cfa 1, 352 mflr 0 std 14, 56(1) + .cfi_offset 14, -296 std 15, 64(1) + .cfi_offset 15, -288 std 16, 72(1) + .cfi_offset 16, -280 std 17, 80(1) + .cfi_offset 17, -272 std 18, 88(1) + .cfi_offset 18, -264 std 19, 96(1) + .cfi_offset 19, -256 std 20, 104(1) + .cfi_offset 20, -248 std 21, 112(1) + .cfi_offset 21, -240 std 22, 120(1) + .cfi_offset 22, -232 li 10, 128 li 11, 144 li 12, 160 @@ -38,11 +49,17 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) li 15, 192 li 16, 208 stxvd2x 52, 10, 1 + .cfi_offset 1144, -224 stxvd2x 53, 11, 1 + .cfi_offset 1145, -208 stxvd2x 54, 12, 1 + .cfi_offset 1146, -192 stxvd2x 55, 14, 1 + .cfi_offset 1147, -176 stxvd2x 56, 15, 1 + .cfi_offset 1148, -160 stxvd2x 57, 16, 1 + .cfi_offset 1149, -144 li 10, 224 li 11, 240 li 12, 256 @@ -50,11 +67,17 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) li 15, 288 li 16, 304 stxvd2x 58, 10, 1 + .cfi_offset 1150, -128 stxvd2x 59, 11, 1 + .cfi_offset 1151, -112 stxvd2x 60, 12, 1 + .cfi_offset 1152, -96 stxvd2x 61, 14, 1 + .cfi_offset 1153, -80 stxvd2x 62, 15, 1 + .cfi_offset 1154, -64 stxvd2x 63, 16, 1 + .cfi_offset 1155, -48 lvx 5, 0, 4 addi 14, 4, 112 addi 22, 4, 2128 @@ -1611,11 +1634,17 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) li 15, 192 li 16, 208 lxvd2x 52, 10, 1 + .cfi_restore 1144 lxvd2x 53, 11, 1 + .cfi_restore 1145 lxvd2x 54, 12, 1 + .cfi_restore 1146 lxvd2x 55, 14, 1 + .cfi_restore 1147 lxvd2x 56, 15, 1 + .cfi_restore 1148 lxvd2x 57, 16, 1 + .cfi_restore 1149 li 10, 224 li 11, 240 li 12, 256 @@ -1623,23 +1652,40 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) li 15, 288 li 16, 304 lxvd2x 58, 10, 1 + .cfi_restore 1150 lxvd2x 59, 11, 1 + .cfi_restore 1151 lxvd2x 60, 12, 1 + .cfi_restore 1152 lxvd2x 61, 14, 1 + .cfi_restore 1153 lxvd2x 62, 15, 1 + .cfi_restore 1154 lxvd2x 63, 16, 1 + .cfi_restore 1155 ld 14, 56(1) + .cfi_restore 14 ld 15, 64(1) + .cfi_restore 15 ld 16, 72(1) + .cfi_restore 16 ld 17, 80(1) + .cfi_restore 17 ld 18, 88(1) + .cfi_restore 18 ld 19, 96(1) + .cfi_restore 19 ld 20, 104(1) + .cfi_restore 20 ld 21, 112(1) + .cfi_restore 21 ld 22, 120(1) + .cfi_restore 22 mtlr 0 addi 1, 1, 352 + .cfi_def_cfa 1, 0 blr + .cfi_endproc MLK_ASM_FN_SIZE(ntt_ppc_asm) diff --git a/mlkem/src/native/ppc64le/src/poly_tomont_ppc_asm.S b/mlkem/src/native/ppc64le/src/poly_tomont_ppc_asm.S index 3993dcffc..3d81bd495 100644 --- a/mlkem/src/native/ppc64le/src/poly_tomont_ppc_asm.S +++ b/mlkem/src/native/ppc64le/src/poly_tomont_ppc_asm.S @@ -27,7 +27,9 @@ .global MLK_ASM_NAMESPACE(poly_tomont_ppc_asm) MLK_ASM_FN_SYMBOL(poly_tomont_ppc_asm) + .cfi_startproc stdu 1, -320(1) + .cfi_def_cfa 1, 320 mflr 0 li 6, 128 li 7, 144 @@ -37,20 +39,31 @@ MLK_ASM_FN_SYMBOL(poly_tomont_ppc_asm) li 11, 208 li 12, 224 stxvd2x 52, 6, 1 + .cfi_offset 1144, -192 stxvd2x 53, 7, 1 + .cfi_offset 1145, -176 stxvd2x 54, 8, 1 + .cfi_offset 1146, -160 stxvd2x 55, 9, 1 + .cfi_offset 1147, -144 stxvd2x 56, 10, 1 + .cfi_offset 1148, -128 stxvd2x 57, 11, 1 + .cfi_offset 1149, -112 stxvd2x 58, 12, 1 + .cfi_offset 1150, -96 li 6, 240 li 7, 256 li 8, 272 li 9, 288 stxvd2x 59, 6, 1 + .cfi_offset 1151, -80 stxvd2x 60, 7, 1 + .cfi_offset 1152, -64 stxvd2x 61, 8, 1 + .cfi_offset 1153, -48 stxvd2x 62, 9, 1 + .cfi_offset 1154, -32 li 6, 0 li 7, 80 li 8, 96 @@ -266,23 +279,36 @@ MLK_ASM_FN_SYMBOL(poly_tomont_ppc_asm) li 11, 208 li 12, 224 lxvd2x 52, 6, 1 + .cfi_restore 1144 lxvd2x 53, 7, 1 + .cfi_restore 1145 lxvd2x 54, 8, 1 + .cfi_restore 1146 lxvd2x 55, 9, 1 + .cfi_restore 1147 lxvd2x 56, 10, 1 + .cfi_restore 1148 lxvd2x 57, 11, 1 + .cfi_restore 1149 lxvd2x 58, 12, 1 + .cfi_restore 1150 li 6, 240 li 7, 256 li 8, 272 li 9, 288 lxvd2x 59, 6, 1 + .cfi_restore 1151 lxvd2x 60, 7, 1 + .cfi_restore 1152 lxvd2x 61, 8, 1 + .cfi_restore 1153 lxvd2x 62, 9, 1 + .cfi_restore 1154 mtlr 0 addi 1, 1, 320 + .cfi_def_cfa 1, 0 blr + .cfi_endproc MLK_ASM_FN_SIZE(poly_tomont_ppc_asm) diff --git a/mlkem/src/native/ppc64le/src/reduce_ppc_asm.S b/mlkem/src/native/ppc64le/src/reduce_ppc_asm.S index 855316e05..2c26d4ec7 100644 --- a/mlkem/src/native/ppc64le/src/reduce_ppc_asm.S +++ b/mlkem/src/native/ppc64le/src/reduce_ppc_asm.S @@ -19,21 +19,31 @@ .global MLK_ASM_NAMESPACE(reduce_ppc_asm) MLK_ASM_FN_SYMBOL(reduce_ppc_asm) + .cfi_startproc stdu 1, -224(1) + .cfi_def_cfa 1, 224 mflr 0 std 14, 96(1) + .cfi_offset 14, -128 std 15, 104(1) + .cfi_offset 15, -120 std 16, 112(1) + .cfi_offset 16, -112 li 6, 128 li 7, 144 li 8, 160 li 9, 176 li 10, 192 stxvd2x 52, 6, 1 + .cfi_offset 1144, -96 stxvd2x 53, 7, 1 + .cfi_offset 1145, -80 stxvd2x 54, 8, 1 + .cfi_offset 1146, -64 stxvd2x 55, 9, 1 + .cfi_offset 1147, -48 stxvd2x 56, 10, 1 + .cfi_offset 1148, -32 vxor 7, 7, 7 li 6, 16 li 7, 32 @@ -684,21 +694,31 @@ MLK_ASM_FN_SYMBOL(reduce_ppc_asm) stxvd2x 33, 8, 3 stxvd2x 32, 9, 3 ld 14, 96(1) + .cfi_restore 14 ld 15, 104(1) + .cfi_restore 15 ld 16, 112(1) + .cfi_restore 16 li 6, 128 li 7, 144 li 8, 160 li 9, 176 li 10, 192 lxvd2x 52, 6, 1 + .cfi_restore 1144 lxvd2x 53, 7, 1 + .cfi_restore 1145 lxvd2x 54, 8, 1 + .cfi_restore 1146 lxvd2x 55, 9, 1 + .cfi_restore 1147 lxvd2x 56, 10, 1 + .cfi_restore 1148 mtlr 0 addi 1, 1, 224 + .cfi_def_cfa 1, 0 blr + .cfi_endproc MLK_ASM_FN_SIZE(reduce_ppc_asm) diff --git a/scripts/autogen b/scripts/autogen index b76d69ea3..cc782c27d 100755 --- a/scripts/autogen +++ b/scripts/autogen @@ -3237,9 +3237,7 @@ def update_via_simpasm( "-o", tmp.name, ] - # TODO: Support CFI for ppc64le - if arch != "ppc64le": - cmd += ["--cfify"] + cmd += ["--cfify"] if cross_prefix is not None: # Stick with llvm-objdump for disassembly cmd += ["--cc", cross_prefix + "gcc"] diff --git a/scripts/cfify b/scripts/cfify index f37a6fa30..ba268b191 100755 --- a/scripts/cfify +++ b/scripts/cfify @@ -7,7 +7,7 @@ directives for stack unwinding. Reads assembly from stdin (or --input) and emits the same assembly with architecture-appropriate `.cfi_*` directives inserted at prologue and -epilogue boundaries. Supports aarch64, x86_64, and armv81m targets. +epilogue boundaries. Supports aarch64, x86_64, armv81m, and ppc64le targets. Invoked by scripts/simpasm when --cfify is passed; see that script for the round-trip validation that guards correctness.""" @@ -199,6 +199,45 @@ def armv81m_parse_reglist(reglist_str): return regs +# ----------------------------------------------------------------------------- +# ppc64le module-scope constants and helpers +# ----------------------------------------------------------------------------- +# PPC64 ELFv2: the NTT/INTT/reduce/tomont routines are leaf functions that +# allocate a frame with `stdu 1, -N(1)`, hold LR in r0 across the body (no +# call, so the return address stays in its default LR location and needs no +# CFI), and spill a contiguous prefix of the callee-saved GPRs (r14-r31) plus +# the callee-saved Altivec vectors (v20-v31). The vector spills use indexed +# `stxvd2x VS, RA, 1` addressing, so the byte offset lives in the RA register +# set by a preceding `li`; we track those immediates to recover the offset. +PPC64LE_STDU_PATTERN = re.compile(r"(\s*)stdu\s+1,\s*-(\d+)\(1\)", re.IGNORECASE) +PPC64LE_ADDI_SP_PATTERN = re.compile(r"(\s*)addi\s+1,\s*1,\s*(\d+)\s*$", re.IGNORECASE) +PPC64LE_STD_PATTERN = re.compile(r"(\s*)std\s+(\d+),\s*(\d+)\(1\)", re.IGNORECASE) +PPC64LE_LD_PATTERN = re.compile(r"(\s*)ld\s+(\d+),\s*(\d+)\(1\)", re.IGNORECASE) +PPC64LE_LI_PATTERN = re.compile(r"(\s*)li\s+(\d+),\s*(-?\d+)\s*$", re.IGNORECASE) +PPC64LE_STXVD2X_PATTERN = re.compile( + r"(\s*)stxvd2x\s+(\d+),\s*(\d+),\s*1\s*$", re.IGNORECASE +) +PPC64LE_LXVD2X_PATTERN = re.compile( + r"(\s*)lxvd2x\s+(\d+),\s*(\d+),\s*1\s*$", re.IGNORECASE +) +PPC64LE_BLR_PATTERN = re.compile(r"(\s*)blr\s*$", re.IGNORECASE) + +# DWARF register numbers: GPRs map to 0-31 directly; Altivec v0-v31 are +# 1124-1155. A VSX register number 32-63 (as named by stxvd2x/lxvd2x) aliases +# the Altivec register VR(n-32). +PPC64LE_VR_DWARF_BASE = 1124 +PPC64LE_CALLEE_SAVED_GPR_LO = 14 +PPC64LE_CALLEE_SAVED_GPR_HI = 31 + + +def ppc64le_vsr_to_vr_dwarf(vsr): + """DWARF number of the Altivec VR aliased by VSX register `vsr`, or None + if `vsr` is not in the VR-aliased range (32-63).""" + if 32 <= vsr <= 63: + return PPC64LE_VR_DWARF_BASE + (vsr - 32) + return None + + def add_cfi_directives(text, arch): lines = text.split("\n") result = [] @@ -210,6 +249,11 @@ def add_cfi_directives(text, arch): # the current CFA register. x86_64_cfa_reg = "rsp" + # ppc64le: frame size from `stdu 1, -N(1)` (CFA = r1 + N), and the most + # recent `li REG, IMM` immediates used to resolve indexed vector spills. + ppc64le_frame_size = 0 + ppc64le_li_values = {} + while i < len(lines): line = lines[i].rstrip() @@ -524,6 +568,90 @@ def add_cfi_directives(text, arch): i += 1 continue + elif arch == "ppc64le": + # Frame allocation: stdu 1, -N(1) sets CFA = r1 + N. + match = PPC64LE_STDU_PATTERN.match(line) + if match: + indent, size_str = match.groups() + ppc64le_frame_size = int(size_str) + result.append(line) + result.append(f"{indent}.cfi_def_cfa 1, {ppc64le_frame_size}") + i += 1 + continue + + # Track li REG, IMM to resolve the byte offsets of indexed vector + # spills (stxvd2x/lxvd2x use base+index addressing). + match = PPC64LE_LI_PATTERN.match(line) + if match: + ppc64le_li_values[match.group(2)] = int(match.group(3)) + result.append(line) + i += 1 + continue + + # Callee-saved GPR spill: std RR, OFF(1). + match = PPC64LE_STD_PATTERN.match(line) + if match: + indent, reg_str, off_str = match.groups() + reg = int(reg_str) + result.append(line) + if PPC64LE_CALLEE_SAVED_GPR_LO <= reg <= PPC64LE_CALLEE_SAVED_GPR_HI: + cfa_off = int(off_str) - ppc64le_frame_size + result.append(f"{indent}.cfi_offset {reg}, {cfa_off}") + i += 1 + continue + + # Callee-saved GPR restore: ld RR, OFF(1). + match = PPC64LE_LD_PATTERN.match(line) + if match: + indent, reg_str, _ = match.groups() + reg = int(reg_str) + result.append(line) + if PPC64LE_CALLEE_SAVED_GPR_LO <= reg <= PPC64LE_CALLEE_SAVED_GPR_HI: + result.append(f"{indent}.cfi_restore {reg}") + i += 1 + continue + + # Callee-saved vector spill: stxvd2x VS, RA, 1 (base = sp). + match = PPC64LE_STXVD2X_PATTERN.match(line) + if match: + indent, vsr_str, idx_reg = match.groups() + result.append(line) + dwarf = ppc64le_vsr_to_vr_dwarf(int(vsr_str)) + if dwarf is not None and idx_reg in ppc64le_li_values: + cfa_off = ppc64le_li_values[idx_reg] - ppc64le_frame_size + result.append(f"{indent}.cfi_offset {dwarf}, {cfa_off}") + i += 1 + continue + + # Callee-saved vector restore: lxvd2x VS, RA, 1 (base = sp). + match = PPC64LE_LXVD2X_PATTERN.match(line) + if match: + indent, vsr_str, idx_reg = match.groups() + result.append(line) + dwarf = ppc64le_vsr_to_vr_dwarf(int(vsr_str)) + if dwarf is not None and idx_reg in ppc64le_li_values: + result.append(f"{indent}.cfi_restore {dwarf}") + i += 1 + continue + + # Frame deallocation: addi 1, 1, N restores CFA = r1 + 0. + match = PPC64LE_ADDI_SP_PATTERN.match(line) + if match: + indent = match.group(1) + result.append(line) + result.append(f"{indent}.cfi_def_cfa 1, 0") + i += 1 + continue + + # Function return: blr. + match = PPC64LE_BLR_PATTERN.match(line) + if match: + indent = match.group(1) + result.append(line) + result.append(f"{indent}.cfi_endproc") + i += 1 + continue + result.append(line) i += 1 @@ -543,7 +671,7 @@ def main(): ) parser.add_argument( "--arch", - choices=["aarch64", "x86_64", "armv81m"], + choices=["aarch64", "x86_64", "armv81m", "ppc64le"], default="aarch64", help="Target architecture (default: aarch64)", )