diff --git a/bintool/bintool.cpp b/bintool/bintool.cpp index 5cf73180..8d5cb225 100644 --- a/bintool/bintool.cpp +++ b/bintool/bintool.cpp @@ -201,7 +201,7 @@ void set_block_ignored(elf_file *elf, uint32_t block_addr) { uint32_t offset = block_addr + 4 - seg->physical_address(); if ((content[offset] & 0x7f) != PICOBIN_BLOCK_ITEM_PARTITION_TABLE) { DEBUG_LOG("setting block at %08x to ignored\n", block_addr); - content[offset] = 0x7e; + content[offset] = 0x7e; } elf->content(*seg, content); } @@ -233,7 +233,7 @@ void set_block_ignored(std::vector &bin, uint32_t storage_addr, uint32_ uint32_t offset = block_addr + 4 - storage_addr; if ((bin[offset] & 0x7f) != PICOBIN_BLOCK_ITEM_PARTITION_TABLE) { DEBUG_LOG("setting block at %08x to ignored\n", block_addr); - bin[offset] = 0x7e; + bin[offset] = 0x7e; } } diff --git a/enc_bootloader/CMakeLists.txt b/enc_bootloader/CMakeLists.txt index 45f3cc16..51f99e14 100644 --- a/enc_bootloader/CMakeLists.txt +++ b/enc_bootloader/CMakeLists.txt @@ -46,6 +46,10 @@ if (NOT USE_PRECOMPILED) target_compile_definitions(enc_bootloader PRIVATE PICO_STACK_SIZE=0x800 + # we cannot unreset clk_peri clocked h/w as we don't configure clk_peri + PICO_RUNTIME_SKIP_INIT_POST_CLOCK_RESETS=1 + RC_COUNT=0 # don't use rcp_count + # 0x20080000 -> 0x20081000 doesn't overlap the stack ROM_CHAIN_WORKSPACE=0x20080000) @@ -53,7 +57,7 @@ if (NOT USE_PRECOMPILED) pico_set_linker_script(enc_bootloader ${CMAKE_CURRENT_LIST_DIR}/memmap_mbedtls.ld) else() - target_sources(enc_bootloader PRIVATE aes.S) + target_sources(enc_bootloader PRIVATE aes.S hard_entry_point.S) target_compile_definitions(enc_bootloader PRIVATE PICO_STACK_SIZE=0x180 @@ -61,25 +65,45 @@ if (NOT USE_PRECOMPILED) ROM_CHAIN_WORKSPACE=0x20080200) pico_set_linker_script(enc_bootloader ${CMAKE_CURRENT_LIST_DIR}/memmap_enc_bootloader.ld) + + target_compile_definitions(enc_bootloader PRIVATE + # The following are to reduce the size of the binary + PICO_NO_PROGRAM_INFO=1 + PICO_CRT0_NO_RESET_SECTION=1 + + HARDENING=1 + DOUBLE_HARDENING=1 + + INLINE_REF_ROUNDKEY_SHARES_S=1 # avoid need for canaries/calling check + INLINE_REF_ROUNDKEY_HVPERMS_S=1 # avoid need for canaries/calling check + INLINE_SHIFT_ROWS_S=1 # avoid need for canaries/calling check + INLINE_MAP_SBOX_S=1 # avoid need for canaries/calling check + + CALLER_INIT_RCP_COUNT=1 + ) endif() target_compile_definitions(enc_bootloader PRIVATE + # The following are to reduce the size of the binary # use stack guards, as AES variables are written near the stack PICO_USE_STACK_GUARDS=1 - # The following are to reduce the size of the binary - PICO_NO_PROGRAM_INFO=1 # No spinlocks used PICO_USE_SW_SPIN_LOCKS=0 # No heap is used PICO_HEAP_SIZE=0 - # These inits are not required - PICO_RUNTIME_SKIP_INIT_SPIN_LOCKS_RESET=1 - PICO_RUNTIME_SKIP_INIT_PER_CORE_IRQ_PRIORITIES=1 + # Note all runtime init is skipped via the linker script + PICO_BOOTROM_LOCKING_ENABLED=0 # Don't need any vtor irqs PICO_MINIMAL_STORED_VECTOR_TABLE=1 PICO_NO_RAM_VECTOR_TABLE=1 - ) + + PICO_USE_GPIO_COPROCESSOR=0 + FIB_WORKAROUND=1 + ) + if (ALLOW_DEBUGGING) + target_compile_definitions(enc_bootloader PRIVATE ALLOW_DEBUGGING=1) + endif() # print memory usage target_link_options(enc_bootloader PUBLIC -Wl,--print-memory-usage) diff --git a/enc_bootloader/aes.S b/enc_bootloader/aes.S index e3ef4644..fb23a21e 100644 --- a/enc_bootloader/aes.S +++ b/enc_bootloader/aes.S @@ -22,6 +22,16 @@ scratch RAM and the stack pointer is overwritten. #include "hardware/regs/trng.h" #include "hardware/rcp.h" +#if HARDENING +@ Number of calls to gen_rand_sha[_nonpres] +#define RND_COUNT_decrypt 394 // From decrypt up to call to ctr_crypt_s +#define RND_COUNT_ctr_crypt_s_init (17 + 32 * CT_BPERM) // Init phase of ctr_crypt_s +#define RND_COUNT_ctr_crypt_mainloop_A (4 + ST_VPERM + ST_SHAREC) +#define RND_COUNT_refreshchaff_and_lfsr 2 +#define RND_COUNT_remap 2 +#define RND_COUNT_decryption_end 3 +#endif + .global decrypt .global chaff @@ -39,19 +49,70 @@ scratch RAM and the stack pointer is overwritten. #define CTAG7 0x32 #define CTAG8 0x33 #define CTAG9 0x34 -#define CTAG10 0x35 @ not used -#define CTAG11 0x36 @ not used +#define CTAG10 0x35 +#define CTAG11 0x36 #define CTAG12 0x37 #define CTAG13 0x38 #define CTAG14 0x39 #define CTAG15 0x3a #define CTAG16 0x3b #define CTAG17 0x3c -#define CTAG18 0x3d @ not used +#define CTAG18 0x3d +#define CTAG19 0x3e +#define CTAG20 0x3f +#define CTAG21 0x29 @ number of blocks from the TRNG processed to initialise rstate_sha #define TRNG_BLOCKS 25 +@ if GEN_RAND_SHA==0 then we don't call the counting version +#if HARDENING && GEN_RAND_SHA +.macro check_rnd_count count +.if !(\count & 0xffffff00) +movs r1, #\count +.else +ldr r1, =\count +.endif +movs r0, #(\count & 1) ^ 1 +bl check_rnd_count_func +rcp_iequal_nodelay r1, r0 +#if DOUBLE_HARDENING +rcp_iequal_nodelay r0, r1 +#endif +.endm + +@ r1 has the expected count +@ Trashes r0 +.macro check_rnd_count_dynamic +mov r0, sp +bl check_rnd_count_func +rcp_iequal_nodelay r1, r0 +#if DOUBLE_HARDENING +rcp_iequal_nodelay r0, r1 +#endif +.endm + +.macro reset_rnd_count +bl reset_rnd_count_func +.endm + +.macro reset_rnd_count_checked +@ This version verifies that the count was actually reset +uxtb r0, r1 +bl reset_rnd_count_func +ldr r0, [r0] +bics r1, #0xff00ff +rcp_iequal_nodelay r1, r0 +.endm + +#else +.macro check_rnd_count count +.endm +.macro reset_rnd_count +.endm +.macro reset_rnd_count_checked +.endm +#endif @ The lower jitterpriorty is, the more the jitter .macro SET_COUNT n,jitterpriority .if RC_COUNT @@ -119,11 +180,12 @@ scratch RAM and the stack pointer is overwritten. workspace_start: @ chaff has to be at the start of scratch_y = 0x20081000 because this is assumed by the following macro, getchaffaddress -@ (It seems ADR does not work, nor is it possible to assert that chaff==0x20081000) -@ getchaffaddress is used by clear03 and clear01 and other sensitive cases which require the first load to be a random one -@ chaff has to be 0 mod 16 for other reasons +@ We need to set the chaff address directly with MOVs, rather than setting it with a load as normal, because at the point +@ the macro is called we have just done a load of a sensitive value at a known memory offset mod 16, and the idea is that +@ the next load is going to be of a random number (in the "chaff" memory) at that same offset mod 16, so we can't afford +@ to do a ldr \rx, =0x20081000 + \offset first, as this will load a non-random value from an uncontrolled memory location mod 16. +@ Ideally we'd avoid the magic number 0x2008100 by using, ADR \rx, chaff+\offset, but the linker does not support this. .macro getchaffaddress rx,offset=0 -@ ldr \rx,=(chaff+\offset) mov \rx,#(0x1000+\offset) movt \rx,#0x2008 .endm @@ -158,7 +220,7 @@ statevperm: @ 12 mod 16 .space 4 @ vperm state rotation: only last two bits are operational; other bits random RKshareC: @ Round key common share C; see comment at init_key_4way for explanation .space 4 -RKshareCchange: @ Temporary used by ref_roundkey_share_s +RKshareCchange: @ Temporary used by ref_roundkey_shares_s .space 4 IV0: @ 2-way share of IV for block 0 .space 36 @ Considering IV0 as a word pointer, the format is IV = IV0[0,1,2,3] ^ (IV0[5,6,7,8],ror#16) @@ -216,6 +278,8 @@ jstate: @ 32-bit jitter state rstate_lfsr: @ 32-bit LFSR random state and constant used to step it .space 4 .word 0x1d872b41 @ constant that defines a maximal-length LFSR +rstate_count: +.space 4 rstate_all_end: @ Mark end of RNG data to allow selective memory wipe .if CT_BPERM @@ -248,6 +312,9 @@ init_rstate: movw r1,#SHA256_CSR_RESET|SHA256_CSR_START_BITS @ initialise SHA internal state by writing START bit str r1,[r5,#SHA256_CSR_OFFSET] str r6,[r4,#TRNG_SAMPLE_CNT1_OFFSET -TRNG_RNG_IMR_OFFSET] +#if HARDENING + movs r3, #0 +#endif movs r6,#TRNG_BLOCKS*2+1 @ odd so that we break out of the loop half-way through loading the SHA hardware, giving @ time for previous SHA computation to complete 2: @@ -256,9 +323,8 @@ init_rstate: str r1,[r4,#TRNG_RND_SOURCE_ENABLE_OFFSET -TRNG_RNG_IMR_OFFSET] @ start ROSC if it is not already started str r1,[r4,#TRNG_RNG_ICR_OFFSET -TRNG_RNG_IMR_OFFSET] @ clear all interrupts (including EHR_VLD) adds r0,r4,#TRNG_EHR_DATA0_OFFSET -TRNG_RNG_IMR_OFFSET - movs r2,#TRNG_TRNG_BUSY_OFFSET -TRNG_RNG_IMR_OFFSET 1: - ldr r1,[r4,r2] @ wait for 192 ROSC samples to fill EHR,should take constant time + ldr r1,[r4,#TRNG_TRNG_BUSY_OFFSET -TRNG_RNG_IMR_OFFSET] @ wait for 192 ROSC samples to fill EHR,should take constant time cmp r1,#0 bne 1b subs r6,#1 @ done? @@ -267,13 +333,31 @@ init_rstate: 1: ldmia r0!,{r2} @ copy 6 EHR words to SHA-256, plus garbage (RND_SOURCE_ENABLE and SAMPLE_CNT1) str r2,[r5,#SHA256_WDATA_OFFSET] @ for a total of half a SHA-256 block +#if HARDENING + adds r3,#1 +#endif subs r1,#1 bne 1b +#if HARDENING + ldr r1, =TRNG_BASE+TRNG_EHR_DATA0_OFFSET+32 + rcp_iequal_nodelay r0, r1 +#endif ldr r2,[r5,#SHA256_SUM0_OFFSET] @ TRNG is now sampling again; use some SHA bits to modulate the chain length str r2,[r4,#TRNG_TRNG_CONFIG_OFFSET -TRNG_RNG_IMR_OFFSET] b.n 2b 3: +#if HARDENING + movs r2, #(TRNG_BLOCKS*2) * 8 + rcp_iequal_nodelay r2, r3 +#endif +#if HARDENING +@ good test that we are dealing with real hardware + ldr r2,[r5,#SHA256_CSR_OFFSET] + movw r1,#SHA256_CSR_RESET + rcp_iequal_nodelay r1, r2 + rcp_iequal_nodelay r2, r1 +#endif CHK_COUNT 25,6 str r1,[r4,#TRNG_TRNG_CONFIG_OFFSET -TRNG_RNG_IMR_OFFSET] @ turn off rand source and wipe SHA bits left in TRNG config; r1=0 str r1,[r4,#TRNG_RND_SOURCE_ENABLE_OFFSET -TRNG_RNG_IMR_OFFSET] @@ -285,7 +369,11 @@ init_rstate: stmia r6,{r0-r3} CHK_COUNT 26,6 movs r0,#0 +#if !HARDENING strb r0,[r6] @ make sure rstate_sha[0] has byte 0 set to 0, representing "out of data" +#else + str r0,[r6] @ make sure rstate_sha[0] has word 0 set to 0, representing "out of data" (24-31) and 0 numbers generated (0-23) +#endif @ try to find a non-zero initialiser to create a non-degenerate LFSR random state ldr r1,[r5,#16] @ SHA SUM4 @@ -295,7 +383,7 @@ init_rstate: mov r1,r6 @ give up and use the address of rstate_sha (which is non-zero); this can't really happen (2^{-64} probability) 1: str r1,[r6,#rstate_lfsr-rstate_sha] - + @ try to find a non-zero initialiser to create a non-degenerate ROSC random state ldr r1,[r5,#24] @ SHA SUM6 cbnz r1,1f @ is word 6 non-zero? then use it @@ -306,7 +394,15 @@ init_rstate: ldr r2,=ROSC_RANDOM_OFFSET+ROSC_BASE str r1,[r2,#0] @ Initialise ROSC LFSR CHK_COUNT 27,6 - +#if HARDENING + ldr r3,=ROSC_RANDOM_OFFSET+ROSC_BASE + cbnz r1, 1f + rcp_panic +1: + ldr r3, [r3] + rcp_iequal_nodelay r1, r3 +#endif + .if GEN_RAND_SHA .if SH_JITTER movs r2,#0 @@ -317,9 +413,102 @@ init_rstate: CHK_COUNT 28,6 bx r14 +.thumb_func +decrypt: +@ r0=4-way key, r1=IV_shareA, r2=IV_shareB, r3=message buffer, [sp]=number of blocks + ldr r12,[sp] @ Pop 5th argument in r12 (which we are allowed to treat as scratch according to AAPCS) + push {r14} + GET_CANARY r14,CTAG3,6 +#if !CALLER_INIT_RCP_COUNT + SET_COUNT 23,6 +#endif + push {r4-r11,r14} + push {r0-r3,r12} @ Save the five arguments + bl reset_sha_trng + bl init_rstate +@ randomly re-share the LUT contents + ldr r4,=lut_a + mov r5,#64 @ 64 words = 256 bytes +1: + bl gen_rand_sha_nonpres + ldr r6,[r4,#lut_b-lut_a] @ EOR a random word into both shares + eors r6,r6,r0 +@if r0 is not EORed into only one share, then the LUT won't be right + str r6,[r4,#lut_b-lut_a] + ldr r6,[r4] +#if HARDENING + eors r7,r6,r0 + eors r8,r7,r6 + rcp_iequal_nodelay r8, r0 + stmia r4!,{r7} +#else + eors r6,r6,r0 + stmia r4!,{r6} +#endif + subs r5,r5,#1 + bne 1b +#if HARDENING + ldr r5,=lut_a + 256 + rcp_iequal_nodelay r4, r5 +#endif + CHK_COUNT 29,6 +#if HARDENING +@check again as this is quite important + rcp_iequal_nodelay r5, r4 +#endif + bl remap @ scramble the LUTs + pop {r0} @ pointer to 4way key data + bl init_key_4way + // todo alex this may trash r12; is that ok? + bl lock_key + CHK_COUNT 32,6 + pop {r0-r3} @ r0=IV_shareA, r1=IV_shareB, r2=message, r3=num blocks + bl ctr_crypt_s + bl randomisechaff + clear03 + pop {r4-r11,r14} + CHK_CANARY r14,CTAG3,6 + pop {r15} + +.thumb_func +reset_sha_trng: + GET_CANARY r0,CTAG19,0 + ldr r1,=RESETS_BASE+RESETS_RESET_OFFSET + ldr r2,[r1] + ldr r3,=#RESETS_RESET_SHA256_BITS|RESETS_RESET_TRNG_BITS + orrs r2,r2,r3 + str r2,[r1] @ reset the SHA hardware and the TRNG hardware + CHK_COUNT 23,6 + bics r2,r2,r3 + str r2,[r1] @ release the reset + CHK_CANARY r0,CTAG19,0 + bx r14 + @ Put AES core code in first scratch area .section .scratch_x.aes,"ax",%progbits +@ if GEN_RAND_SHA==0 then we don't call the counting version +#if HARDENING && GEN_RAND_SHA +check_rnd_count_func: +@ NOTE: we don't bother with a canary here as we don't write anything + ldr r0,=rstate_sha + ldr r0, [r0] + rsbs r0,r0,#0 @ Negate bottom 24 bits to get the number of calls to gen_rand_sha[_nonpres] since the last reset + bfc r0,#24,#8 @ + bx r14 + +reset_rnd_count_func: + push {lr} + GET_CANARY lr,CTAG11,0 + ldr r0,=rstate_sha + ldrb r1, [r0, #3] + orrs r1, #1 + lsls r1, #24 + str r1, [r0] + CHK_CANARY lr,CTAG11,0 + pop {pc} +#endif + .if GEN_RAND_SHA @ we need SHA256_SUM0_OFFSET==8 (see note below) .if SHA256_SUM0_OFFSET!=8 @@ -330,74 +519,119 @@ init_rstate: @ Preserves r1-r13 .balign 4 gen_rand_sha: - push {r14} - GET_CANARY r14,CTAG1,2 - push {r1-r3,r14} + push {r1-r3,lr} + GET_CANARY r1,CTAG1,2 + push {r1} .if SH_JITTER ldr r2,=rstate_sha ldr r0,[r2,#jstate-rstate_sha] + lsls r3,r0,#30 + lsrs r3,#28 movs r1,#1 - ands r3,r0,#3 - movs r3,r3,lsl#2 - movs r3,r1,lsl r3 @ 1<<(4*(r0&3)) + lsls r3,r1,r3 @ 1<<(4*(r0&3)) udiv r3,r3,r1 @ Takes constant + (r0&3) cycles lsrs r0,r0,#2 bne 1f bl gen_rand_sha_nonpres ldr r2,=rstate_sha +#if HARDENING + ldr r1,[r2] @ Make this (SH_JITTER) not affect rnd_count + adds r1,r1,#1 @ (compensating for call to gen_rand_sha_nonpres which decrements the count by 1) + str r1,[r2] @ The purpose is to simplify check_rnd_count calls, and to avoid having to reset jstate frequently +#endif 1: str r0,[r2,#jstate-rstate_sha] .endif bl gen_rand_sha_nonpres - pop {r1-r3,r14} - CHK_CANARY r14,CTAG1,0 - pop {r15} + pop {r1} + CHK_CANARY r1,CTAG1,0 + pop {r1-r3,pc} @ Return single random word in r0 @ Trashes r1-r3 .balign 4 gen_rand_sha_nonpres: - ldr r0,=SHA256_BASE + push {lr} + GET_CANARY lr,CTAG18,0 ldr r2,=rstate_sha +#if !HARDENING + ldr r3,=SHA256_BASE ldrb r1,[r2] @ get word counter from bottom byte of rstate_sha[] (offset into SUM registers) - subs r3,r1,#4 @ decrement it to previous SUM register + subs r0,r1,#4 @ decrement it to previous SUM register ble 1f @ if the offset was 4 or less we have run out of SUM register values - ldr r0,[r0,r1] @ read value from SUM register: note that this relies on SHA256_SUM0_OFFSET==8 - strb r3,[r2] @ save updated SUM register offset in bottom byte of rstate_sha[] - bx r14 + strb r0,[r2] @ save updated SUM register offset in bottom byte of rstate_sha[] + ldr r0,[r3,r1] @ read value from SUM register: note that this relies on SHA256_SUM0_OFFSET==8 +#else + ldr r3,=SHA256_BASE + ldr r1,[r2] @ get word counter (8) : rand counter (24) from first word of rstate_sha[] (offset into SUM registers) + lsls r0, r1, #1 @ clear C (also set N which may force us down BLE path on skip of the sub below) + sbcs r0,r1,#0x04000000 @ decrement word counter for previous SUM register (and decrement rand counter due to C == 0) + str r0,[r2] @ save updated worder counter / rand_counter in bottom word of rstate_sha[] + asrs r1, r0, #24 + ble 1f @ if the offset was 4 or less we have run out of SUM register values + + ldr r2,=SHA256_BASE + 4 + adds r2, r1 + adds r1, r3, r0, asr #24 + ldr r0, [r2], #-4 + rcp_iequal_nodelay r1, r2 +#endif + b gen_rand_sha_nonpres_exit 1: @ [CK_JITTER code was here] - movs r3,#SHA256_SUM6_OFFSET+1 - strb r3,[r2] @ reset word counter: the +1 is compensated for later + movs r0,#SHA256_SUM6_OFFSET+1 +#if !HARDENING + strb r0,[r2] @ reset word counter: the +1 is compensated for later +#else + strb r0,[r2,#3] @ reset word counter: the +1 is compensated for later +#endif movw r1,#(1<>30, vpermB=Bptr[4]>>30, and @ roundkey shareA(i) = Aptr[i+vpermA mod 4] ror ((i+vpermA mod 4)^th byte of Aptr[4]) @ roundkey shareB(i) = Bptr[i+vpermB mod 4] ror ((i+vpermB mod 4)^th byte of Bptr[4])+16 -.balign 4 -.thumb_func -ref_roundkey_shares_s: - mov r11,#15 @ there are 15 expanded keys -ref_roundkey_shares_s_test: @ entry point for test code to do fewer than 15 rounds + +.macro ref_roundkey_shares_s_impl ldr r4,=rkey_s loadlfsr - steplfsr @ r0=change in RKshareC + steplfsr_check @ r0=change in RKshareC ldr r2,=RKshareCchange str r0,[r2] ldr r3,=RKshareC @@ -723,7 +918,7 @@ ref_roundkey_shares_s_loop: steplfsr; eors r5,r5,r0; ands r9,r9,#3; ldr r3,[r4,r9,lsl#2]; ror r2,r0,r12; eors r3,r3,r2,ror#16; mov r12,r12,ror#8; str r3,[r4,r9,lsl#2]; adds r9,r9,#1 steplfsr; eors r6,r6,r0; ands r9,r9,#3; ldr r3,[r4,r9,lsl#2]; ror r2,r0,r12; eors r3,r3,r2,ror#16; mov r12,r12,ror#8; str r3,[r4,r9,lsl#2]; adds r9,r9,#1 steplfsr; eors r7,r7,r0; ands r9,r9,#3; ldr r3,[r4,r9,lsl#2]; ror r2,r0,r12; eors r3,r3,r2,ror#16; mov r12,r12,ror#8; str r3,[r4,r9,lsl#2]; adds r9,r9,#1 - steplfsr; eors r8,r8,r0; ands r9,r9,#3; ldr r3,[r4,r9,lsl#2]; ror r2,r0,r12; eors r3,r3,r2,ror#16; str r3,[r4,r9,lsl#2] + steplfsr_check; eors r8,r8,r0; ands r9,r9,#3; ldr r3,[r4,r9,lsl#2]; ror r2,r0,r12; eors r3,r3,r2,ror#16; str r3,[r4,r9,lsl#2] ldr r3,=RKshareCchange ldr r3,[r3] @@ -740,63 +935,23 @@ ref_roundkey_shares_s_loop: subs r11,r11,#1 bne ref_roundkey_shares_s_loop +#if HARDENING + ldr r5,=rkey_s + 40 * 15 + rcp_iequal_nodelay r4, r5 +#endif ldr r2,=rstate_lfsr @ restore rstate_lfsr savelfsr @ Save lfsr_state clear03 24 -ref_roundkey_shares_s_exit: - bx r14 - -.balign 4 -.thumb_func -@ Rotates roundkey vperms and RK_ROR rotations by random amounts -@ Trashes r0-r10 -@ If i = word number 0..3, -@ Aptr=memory word pointer to block of 20 bytes containing H&V-rotated share A roundkey (similarly B), then -@ vpermA=Aptr[4]>>30, vpermB=Bptr[4]>>30, and -@ roundkey shareA(i) = Aptr[i+vpermA mod 4] ror ((i+vpermA mod 4)^th byte of Aptr[4]) -@ roundkey shareB(i) = Bptr[i+vpermB mod 4] ror ((i+vpermB mod 4)^th byte of Bptr[4])+16 -ref_roundkey_hvperms_s: - movs r7,#30 -ref_roundkey_hvperms_s_test: @ entry point for test code to do fewer than 30 key shares - GET_CANARY r10,CTAG9,6 - push {r10,r14} - ldr r10,=rkey_s -ref_roundkey_hvperms_s_loop: - bl gen_rand_lfsr_nonpres @ r0=new vperm high|rotations - ldmia r10,{r2-r5,r9} @ r2-r5=roundkey share A/B, r9=old vperm high|rotations - str r0,[r10,#16] - mov r8,r0,lsr#30 @ r8=new vperm low - sub r6,r8,r9,lsr#30 @ r6=(new vperm low)-(old vperm low) | junk - mov r8,r6,lsl#3 @ r8=8*((new vperm low)-(old vperm low)) mod 32 - mov r0,r0,ror r8 - usub8 r0,r9,r0 @ i^th byte of r0 = (i^th byte of old rotations) - ((i+newvperm-oldvperm)^th byte of new rotations) - movs r2,r2,ror r0; ands r6,r6,#3; str r2,[r10,r6,lsl#2]; movs r0,r0,ror#8; adds r6,r6,#1 - movs r3,r3,ror r0; ands r6,r6,#3; str r3,[r10,r6,lsl#2]; movs r0,r0,ror#8; adds r6,r6,#1 - movs r4,r4,ror r0; ands r6,r6,#3; str r4,[r10,r6,lsl#2]; movs r0,r0,ror#8; adds r6,r6,#1 - movs r5,r5,ror r0; ands r6,r6,#3; str r5,[r10,r6,lsl#2] - adds r10,r10,#20 - subs r7,r7,#1 - bne ref_roundkey_hvperms_s_loop - clear03 28 -ref_roundkey_hvperms_s_exit: @ label exit point to be to able to specify to analysis code - pop {r10,r14} - CHK_CANARY r10,CTAG9,6 - bx r14 +.endm -.else +.else // RK_ROR @ "refresh" shares of rkeys by random eor into both shares of each word, and also randomise the single word RKshareC @ Trashes r0-r11 -.balign 4 -.thumb_func -ref_roundkey_shares_s: - mov r11,#15 @ there are 15 expanded keys -ref_roundkey_shares_s_test: @ entry point for test code to do fewer than 15 rounds - GET_CANARY r4,CTAG8,6 - push {r4,r14} +.macro ref_roundkey_shares_s_impl ldr r4,=rkey_s loadlfsr - steplfsr @ r0=change in RKshareC + steplfsr_check @ r0=change in RKshareC ldr r3,=RKshareC ldr r5,[r3] eors r5,r5,r0 @@ -815,7 +970,7 @@ ref_roundkey_shares_s_loop: steplfsr; eors r5,r5,r0; and r9,r9,#3; eors r5,r5,r10; ldr r3,[r4,r9,lsl#2]; eors r3,r3,r0,ror#16; str r3,[r4,r9,lsl#2]; adds r9,r9,#1 steplfsr; eors r6,r6,r0; and r9,r9,#3; eors r6,r6,r10; ldr r3,[r4,r9,lsl#2]; eors r3,r3,r0,ror#16; str r3,[r4,r9,lsl#2]; adds r9,r9,#1 steplfsr; eors r7,r7,r0; and r9,r9,#3; eors r7,r7,r10; ldr r3,[r4,r9,lsl#2]; eors r3,r3,r0,ror#16; str r3,[r4,r9,lsl#2]; adds r9,r9,#1 - steplfsr; eors r8,r8,r0; and r9,r9,#3; eors r8,r8,r10; ldr r3,[r4,r9,lsl#2]; eors r3,r3,r0,ror#16; str r3,[r4,r9,lsl#2] + steplfsr_check; eors r8,r8,r0; and r9,r9,#3; eors r8,r8,r10; ldr r3,[r4,r9,lsl#2]; eors r3,r3,r0,ror#16; str r3,[r4,r9,lsl#2] subs r4,r4,#20 stmia r4,{r5-r8} @@ -827,20 +982,69 @@ ref_roundkey_shares_s_loop: bne ref_roundkey_shares_s_loop savelfsr clear03 24 -ref_roundkey_shares_s_exit: - pop {r4,r14} - CHK_CANARY r4,CTAG8,6 - bx r14 +#if HARDENING + ldr r5,=rkey_s + 40 * 15 + rcp_iequal_nodelay r4, r5 +#endif +.endm +.endif +.if INLINE_REF_ROUNDKEY_SHARES_S +.macro inline_ref_roundkey_shares_s +ref_roundkey_shares_s_starts: + mov r11,#15 @ there are 15 expanded keys + ref_roundkey_shares_s_impl +ref_roundkey_shares_s_end: +.endm +.else .balign 4 .thumb_func +ref_roundkey_shares_s: + mov r11,#15 @ there are 15 expanded keys +ref_roundkey_shares_s_test: @ entry point for test code to do fewer than 15 rounds + push {lr} + GET_CANARY lr,CTAG8,6 + ref_roundkey_shares_s_impl + CHK_CANARY lr,CTAG8,6 + pop {pc} +.endif + +.if RK_ROR + +@ Rotates roundkey vperms and RK_ROR rotations by random amounts +@ Trashes r0-r10 +@ If i = word number 0..3, +@ Aptr=memory word pointer to block of 20 bytes containing H&V-rotated share A roundkey (similarly B), then +@ vpermA=Aptr[4]>>30, vpermB=Bptr[4]>>30, and +@ roundkey shareA(i) = Aptr[i+vpermA mod 4] ror ((i+vpermA mod 4)^th byte of Aptr[4]) +@ roundkey shareB(i) = Bptr[i+vpermB mod 4] ror ((i+vpermB mod 4)^th byte of Bptr[4])+16 +.macro ref_roundkey_hvperms_s_impl + ldr r10,=rkey_s +ref_roundkey_hvperms_s_loop: + bl gen_rand_lfsr_nonpres @ r0=new vperm high|rotations + ldmia r10,{r2-r5,r9} @ r2-r5=roundkey share A/B, r9=old vperm high|rotations + str r0,[r10,#16] + mov r8,r0,lsr#30 @ r8=new vperm low + sub r6,r8,r9,lsr#30 @ r6=(new vperm low)-(old vperm low) | junk + mov r8,r6,lsl#3 @ r8=8*((new vperm low)-(old vperm low)) mod 32 + mov r0,r0,ror r8 + usub8 r0,r9,r0 @ i^th byte of r0 = (i^th byte of old rotations) - ((i+newvperm-oldvperm)^th byte of new rotations) + movs r2,r2,ror r0; ands r6,r6,#3; str r2,[r10,r6,lsl#2]; movs r0,r0,ror#8; adds r6,r6,#1 + movs r3,r3,ror r0; ands r6,r6,#3; str r3,[r10,r6,lsl#2]; movs r0,r0,ror#8; adds r6,r6,#1 + movs r4,r4,ror r0; ands r6,r6,#3; str r4,[r10,r6,lsl#2]; movs r0,r0,ror#8; adds r6,r6,#1 + movs r5,r5,ror r0; ands r6,r6,#3; str r5,[r10,r6,lsl#2] + adds r10,r10,#20 + adds r7,r7,#1 + cmp r7, #30 + bne ref_roundkey_hvperms_s_loop + clear03 28 +.endm + +.else + @ Rotates roundkey vperms by random amounts @ Trashes r0-r9 -ref_roundkey_hvperms_s: - movs r7,#30 -ref_roundkey_hvperms_s_test: @ entry point for test code to do fewer than 30 key shares - GET_CANARY r0,CTAG9,6 - push {r0,r14} +.macro ref_roundkey_hvperms_s_impl bl gen_rand_lfsr_nonpres ldr r1,=rkey_s ref_roundkey_hvperms_s_loop: @@ -862,14 +1066,32 @@ ref_roundkey_hvperms_s_loop: ands r6,r6,#3; str r5,[r1,r6,lsl#2] adds r1,r1,#20 movs r0,r0,ror#2 - subs r7,r7,#1 + adds r7,r7,#1 + cmp r7, #30 bne ref_roundkey_hvperms_s_loop clear03 28 -ref_roundkey_hvperms_s_exit: @ label exit point to be to able to specify to analysis code - pop {r0,r14} - CHK_CANARY r0,CTAG9,6 - bx r14 +.endm +.endif +.if INLINE_REF_ROUNDKEY_HVPERMS_S +.macro inline_ref_roundkey_hvperms_s +ref_roundkey_hvperms_s_starts: + movs r7,#0 + ref_roundkey_hvperms_s_impl +ref_roundkey_hvperms_s_end: +.endm +.else +.balign 4 +.thumb_func +ref_roundkey_hvperms_s: + movs r7,#0 +ref_roundkey_hvperms_s_test: @ entry point for test code to do fewer than 15 rounds + GET_CANARY r0,CTAG9,6 + push {r0, lr} + ref_roundkey_hvperms_s_impl + pop {r0} + CHK_CANARY r0,CTAG9,6 + pop {pc} .endif .ltorg @@ -885,6 +1107,8 @@ ref_roundkey_hvperms_s_exit: @ label exit point to be to able to specify to ana @ r8=rorig(8+(-!r1)%4), r9=rorig(8+(1-!r1)%4), ... @ Note: only low 2 bits of !r1 are used. The rest are random to add to the noise. addstatevperm: + push {r14} + GET_CANARY r14,CTAG20,0 ldr r2,[r1] adds r2,r2,r0 str r2,[r1] @@ -916,7 +1140,8 @@ addstatevperm: stmia r1!,{r2,r3} addstatevperm_exit: @ label exit point to be to able to specify to analysis code - bx r14 + CHK_CANARY r14,CTAG20,0 + pop {pc} .endif @ Conjugate lut_a, lut_b with (state) shareC @@ -925,9 +1150,11 @@ addstatevperm_exit: @ label exit point to be to able to specify to ana @ Arbitrarily choosing a0, b1 and d0 .balign 4 conjshareC: + push {r14} + GET_CANARY r14,CTAG21,0 .if ST_SHAREC - ldr r1,=shareC - ldr r0,[r1] @ Get shareC as a word (all bytes the same) + ldr r1,=shareA + ldr r0,[r1, #shareC-shareA] @ Get shareC as a word (all bytes the same) ldr r1,=lut_a @ Need to EOR share C into inputs of both lut_a and lut_b, and one of their outputs... ldr r2,[r1,#0x100] eors r2,r2,r0,lsr#24 @@ -938,11 +1165,10 @@ conjshareC: eors r2,r2,r0,lsl#8 str r2,[r1,#0x100] .endif - bx r14 + CHK_CANARY r14,CTAG21,0 + pop {pc} -.balign 4 -.thumb_func -shift_rows_s: +.macro shift_rows_s_impl @ First "rotate" the two most-significant bytes of the state by two registers @ Trashes r0-r3 @ Slightly faster (but not shorter?) with ubfx/bfi @@ -996,7 +1222,22 @@ shift_rows_s: eors r11,r11,r1 @ state[3]^=tb; clear01 @ barrier +.endm + +.if INLINE_SHIFT_ROWS_S +.macro inline_shift_rows_s +shift_rows_s_starts: + shift_rows_s_impl +shift_rows_s_end: +.endm +.else +.balign 4 +.thumb_func +@ Not going to use canaries here as it doesn't write anything - could be use to perturb register values, but not super worried about that yet +shift_rows_s: + shift_rows_s_impl bx r14 +.endif @ multiply polynomial over GF(2⁸) by c(x) = 0x03x³ + 0x01x² + 0x01x + 0x02 modulo x⁴+1 @ r0x00 is a register holding 0x00000000; r0x1b is a register holding 0x1b1b1b1b @@ -1036,6 +1277,7 @@ shift_rows_s: .balign 4 .thumb_func +@ Not going to use canaries here as it doesn't write anything - could be use to perturb register values, but not super worried about that yet @ Trashes r0-r3,r12 mix_cols_s: mov r2,#0x00000000 @@ -1071,12 +1313,7 @@ mix_cols_s: @ map all bytes of the state through the split LUT, lut_a and lut_b @ Trashes r0-r3,r12 -.balign 4 -.thumb_func -map_sbox_s: - GET_CANARY r12,CTAG12,3 - push {r12,r14} - +.macro map_sbox_s_impl ldr r0,=shareA @ Write out state share A to memory @ stmia r0,{r4-r7} @ Used to do a STM getchaffaddress r1 @@ -1160,10 +1397,29 @@ map_sbox_s: @ steplfsr; eors r6,r6,r0; mov r12,#0; eor r10,r10,r0,ror#16 @ steplfsr; eors r7,r7,r0; mov r12,#0; eor r11,r11,r0,ror#16 @ savelfsr +.endm + +.if INLINE_MAP_SBOX_S +.macro inline_map_sbox_s +map_sbox_s_starts: + // push {lr} + map_sbox_s_impl + // pop {lr} +map_sbox_s_end: +.endm +.else +.balign 4 +.thumb_func +map_sbox_s: + GET_CANARY r12,CTAG12,3 + push {r12,r14} + + map_sbox_s_impl pop {r12,r14} CHK_CANARY r12,CTAG12,5 bx r14 +.endif .ltorg @@ -1206,16 +1462,22 @@ refreshchaff_and_lfsr: bl gen_rand_sha_nonpres ldr r1,=rstate_lfsr ldr r2,[r1] +1: adds r2,r2,r0 - beq 1f @ Don't update LFSR state to 0 +@ note that r2 should not be 0 on entry, so both +@ r2 + r0, and r2 + r0 + r0 on the next loop should not both be 0 +@ if they are, we will loop + beq 1b @ Don't update LFSR state to 0 +#if HARDENING + beq 1b +#endif str r2,[r1] -1: @ Choose a random order to update chaff words to make 2nd order attacks harder movs r0,#12 ldr r1,=permscratch bl makesmallperm - + movs r1,#11 1: push {r1} @@ -1458,24 +1720,24 @@ init_key_4way: GET_CANARY r12,CTAG17,6 push {r0-r12,r14} - + @ Transfer 4-way key into local workspace, rerandomising the shares mov r5,r0 @ r5=4-way key input bl randomisechaff ldr r6,=rkey4way movs r7,#8 - b 1f -2: - adds r5,#64 @ Skip 64 byte gap for FIB workaround - subs r7,r7,#1 1: +#if FIB_WORKAROUND + cmp r7,#4 + bne 2f + adds r5,#64 @ Skip 64 byte gap for FIB workaround +2: +#endif ldmia r5!,{r1-r4} bl gen_rand_sha; eors r1,r1,r0; eors r4,r4,r0 bl gen_rand_sha; eors r2,r2,r0; eors r4,r4,r0 bl gen_rand_sha; eors r3,r3,r0; eors r4,r4,r0 stmia r6!,{r1-r4} - cmp r7,#5 - beq 2b subs r7,r7,#1 bne 1b @@ -1554,6 +1816,7 @@ init_key_expandloop: cmp r2,#52 bne init_key_expandloop + CHK_COUNT 30,6 pop {r0-r12,r14} CHK_CANARY r12,CTAG17,6 bx r14 @@ -1596,7 +1859,7 @@ addrkey_s: clear03_preserve_r3 add r12,r12,#20 @ r0=chaff+16, r3=statevperm, r4-r11=state, r12=roundkeyBptr - + bfi r0,r12,#0,#4 @ match chaff pointer (r0) to roundkey ptr (r12) mod 16 ldr r1,[r12,#16] @ r1=vperm key rotation in top two bits ldr r2,[r0,#16] @ barrier load @@ -1605,7 +1868,7 @@ addrkey_s: bfi r0,r3,#0,#4 ldr r3,[r3] ldr r0,[r0] @ barrier load - + @ Read shareB of roundkey, offset by vpermkeyrot-vpermstaterot, and eor it into shareB of state, offset by -vpermstaterot @ r1=rkeyBrotdata, r2=vpermkeyrot-vpermstaterot, r3=RKshareC, r4-r11=state, r12=roundkeyB ptr .if RK_ROR @@ -1643,8 +1906,10 @@ ctr_crypt_s: push {r0-r12,r14} @ save all registers so that when we restore we overwrite any secrets push {r0-r3} - - SET_COUNT 93,6 + +#if !CALLER_INIT_RCP_COUNT + SET_COUNT 33,6 +#endif .if CT_BPERM @ Initialise 32 random numbers (which fit in half-words) @@ -1674,10 +1939,25 @@ ctr_crypt_s: bl gen_rand_sha_nonpres; eors r6,r6,r0; movs r1,#0; mov r10,r10,ror#16; eor r10,r10,r0,ror#16 bl gen_rand_sha_nonpres; eors r7,r7,r0; movs r1,#0; mov r11,r11,ror#16; eor r11,r11,r0,ror#16 ldr r0,=IV0 - stmia r0,{r4-r7} - adds r0,r0,#20 - stmia r0,{r8-r11} + stmia r0!,{r4-r7} + adds r1,r0,#4 + stmia r1,{r8-r11} @ "Decommission" IV0 so that it doesn't get stacked +#if 1 // approved by Alex - no side channel leakage it seems +#if HARDENING + // if this is skipped, r4 is likely random, so more 1 in 4 chance that ldmia will trap + // in any case very unlikely to load useful data below (and presuambly the faulting address is uninteresting + // since it is already XORed with random data above) + movs r0, #32 + // note if r1 is unset, then we are reading from lut_a + movs r1, #0 + ldmia r1!, {r4, r5, r6, r7, r8, r9, r10, r11} + rcp_iequal_nodelay r0, r1 +#else + movs r0, #0 + ldmia r0, {r4, r5, r6, r7, r8, r9, r10, r11} +#endif +#else bl gen_rand_sha_nonpres; movs r4,r0 bl gen_rand_sha_nonpres; movs r5,r0 bl gen_rand_sha_nonpres; movs r6,r0 @@ -1686,11 +1966,14 @@ ctr_crypt_s: bl gen_rand_sha_nonpres; mov r9,r0 bl gen_rand_sha_nonpres; mov r10,r0 bl gen_rand_sha_nonpres; mov r11,r0 +#endif +@ Trashes r0, r1 + check_rnd_count (RND_COUNT_decrypt+RND_COUNT_ctr_crypt_s_init) pop {r1,r2} @ r1=cipher/plaintext buffer, r2=number of blocks movs r3,#0 - CHK_COUNT 93,6 + CHK_COUNT 33,6 ctr_crypt_mainloop: SET_COUNT 80,6 @@ -1700,31 +1983,98 @@ ctr_crypt_mainloop: push {r1-r3} @ It's OK for execution time to depend on the block counter r3 ("public"), but not the block number (secret) +@ Trashes r0, r1 + reset_rnd_count_checked + + // no point in having a branch if we should never take it (hardening/size fail) +#if REFCHAFF_PERIOD != 1 tst r3,#(REFCHAFF_PERIOD-1) bne 1f +#endif bl refreshchaff_and_lfsr 1: - - ldr r3,[r13,#8] @ get block count off the stack + ldr r3,[sp,#8] @ get block count off the stack + // no point in having a branch if we should never take it (hardening/size fail) +#if REMAP_PERIOD != 1 tst r3,#(REMAP_PERIOD-1) bne 1f +#endif bl remap @ shuffle the LUTs; this preserves R3 1: - CHK_COUNT 80,6 + CHK_COUNT 80,6 + ldr r0,[sp,#8] @ get block count off the stack +#if HARDENING +@ We check the random counts here. Note we start with the combined count and subtract, just because +@ it might make it marginally more difficult to get the right answer if skipping multiple instructions + movs r1, #(RND_COUNT_remap + RND_COUNT_refreshchaff_and_lfsr) +#if REMAP_PERIOD != 1 + tst r0, #(REMAP_PERIOD-1) + it ne + subne r1, #RND_COUNT_remap +#endif +#if REFCHAFF_PERIOD != 1 + tst r0, #(REFCHAFF_PERIOD-1) + it ne + subne r1, #RND_COUNT_refreshchaff_and_lfsr +#endif +@ r0=block count, r1=expected sha rand count, r3=block count + rcp_iequal_nodelay r0, r3 +@ r1=expected sha rand count, r3=block count + check_rnd_count_dynamic +#endif // HARDENING +@ r3=block count + +@ No point in having a branch if we should never take it (hardening/size fail) +#if REFROUNDKEYSHARES_PERIOD != 1 +#if HARDENING +// we want to check that we are calling enough +#warning REFROUNDKEYSHARES_PERIOD check needs hardening +#endif tst r3,#(REFROUNDKEYSHARES_PERIOD-1) - bne 1f + bne skip_ref_roundkey_shares_s +#endif +#if INLINE_REF_ROUNDKEY_SHARES_S + inline_ref_roundkey_shares_s +#else +#if HARDENING + // todo graham we could remove this for space, as I don't think r4 and r5 are equal +@ Make sure r4 != r5 on entry to ref_roundkey_shares_s + subs r4, r5, #1 +#endif bl ref_roundkey_shares_s @ refresh the round key shares -1: +#if HARDENING +@ r4 and r5 are set equal by ref_roundkey_shares (note we don't do a rnd_check as no sha random numbers are generated) + rcp_iequal_nodelay r4, r5 +#endif +#endif +skip_ref_roundkey_shares_s: - ldr r3,[r13,#8] @ get block count off the stack +#if REFROUNDKEYHVPERMS_PERIOD != 1 +#if HARDENING +// we want to check that we are calling enough +#warning REFROUNDKEYHVPERMS_PERIOD check needs hardening +#endif + ldr r3,[sp,#8] @ get block count off the stack tst r3,#(REFROUNDKEYHVPERMS_PERIOD-1) - bne 1f + bne skip_ref_roundkey_hvperm_s +#endif +#if INLINE_REF_ROUNDKEY_HVPERMS_S + inline_ref_roundkey_hvperms_s +#else bl ref_roundkey_hvperms_s @ refresh the round key vperms -1: +#if HARDENING + movs r0, #30 +@ r7 should be 30 on exit from ref_roundkey_hvperms_s + rcp_iequal_nodelay r0, r7 +#endif +#endif +skip_ref_roundkey_hvperms_s: CHK_COUNT 81,6 +@ Trashes r0, r1 + reset_rnd_count pop {r1-r3} @ r1=cipher/plaintext buffer, r2=number of blocks, r3=block counter @@ -1767,6 +2117,7 @@ ctr_crypt_mainloop: adds r4,r4,r7 @ r4=j if top bit of r6, else i subs r1,r1,#1 bpl 1b + // tooo loop check pop {r1} mov r12,r4 .else @@ -1812,29 +2163,59 @@ processIV: @ non-target label to assist power analysis movs r2,#0 str r2,[r1] bl addstatevperm @ Initialise state vperm (use SHA RNG to start with, later refreshes are with LFSR RNG) +#if HARDENING + // r1 is set to lut_b by addstatevperm + ldr r0, =shareB + 0x10 + rcp_iequal_nodelay r0, r1 +#endif .endif +@ Trashes r0, r1 + check_rnd_count RND_COUNT_ctr_crypt_mainloop_A CHK_COUNT 84,6 +.if ST_SHAREC @ Avoid func call if the func is empty bl conjshareC @ Add the effect of shareC to lut_a, lut_b +#if HARDENING + // r1 is set to lut_b by conjshare + ldr r2,=lut_b + rcp_iequal_nodelay r1, r2 +#endif +.endif + // todo graham remove this count CHK_COUNT 85,6 @ now perform the 15 encryption rounds on (key, state=IV+x) @ here r4-r7, r8-r11: state mov r2,#0 @ round counter rounds_s_mainloop: +@ Trashes r0, r1 + reset_rnd_count_checked ldr r12,=rkey_s add r12,r12,r2,lsl#5 @ pointer to key shares for this round add r12,r12,r2,lsl#3 push {r2} @ save round count bl addrkey_s +.if INLINE_MAP_SBOX_S + inline_map_sbox_s +.else bl map_sbox_s +.endif +.if INLINE_SHIFT_ROWS_S + inline_shift_rows_s +.else bl shift_rows_s +.endif .if ST_VPERM - ldr r2,[r13] @ peek at stack to get round count + ldr r2,[sp] @ peek at stack to get round count cmp r2,#NUMREFSTATEVPERM bcs 1f bl gen_rand_lfsr_nonpres ldr r1,=statevperm bl addstatevperm @ V shuffle of r4-r11 +#if HARDENING + // r1 is set to lut_b by addstatevperm + ldr r2, =shareB + 0x10 + rcp_iequal_nodelay r1, r2 +#endif 1: .endif pop {r2} @@ -1846,22 +2227,32 @@ rounds_s_mainloop: pop {r2} b rounds_s_mainloop 2: +#if HARDENING + movs r1, #14 + rcp_iequal_nodelay r1, r2 +#endif CHK_COUNT 86,6 ldr r12,=rkey_s+14*40 @ final round key shares + // todo graham check this is called bl addrkey_s CHK_COUNT 87,6 +.if ST_SHAREC @ Avoid func call if the func is empty + // todo alex, i assume that skipping this will cause bad things to happen anyway? bl conjshareC @ Undo the effect of shareC from lut_a, lut_b +.endif CHK_COUNT 88,6 .if ST_VPERM @ Undo the effects of vperm rotation recorded in statevperm ldr r1,=statevperm ldr r2,[r1] rsbs r0,r2,#0 +@ We don't check this is called since failing to undo this is probably going to break decryption +// todo alex is this fair? bl addstatevperm .endif pop {r1-r3,r12} - push {r3} + push {r1,r3} @ r1=cipher/plaintext buffer, r2=number of blocks, r3=block counter, r12=block to be deciphered decryption_start: @@ -1901,10 +2292,13 @@ decryption_start: eors r3,r3,r0 str r3,[r1,#12] - sub r1,r1,r12,lsl#4 @ Restore r1 to point to start of buffer CHK_COUNT 90,6 - pop {r3} @ Restore block counter +@ Trashes r0, r1 + check_rnd_count RND_COUNT_decryption_end + + pop {r1,r3} @ Restore r1 to point to start of buffer + @ Restore block counter @ r1=cipher/plaintext buffer, r2=number of blocks, r3=block counter decryption_end: @@ -1912,33 +2306,61 @@ decryption_end: cmp r3,r2 CHK_COUNT 91,6 bne ctr_crypt_mainloop + // todo alex, is this necessary - if you don't do the right number of loops, you ain't gonna get far? +#if HARDENING + rcp_iequal_nodelay r2, r3 +#endif #if WIPE_MEMORY @ Wipe memory from workspace_start up to the stack pointer @ First fill everything (except the RNG state itself) with random numbers to avoid any possibly useful power signals ldr r4,=workspace_start - ldr r5,=rstate_all_start + add r5, r4, #rstate_all_start - workspace_start +#if HARDENING + ldr r7,=workspace_start + add r6, r4, #rstate_all_start - workspace_start + rcp_iequal_nodelay r4, r7 +#endif +#if HARDENING + // todo alex, is this necessary - if you don't do the right number of loops, you ain't gonna get far? +@ Recheck of above + rcp_iequal_nodelay r3, r2 +#endif 1: bl gen_rand_sha_nonpres stmia r4!,{r0} cmp r4,r5 bcc 1b +#if HARDENING + rcp_iequal_nodelay r4, r6 + mov r6,sp +#endif + // not if this load is skpped, then we are just erasing from where we left off before +.if rstate_all_end <= rstate_all_start +.err +.endif ldr r4,=rstate_all_end - mov r5,r13 @ gcc arm assembler says cmp r4,r13 is deprecated, so use another register + mov r5,sp @ gcc arm assembler says cmp r4,sp is deprecated, so use another register 1: bl gen_rand_sha_nonpres stmia r4!,{r0} cmp r4,r5 bcc 1b +#if HARDENING + rcp_iequal_nodelay r4, r6 +#endif @ Then fill everything with zeros so as not to leave behind clues about the RNG state ldr r4,=workspace_start movs r0,#0 - mov r5,r13 + mov r5,sp 1: stmia r4!,{r0} cmp r4,r5 bcc 1b +#if HARDENING + rcp_iequal_nodelay r4, r6 +#endif #endif .if GEN_RAND_SHA diff --git a/enc_bootloader/config.h b/enc_bootloader/config.h index 1573fbff..64c7fd90 100644 --- a/enc_bootloader/config.h +++ b/enc_bootloader/config.h @@ -54,6 +54,21 @@ #define CK_JITTER 1 // Use the ROSC clock to make ARM timings unpredictable #endif +#ifndef INLINE_REF_ROUNDKEY_SHARES_S +#define INLINE_REF_ROUNDKEY_SHARES_S 0 +#endif + +#ifndef INLINE_REF_ROUNDKEY_HVPERMS_S +#define INLINE_REF_ROUNDKEY_HVPERMS_S 0 +#endif + +#ifndef INLINE_SHIFT_ROWS_S +#define INLINE_SHIFT_ROWS_S 0 +#endif + +#ifndef INLINE_MAP_SBOX_S +#define INLINE_MAP_SBOX_S 0 +#endif //////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/enc_bootloader/enc_bootloader.c b/enc_bootloader/enc_bootloader.c index 4b7e40f0..2668150b 100644 --- a/enc_bootloader/enc_bootloader.c +++ b/enc_bootloader/enc_bootloader.c @@ -10,16 +10,14 @@ #include "boot/picoboot.h" #include "pico/bootrom.h" #include "hardware/structs/otp.h" - -#include "hardware/structs/trng.h" -#include "hardware/structs/sha256.h" - +#include "hardware/structs/mpu.h" #include "pico/binary_info.h" #include "hardware/clocks.h" -#include "hardware/xosc.h" #include "hardware/structs/rosc.h" +#include "hardware/rcp.h" +#include "hard_entry_point.h" #include "config.h" extern void decrypt(uint8_t* key4way, uint8_t* IV_OTPsalt, uint8_t* IV_public, uint8_t(*buf)[16], int nblk); @@ -32,6 +30,79 @@ extern void decrypt(uint8_t* key4way, uint8_t* IV_OTPsalt, uint8_t* IV_public, u #define XOSC_CALIBRATION 0 #if CK_JITTER + +static inline bool has_glitchless_mux(clock_handle_t clock) { + return clock == clk_sys || clock == clk_ref; +} + +static __force_inline void clock_configure_internal(clock_handle_t clock, uint32_t src, uint32_t auxsrc, uint32_t actual_freq, uint32_t div) { + clock_hw_t *clock_hw = &clocks_hw->clk[clock]; + + // If increasing divisor, set divisor before source. Otherwise set source + // before divisor. This avoids a momentary overspeed when e.g. switching + // to a faster source and increasing divisor to compensate. + if (div > clock_hw->div) + clock_hw->div = div; + + // If switching a glitchless slice (ref or sys) to an aux source, switch + // away from aux *first* to avoid passing glitches when changing aux mux. + // Assume (!!!) glitchless source 0 is no faster than the aux source. + if (has_glitchless_mux(clock) && src == CLOCKS_CLK_SYS_CTRL_SRC_VALUE_CLKSRC_CLK_SYS_AUX) { + hw_clear_bits(&clock_hw->ctrl, CLOCKS_CLK_REF_CTRL_SRC_BITS); + while (!(clock_hw->selected & 1u)) + tight_loop_contents(); + } + // If no glitchless mux, cleanly stop the clock to avoid glitches + // propagating when changing aux mux. Note it would be a really bad idea + // to do this on one of the glitchless clocks (clk_sys, clk_ref). + else { + // Disable clock. On clk_ref and clk_sys this does nothing, + // all other clocks have the ENABLE bit in the same position. + if (clock != clk_sys && clock != clk_ref) { + pico_default_asm_volatile("b not_there\n"); // this branch should be elided by compiler in inlined func + hw_clear_bits(&clock_hw->ctrl, CLOCKS_CLK_GPOUT0_CTRL_ENABLE_BITS); + } + // if (configured_freq[clock] > 0) { + // // Delay for 3 cycles of the target clock, for ENABLE propagation. + // // Note XOSC_COUNT is not helpful here because XOSC is not + // // necessarily running, nor is timer... + // uint delay_cyc = configured_freq[clk_sys] / configured_freq[clock] + 1; + // busy_wait_at_least_cycles(delay_cyc * 3); + // } + } + + // Set aux mux first, and then glitchless mux if this clock has one + hw_write_masked(&clock_hw->ctrl, + (auxsrc << CLOCKS_CLK_SYS_CTRL_AUXSRC_LSB), + CLOCKS_CLK_SYS_CTRL_AUXSRC_BITS + ); + + if (has_glitchless_mux(clock)) { + hw_write_masked(&clock_hw->ctrl, + src << CLOCKS_CLK_REF_CTRL_SRC_LSB, + CLOCKS_CLK_REF_CTRL_SRC_BITS + ); + while (!(clock_hw->selected & (1u << src))) + tight_loop_contents(); + } + + // Enable clock. On clk_ref and clk_sys this does nothing, + // all other clocks have the ENABLE bit in the same position. + if (clock != clk_sys && clock != clk_ref) { + pico_default_asm_volatile("b not_there\n"); // code should me omitted + // hw_set_bits(&clock_hw->ctrl, CLOCKS_CLK_GPOUT0_CTRL_ENABLE_BITS); + } + + // Now that the source is configured, we can trust that the user-supplied + // divisor is a safe value. + clock_hw->div = div; + //configured_freq[clock] = actual_freq; +} + +static __force_inline void clock_configure_int_divider_inline(clock_handle_t clock, uint32_t src, uint32_t auxsrc, uint32_t src_freq, uint32_t int_divider) { + clock_configure_internal(clock, src, auxsrc, src_freq / int_divider, int_divider << CLOCKS_CLK_GPOUT0_DIV_INT_LSB); +} + void runtime_init_clocks(void) { // Disable resus that may be enabled from previous software clocks_hw->resus.ctrl = 0; @@ -40,12 +111,12 @@ void runtime_init_clocks(void) { bi_decl(bi_ptr_int32(0, 0, rosc_drive, 0x7777)); // default drives of 0b111 (0x7) // Bump up ROSC speed to ~110MHz - rosc_hw->freqa = 0; // reset the drive strengths + rosc_hw->freqa = 0; // reset the drive strengths (note password not needed for this) rosc_hw->div = rosc_div | ROSC_DIV_VALUE_PASS; // set divider // Increment the freqency range one step at a time - this is safe provided the current config is not TOOHIGH // because ROSC_CTRL_FREQ_RANGE_VALUE_MEDIUM | ROSC_CTRL_FREQ_RANGE_VALUE_HIGH == ROSC_CTRL_FREQ_RANGE_VALUE_HIGH - static_assert((ROSC_CTRL_FREQ_RANGE_VALUE_LOW | ROSC_CTRL_FREQ_RANGE_VALUE_MEDIUM) == ROSC_CTRL_FREQ_RANGE_VALUE_MEDIUM); - static_assert((ROSC_CTRL_FREQ_RANGE_VALUE_MEDIUM | ROSC_CTRL_FREQ_RANGE_VALUE_HIGH) == ROSC_CTRL_FREQ_RANGE_VALUE_HIGH); + static_assert(ROSC_CTRL_FREQ_RANGE_VALUE_LOW | ROSC_CTRL_FREQ_RANGE_VALUE_MEDIUM == ROSC_CTRL_FREQ_RANGE_VALUE_MEDIUM); + static_assert(ROSC_CTRL_FREQ_RANGE_VALUE_MEDIUM | ROSC_CTRL_FREQ_RANGE_VALUE_HIGH == ROSC_CTRL_FREQ_RANGE_VALUE_HIGH); hw_set_bits(&rosc_hw->ctrl, ROSC_CTRL_FREQ_RANGE_VALUE_MEDIUM); hw_set_bits(&rosc_hw->ctrl, ROSC_CTRL_FREQ_RANGE_VALUE_HIGH); @@ -55,8 +126,25 @@ void runtime_init_clocks(void) { // Not used with FREQ_RANGE_VALUE_HIGH, but should still be set to the maximum drive rosc_hw->freqb = (ROSC_FREQB_PASSWD_VALUE_PASS << ROSC_FREQB_PASSWD_LSB) | - ROSC_FREQB_DS7_LSB | ROSC_FREQB_DS6_LSB | ROSC_FREQB_DS5_LSB | ROSC_FREQB_DS4_LSB; + ROSC_FREQB_DS7_BITS | ROSC_FREQB_DS6_BITS | ROSC_FREQB_DS5_BITS | ROSC_FREQB_DS4_BITS; +#if RC_COUNT + rcp_count_check_nodelay(STEP_RUNTIME_CLOCKS_INIT); +#endif +#if HARDENING + // force reload + rosc_hw_t *rosc_hw2; + pico_default_asm_volatile( + "ldr %0, =%c1\n" + : "=&r" (rosc_hw2) : "i" (ROSC_BASE), "r" (rosc_hw)); // note include rosc_hw to use a different register + uint32_t c = (*(volatile uint32_t *)&rosc_drive) | ROSC_FREQA_DS1_RANDOM_BITS | ROSC_FREQA_DS0_RANDOM_BITS; + rcp_iequal_nodelay(c, *(io_ro_16 *)(&rosc_hw2->freqa)); + rcp_iequal_nodelay(ROSC_FREQB_DS7_BITS | ROSC_FREQB_DS6_BITS | ROSC_FREQB_DS5_BITS | ROSC_FREQB_DS4_BITS, *(io_ro_16 *)(&rosc_hw2->freqb)); + rcp_iequal_nodelay((ROSC_CTRL_ENABLE_VALUE_ENABLE << ROSC_CTRL_ENABLE_LSB) | ROSC_CTRL_FREQ_RANGE_VALUE_HIGH, rosc_hw2->ctrl); +#endif +#if RC_COUNT + rcp_count_check_nodelay(STEP_RUNTIME_CLOCKS_INIT2); +#endif #if XOSC_CALIBRATION // Calibrate ROSC frequency if XOSC present - otherwise just configure bi_decl(bi_ptr_int32(0, 0, xosc_hz, 12000000)); // xosc freq in Hz @@ -101,69 +189,173 @@ void runtime_init_clocks(void) { } #endif // XOSC_CALIBRATION // CLK SYS = ROSC directly, as it's running slowly enough - clock_configure_int_divider(clk_sys, + clock_configure_int_divider_inline(clk_sys, CLOCKS_CLK_SYS_CTRL_SRC_VALUE_CLKSRC_CLK_SYS_AUX, CLOCKS_CLK_SYS_CTRL_AUXSRC_VALUE_ROSC_CLKSRC, ROSC_HZ, // this doesn't have to be accurate 1); - // Configure other clocks - none of these need to be accurate +#if RC_COUNT + rcp_count_check_nodelay(STEP_RUNTIME_CLOCKS_INIT3); +#endif // CLK_REF = ROSC / OTHER_CLK_DIV - this and other clocks aren't really used, so just need to be set to a low enough frequency - clock_configure_int_divider(clk_ref, + clock_configure_int_divider_inline(clk_ref, CLOCKS_CLK_REF_CTRL_SRC_VALUE_ROSC_CLKSRC_PH, 0, ROSC_HZ, OTHER_CLK_DIV); - // CLK USB (not used) - clock_configure_int_divider(clk_usb, - 0, // No GLMUX - CLOCKS_CLK_USB_CTRL_AUXSRC_VALUE_ROSC_CLKSRC_PH, - ROSC_HZ, - OTHER_CLK_DIV); - - // CLK ADC (not used) - clock_configure_int_divider(clk_adc, - 0, // No GLMUX - CLOCKS_CLK_ADC_CTRL_AUXSRC_VALUE_ROSC_CLKSRC_PH, - ROSC_HZ, - OTHER_CLK_DIV); - - // CLK PERI Used as reference clock for UART and SPI serial. (not used) - clock_configure_int_divider(clk_peri, - 0, - CLOCKS_CLK_PERI_CTRL_AUXSRC_VALUE_CLK_SYS, - ROSC_HZ, - OTHER_CLK_DIV); - - // CLK_HSTX Transmit bit clock for the HSTX peripheral. (not used) - clock_configure_int_divider(clk_hstx, - 0, - CLOCKS_CLK_HSTX_CTRL_AUXSRC_VALUE_CLK_SYS, - ROSC_HZ, - OTHER_CLK_DIV); +#if RC_COUNT + rcp_count_check_nodelay(STEP_RUNTIME_CLOCKS_INIT4); +#endif + // we don't need any other clocks +#if HARDENING + static_assert(CLOCKS_CLK_REF_CTRL_OFFSET == clk_ref * sizeof(clock_hw_t), ""); + static_assert(clk_sys > clk_ref, ""); + // prevent GCC from re-using ptr + io_ro_32 *clock_ref_ctrl; + pico_default_asm_volatile( + "ldr %0, =%c1\n" + : "=r" (clock_ref_ctrl) : "i" (CLOCKS_BASE + clk_ref * sizeof(clock_hw_t))); + // check clock_sys + rcp_iequal_nodelay(clock_ref_ctrl[(clk_sys - clk_ref) * sizeof(clock_hw_t) / 4], + (CLOCKS_CLK_SYS_CTRL_AUXSRC_VALUE_ROSC_CLKSRC << CLOCKS_CLK_SYS_CTRL_AUXSRC_LSB) | + (CLOCKS_CLK_SYS_CTRL_SRC_VALUE_CLKSRC_CLK_SYS_AUX << CLOCKS_CLK_SYS_CTRL_SRC_LSB)); + // check clock_ref + rcp_iequal_nodelay(*clock_ref_ctrl, 0); +#endif +#if RC_COUNT + rcp_count_check_nodelay(STEP_RUNTIME_CLOCKS_INIT5); +#endif } #endif bi_decl(bi_ptr_int32(0, 0, otp_key_page, 29)); +// Stop the compiler from constant-folding a hardware base pointer into the +// pointers to individual registers, in cases where constant folding has +// produced redundant 32-bit pointer literals that could have been load/store +// offsets. (Note typeof(ptr+0) gives non-const, for +r constraint.) E.g. +// uart_hw_t *uart0 = __get_opaque_ptr(uart0_hw); +#define __get_opaque_ptr(ptr) ({ \ + typeof((ptr)+0) __opaque_ptr = (ptr); \ + asm ("" : "+r"(__opaque_ptr)); \ + __opaque_ptr; \ +}) + // The function lock_key() is called from decrypt() after key initialisation is complete and before decryption begins. // That is a suitable point to lock the OTP area where key information is stored. void lock_key() { - otp_hw->sw_lock[otp_key_page] = 0xf; - otp_hw->sw_lock[otp_key_page + 1] = 0xf; + io_rw_32 *sw_lock = __get_opaque_ptr(&otp_hw->sw_lock[0]) + otp_key_page; +#if HARDENING + // prevent compiler from re-using sw_lock pointer + io_rw_32 *sw_lock2; + pico_default_asm( + "ldr %0, =%1\n" + : "=r" (sw_lock2) + : "i" (OTP_BASE + OTP_SW_LOCK0_OFFSET) + ); + sw_lock2 += otp_key_page; + sw_lock[0] = 0xf; + sw_lock[1] = 0xf; + uint32_t v = sw_lock2[1]; + v = (v << 4) | sw_lock2[0]; + uint32_t ff1; + pico_default_asm_volatile( + "movs %0, #0xff" + : "=r" (ff1) + ); + uint32_t ff2; + pico_default_asm_volatile( + "movw %0, #0xff" + : "=r" (ff2) + ); + rcp_iequal(v, ff1); +#if RC_COUNT + rcp_count_check_nodelay(31); +#endif + rcp_iequal(ff2, v); +#else + sw_lock[0] = 0xf; + sw_lock[1] = 0xf; +#if RC_COUNT + rcp_count_check_nodelay(31); +#endif +#endif } +static __force_inline void lock_all(void) { + io_rw_32 *sw_lock = __get_opaque_ptr(&otp_hw->sw_lock[0]) + otp_key_page; +#if HARDENING + // we only actually need to lock page 2 but we lock and check 0, 1, 2 anyway + // prevent compiler from re-using sw_lock pointer + io_rw_32 *sw_lock2; + pico_default_asm( + "ldr %0, =%1\n" + : "=r" (sw_lock2) + : "i" (OTP_BASE + OTP_SW_LOCK0_OFFSET) + ); + sw_lock2 += otp_key_page; + sw_lock[0] = 0xf; + sw_lock[1] = 0xf; + uint32_t v = sw_lock2[1]; + sw_lock[2] = 0xf; + v = (v << 4) | sw_lock2[0]; + v = (v << 4) | sw_lock2[2]; + uint32_t fff1; + pico_default_asm_volatile( + "movw %0, #0xfff" + : "=r" (fff1) + ); + uint32_t fff2; + pico_default_asm_volatile( + "movw %0, #0xfff" + : "=r" (fff2) + ); + rcp_iequal(v, fff1); + rcp_iequal(fff2, v); +#else + sw_lock[0] = 0xf; + sw_lock[1] = 0xf; + sw_lock[2] = 0xf; +#endif + +} +#define PIN_R PICO_DEFAULT_LED_PIN +#define PIN_G 6 + +#ifndef USE_LED +#if ALLOW_DEBUGGING +#define USE_LED 1 +#endif +#endif int main() { +#if USE_LED +#if ALLOW_DEBUGGING + reset_block_mask((1u << RESET_IO_BANK0) | (1u << RESET_PADS_BANK0)); + unreset_block_mask_wait_blocking((1u << RESET_IO_BANK0) | (1u << RESET_PADS_BANK0)); +#endif + gpio_init(PIN_G); + gpio_set_dir(PIN_G, GPIO_OUT); + gpio_put(PIN_G, 1); +#endif +#if RC_COUNT + rcp_count_check_nodelay(STEP_MAIN); +#endif + bi_decl(bi_ptr_int32(0, 0, data_start_addr, 0x20000000)); bi_decl(bi_ptr_int32(0, 0, data_size, 0x78000)); bi_decl(bi_ptr_string(0, 0, iv, "0123456789abcdef", 17)); // Read key directly from OTP - guarded reads will throw a bus fault if there are any errors +#if ALLOW_DEBUGGING + uint16_t* otp_data = (uint16_t*)OTP_DATA_BASE; +#else uint16_t* otp_data = (uint16_t*)OTP_DATA_GUARDED_BASE; +#endif decrypt( (uint8_t*)&(otp_data[otp_key_page * 0x40]), (uint8_t*)&(otp_data[(otp_key_page + 2) * 0x40]), @@ -172,23 +364,77 @@ int main() { data_size/16 ); - // Lock the IV salt - otp_hw->sw_lock[otp_key_page + 2] = 0xf; + lock_all(); - // Increase stack limit by 0x100 +#if ALLOW_DEBUGGING && !defined(NDEBUG) + // check locks + io_ro_32 *otp_data_raw = (io_ro_32 *)OTP_DATA_RAW_BASE; + rcp_iequal(otp_data_raw[otp_key_page * 0x40], -1); + rcp_iequal(otp_data_raw[otp_key_page * 0x40 + 0x40], -1); + rcp_iequal(otp_data_raw[otp_key_page * 0x40 + 0x80], -1); +#endif + // Increase stack space by 0x100 pico_default_asm_volatile( "mrs r0, msplim\n" "subs r0, 0x100\n" "msr msplim, r0" :::"r0"); +#if HARDENING + // this is the only one we want to leave in place + static_assert(MPU_REGION_FLASH == 7, ""); + // disable regions 0-3 (note rnr should be 0 already) + // todo we should check here that we decrypted enough + mpu_hw->rlar = 0; + mpu_hw->rlar_a1 = 0; + mpu_hw->rlar_a2 = 0; + mpu_hw->rlar_a3 = 0; + // todo make this configurable (if you want MPU disabled) +#if 1 + // disable region 7 + mpu_hw->rnr = MPU_REGION_FLASH; + mpu_hw->rlar = 0; + // disable MPU + mpu_hw->ctrl = 0; +#endif +#endif + +#if USE_LED + gpio_put(PIN_G, 0); + __compiler_memory_barrier(); +#endif // Chain into decrypted image - rom_chain_image( + // todo make sure the image is NOT in flash (perhaps we also disallow execution from flash as an option) + int chain_result = rom_chain_image( (uint8_t*)ROM_CHAIN_WORKSPACE, 4 * 1024, data_start_addr, data_size ); - __breakpoint(); + chain_result = -chain_result; +#if USE_LED + gpio_init(PIN_R); + gpio_set_dir(PIN_R, GPIO_OUT); + gpio_put(PIN_R, 1); + + + gpio_put(PIN_G, 0); + for (int j = 0; j < 10000000; j++) { __nop(); } + for (int i = 0; i < chain_result; i++) { + gpio_put(PIN_G, 0); + for (int j = 0; j < 5000000; j++) { __nop(); } + gpio_put(PIN_G, 1); + for (int j = 0; j < 5000000; j++) { __nop(); } + } + gpio_put(PIN_G, 0); +#endif + +#if ALLOW_DEBUGGING + rom_connect_internal_flash(); + rom_flash_exit_xip(); + rom_flash_range_erase(0x100000, 0x80000, (1u << 16), 0xd8); + rom_flash_range_program(0x100000, (void *)data_start_addr, data_size); + rom_reset_usb_boot(0, 0); +#endif } diff --git a/enc_bootloader/enc_bootloader.elf b/enc_bootloader/enc_bootloader.elf index e1007488..80460550 100755 Binary files a/enc_bootloader/enc_bootloader.elf and b/enc_bootloader/enc_bootloader.elf differ diff --git a/enc_bootloader/enc_bootloader_mbedtls.elf b/enc_bootloader/enc_bootloader_mbedtls.elf index 5c278639..24719cd0 100644 Binary files a/enc_bootloader/enc_bootloader_mbedtls.elf and b/enc_bootloader/enc_bootloader_mbedtls.elf differ diff --git a/enc_bootloader/hard_entry_point.S b/enc_bootloader/hard_entry_point.S new file mode 100644 index 00000000..a7508606 --- /dev/null +++ b/enc_bootloader/hard_entry_point.S @@ -0,0 +1,204 @@ +/* + * Copyright (c) 2020 Raspberry Pi (Trading) Ltd. + * + * SPDX-License-Identifier: BSD-3-Clause + */ + +#include "pico.h" +#include "pico/asm_helper.S" +#include "pico/platform/cpu_regs.h" +#include "boot/bootrom_constants.h" +#include "hardware/rcp.h" +#include "hardware/regs/bootram.h" +#include "hardware/regs/resets.h" +#include "hard_entry_point.h" + +pico_default_asm_setup + +#if !PICO_NO_FLASH +#error expected PICO_NO_FLASH +#endif +// ELF entry point: +.type _entry_point,%function +.type _reset_handler,%function +.thumb_func +.global _entry_point, _reset_handler +_entry_point: +_reset_handler: +#if !ALLOW_DEBUGGING + // note we assume that RCP is already initialized by bootrom +#else + // just enable the RCP which is fine if it already was (we assume no other co-processors are enabled at this point to save space) + ldr r0, = PPB_BASE + M33_CPACR_OFFSET + movs r1, #ARM_CPU_PREFIXED(CPACR_CP7_BITS) + str r1, [r0] + // only initialize canary seeds if they haven't been (as to do so twice is a fault) + mrc p7, #1, apsr_nzcv, c0, c0, #0 + bmi 1f + // i dont think it much matters what we initialized to, as to have gotten here we must have not + // gone thru the bootrom (which a secure boot would have) + mcrr p7, #8, r0, r0, c0 + mcrr p7, #8, r0, r0, c1 + sev +1: + ldr r0, =__StackTop + msr msp, r0 + ldr r0, =__vectors + ldr r1, =(PPB_BASE + ARM_CPU_PREFIXED(VTOR_OFFSET)) + str r0, [r1] +#endif + + ldr r0, =PPB_BASE + M33_MPU_CTRL_OFFSET + adr r6, mpu_regions + // set region 7 (flash) + ldmia r6!, {r1, r2, r3, r4} + stmia r0!, {r1, r2, r3, r4} + // sp should have low 3 bits == 0 (which is all the bits RNR is) + str sp, [r0, #M33_MPU_RNR_OFFSET - (M33_MPU_CTRL_OFFSET + 16)] +#if HARDENING + sub lr, lr // we don't need lr and it is unlikely to be something that ww will read from memory if this is skipped + ldr r1, [r0, #M33_MPU_RNR_OFFSET - (M33_MPU_CTRL_OFFSET + 16)] + // RNR should read back as zero + rcp_iequal_nodelay r1, lr +#endif + subs r0, #8 +#ifdef HARDENING + rcp_iequal_nodelay lr, r1 +#endif + ldmia r6!, {r1, r2, r3, r4, r5, r8, r9, r10} + stmia r0!, {r1, r2, r3, r4, r5, r8, r9, r10} +#if HARDENING + ldr r1, = PPB_BASE + M33_MPU_RLAR_A3_OFFSET + 4 + rcp_iequal_nodelay r0, r1 + adr r7, mpu_regions + 48 + rcp_iequal_nodelay r6, r7 + // check this again for good measure + // todo can remove if we do some tt checks + rcp_iequal_nodelay r1, r0 + // todo this is less useful, because if we laoded garbage into the MPU it may cause a fault anyway, + // however it is only one bit for enabling, so we definitely should do some tt to prove it is enabled + rcp_iequal_nodelay r7, r6 +#endif + + // todo is this also part of ALLOW_DEBUGGING (is it done by LOAD_MAP?, or indeed by later code) + // Zero out the BSS + ldr r1, =__bss_start__ + ldr r2, =__bss_end__ + movs r0, #0 + b bss_fill_test +bss_fill_loop: + stm r1!, {r0} +bss_fill_test: + cmp r1, r2 + bne bss_fill_loop +#if HARDENING + rcp_iequal_nodelay r1, r2 +#endif + // runtime_init is inlined here, to avoid a bunch of ROP attackable functions (note + // runtime_run_initializers is particularly bad as it calls a list of function pointers) + // + // can revisit this when we have a hardened SDK option +#if 0 + bl runtime_init +#else +#if !PICO_RP2350 +#error RP2350 init only supported +#endif + rcp_count_set_nodelay STEP_RUNTIME_CLOCKS_INIT + // runtime_init_install_stack_guard + ldr r1, =__StackBottom + // todo harden + msr msplim, r1 + + // runtime_init_early_resets - note we actually reset more than the standard runtime_init + // as we know more about our environment + ldr r1, =RESETS_BASE + RESETS_RESET_OFFSET + REG_ALIAS_SET_BITS +#if ALLOW_DEBUGGING + ldr r0, =~(RESETS_RESET_SYSCFG_BITS | RESETS_RESET_PLL_SYS_BITS | RESETS_RESET_PLL_USB_BITS) // include USB PLL in case we are running from it +#else + ldr r0, =~(RESETS_RESET_SYSCFG_BITS | RESETS_RESET_PLL_SYS_BITS) +#endif + str r0, [r1] + + bl runtime_init_clocks + rcp_count_check_nodelay STEP_RUNTIME_CLOCKS_INIT_DONE + + // note: there is no runtime_init_post_clocks_reset as there are no peripherals that need turning on + // all we really care about is lock BOOTROM_LOCK_ENABLE for now, because we don't want bootrom locking enabled + // and without it, the bootrom will ignore the reset + ldr r1, =BOOTRAM_BASE + BOOTRAM_BOOTLOCK0_OFFSET + BOOTROM_LOCK_ENABLE * 4 + str r1, [r1] // any write unlocks + + // note: there is no runtime_init_bootrom_reset as we will have ome in via the bootrom + // todo however think about somehow being watchdogged back in? + + // note there is no runtime_init_per_corebootrom_reset as it is a no-op on Arm +#endif + + bl main +#if ALLOW_DEBUGGING + bkpt #0 +#endif + rcp_panic +#if HARDENING + rcp_panic +//#if DOUBLE_HARDENING + rcp_panic +//#endif +#endif + +.p2align 2 +.global data_cpy_table +data_cpy_table: +.word 0 + +#define MPU_REGION_RW_XN(n, rbar, rlar) \ + .word rbar + M33_MPU_RBAR_XN_BITS + (0 << M33_MPU_RBAR_AP_LSB), \ + (rlar) + M33_MPU_RLAR_EN_BITS + 0x10 // note 0x10 will be written but not read back + +#define MPU_REGION_RO_XN(n, rbar, rlar) \ + .word rbar + M33_MPU_RBAR_XN_BITS + (2 << M33_MPU_RBAR_AP_LSB), \ + (rlar) + M33_MPU_RLAR_EN_BITS + 0x10 // note 0x10 will be written but not read back + +#define MPU_REGION_RO(n, rbar, rlar) \ + .word rbar + (2 << M33_MPU_RBAR_AP_LSB), \ + (rlar) + M33_MPU_RLAR_EN_BITS + 0x10 // note 0x10 will be written but not read back + +.p2align 2 +mpu_regions: + /* ctrl */ .word M33_MPU_CTRL_PRIVDEFENA_BITS | M33_MPU_CTRL_ENABLE_BITS + /* rnr */ .word 7 // set for thie initial load +#if MPU_REGION_FLASH != 7 || \ + MPU_REGION_RAM != 0 || \ + MPU_REGION_SCRATCH_X != 1 || \ + MPU_REGION_SCRATCH_Y_DATA != 2 + MPU_REGION_SCRATCH_Y_CODE != 3 +#error MPU regions should be in order +#endif + // todo what about XIP_CACHE binaries? + MPU_REGION_RO_XN(MPU_REGION_FLASH, + XIP_BASE, + XIP_END - 0x20) +mpu_regions_middle: + MPU_REGION_RW_XN(MPU_REGION_RAM, + SRAM_BASE, + SRAM_SCRATCH_X_BASE - 0x20) + MPU_REGION_RO(MPU_REGION_SCRATCH_X, + SRAM_SCRATCH_X_BASE, + SRAM_SCRATCH_Y_BASE - 0x20) + MPU_REGION_RW_XN(MPU_REGION_SCRATCH_Y_DATA, + SRAM_SCRATCH_Y_BASE, + __text_start - 0x20) + MPU_REGION_RO(MPU_REGION_SCRATCH_Y_CODE, + __text_start, + __data_start__ - 0x20) +mpu_regions_end: + +.if mpu_regions_middle - mpu_regions != 16 +.err unexpected region size +.endif +.if mpu_regions_end - mpu_regions_middle != 32 +.err unexpected region size +.endif + + diff --git a/enc_bootloader/hard_entry_point.h b/enc_bootloader/hard_entry_point.h new file mode 100644 index 00000000..2c69df7a --- /dev/null +++ b/enc_bootloader/hard_entry_point.h @@ -0,0 +1,23 @@ +/** +* Copyright (c) 2025 Raspberry Pi (Trading) Ltd. + * + * SPDX-License-Identifier: BSD-3-Clause + */ + +#pragma once + +#define MPU_REGION_RAM 0 +#define MPU_REGION_SCRATCH_X 1 +#define MPU_REGION_SCRATCH_Y_DATA 2 +#define MPU_REGION_SCRATCH_Y_CODE 3 + +#define MPU_REGION_FLASH 7 + +#define STEP_RUNTIME_CLOCKS_INIT 16 +#define STEP_RUNTIME_CLOCKS_INIT2 17 +#define STEP_RUNTIME_CLOCKS_INIT3 18 +#define STEP_RUNTIME_CLOCKS_INIT4 19 +#define STEP_RUNTIME_CLOCKS_INIT5 20 +#define STEP_RUNTIME_CLOCKS_INIT_DONE 21 +#define STEP_MAIN 22 +// note decryot: expects count to be 23 diff --git a/enc_bootloader/memmap_enc_bootloader.ld b/enc_bootloader/memmap_enc_bootloader.ld index 6b08c5b3..d92441cc 100644 --- a/enc_bootloader/memmap_enc_bootloader.ld +++ b/enc_bootloader/memmap_enc_bootloader.ld @@ -24,7 +24,7 @@ MEMORY { RAM_START(rwx) : ORIGIN = 0x20080000, LENGTH = 0x44 - SCRATCH_X(rwx) : ORIGIN = 0x20080044, LENGTH = 0xFBC + SCRATCH_X(rwx) : ORIGIN = 0x20080044, LENGTH = 0xfbc SCRATCH_Y(rwx) : ORIGIN = 0x20081000, LENGTH = 0x800 RAM(rwx) : ORIGIN = 0x20081800, LENGTH = 0x800 } @@ -33,13 +33,7 @@ ENTRY(_entry_point) SECTIONS { - /* Note unlike RP2040, we start the image with a vector table even for - NO_FLASH builds. On Arm, the bootrom expects a VT at the start of the - image by default; on RISC-V, the default is to enter the image at its - lowest address, so an IMAGEDEF item is required to specify the - nondefault entry point. */ - - .start_text : { + .start : { __logical_binary_start = .; /* Vectors require 512-byte alignment on v8-M when >48 IRQs are used, so we would waste RAM if the vector table were not at the @@ -52,6 +46,7 @@ SECTIONS } > RAM_START .text : { + __text_start = .; __reset_start = .; KEEP (*(.reset)) __reset_end = .; @@ -123,26 +118,29 @@ SECTIONS KEEP(*(.mutex_array)) PROVIDE_HIDDEN (__mutex_array_end = .); + /* We skip all the initializers as we need very little runtime, and preinit_array items + * are __used, so would never be garbage collected even if referenced + */ . = ALIGN(4); /* preinit data */ - PROVIDE_HIDDEN (__preinit_array_start = .); - KEEP(*(SORT(.preinit_array.*))) - KEEP(*(.preinit_array)) - PROVIDE_HIDDEN (__preinit_array_end = .); +/* PROVIDE_HIDDEN (__preinit_array_start = .); */ +/* KEEP(*(SORT(.preinit_array.*)))*/ +/* KEEP(*(.preinit_array))*/ +/* PROVIDE_HIDDEN (__preinit_array_end = .);*/ . = ALIGN(4); /* init data */ - PROVIDE_HIDDEN (__init_array_start = .); - KEEP(*(SORT(.init_array.*))) - KEEP(*(.init_array)) - PROVIDE_HIDDEN (__init_array_end = .); +/* PROVIDE_HIDDEN (__init_array_start = .);*/ +/* KEEP(*(SORT(.init_array.*)))*/ +/* KEEP(*(.init_array))*/ +/* PROVIDE_HIDDEN (__init_array_end = .);*/ . = ALIGN(4); /* finit data */ - PROVIDE_HIDDEN (__fini_array_start = .); - *(SORT(.fini_array.*)) - *(.fini_array) - PROVIDE_HIDDEN (__fini_array_end = .); +/* PROVIDE_HIDDEN (__fini_array_start = .);*/ +/* *(SORT(.fini_array.*))*/ +/* *(.fini_array)*/ +/* PROVIDE_HIDDEN (__fini_array_end = .);*/ *(.jcr) . = ALIGN(4); @@ -237,6 +235,7 @@ SECTIONS __StackOneBottom = __StackOneTop - SIZEOF(.stack1_dummy); __StackBottom = __StackTop - SIZEOF(.stack_dummy); PROVIDE(__stack = __StackTop); + ASSERT(__StackBottom >= __scratch_y_end__, "stack collision in scratch_y") /* picolibc and LLVM */ PROVIDE (__heap_start = __end__); diff --git a/main.cpp b/main.cpp index 61617fd0..ee9bdb61 100644 --- a/main.cpp +++ b/main.cpp @@ -184,9 +184,11 @@ const string getFiletypeName(enum filetype type) struct cancelled_exception : std::exception { }; struct not_mapped_exception : std::exception { + explicit not_mapped_exception(uint32_t addr) : addr(addr), std::exception() {} const char *what() const noexcept override { return "Hmm uncaught not mapped"; } + uint32_t addr; }; // from -> to @@ -287,14 +289,14 @@ template struct range_map { auto f = m.upper_bound(p); if (f == m.end()) { if (m.empty()) - throw not_mapped_exception(); + throw not_mapped_exception(p); } else if (f == m.begin()) { - throw not_mapped_exception(); + throw not_mapped_exception(p); } f--; assert(p >= f->first); if (p >= f->second.first) { - throw not_mapped_exception(); + throw not_mapped_exception(p); } return std::make_pair(mapping(p - f->first, f->second.first - f->first), f->second.second); } @@ -897,7 +899,7 @@ struct seal_command : public cmd { ).force_expand_help(true) % "BIN file options" + named_file_selection_x("outfile", 1) % "File to save to" + optional_untyped_file_selection_x("key", 2) % "Key file (.pem)" + - optional_untyped_file_selection_x("otp", 3) % "JSON file to save OTP to (will edit existing file if it exists)" + + optional_untyped_file_selection_x("otp", 3) % "JSON file to save OTP to (will edit existing file if it exists)" + ( option("--major") & integer("major").set(settings.seal.major_version) @@ -2152,14 +2154,14 @@ struct iostream_memory_access : public memory_access { assert(this_size); file->seekg(result.second + result.first.offset, ios::beg); file->read((char*)buffer, this_size); - } catch (not_mapped_exception &e) { + } catch (not_mapped_exception &) { if (zero_fill) { // address is not in a range, so fill up to next range with zeros this_size = rmap.next(address) - address; this_size = std::min(this_size, size); memset(buffer, 0, this_size); } else { - throw e; + throw; } } buffer += this_size; @@ -3794,8 +3796,8 @@ void info_guts(memory_access &raw_access, void *con) { } } fos.flush(); - } catch (not_mapped_exception&) { - std::cout << "\nfailed to read memory\n"; + } catch (not_mapped_exception&e) { + std::cout << "\nfailed to read memory at " << hex_string(e.addr) << "\n"; } } @@ -5283,7 +5285,7 @@ bool encrypt_command::execute(device_map &devices) { // Sign the final thing settings.seal.clear_sram = true; sign_guts_elf(enc_elf, private_key, public_key); - + auto out = get_file_idx(ios::out|ios::binary, 1); enc_elf->write(out); out->close();