From adb550afcc5e7612adf79cbdbf7d0e0b10e1ceb1 Mon Sep 17 00:00:00 2001 From: Folkert de Vries Date: Tue, 17 Feb 2026 11:29:05 +0100 Subject: [PATCH] use `read_unaligned` for f64 `vld` and `vldq` --- .../core_arch/src/aarch64/neon/generated.rs | 78 +++++-------------- crates/core_arch/src/aarch64/neon/mod.rs | 8 ++ .../spec/neon/aarch64.spec.yml | 15 ++-- 3 files changed, 34 insertions(+), 67 deletions(-) diff --git a/crates/core_arch/src/aarch64/neon/generated.rs b/crates/core_arch/src/aarch64/neon/generated.rs index a0647551e4..28db407924 100644 --- a/crates/core_arch/src/aarch64/neon/generated.rs +++ b/crates/core_arch/src/aarch64/neon/generated.rs @@ -11488,16 +11488,9 @@ pub unsafe fn vld1q_p64(ptr: *const p64) -> poly64x2_t { #[inline(always)] #[target_feature(enable = "neon")] #[stable(feature = "neon_intrinsics", since = "1.59.0")] -#[cfg_attr(test, assert_instr(ld1))] -pub unsafe fn vld1_f64_x2(a: *const f64) -> float64x1x2_t { - unsafe extern "unadjusted" { - #[cfg_attr( - any(target_arch = "aarch64", target_arch = "arm64ec"), - link_name = "llvm.aarch64.neon.ld1x2.v1f64.p0" - )] - fn _vld1_f64_x2(a: *const f64) -> float64x1x2_t; - } - _vld1_f64_x2(a) +#[cfg_attr(test, assert_instr(ld))] +pub unsafe fn vld1_f64_x2(ptr: *const f64) -> float64x1x2_t { + crate::ptr::read_unaligned(ptr.cast()) } #[doc = "Load multiple single-element structures to one, two, three, or four registers"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_f64_x3)"] @@ -11506,16 +11499,9 @@ pub unsafe fn vld1_f64_x2(a: *const f64) -> float64x1x2_t { #[inline(always)] #[target_feature(enable = "neon")] #[stable(feature = "neon_intrinsics", since = "1.59.0")] -#[cfg_attr(test, assert_instr(ld1))] -pub unsafe fn vld1_f64_x3(a: *const f64) -> float64x1x3_t { - unsafe extern "unadjusted" { - #[cfg_attr( - any(target_arch = "aarch64", target_arch = "arm64ec"), - link_name = "llvm.aarch64.neon.ld1x3.v1f64.p0" - )] - fn _vld1_f64_x3(a: *const f64) -> float64x1x3_t; - } - _vld1_f64_x3(a) +#[cfg_attr(test, assert_instr(ld))] +pub unsafe fn vld1_f64_x3(ptr: *const f64) -> float64x1x3_t { + crate::ptr::read_unaligned(ptr.cast()) } #[doc = "Load multiple single-element structures to one, two, three, or four registers"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_f64_x4)"] @@ -11524,16 +11510,9 @@ pub unsafe fn vld1_f64_x3(a: *const f64) -> float64x1x3_t { #[inline(always)] #[target_feature(enable = "neon")] #[stable(feature = "neon_intrinsics", since = "1.59.0")] -#[cfg_attr(test, assert_instr(ld1))] -pub unsafe fn vld1_f64_x4(a: *const f64) -> float64x1x4_t { - unsafe extern "unadjusted" { - #[cfg_attr( - any(target_arch = "aarch64", target_arch = "arm64ec"), - link_name = "llvm.aarch64.neon.ld1x4.v1f64.p0" - )] - fn _vld1_f64_x4(a: *const f64) -> float64x1x4_t; - } - _vld1_f64_x4(a) +#[cfg_attr(test, assert_instr(ld))] +pub unsafe fn vld1_f64_x4(ptr: *const f64) -> float64x1x4_t { + crate::ptr::read_unaligned(ptr.cast()) } #[doc = "Load multiple single-element structures to one, two, three, or four registers"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_f64_x2)"] @@ -11542,16 +11521,9 @@ pub unsafe fn vld1_f64_x4(a: *const f64) -> float64x1x4_t { #[inline(always)] #[target_feature(enable = "neon")] #[stable(feature = "neon_intrinsics", since = "1.59.0")] -#[cfg_attr(test, assert_instr(ld1))] -pub unsafe fn vld1q_f64_x2(a: *const f64) -> float64x2x2_t { - unsafe extern "unadjusted" { - #[cfg_attr( - any(target_arch = "aarch64", target_arch = "arm64ec"), - link_name = "llvm.aarch64.neon.ld1x2.v2f64.p0" - )] - fn _vld1q_f64_x2(a: *const f64) -> float64x2x2_t; - } - _vld1q_f64_x2(a) +#[cfg_attr(test, assert_instr(ld))] +pub unsafe fn vld1q_f64_x2(ptr: *const f64) -> float64x2x2_t { + crate::ptr::read_unaligned(ptr.cast()) } #[doc = "Load multiple single-element structures to one, two, three, or four registers"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_f64_x3)"] @@ -11560,16 +11532,9 @@ pub unsafe fn vld1q_f64_x2(a: *const f64) -> float64x2x2_t { #[inline(always)] #[target_feature(enable = "neon")] #[stable(feature = "neon_intrinsics", since = "1.59.0")] -#[cfg_attr(test, assert_instr(ld1))] -pub unsafe fn vld1q_f64_x3(a: *const f64) -> float64x2x3_t { - unsafe extern "unadjusted" { - #[cfg_attr( - any(target_arch = "aarch64", target_arch = "arm64ec"), - link_name = "llvm.aarch64.neon.ld1x3.v2f64.p0" - )] - fn _vld1q_f64_x3(a: *const f64) -> float64x2x3_t; - } - _vld1q_f64_x3(a) +#[cfg_attr(test, assert_instr(ld))] +pub unsafe fn vld1q_f64_x3(ptr: *const f64) -> float64x2x3_t { + crate::ptr::read_unaligned(ptr.cast()) } #[doc = "Load multiple single-element structures to one, two, three, or four registers"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_f64_x4)"] @@ -11578,16 +11543,9 @@ pub unsafe fn vld1q_f64_x3(a: *const f64) -> float64x2x3_t { #[inline(always)] #[target_feature(enable = "neon")] #[stable(feature = "neon_intrinsics", since = "1.59.0")] -#[cfg_attr(test, assert_instr(ld1))] -pub unsafe fn vld1q_f64_x4(a: *const f64) -> float64x2x4_t { - unsafe extern "unadjusted" { - #[cfg_attr( - any(target_arch = "aarch64", target_arch = "arm64ec"), - link_name = "llvm.aarch64.neon.ld1x4.v2f64.p0" - )] - fn _vld1q_f64_x4(a: *const f64) -> float64x2x4_t; - } - _vld1q_f64_x4(a) +#[cfg_attr(test, assert_instr(ld))] +pub unsafe fn vld1q_f64_x4(ptr: *const f64) -> float64x2x4_t { + crate::ptr::read_unaligned(ptr.cast()) } #[doc = "Load single 2-element structure and replicate to all lanes of two registers"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_dup_f64)"] diff --git a/crates/core_arch/src/aarch64/neon/mod.rs b/crates/core_arch/src/aarch64/neon/mod.rs index 135d0a156d..c39b3e93af 100644 --- a/crates/core_arch/src/aarch64/neon/mod.rs +++ b/crates/core_arch/src/aarch64/neon/mod.rs @@ -1093,6 +1093,14 @@ mod tests { test_vld1q_f32_x3(f32, 12, float32x4x3_t, vst1q_f32_x3, vld1q_f32_x3); test_vld1q_f32_x4(f32, 16, float32x4x4_t, vst1q_f32_x4, vld1q_f32_x4); + test_vld1_f64_x2(f64, 2, float64x1x2_t, vst1_f64_x2, vld1_f64_x2); + test_vld1_f64_x3(f64, 3, float64x1x3_t, vst1_f64_x3, vld1_f64_x3); + test_vld1_f64_x4(f64, 4, float64x1x4_t, vst1_f64_x4, vld1_f64_x4); + + test_vld1q_f64_x2(f64, 4, float64x2x2_t, vst1q_f64_x2, vld1q_f64_x2); + test_vld1q_f64_x3(f64, 6, float64x2x3_t, vst1q_f64_x3, vld1q_f64_x3); + test_vld1q_f64_x4(f64, 8, float64x2x4_t, vst1q_f64_x4, vld1q_f64_x4); + test_vld1_s8_x2(i8, 16, int8x8x2_t, vst1_s8_x2, vld1_s8_x2); test_vld1_s8_x3(i8, 24, int8x8x3_t, vst1_s8_x3, vld1_s8_x3); test_vld1_s8_x4(i8, 32, int8x8x4_t, vst1_s8_x4, vld1_s8_x4); diff --git a/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml b/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml index 95f23ebd9a..ec9d49a510 100644 --- a/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml +++ b/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml @@ -3479,10 +3479,10 @@ intrinsics: - name: "vld1{neon_type[1].no}" doc: "Load multiple single-element structures to one, two, three, or four registers" - arguments: ["a: {type[0]}"] + arguments: ["ptr: {type[0]}"] return_type: "{neon_type[1]}" attr: [*neon-stable] - assert_instr: [ld1] + assert_instr: [ld] safety: unsafe: [neon] types: @@ -3493,11 +3493,12 @@ intrinsics: - ["*const f64", float64x1x4_t] - ["*const f64", float64x2x4_t] compose: - - LLVMLink: - name: "vld1{neon_type[1].no}" - links: - - link: "llvm.aarch64.neon.ld1x{neon_type[1].tuple}.v{neon_type[1].lane}f{neon_type[1].base}.p0" - arch: aarch64,arm64ec + - FnCall: + - 'crate::ptr::read_unaligned' + - - MethodCall: + - ptr + - cast + - [] - name: "vld2{neon_type[1].lane_nox}" doc: Load multiple 2-element structures to two registers