|
4 | 4 | #include "pycore_strhex.h" // _Py_strhex_with_sep() |
5 | 5 | #include "pycore_unicodeobject.h" // _PyUnicode_CheckConsistency() |
6 | 6 |
|
| 7 | +/* Scalar hexlify: convert len bytes to 2*len hex characters. |
| 8 | + Uses table lookup via Py_hexdigits for the conversion. */ |
| 9 | +static inline void |
| 10 | +_Py_hexlify_scalar(const unsigned char *src, Py_UCS1 *dst, Py_ssize_t len) |
| 11 | +{ |
| 12 | + /* Various optimizations like using math instead of a table lookup, |
| 13 | + manually unrolling the loop, storing the global table pointer locally, |
| 14 | + and doing wider dst writes have been tried and benchmarked; all produced |
| 15 | + nearly identical performance on gcc 15. Using a 256 entry uint16_t |
| 16 | + table was a bit slower. So we keep our old simple and obvious code. */ |
| 17 | + for (Py_ssize_t i = 0; i < len; i++) { |
| 18 | + unsigned char c = src[i]; |
| 19 | + *dst++ = Py_hexdigits[c >> 4]; |
| 20 | + *dst++ = Py_hexdigits[c & 0x0f]; |
| 21 | + } |
| 22 | +} |
| 23 | + |
| 24 | +/* Portable SIMD optimization for hexlify using GCC/Clang vector extensions. |
| 25 | + Uses __builtin_shufflevector for portable interleave that compiles to |
| 26 | + native SIMD instructions (SSE2 punpcklbw/punpckhbw on x86-64 [always], |
| 27 | + NEON zip1/zip2 on ARM64 [always], & vzip on ARM32 when compiler flags |
| 28 | + for the target microarch allow it [try -march=native if running 32-bit |
| 29 | + on an RPi3 or later]). |
| 30 | +
|
| 31 | + Performance: |
| 32 | + - For more common small data it varies between 1.1-3x faster. |
| 33 | + - Up to 11x faster on larger data than the scalar code. |
| 34 | +
|
| 35 | + While faster is possible for big data using AVX2 or AVX512, that |
| 36 | + adds a ton of complication. Who ever really hexes huge data? |
| 37 | + The 16-64 byte boosts align nicely with md5 - sha512 hexdigests. |
| 38 | +*/ |
| 39 | +#ifdef HAVE_EFFICIENT_BUILTIN_SHUFFLEVECTOR |
| 40 | + |
| 41 | +/* 128-bit vector of 16 unsigned bytes */ |
| 42 | +typedef unsigned char v16u8 __attribute__((vector_size(16))); |
| 43 | +/* 128-bit vector of 16 signed bytes - for efficient comparison. |
| 44 | + Using signed comparison generates pcmpgtb on x86-64 instead of |
| 45 | + the slower psubusb+pcmpeqb sequence from unsigned comparison. |
| 46 | + ARM NEON performs the same either way. */ |
| 47 | +typedef signed char v16s8 __attribute__((vector_size(16))); |
| 48 | + |
| 49 | +/* Splat a byte value across all 16 lanes */ |
| 50 | +static inline v16u8 |
| 51 | +v16u8_splat(unsigned char x) |
| 52 | +{ |
| 53 | + return (v16u8){x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x}; |
| 54 | +} |
| 55 | + |
| 56 | +static inline v16s8 |
| 57 | +v16s8_splat(signed char x) |
| 58 | +{ |
| 59 | + return (v16s8){x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x}; |
| 60 | +} |
| 61 | + |
| 62 | +/* Portable SIMD hexlify: converts 16 bytes to 32 hex chars per iteration. |
| 63 | + Compiles to native SSE2 on x86-64, NEON on ARM64 (and some ARM32). */ |
| 64 | +static void |
| 65 | +_Py_hexlify_simd(const unsigned char *src, Py_UCS1 *dst, Py_ssize_t len) |
| 66 | +{ |
| 67 | + const v16u8 mask_0f = v16u8_splat(0x0f); |
| 68 | + const v16u8 ascii_0 = v16u8_splat('0'); |
| 69 | + const v16u8 offset = v16u8_splat('a' - '0' - 10); /* 0x27 */ |
| 70 | + const v16s8 nine = v16s8_splat(9); |
| 71 | + |
| 72 | + Py_ssize_t i = 0; |
| 73 | + |
| 74 | + /* Process 16 bytes at a time */ |
| 75 | + for (; i + 16 <= len; i += 16, dst += 32) { |
| 76 | + /* Load 16 bytes (memcpy for safe unaligned access) */ |
| 77 | + v16u8 data; |
| 78 | + memcpy(&data, src + i, 16); |
| 79 | + |
| 80 | + /* Extract high and low nibbles using vector operators */ |
| 81 | + v16u8 hi = (data >> 4) & mask_0f; |
| 82 | + v16u8 lo = data & mask_0f; |
| 83 | + |
| 84 | + /* Compare > 9 using signed comparison for efficient codegen. |
| 85 | + Nibble values 0-15 are safely in signed byte range. |
| 86 | + This generates pcmpgtb on x86-64, avoiding the slower |
| 87 | + psubusb+pcmpeqb sequence from unsigned comparison. */ |
| 88 | + v16u8 hi_gt9 = (v16u8)((v16s8)hi > nine); |
| 89 | + v16u8 lo_gt9 = (v16u8)((v16s8)lo > nine); |
| 90 | + |
| 91 | + /* Convert nibbles to hex ASCII */ |
| 92 | + hi = hi + ascii_0 + (hi_gt9 & offset); |
| 93 | + lo = lo + ascii_0 + (lo_gt9 & offset); |
| 94 | + |
| 95 | + /* Interleave hi/lo nibbles using portable shufflevector. |
| 96 | + This compiles to punpcklbw/punpckhbw on x86-64, zip1/zip2 on ARM64, |
| 97 | + or vzip on ARM32. */ |
| 98 | + v16u8 result0 = __builtin_shufflevector(hi, lo, |
| 99 | + 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); |
| 100 | + v16u8 result1 = __builtin_shufflevector(hi, lo, |
| 101 | + 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); |
| 102 | + |
| 103 | + /* Store 32 hex characters */ |
| 104 | + memcpy(dst, &result0, 16); |
| 105 | + memcpy(dst + 16, &result1, 16); |
| 106 | + } |
| 107 | + |
| 108 | + /* Scalar fallback for remaining 0-15 bytes */ |
| 109 | + _Py_hexlify_scalar(src + i, dst, len - i); |
| 110 | +} |
| 111 | + |
| 112 | +#endif /* HAVE_EFFICIENT_BUILTIN_SHUFFLEVECTOR */ |
| 113 | + |
7 | 114 | static PyObject *_Py_strhex_impl(const char* argbuf, const Py_ssize_t arglen, |
8 | 115 | PyObject* sep, int bytes_per_sep_group, |
9 | 116 | const int return_bytes) |
@@ -82,13 +189,15 @@ static PyObject *_Py_strhex_impl(const char* argbuf, const Py_ssize_t arglen, |
82 | 189 | unsigned char c; |
83 | 190 |
|
84 | 191 | if (bytes_per_sep_group == 0) { |
85 | | - for (i = j = 0; i < arglen; ++i) { |
86 | | - assert((j + 1) < resultlen); |
87 | | - c = argbuf[i]; |
88 | | - retbuf[j++] = Py_hexdigits[c >> 4]; |
89 | | - retbuf[j++] = Py_hexdigits[c & 0x0f]; |
| 192 | +#ifdef HAVE_EFFICIENT_BUILTIN_SHUFFLEVECTOR |
| 193 | + if (arglen >= 16) { |
| 194 | + _Py_hexlify_simd((const unsigned char *)argbuf, retbuf, arglen); |
| 195 | + } |
| 196 | + else |
| 197 | +#endif |
| 198 | + { |
| 199 | + _Py_hexlify_scalar((const unsigned char *)argbuf, retbuf, arglen); |
90 | 200 | } |
91 | | - assert(j == resultlen); |
92 | 201 | } |
93 | 202 | else { |
94 | 203 | /* The number of complete chunk+sep periods */ |
|
0 commit comments