Skip to content

Commit bc112a2

Browse files
authored
Merge branch 'main' into dataclasses_frozendict_tests
2 parents 97cec2d + 3dc8fdb commit bc112a2

File tree

9 files changed

+272
-11
lines changed

9 files changed

+272
-11
lines changed

Lib/difflib.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -942,10 +942,12 @@ def _fancy_replace(self, a, alo, ahi, b, blo, bhi):
942942
cruncher.set_seq1(a[i])
943943
# Ordering by cheapest to most expensive ratio is very
944944
# valuable, most often getting out early.
945-
if (crqr() > best_ratio
946-
and cqr() > best_ratio
947-
and cr() > best_ratio):
948-
best_i, best_j, best_ratio = i, j, cr()
945+
if crqr() <= best_ratio or cqr() <= best_ratio:
946+
continue
947+
948+
ratio = cr()
949+
if ratio > best_ratio:
950+
best_i, best_j, best_ratio = i, j, ratio
949951

950952
if best_i is None:
951953
# found nothing to synch on yet - move to next j

Lib/test/test_bytes.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -584,6 +584,37 @@ def test_hex_separator_six_bytes(self):
584584
self.assertEqual(six_bytes.hex(':', -6), '0306090c0f12')
585585
self.assertEqual(six_bytes.hex(' ', -95), '0306090c0f12')
586586

587+
def test_hex_simd_boundaries(self):
588+
# Test lengths around the SIMD threshold (16 bytes).
589+
# SIMD processes 16 bytes at a time; smaller inputs use scalar code.
590+
for length in (14, 15, 16, 17, 31, 32, 33, 64, 65):
591+
data = self.type2test(bytes(range(length)))
592+
expected = ''.join(f'{b:02x}' for b in range(length))
593+
with self.subTest(length=length):
594+
self.assertEqual(data.hex(), expected)
595+
596+
def test_hex_nibble_boundaries(self):
597+
# Test the nibble value boundary at 9/10 (where '9' becomes 'a').
598+
# SIMD uses signed comparison for efficiency; verify correctness
599+
# at this boundary for various nibble combinations.
600+
boundary_bytes = self.type2test(bytes([
601+
0x09, # both nibbles: 0, 9
602+
0x0a, # both nibbles: 0, 10
603+
0x90, # both nibbles: 9, 0
604+
0x99, # both nibbles: 9, 9 (max all-digit)
605+
0x9a, # both nibbles: 9, 10
606+
0xa0, # both nibbles: 10, 0
607+
0xa9, # both nibbles: 10, 9
608+
0xaa, # both nibbles: 10, 10 (min all-letter)
609+
0x00, # min value
610+
0xff, # max value
611+
]))
612+
self.assertEqual(boundary_bytes.hex(), '090a90999aa0a9aa00ff')
613+
614+
# Repeat with 16+ bytes to exercise SIMD path
615+
simd_boundary = self.type2test(boundary_bytes * 2)
616+
self.assertEqual(simd_boundary.hex(), '090a90999aa0a9aa00ff' * 2)
617+
587618
def test_join(self):
588619
self.assertEqual(self.type2test(b"").join([]), b"")
589620
self.assertEqual(self.type2test(b"").join([b""]), b"")

Lib/test/test_dict.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1736,6 +1736,16 @@ class FrozenDictSlots(frozendict):
17361736

17371737

17381738
class FrozenDictTests(unittest.TestCase):
1739+
def test_constructor(self):
1740+
# frozendict.__init__() has no effect
1741+
d = frozendict(a=1, b=2, c=3)
1742+
d.__init__(x=1)
1743+
self.assertEqual(d, frozendict(a=1, b=2, c=3))
1744+
1745+
# dict constructor cannot be used on frozendict
1746+
with self.assertRaises(TypeError):
1747+
dict.__init__(d, x=1)
1748+
17391749
def test_copy(self):
17401750
d = frozendict(x=1, y=2)
17411751
d2 = d.copy()
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
Speed up :meth:`bytes.hex`, :meth:`bytearray.hex`, :func:`binascii.hexlify`,
2+
and :mod:`hashlib` ``.hexdigest()`` operations with SIMD on x86-64, ARM64,
3+
and ARM32 with NEON when built with gcc (version 12 or higher) or clang
4+
(version 3 or higher) compilers. Around 1.1-3x faster for common 16-64 byte
5+
inputs such as hashlib hex digests, and up to 8x faster for larger data.

Objects/dictobject.c

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8143,7 +8143,6 @@ PyTypeObject PyFrozenDict_Type = {
81438143
.tp_richcompare = dict_richcompare,
81448144
.tp_iter = dict_iter,
81458145
.tp_methods = frozendict_methods,
8146-
.tp_init = dict_init,
81478146
.tp_alloc = _PyType_AllocNoTrack,
81488147
.tp_new = frozendict_new,
81498148
.tp_free = PyObject_GC_Del,

Python/pystrhex.c

Lines changed: 115 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,113 @@
44
#include "pycore_strhex.h" // _Py_strhex_with_sep()
55
#include "pycore_unicodeobject.h" // _PyUnicode_CheckConsistency()
66

7+
/* Scalar hexlify: convert len bytes to 2*len hex characters.
8+
Uses table lookup via Py_hexdigits for the conversion. */
9+
static inline void
10+
_Py_hexlify_scalar(const unsigned char *src, Py_UCS1 *dst, Py_ssize_t len)
11+
{
12+
/* Various optimizations like using math instead of a table lookup,
13+
manually unrolling the loop, storing the global table pointer locally,
14+
and doing wider dst writes have been tried and benchmarked; all produced
15+
nearly identical performance on gcc 15. Using a 256 entry uint16_t
16+
table was a bit slower. So we keep our old simple and obvious code. */
17+
for (Py_ssize_t i = 0; i < len; i++) {
18+
unsigned char c = src[i];
19+
*dst++ = Py_hexdigits[c >> 4];
20+
*dst++ = Py_hexdigits[c & 0x0f];
21+
}
22+
}
23+
24+
/* Portable SIMD optimization for hexlify using GCC/Clang vector extensions.
25+
Uses __builtin_shufflevector for portable interleave that compiles to
26+
native SIMD instructions (SSE2 punpcklbw/punpckhbw on x86-64 [always],
27+
NEON zip1/zip2 on ARM64 [always], & vzip on ARM32 when compiler flags
28+
for the target microarch allow it [try -march=native if running 32-bit
29+
on an RPi3 or later]).
30+
31+
Performance:
32+
- For more common small data it varies between 1.1-3x faster.
33+
- Up to 11x faster on larger data than the scalar code.
34+
35+
While faster is possible for big data using AVX2 or AVX512, that
36+
adds a ton of complication. Who ever really hexes huge data?
37+
The 16-64 byte boosts align nicely with md5 - sha512 hexdigests.
38+
*/
39+
#ifdef HAVE_EFFICIENT_BUILTIN_SHUFFLEVECTOR
40+
41+
/* 128-bit vector of 16 unsigned bytes */
42+
typedef unsigned char v16u8 __attribute__((vector_size(16)));
43+
/* 128-bit vector of 16 signed bytes - for efficient comparison.
44+
Using signed comparison generates pcmpgtb on x86-64 instead of
45+
the slower psubusb+pcmpeqb sequence from unsigned comparison.
46+
ARM NEON performs the same either way. */
47+
typedef signed char v16s8 __attribute__((vector_size(16)));
48+
49+
/* Splat a byte value across all 16 lanes */
50+
static inline v16u8
51+
v16u8_splat(unsigned char x)
52+
{
53+
return (v16u8){x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x};
54+
}
55+
56+
static inline v16s8
57+
v16s8_splat(signed char x)
58+
{
59+
return (v16s8){x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x};
60+
}
61+
62+
/* Portable SIMD hexlify: converts 16 bytes to 32 hex chars per iteration.
63+
Compiles to native SSE2 on x86-64, NEON on ARM64 (and some ARM32). */
64+
static void
65+
_Py_hexlify_simd(const unsigned char *src, Py_UCS1 *dst, Py_ssize_t len)
66+
{
67+
const v16u8 mask_0f = v16u8_splat(0x0f);
68+
const v16u8 ascii_0 = v16u8_splat('0');
69+
const v16u8 offset = v16u8_splat('a' - '0' - 10); /* 0x27 */
70+
const v16s8 nine = v16s8_splat(9);
71+
72+
Py_ssize_t i = 0;
73+
74+
/* Process 16 bytes at a time */
75+
for (; i + 16 <= len; i += 16, dst += 32) {
76+
/* Load 16 bytes (memcpy for safe unaligned access) */
77+
v16u8 data;
78+
memcpy(&data, src + i, 16);
79+
80+
/* Extract high and low nibbles using vector operators */
81+
v16u8 hi = (data >> 4) & mask_0f;
82+
v16u8 lo = data & mask_0f;
83+
84+
/* Compare > 9 using signed comparison for efficient codegen.
85+
Nibble values 0-15 are safely in signed byte range.
86+
This generates pcmpgtb on x86-64, avoiding the slower
87+
psubusb+pcmpeqb sequence from unsigned comparison. */
88+
v16u8 hi_gt9 = (v16u8)((v16s8)hi > nine);
89+
v16u8 lo_gt9 = (v16u8)((v16s8)lo > nine);
90+
91+
/* Convert nibbles to hex ASCII */
92+
hi = hi + ascii_0 + (hi_gt9 & offset);
93+
lo = lo + ascii_0 + (lo_gt9 & offset);
94+
95+
/* Interleave hi/lo nibbles using portable shufflevector.
96+
This compiles to punpcklbw/punpckhbw on x86-64, zip1/zip2 on ARM64,
97+
or vzip on ARM32. */
98+
v16u8 result0 = __builtin_shufflevector(hi, lo,
99+
0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
100+
v16u8 result1 = __builtin_shufflevector(hi, lo,
101+
8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
102+
103+
/* Store 32 hex characters */
104+
memcpy(dst, &result0, 16);
105+
memcpy(dst + 16, &result1, 16);
106+
}
107+
108+
/* Scalar fallback for remaining 0-15 bytes */
109+
_Py_hexlify_scalar(src + i, dst, len - i);
110+
}
111+
112+
#endif /* HAVE_EFFICIENT_BUILTIN_SHUFFLEVECTOR */
113+
7114
static PyObject *_Py_strhex_impl(const char* argbuf, const Py_ssize_t arglen,
8115
PyObject* sep, int bytes_per_sep_group,
9116
const int return_bytes)
@@ -82,13 +189,15 @@ static PyObject *_Py_strhex_impl(const char* argbuf, const Py_ssize_t arglen,
82189
unsigned char c;
83190

84191
if (bytes_per_sep_group == 0) {
85-
for (i = j = 0; i < arglen; ++i) {
86-
assert((j + 1) < resultlen);
87-
c = argbuf[i];
88-
retbuf[j++] = Py_hexdigits[c >> 4];
89-
retbuf[j++] = Py_hexdigits[c & 0x0f];
192+
#ifdef HAVE_EFFICIENT_BUILTIN_SHUFFLEVECTOR
193+
if (arglen >= 16) {
194+
_Py_hexlify_simd((const unsigned char *)argbuf, retbuf, arglen);
195+
}
196+
else
197+
#endif
198+
{
199+
_Py_hexlify_scalar((const unsigned char *)argbuf, retbuf, arglen);
90200
}
91-
assert(j == resultlen);
92201
}
93202
else {
94203
/* The number of complete chunk+sep periods */

configure

Lines changed: 66 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

configure.ac

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5017,6 +5017,41 @@ AS_VAR_IF([ac_cv_builtin_atomic], [yes], [
50175017
AC_DEFINE(HAVE_BUILTIN_ATOMIC, 1, [Has builtin __atomic_load_n() and __atomic_store_n() functions])
50185018
])
50195019

5020+
# Check for __builtin_shufflevector with 128-bit vector support on an
5021+
# architecture where it compiles to worthwhile native SIMD instructions.
5022+
# Used for SIMD-accelerated bytes.hex() in Python/pystrhex.c.
5023+
AC_CACHE_CHECK([for __builtin_shufflevector], [ac_cv_efficient_builtin_shufflevector], [
5024+
AC_LINK_IFELSE([
5025+
AC_LANG_PROGRAM([[
5026+
/* __builtin_shufflevector is available on many platforms, but 128-bit
5027+
vector code is only worthwhile on architectures with native SIMD:
5028+
x86-64 (SSE2, always available), ARM64 (NEON, always available),
5029+
or ARM32 when NEON is enabled via compiler flags (e.g. -march=native
5030+
on RPi3+). On ARM32 without NEON (e.g. armv6 builds), the compiler
5031+
has the builtin but generates slow scalar code instead. */
5032+
#if !defined(__x86_64__) && !defined(__aarch64__) && \
5033+
!(defined(__arm__) && defined(__ARM_NEON))
5034+
# error "128-bit vector SIMD not worthwhile on this architecture"
5035+
#endif
5036+
typedef unsigned char v16u8 __attribute__((vector_size(16)));
5037+
]], [[
5038+
v16u8 a = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
5039+
v16u8 b = {16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31};
5040+
v16u8 c = __builtin_shufflevector(a, b,
5041+
0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
5042+
(void)c;
5043+
return 0;
5044+
]])
5045+
],[ac_cv_efficient_builtin_shufflevector=yes],[ac_cv_efficient_builtin_shufflevector=no])
5046+
])
5047+
5048+
AS_VAR_IF([ac_cv_efficient_builtin_shufflevector], [yes], [
5049+
AC_DEFINE([HAVE_EFFICIENT_BUILTIN_SHUFFLEVECTOR], [1],
5050+
[Define if compiler supports __builtin_shufflevector with 128-bit
5051+
vectors AND the target architecture has native SIMD (not just API
5052+
availability)])
5053+
])
5054+
50205055
# --with-mimalloc
50215056
AC_MSG_CHECKING([for --with-mimalloc])
50225057
AC_ARG_WITH([mimalloc],

pyconfig.h.in

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -324,6 +324,10 @@
324324
/* Define to 1 if you have the <editline/readline.h> header file. */
325325
#undef HAVE_EDITLINE_READLINE_H
326326

327+
/* Define if compiler supports __builtin_shufflevector with 128-bit vectors
328+
AND the target architecture has native SIMD (not just API availability) */
329+
#undef HAVE_EFFICIENT_BUILTIN_SHUFFLEVECTOR
330+
327331
/* Define to 1 if you have the <endian.h> header file. */
328332
#undef HAVE_ENDIAN_H
329333

0 commit comments

Comments
 (0)