diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 1208b35..5bcd2dc 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -5,6 +5,16 @@ NEXT ~~~~~~~~~~~~~~~~~ - Drop support for Python 3.8 +- Remove deprecated ``xxhash.VERSION_TUPLE`` +- Default streaming hash objects (``xxh32``, ``xxh64``, ``xxh3_64``, + ``xxh3_128``) are no longer thread-safe by default; this removes + per-object locking overhead and restores performance as the primary goal +- Add ``xxhash.threadsafe`` submodule for users who need to share a + streaming hash object across threads; it provides the same API with a + per-object lock +- Both the default module and ``xxhash.threadsafe`` are provided on + free-threading (no-GIL) Python builds, matching the API on regular GIL + builds v3.7.0 2025-04-25 ~~~~~~~~~~~~~~~~~ diff --git a/README.rst b/README.rst index 3ec4e9c..c1abbb1 100644 --- a/README.rst +++ b/README.rst @@ -251,6 +251,36 @@ And aliases: | xxh128_intdigest = xxh3_128_intdigest | xxh128_hexdigest = xxh3_128_hexdigest +Thread safety +------------- + +The default ``xxhash`` module is optimized for speed. Streaming hash objects +(``xxh32``, ``xxh64``, ``xxh3_64``, ``xxh3_128`` / ``xxh128``) are **not** +thread-safe: do not call ``update()``, ``digest()``, ``copy()``, ``reset()``, +or any other mutating method on the same object from multiple threads without +external synchronization. + +One-shot functions (``xxh32_digest``, ``xxh64_hexdigest``, ``xxh3_128_digest``, +etc.) are stateless and always safe to call concurrently. + +Concurrent ``update()`` / ``reset()`` on a shared streaming hash object is +discouraged even with locking — prefer one-shot functions or per-thread hash +objects. If you must share a streaming hash across threads, use the +``xxhash.threadsafe`` submodule. It provides the same API with a per-object +lock that serializes all access to the internal xxHash state: + +.. code-block:: python + + >>> from xxhash import threadsafe + >>> h = threadsafe.xxh64() + >>> # h can be updated from multiple threads, but concurrent update/reset + >>> # still adds overhead and is not recommended + +The same two-module split is provided on free-threading (no-GIL) Python +builds: the default module is unlocked, and ``xxhash.threadsafe`` provides a +locked variant. + + Caveats ------- diff --git a/setup.py b/setup.py index 6fed6f6..9b4d77d 100644 --- a/setup.py +++ b/setup.py @@ -2,6 +2,7 @@ from pathlib import Path from setuptools import Extension, setup +from setuptools.command.build_ext import build_ext as _build_ext if os.getenv("XXHASH_LINK_SO"): libraries = ["xxhash"] @@ -12,15 +13,51 @@ source = ["src/_xxhash.c", "deps/xxhash/xxhash.c"] include_dirs = ["deps/xxhash"] +# The default ``xxhash._xxhash`` extension is built without per-object locks +# for maximum performance. Users who need to share a streaming hash object +# across threads can use ``xxhash._xxhash_threadsafe`` (exposed as the public +# ``xxhash.threadsafe`` submodule), which is compiled from the same source +# with locking enabled. + +_ext_kwargs = { + "sources": source, + "include_dirs": include_dirs, + "libraries": libraries, +} + ext_modules = [ Extension( "_xxhash", - source, - include_dirs=include_dirs, - libraries=libraries, - ) + **_ext_kwargs, + ), + Extension( + "_xxhash_threadsafe", + define_macros=[("XXHASH_WITH_LOCK", "1"), ("XXHASH_MODULE_NAME", "_xxhash_threadsafe")], + **_ext_kwargs, + ), ] + +class build_ext(_build_ext): + """Build each extension in its own temp directory. + + Both extensions are built from the same ``src/_xxhash.c`` source file. + Without separate temp directories their object files would overwrite + each other, causing one variant to be linked with the wrong macros. + + ``try/finally`` restores ``self.build_temp`` so that incremental builds + (where ``build_ext`` may be reused) still work correctly. + """ + + def build_extension(self, ext): + old_build_temp = self.build_temp + self.build_temp = os.path.join(old_build_temp, ext.name) + try: + super().build_extension(ext) + finally: + self.build_temp = old_build_temp + + d = Path(__file__).parent long_description = d.joinpath("README.rst").read_text() + "\n" + d.joinpath("CHANGELOG.rst").read_text() @@ -58,5 +95,6 @@ ], python_requires=">=3.9", ext_modules=ext_modules, + cmdclass={"build_ext": build_ext}, package_data={"xxhash": ["py.typed", "**.pyi"]}, ) diff --git a/src/_xxhash.c b/src/_xxhash.c index d79f006..5c3b025 100644 --- a/src/_xxhash.c +++ b/src/_xxhash.c @@ -35,60 +35,71 @@ /* ------------------------------------------------------------------ */ /* Lock type & helpers */ /* ------------------------------------------------------------------ */ -#if PY_VERSION_HEX >= 0x030d0000 /* Python 3.13+: always-on PyMutex (3.15+ style) */ -# define XXHASH_LOCK_FIELD PyMutex mutex; -# define XXHASH_LOCK_INIT(o) ((void)((o)->mutex = (PyMutex){0})) -# define XXHASH_LOCK_IS_ACTIVE(o) 1 -# define XXHASH_LOCK_MAYBE_INIT(o, len) ((void)0) -# define XXHASH_LOCK_FINI(o) ((void)0) -# define XXHASH_LOCK_ACQUIRE(o) PyMutex_Lock(&(o)->mutex) -# define XXHASH_LOCK_ACQUIRE_BLOCKING(o) XXHASH_LOCK_ACQUIRE(o) -# define XXHASH_LOCK_RELEASE(o) PyMutex_Unlock(&(o)->mutex) -#else /* Python 3.9-3.12: PyThread_type_lock */ -# define XXHASH_LOCK_FIELD PyThread_type_lock lock; -# define XXHASH_LOCK_INIT(o) ((o)->lock = NULL) -# define XXHASH_LOCK_IS_ACTIVE(o) ((o)->lock != NULL) +#ifdef XXHASH_WITH_LOCK +# if PY_VERSION_HEX >= 0x030d0000 /* Python 3.13+: always-on PyMutex (3.15+ style) */ +# define XXHASH_LOCK_FIELD PyMutex mutex; +# define XXHASH_LOCK_INIT(o) ((void)((o)->mutex = (PyMutex){0})) +# define XXHASH_LOCK_IS_ACTIVE(o) 1 +# define XXHASH_LOCK_MAYBE_INIT(o, len) ((void)0) +# define XXHASH_LOCK_FINI(o) ((void)0) +# define XXHASH_LOCK_ACQUIRE(o) PyMutex_Lock(&(o)->mutex) +# define XXHASH_LOCK_ACQUIRE_BLOCKING(o) XXHASH_LOCK_ACQUIRE(o) +# define XXHASH_LOCK_RELEASE(o) PyMutex_Unlock(&(o)->mutex) +# else /* Python 3.9-3.12: PyThread_type_lock */ +# define XXHASH_LOCK_FIELD PyThread_type_lock lock; +# define XXHASH_LOCK_INIT(o) ((o)->lock = NULL) +# define XXHASH_LOCK_IS_ACTIVE(o) ((o)->lock != NULL) /* Lazy allocation on first large update */ -# define XXHASH_LOCK_MAYBE_INIT(o, len) \ - do { \ - if ((o)->lock == NULL && (len) >= XXHASH_GIL_MINSIZE) { \ - (o)->lock = PyThread_allocate_lock(); \ - /* fail? lock stays NULL, fall back to non-threaded code. */ \ - } \ - } while (0) -# define XXHASH_LOCK_FINI(o) do { if ((o)->lock) \ - PyThread_free_lock((o)->lock); \ - } while (0) +# define XXHASH_LOCK_MAYBE_INIT(o, len) \ + do { \ + if ((o)->lock == NULL && (len) >= XXHASH_GIL_MINSIZE) { \ + (o)->lock = PyThread_allocate_lock(); \ + /* fail? lock stays NULL, fall back to non-threaded code. */ \ + } \ + } while (0) +# define XXHASH_LOCK_FINI(o) do { if ((o)->lock) \ + PyThread_free_lock((o)->lock); \ + } while (0) /* Acquire lock when GIL is already released — simple blocking acquire. * Only acquires if lock has been allocated (lazy init). */ -# define XXHASH_LOCK_ACQUIRE_BLOCKING(o) \ - do { \ - if ((o)->lock) { \ - PyThread_acquire_lock((o)->lock, WAIT_LOCK); \ - } \ - } while (0) +# define XXHASH_LOCK_ACQUIRE_BLOCKING(o) \ + do { \ + if ((o)->lock) { \ + PyThread_acquire_lock((o)->lock, WAIT_LOCK); \ + } \ + } while (0) /* Acquire lock with the GIL held — non-blocking try first, then release * GIL and block if contested (matches hashlib's ENTER_HASHLIB in 3.9-3.12). * Only acquires if lock has been allocated (lazy init). */ -# define XXHASH_LOCK_ACQUIRE(o) \ - do { \ - if ((o)->lock) { \ - if (!PyThread_acquire_lock((o)->lock, NOWAIT_LOCK)) { \ - /* Lock contested – release GIL while waiting. */ \ - Py_BEGIN_ALLOW_THREADS \ - PyThread_acquire_lock((o)->lock, WAIT_LOCK); \ - Py_END_ALLOW_THREADS \ - } \ - } \ - } while (0) - -# define XXHASH_LOCK_RELEASE(o) \ - do { \ - if ((o)->lock) { \ - PyThread_release_lock((o)->lock); \ - } \ - } while (0) +# define XXHASH_LOCK_ACQUIRE(o) \ + do { \ + if ((o)->lock) { \ + if (!PyThread_acquire_lock((o)->lock, NOWAIT_LOCK)) { \ + /* Lock contested – release GIL while waiting. */ \ + Py_BEGIN_ALLOW_THREADS \ + PyThread_acquire_lock((o)->lock, WAIT_LOCK); \ + Py_END_ALLOW_THREADS \ + } \ + } \ + } while (0) + +# define XXHASH_LOCK_RELEASE(o) \ + do { \ + if ((o)->lock) { \ + PyThread_release_lock((o)->lock); \ + } \ + } while (0) +# endif +#else /* !XXHASH_WITH_LOCK */ +# define XXHASH_LOCK_FIELD +# define XXHASH_LOCK_INIT(o) ((void)0) +# define XXHASH_LOCK_IS_ACTIVE(o) 0 +# define XXHASH_LOCK_MAYBE_INIT(o, len) ((void)0) +# define XXHASH_LOCK_FINI(o) ((void)0) +# define XXHASH_LOCK_ACQUIRE(o) ((void)0) +# define XXHASH_LOCK_ACQUIRE_BLOCKING(o) ((void)0) +# define XXHASH_LOCK_RELEASE(o) ((void)0) #endif /* Data size threshold for releasing the GIL during hash. */ @@ -98,6 +109,15 @@ #define VALUE_TO_STRING(x) TOSTRING(x) #define XXHASH_VERSION XXH_VERSION_MAJOR.XXH_VERSION_MINOR.XXH_VERSION_RELEASE +/* Module name is parameterised so the same source file can be compiled as + * both xxhash._xxhash and xxhash._xxhash_threadsafe. */ +#ifndef XXHASH_MODULE_NAME +# define XXHASH_MODULE_NAME _xxhash +#endif +#define XXHASH_PASTE2(a, b) a ## b +#define XXHASH_PASTE(a, b) XXHASH_PASTE2(a, b) +#define XXHASH_PYINIT(name) XXHASH_PASTE(PyInit_, name) + #define XXH32_DIGESTSIZE 4 #define XXH32_BLOCKSIZE 16 #define XXH64_DIGESTSIZE 8 @@ -105,13 +125,10 @@ #define XXH128_DIGESTSIZE 16 #define XXH128_BLOCKSIZE 64 -#ifndef Py_ALWAYS_INLINE -# define Py_ALWAYS_INLINE -#endif /* Hex lookup table for hexdigest(). */ /* Get a buffer from an object. Rejects str with hashlib-compatible error. */ -static inline Py_ALWAYS_INLINE int +static inline int _get_buffer_or_str(PyObject *obj, Py_buffer *buf) { if (obj == Py_None) { @@ -624,11 +641,14 @@ static void PYXXH32_dealloc(PYXXH32Object *self) } /* Macro to generate _do_update for each hash type. - * Matches CPython 3.9-3.12 md5 pattern: release GIL first (for large data), - * then acquire lock, hash, release lock, re-acquire GIL. - * For small data, acquire lock with GIL held (try-then-block if contested). */ + * When XXHASH_WITH_LOCK is defined: matches CPython 3.9-3.12 md5 pattern, + * release GIL first (for large data), then acquire lock, hash, release lock, + * re-acquire GIL. For small data, acquire lock with GIL held + * (try-then-block if contested). + * When XXHASH_WITH_LOCK is not defined: no locking, but still release GIL + * for large data to avoid blocking other threads. */ #define XXHASH_DO_UPDATE(type, update_fn) \ -static inline Py_ALWAYS_INLINE void \ +static inline void \ PY##type##_do_update(PY##type##Object *self, Py_buffer *buf) \ { \ XXHASH_LOCK_MAYBE_INIT(self, buf->len); \ @@ -647,8 +667,14 @@ PY##type##_do_update(PY##type##Object *self, Py_buffer *buf) \ XXHASH_LOCK_RELEASE(self); \ } \ } else { \ - /* No lock: hash directly, no GIL release. */ \ - update_fn(self->xxhash_state, buf->buf, buf->len); \ + /* No lock */ \ + if (buf->len > XXHASH_GIL_MINSIZE) { \ + Py_BEGIN_ALLOW_THREADS \ + update_fn(self->xxhash_state, buf->buf, buf->len); \ + Py_END_ALLOW_THREADS \ + } else { \ + update_fn(self->xxhash_state, buf->buf, buf->len); \ + } \ } \ PyBuffer_Release(buf); \ } @@ -737,9 +763,7 @@ _parse_init_args(PyObject *args, PyObject *kwargs, { Py_ssize_t nargs = PyTuple_GET_SIZE(args); - if (!kwargs) { - /* fast path: no keywords */ - } else { + if (kwargs) { Py_ssize_t pos = 0; PyObject *key, *val; while (PyDict_Next(kwargs, &pos, &key, &val)) { @@ -826,58 +850,63 @@ static int PY##type##_init(PY##type##Object *self, PyObject *args, \ XXHASH_INIT(XXH32, XXH32_reset, XXH32_update, XXH32_hash_t) -PyDoc_STRVAR( - PYXXH32_update_doc, - "update (data)\n\n" - "Update the xxh32 object with bytes-like data. Repeated calls are\n" - "equivalent to a single call with the concatenation of all the arguments."); - -static PyObject *PYXXH32_update(PYXXH32Object *self, PyObject *const *args, - Py_ssize_t nargs, PyObject *kwnames) -{ - PyObject *arg = NULL; - - /* validate keywords first */ - if (kwnames) { - Py_ssize_t nkw = PyTuple_GET_SIZE(kwnames); - for (Py_ssize_t i = 0; i < nkw; i++) { - PyObject *key = PyTuple_GET_ITEM(kwnames, i); - if (PyUnicode_CompareWithASCIIString(key, "data") == 0) { - if (nargs >= 1) { - PyErr_SetString(PyExc_TypeError, - "xxh32.update() got multiple values for argument 'data'"); - return NULL; - } - arg = args[nargs + i]; - } else { - PyErr_Format(PyExc_TypeError, - "'%U' is an invalid keyword argument for 'xxh32.update()'", - key); - return NULL; - } - } - } - - if (nargs >= 1) { - if (nargs > 1) { - PyErr_Format(PyExc_TypeError, - "xxh32.update() takes at most 1 positional argument (%zd given)", nargs); - return NULL; - } - arg = args[0]; - } - - if (!arg) { - PyErr_SetString(PyExc_TypeError, "xxh32.update() missing required argument 'data'"); - return NULL; - } - - Py_buffer buf; - if (_get_buffer_or_str(arg, &buf) < 0) - return NULL; - PYXXH32_do_update(self, &buf); - Py_RETURN_NONE; -} +#define XXHASH_UPDATE_METHOD(prefix, name) \ +PyDoc_STRVAR( \ + PY##prefix##_update_doc, \ + "update (data)\n\n" \ + "Update the " name " object with bytes-like data. Repeated calls are\n" \ + "equivalent to a single call with the concatenation of all the arguments."); \ + \ +static PyObject *PY##prefix##_update(PY##prefix##Object *self, \ + PyObject *const *args, \ + Py_ssize_t nargs, PyObject *kwnames) \ +{ \ + PyObject *arg = NULL; \ + \ + /* validate keywords first */ \ + if (kwnames) { \ + Py_ssize_t nkw = PyTuple_GET_SIZE(kwnames); \ + for (Py_ssize_t i = 0; i < nkw; i++) { \ + PyObject *key = PyTuple_GET_ITEM(kwnames, i); \ + if (PyUnicode_CompareWithASCIIString(key, "data") == 0) { \ + if (nargs >= 1) { \ + PyErr_SetString(PyExc_TypeError, \ + name ".update() got multiple values for argument 'data'"); \ + return NULL; \ + } \ + arg = args[nargs + i]; \ + } else { \ + PyErr_Format(PyExc_TypeError, \ + "'%U' is an invalid keyword argument for '" name ".update()'", \ + key); \ + return NULL; \ + } \ + } \ + } \ + \ + if (nargs >= 1) { \ + if (nargs > 1) { \ + PyErr_Format(PyExc_TypeError, \ + name ".update() takes at most 1 positional argument (%zd given)", nargs); \ + return NULL; \ + } \ + arg = args[0]; \ + } \ + \ + if (!arg) { \ + PyErr_SetString(PyExc_TypeError, \ + name ".update() missing required argument 'data'"); \ + return NULL; \ + } \ + \ + Py_buffer buf; \ + if (_get_buffer_or_str(arg, &buf) < 0) \ + return NULL; \ + PY##prefix##_do_update(self, &buf); \ + Py_RETURN_NONE; \ +} + +XXHASH_UPDATE_METHOD(XXH32, "xxh32") PyDoc_STRVAR( PYXXH32_digest_doc, @@ -1178,58 +1207,7 @@ static PyObject *PYXXH64_new(PyTypeObject *type, PyObject *args, PyObject *kwarg XXHASH_INIT(XXH64, XXH64_reset, XXH64_update, XXH64_hash_t) -PyDoc_STRVAR( - PYXXH64_update_doc, - "update (data)\n\n" - "Update the xxh64 object with bytes-like data. Repeated calls are\n" - "equivalent to a single call with the concatenation of all the arguments."); - -static PyObject *PYXXH64_update(PYXXH64Object *self, PyObject *const *args, - Py_ssize_t nargs, PyObject *kwnames) -{ - PyObject *arg = NULL; - - /* validate keywords first */ - if (kwnames) { - Py_ssize_t nkw = PyTuple_GET_SIZE(kwnames); - for (Py_ssize_t i = 0; i < nkw; i++) { - PyObject *key = PyTuple_GET_ITEM(kwnames, i); - if (PyUnicode_CompareWithASCIIString(key, "data") == 0) { - if (nargs >= 1) { - PyErr_SetString(PyExc_TypeError, - "xxh64.update() got multiple values for argument 'data'"); - return NULL; - } - arg = args[nargs + i]; - } else { - PyErr_Format(PyExc_TypeError, - "'%U' is an invalid keyword argument for 'xxh64.update()'", - key); - return NULL; - } - } - } - - if (nargs >= 1) { - if (nargs > 1) { - PyErr_Format(PyExc_TypeError, - "xxh64.update() takes at most 1 positional argument (%zd given)", nargs); - return NULL; - } - arg = args[0]; - } - - if (!arg) { - PyErr_SetString(PyExc_TypeError, "xxh64.update() missing required argument 'data'"); - return NULL; - } - - Py_buffer buf; - if (_get_buffer_or_str(arg, &buf) < 0) - return NULL; - PYXXH64_do_update(self, &buf); - Py_RETURN_NONE; -} +XXHASH_UPDATE_METHOD(XXH64, "xxh64") PyDoc_STRVAR( PYXXH64_digest_doc, @@ -1530,58 +1508,7 @@ static PyObject *PYXXH3_64_new(PyTypeObject *type, PyObject *args, PyObject *kwa XXHASH_INIT(XXH3_64, XXH3_64bits_reset_withSeed, XXH3_64bits_update, XXH64_hash_t) -PyDoc_STRVAR( - PYXXH3_64_update_doc, - "update (data)\n\n" - "Update the xxh3_64 object with bytes-like data. Repeated calls are\n" - "equivalent to a single call with the concatenation of all the arguments."); - -static PyObject *PYXXH3_64_update(PYXXH3_64Object *self, PyObject *const *args, - Py_ssize_t nargs, PyObject *kwnames) -{ - PyObject *arg = NULL; - - /* validate keywords first */ - if (kwnames) { - Py_ssize_t nkw = PyTuple_GET_SIZE(kwnames); - for (Py_ssize_t i = 0; i < nkw; i++) { - PyObject *key = PyTuple_GET_ITEM(kwnames, i); - if (PyUnicode_CompareWithASCIIString(key, "data") == 0) { - if (nargs >= 1) { - PyErr_SetString(PyExc_TypeError, - "xxh3_64.update() got multiple values for argument 'data'"); - return NULL; - } - arg = args[nargs + i]; - } else { - PyErr_Format(PyExc_TypeError, - "'%U' is an invalid keyword argument for 'xxh3_64.update()'", - key); - return NULL; - } - } - } - - if (nargs >= 1) { - if (nargs > 1) { - PyErr_Format(PyExc_TypeError, - "xxh3_64.update() takes at most 1 positional argument (%zd given)", nargs); - return NULL; - } - arg = args[0]; - } - - if (!arg) { - PyErr_SetString(PyExc_TypeError, "xxh3_64.update() missing required argument 'data'"); - return NULL; - } - - Py_buffer buf; - if (_get_buffer_or_str(arg, &buf) < 0) - return NULL; - PYXXH3_64_do_update(self, &buf); - Py_RETURN_NONE; -} +XXHASH_UPDATE_METHOD(XXH3_64, "xxh3_64") PyDoc_STRVAR( PYXXH3_64_digest_doc, @@ -1889,58 +1816,7 @@ static PyObject *PYXXH3_128_new(PyTypeObject *type, PyObject *args, PyObject *kw XXHASH_INIT(XXH3_128, XXH3_128bits_reset_withSeed, XXH3_128bits_update, XXH64_hash_t) -PyDoc_STRVAR( - PYXXH3_128_update_doc, - "update (data)\n\n" - "Update the xxh3_128 object with bytes-like data. Repeated calls are\n" - "equivalent to a single call with the concatenation of all the arguments."); - -static PyObject *PYXXH3_128_update(PYXXH3_128Object *self, PyObject *const *args, - Py_ssize_t nargs, PyObject *kwnames) -{ - PyObject *arg = NULL; - - /* validate keywords first */ - if (kwnames) { - Py_ssize_t nkw = PyTuple_GET_SIZE(kwnames); - for (Py_ssize_t i = 0; i < nkw; i++) { - PyObject *key = PyTuple_GET_ITEM(kwnames, i); - if (PyUnicode_CompareWithASCIIString(key, "data") == 0) { - if (nargs >= 1) { - PyErr_SetString(PyExc_TypeError, - "xxh3_128.update() got multiple values for argument 'data'"); - return NULL; - } - arg = args[nargs + i]; - } else { - PyErr_Format(PyExc_TypeError, - "'%U' is an invalid keyword argument for 'xxh3_128.update()'", - key); - return NULL; - } - } - } - - if (nargs >= 1) { - if (nargs > 1) { - PyErr_Format(PyExc_TypeError, - "xxh3_128.update() takes at most 1 positional argument (%zd given)", nargs); - return NULL; - } - arg = args[0]; - } - - if (!arg) { - PyErr_SetString(PyExc_TypeError, "xxh3_128.update() missing required argument 'data'"); - return NULL; - } - - Py_buffer buf; - if (_get_buffer_or_str(arg, &buf) < 0) - return NULL; - PYXXH3_128_do_update(self, &buf); - Py_RETURN_NONE; -} +XXHASH_UPDATE_METHOD(XXH3_128, "xxh3_128") PyDoc_STRVAR( PYXXH3_128_digest_doc, @@ -2230,7 +2106,10 @@ static PyModuleDef_Slot slots[] = { {Py_mod_multiple_interpreters, Py_MOD_PER_INTERPRETER_GIL_SUPPORTED}, #endif #if PY_VERSION_HEX >= 0x030d0000 - /* Python 3.13+: module is thread-safe with per-object lock */ + /* Both variants manage their own synchronization guarantees: + * the thread-safe variant uses a per-object lock; the default + * variant requires callers not to share streaming hash objects + * across threads. */ {Py_mod_gil, Py_MOD_GIL_NOT_USED}, #endif {0, NULL} @@ -2254,7 +2133,7 @@ static PyMethodDef methods[] = { static struct PyModuleDef moduledef = { PyModuleDef_HEAD_INIT, - "_xxhash", + VALUE_TO_STRING(XXHASH_MODULE_NAME), NULL, 0, methods, @@ -2265,7 +2144,7 @@ static struct PyModuleDef moduledef = { }; PyMODINIT_FUNC -PyInit__xxhash(void) +XXHASH_PYINIT(XXHASH_MODULE_NAME)(void) { return PyModuleDef_Init(&moduledef); } diff --git a/tests/test_thread_safety.py b/tests/test_thread_safety.py index 24852ba..c254a4d 100644 --- a/tests/test_thread_safety.py +++ b/tests/test_thread_safety.py @@ -1,18 +1,15 @@ """ -Thread-safety tests for xxhash. +Thread-safety tests for xxhash.threadsafe. -Previously, the C extension released the GIL inside ``_do_update`` (via -``Py_BEGIN_ALLOW_THREADS`` / ``Py_END_ALLOW_THREADS``) while calling -``XXH*_update(self->xxhash_state, ...)``, and all other methods accessed the -same ``xxhash_state`` without any per-object lock — creating a data race. - -This has been fixed by adding a per-object ``PyThread_type_lock`` that protects -all access to ``xxhash_state``. The lock is acquired **after** releasing the -GIL in ``update()`` to avoid ABBA deadlocks. +The default ``xxhash`` module is optimized for speed and does not protect +streaming hash objects with a per-object lock. Concurrent access to the same +hash object from multiple threads is only safe when using the +``xxhash.threadsafe`` submodule, which adds a per-object lock around every +operation that touches the internal xxHash state. The tests below verify that: - * no crashes occur under concurrent access - * hash results are now deterministic (no data races) + * no crashes occur under concurrent access to ``threadsafe`` hash objects + * hash results are deterministic (no data races) """ import os @@ -20,7 +17,7 @@ import subprocess import signal import unittest -import xxhash +from xxhash import threadsafe as xxhash # --------------------------------------------------------------------------- @@ -47,7 +44,7 @@ def _run_in_subprocess(code: str, timeout: float = 60.0): # --------------------------------------------------------------------------- CONCURRENT_DIGEST_CODE = r""" -import sys, threading, xxhash +import sys, threading; from xxhash import threadsafe as xxhash h = xxhash.xxh32() BLOCK = b'x' * (4 * 1024 * 1024) # 4 MiB @@ -93,7 +90,7 @@ def digester(): # --------------------------------------------------------------------------- CONCURRENT_RESET_CODE = r""" -import sys, threading, xxhash +import sys, threading; from xxhash import threadsafe as xxhash h = xxhash.xxh32() BLOCK = b'x' * (4 * 1024 * 1024) @@ -139,7 +136,7 @@ def reseter(): # --------------------------------------------------------------------------- CONCURRENT_UPDATE_CODE = r""" -import sys, threading, xxhash +import sys, threading; from xxhash import threadsafe as xxhash h = xxhash.xxh32() BLOCK = b'x' * (4 * 1024 * 1024) @@ -184,7 +181,7 @@ def updater(): # --------------------------------------------------------------------------- NON_DETERMINISM_CODE = r""" -import sys, threading, xxhash +import sys, threading; from xxhash import threadsafe as xxhash h = xxhash.xxh32() BLOCK = b'x' * (4 * 1024 * 1024) @@ -224,7 +221,7 @@ def updater(): # --------------------------------------------------------------------------- XXH128_UPDATE_RESET_CODE = r""" -import sys, threading, xxhash +import sys, threading; from xxhash import threadsafe as xxhash h = xxhash.xxh128() BLOCK = b'x' * 16 # tiny block: more calls = more race windows @@ -278,7 +275,7 @@ def reseter(): # --------------------------------------------------------------------------- XXH128_UPDATE_COPY_CODE = r""" -import sys, threading, xxhash +import sys, threading; from xxhash import threadsafe as xxhash h = xxhash.xxh128() copies = [] @@ -337,7 +334,7 @@ def copier(): # --------------------------------------------------------------------------- XXH128_ALL_METHODS_CODE = r""" -import sys, random, threading, xxhash +import sys, random, threading; from xxhash import threadsafe as xxhash h = xxhash.xxh128() BLOCK = b'x' * 32 @@ -386,7 +383,7 @@ def worker(): # --------------------------------------------------------------------------- XXH64_AGGRESSIVE_RACE_CODE = r""" -import sys, threading, xxhash +import sys, threading; from xxhash import threadsafe as xxhash h = xxhash.xxh64() BLOCK = b'x' * 16 @@ -423,9 +420,10 @@ def worker(): class TestThreadSafety(unittest.TestCase): - """Verify that concurrent access to a single hash object does not crash. + """Verify that concurrent access to a single threadsafe hash object works. - With per-object locking in place, concurrent access is now safe. + These tests import ``xxhash.threadsafe`` (not the default ``xxhash`` + module), which uses a per-object lock around all streaming operations. We run each scenario many times (``REPETITIONS``) to verify that no crashes, deadlocks, or unexpected exceptions occur. """ @@ -509,8 +507,7 @@ class TestNonDeterminism(unittest.TestCase): TIMEOUT = int(os.environ.get("XXHASH_TEST_TIMEOUT", "120")) def test_concurrent_update_is_deterministic(self): - """Concurrent update() should be deterministic (FIXED: per-object - lock now prevents data races).""" + """Concurrent update() on a threadsafe object should be deterministic.""" digests = set() for i in range(self.SAMPLES): rc, out, err = _run_in_subprocess( diff --git a/xxhash/__init__.py b/xxhash/__init__.py index 5fe0087..d1723b5 100644 --- a/xxhash/__init__.py +++ b/xxhash/__init__.py @@ -18,7 +18,7 @@ XXHASH_VERSION, ) -from .version import VERSION, VERSION_TUPLE +from .version import VERSION xxh128 = xxh3_128 @@ -26,13 +26,13 @@ xxh128_intdigest = xxh3_128_intdigest xxh128_digest = xxh3_128_digest -algorithms_available = set([ +algorithms_available = { "xxh32", "xxh64", "xxh3_64", "xxh128", "xxh3_128", -]) +} algorithms_guaranteed = algorithms_available @@ -59,7 +59,6 @@ "xxh128_intdigest", "xxh128_hexdigest", "VERSION", - "VERSION_TUPLE", "XXHASH_VERSION", "algorithms_available", "algorithms_guaranteed", diff --git a/xxhash/__init__.pyi b/xxhash/__init__.pyi index 7490e30..91741d2 100644 --- a/xxhash/__init__.pyi +++ b/xxhash/__init__.pyi @@ -8,8 +8,6 @@ _DataType = _Buffer VERSION: str XXHASH_VERSION: str -#: Deprecated, will be removed in the next major release -VERSION_TUPLE: tuple[int, ...] algorithms_available: set[str] algorithms_guaranteed: set[str] @@ -36,7 +34,6 @@ __all__: list[str] = [ "xxh128_intdigest", "xxh128_hexdigest", "VERSION", - "VERSION_TUPLE", "XXHASH_VERSION", "algorithms_available", "algorithms_guaranteed", diff --git a/xxhash/threadsafe.py b/xxhash/threadsafe.py new file mode 100644 index 0000000..0e49785 --- /dev/null +++ b/xxhash/threadsafe.py @@ -0,0 +1,62 @@ +from xxhash.version import VERSION +from xxhash._xxhash_threadsafe import ( + xxh32, + xxh32_digest, + xxh32_intdigest, + xxh32_hexdigest, + xxh64, + xxh64_digest, + xxh64_intdigest, + xxh64_hexdigest, + xxh3_64, + xxh3_64_digest, + xxh3_64_intdigest, + xxh3_64_hexdigest, + xxh3_128, + xxh3_128_digest, + xxh3_128_intdigest, + xxh3_128_hexdigest, + XXHASH_VERSION, +) + +xxh128 = xxh3_128 +xxh128_digest = xxh3_128_digest +xxh128_intdigest = xxh3_128_intdigest +xxh128_hexdigest = xxh3_128_hexdigest + +algorithms_available = { + "xxh32", + "xxh64", + "xxh3_64", + "xxh128", + "xxh3_128", +} + +algorithms_guaranteed = algorithms_available + +__all__ = [ + "xxh32", + "xxh32_digest", + "xxh32_intdigest", + "xxh32_hexdigest", + "xxh64", + "xxh64_digest", + "xxh64_intdigest", + "xxh64_hexdigest", + "xxh3_64", + "xxh3_64_digest", + "xxh3_64_intdigest", + "xxh3_64_hexdigest", + "xxh3_128", + "xxh3_128_digest", + "xxh3_128_intdigest", + "xxh3_128_hexdigest", + "xxh128", + "xxh128_digest", + "xxh128_intdigest", + "xxh128_hexdigest", + "VERSION", + "XXHASH_VERSION", + "algorithms_available", + "algorithms_guaranteed", +] diff --git a/xxhash/threadsafe.pyi b/xxhash/threadsafe.pyi new file mode 100644 index 0000000..e90bffe --- /dev/null +++ b/xxhash/threadsafe.pyi @@ -0,0 +1,93 @@ +from typing import Protocol, final + +class _Buffer(Protocol): + """Objects that support the buffer protocol (PEP 688).""" + def __buffer__(self, flags: int, /) -> memoryview: ... + +_DataType = _Buffer + +VERSION: str +XXHASH_VERSION: str + +algorithms_available: set[str] +algorithms_guaranteed: set[str] + +__all__: list[str] = [ + "xxh32", + "xxh32_digest", + "xxh32_intdigest", + "xxh32_hexdigest", + "xxh64", + "xxh64_digest", + "xxh64_intdigest", + "xxh64_hexdigest", + "xxh3_64", + "xxh3_64_digest", + "xxh3_64_intdigest", + "xxh3_64_hexdigest", + "xxh3_128", + "xxh3_128_digest", + "xxh3_128_intdigest", + "xxh3_128_hexdigest", + "xxh128", + "xxh128_digest", + "xxh128_intdigest", + "xxh128_hexdigest", + "VERSION", + "XXHASH_VERSION", + "algorithms_available", + "algorithms_guaranteed", +] + +class _Hasher: + def __init__(self, data: _DataType = ..., seed: int = ...) -> None: ... + def update(self, data: _DataType) -> None: ... + def digest(self) -> bytes: ... + def hexdigest(self) -> str: ... + def intdigest(self) -> int: ... + def copy(self) -> _Hasher: ... + def reset(self) -> None: ... + @property + def digestsize(self) -> int: ... + @property + def digest_size(self) -> int: ... + @property + def block_size(self) -> int: ... + @property + def name(self) -> str: ... + @property + def seed(self) -> int: ... + +@final +class xxh32(_Hasher): ... + +@final +class xxh64(_Hasher): ... + +@final +class xxh3_64(_Hasher): ... + +@final +class xxh3_128(_Hasher): ... + +xxh128 = xxh3_128 + +def xxh32_digest(data: _DataType, seed: int = ...) -> bytes: ... +def xxh32_hexdigest(data: _DataType, seed: int = ...) -> str: ... +def xxh32_intdigest(data: _DataType, seed: int = ...) -> int: ... + +def xxh64_digest(data: _DataType, seed: int = ...) -> bytes: ... +def xxh64_hexdigest(data: _DataType, seed: int = ...) -> str: ... +def xxh64_intdigest(data: _DataType, seed: int = ...) -> int: ... + +def xxh3_64_digest(data: _DataType, seed: int = ...) -> bytes: ... +def xxh3_64_hexdigest(data: _DataType, seed: int = ...) -> str: ... +def xxh3_64_intdigest(data: _DataType, seed: int = ...) -> int: ... + +def xxh3_128_digest(data: _DataType, seed: int = ...) -> bytes: ... +def xxh3_128_hexdigest(data: _DataType, seed: int = ...) -> str: ... +def xxh3_128_intdigest(data: _DataType, seed: int = ...) -> int: ... + +xxh128_digest = xxh3_128_digest +xxh128_hexdigest = xxh3_128_hexdigest +xxh128_intdigest = xxh3_128_intdigest diff --git a/xxhash/version.py b/xxhash/version.py index 17952b8..1f5645f 100644 --- a/xxhash/version.py +++ b/xxhash/version.py @@ -1,3 +1 @@ -VERSION = "3.8.0.dev9" -#: Deprecated, will be removed in the next major release -VERSION_TUPLE = (3, 8, 0) +VERSION = "4.0.0.dev0"