Skip to content

Commit b942917

Browse files
committed
gh-149449: Fix use-after-free in _PyUnicode_GetNameCAPI
The cache stored the raw struct pointer extracted from unicodedata's _ucnhash_CAPI capsule without keeping a reference to the owning capsule. Dropping the module from sys.modules and running gc.collect() then freed the struct while \N{...} decoding and the namereplace codec handler were still using it. Hold a strong reference to the capsule on the interpreter state and Py_CLEAR it in _PyUnicode_Fini.
1 parent fad0674 commit b942917

4 files changed

Lines changed: 62 additions & 4 deletions

File tree

Include/internal/pycore_interp_structs.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -708,7 +708,11 @@ struct _Py_unicode_ids {
708708
struct _Py_unicode_state {
709709
struct _Py_unicode_fs_codec fs_codec;
710710

711+
// Cached pointer to the unicodedata module's _ucnhash_CAPI struct;
712+
// valid as long as ucnhash_capsule holds a strong reference to the
713+
// owning capsule. See _PyUnicode_GetNameCAPI().
711714
_PyUnicode_Name_CAPI *ucnhash_capi;
715+
PyObject *ucnhash_capsule;
712716

713717
// Unicode identifiers (_Py_Identifier): see _PyUnicode_FromId()
714718
struct _Py_unicode_ids ids;

Lib/test/test_unicodedata.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1106,6 +1106,22 @@ def test_failed_import_during_compiling(self):
11061106
"(can't load unicodedata module)"
11071107
self.assertIn(error, result.err.decode("ascii"))
11081108

1109+
def test_unicodedata_unload_reload(self):
1110+
# gh-149449: dropping unicodedata and running gc must not leave the
1111+
# cached _ucnhash_CAPI pointer dangling.
1112+
code = (
1113+
"import gc, sys\n"
1114+
"assert '\\N{GRINNING FACE}'.encode("
1115+
" 'ascii', errors='namereplace') == b'\\\\N{GRINNING FACE}'\n"
1116+
"compile(r\"x = '\\\\N{LATIN CAPITAL LETTER A}'\", '<x>', 'exec')\n"
1117+
"del sys.modules['unicodedata']\n"
1118+
"gc.collect()\n"
1119+
"assert '\\N{WINKING FACE}'.encode("
1120+
" 'ascii', errors='namereplace') == b'\\\\N{WINKING FACE}'\n"
1121+
"compile(r\"x = '\\\\N{LATIN CAPITAL LETTER B}'\", '<x>', 'exec')\n"
1122+
)
1123+
script_helper.assert_python_ok("-c", code)
1124+
11091125
def test_decimal_numeric_consistent(self):
11101126
# Test that decimal and numeric are consistent,
11111127
# i.e. if a character has a decimal value,
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Fix a use-after-free crash when the :mod:`unicodedata` module was removed
2+
from :data:`sys.modules` and garbage-collected between calls that decode
3+
``\N{...}`` escapes or use the ``namereplace`` codec error handler.

Objects/unicodeobject.c

Lines changed: 39 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6445,13 +6445,44 @@ _PyUnicode_GetNameCAPI(void)
64456445
_PyUnicode_Name_CAPI *ucnhash_capi;
64466446

64476447
ucnhash_capi = _Py_atomic_load_ptr(&interp->unicode.ucnhash_capi);
6448+
if (ucnhash_capi != NULL) {
6449+
return ucnhash_capi;
6450+
}
6451+
6452+
// The pointer we cache lives inside a PyCapsule owned by the
6453+
// unicodedata module. PyCapsule_Import() only returns the raw C
6454+
// pointer, so if unicodedata is later removed from sys.modules and
6455+
// garbage-collected, the capsule's destructor frees the underlying
6456+
// struct and any cached raw pointer is left dangling (gh-149449).
6457+
// Keep a strong reference to the capsule object on the interpreter
6458+
// state so the struct stays alive for the lifetime of the interpreter.
6459+
PyObject *module = PyImport_ImportModule("unicodedata");
6460+
if (module == NULL) {
6461+
return NULL;
6462+
}
6463+
PyObject *capsule = PyObject_GetAttrString(module, "_ucnhash_CAPI");
6464+
Py_DECREF(module);
6465+
if (capsule == NULL) {
6466+
return NULL;
6467+
}
6468+
ucnhash_capi = (_PyUnicode_Name_CAPI *)PyCapsule_GetPointer(
6469+
capsule, PyUnicodeData_CAPSULE_NAME);
64486470
if (ucnhash_capi == NULL) {
6449-
ucnhash_capi = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6450-
PyUnicodeData_CAPSULE_NAME, 1);
6471+
Py_DECREF(capsule);
6472+
return NULL;
6473+
}
64516474

6452-
// It's fine if we overwrite the value here. It's always the same value.
6475+
// Install the capsule into the interpreter state. If another thread
6476+
// raced us and got there first, drop our reference; the pointer
6477+
// values are functionally equivalent.
6478+
PyObject *expected = NULL;
6479+
if (_Py_atomic_compare_exchange_ptr(
6480+
&interp->unicode.ucnhash_capsule, &expected, capsule)) {
64536481
_Py_atomic_store_ptr(&interp->unicode.ucnhash_capi, ucnhash_capi);
64546482
}
6483+
else {
6484+
Py_DECREF(capsule);
6485+
}
64556486
return ucnhash_capi;
64566487
}
64576488

@@ -14959,8 +14990,12 @@ _PyUnicode_Fini(PyInterpreterState *interp)
1495914990
_PyUnicode_FiniEncodings(&state->fs_codec);
1496014991

1496114992
// bpo-47182: force a unicodedata CAPI capsule re-import on
14962-
// subsequent initialization of interpreter.
14993+
// subsequent initialization of interpreter. Releasing the capsule
14994+
// reference here triggers its destructor, which frees the malloc'd
14995+
// _PyUnicode_Name_CAPI struct; clearing the raw pointer is required
14996+
// so callers don't observe the dangling cache (gh-149449).
1496314997
interp->unicode.ucnhash_capi = NULL;
14998+
Py_CLEAR(interp->unicode.ucnhash_capsule);
1496414999

1496515000
unicode_clear_identifiers(state);
1496615001
}

0 commit comments

Comments
 (0)