diff --git a/mypy/typeshed/stubs/librt/librt/strings.pyi b/mypy/typeshed/stubs/librt/librt/strings.pyi index 01aee3ff758d..7a028f9e7859 100644 --- a/mypy/typeshed/stubs/librt/librt/strings.pyi +++ b/mypy/typeshed/stubs/librt/librt/strings.pyi @@ -47,3 +47,4 @@ def isspace(c: i32, /) -> bool: ... def isdigit(c: i32, /) -> bool: ... def isalnum(c: i32, /) -> bool: ... def isalpha(c: i32, /) -> bool: ... +def isidentifier(c: i32, /) -> bool: ... diff --git a/mypyc/ir/deps.py b/mypyc/ir/deps.py index 0cf58c83c27b..751845d3a324 100644 --- a/mypyc/ir/deps.py +++ b/mypyc/ir/deps.py @@ -116,5 +116,4 @@ def get_header(self) -> str: STRING_WRITER_EXTRA_OPS: Final = SourceDep("stringwriter_extra_ops.c") BYTEARRAY_EXTRA_OPS: Final = SourceDep("bytearray_extra_ops.c") STR_EXTRA_OPS: Final = SourceDep("str_extra_ops.c") -CODEPOINT_EXTRA_OPS: Final = SourceDep("codepoint_extra_ops.c") VECS_EXTRA_OPS: Final = SourceDep("vecs_extra_ops.c") diff --git a/mypyc/lib-rt/codepoint_extra_ops.c b/mypyc/lib-rt/codepoint_extra_ops.c deleted file mode 100644 index ca03eba4e6f5..000000000000 --- a/mypyc/lib-rt/codepoint_extra_ops.c +++ /dev/null @@ -1,8 +0,0 @@ -#include "codepoint_extra_ops.h" - -// Out-of-line bodies for codepoint helpers that are too large to inline. -// The classification helpers and the ASCII fast paths for case conversion -// stay inline in codepoint_extra_ops.h; this file holds the slow paths -// that round-trip through PyUnicode_FromOrdinal and CPython's Unicode -// machinery. Currently empty; populated as later commits add -// isidentifier, toupper, and tolower. diff --git a/mypyc/lib-rt/codepoint_extra_ops.h b/mypyc/lib-rt/codepoint_extra_ops.h deleted file mode 100644 index bb83f92e4b87..000000000000 --- a/mypyc/lib-rt/codepoint_extra_ops.h +++ /dev/null @@ -1,28 +0,0 @@ -#ifndef MYPYC_CODEPOINT_EXTRA_OPS_H -#define MYPYC_CODEPOINT_EXTRA_OPS_H - -#include -#include -#include - -// Codepoint helpers for librt.strings. -// Inputs are signed int32_t for compatibility with mypyc's i32 type. -// Negative values are treated as non-codepoints and return false. - -static inline bool LibRTStrings_IsSpace(int32_t c) { - return c >= 0 && Py_UNICODE_ISSPACE((Py_UCS4)c); -} - -static inline bool LibRTStrings_IsDigit(int32_t c) { - return c >= 0 && Py_UNICODE_ISDIGIT((Py_UCS4)c); -} - -static inline bool LibRTStrings_IsAlnum(int32_t c) { - return c >= 0 && Py_UNICODE_ISALNUM((Py_UCS4)c); -} - -static inline bool LibRTStrings_IsAlpha(int32_t c) { - return c >= 0 && Py_UNICODE_ISALPHA((Py_UCS4)c); -} - -#endif // MYPYC_CODEPOINT_EXTRA_OPS_H diff --git a/mypyc/lib-rt/strings/librt_strings.c b/mypyc/lib-rt/strings/librt_strings.c index cbc3e5f753fa..d5245af9183f 100644 --- a/mypyc/lib-rt/strings/librt_strings.c +++ b/mypyc/lib-rt/strings/librt_strings.c @@ -4,7 +4,6 @@ #include #include #include "CPy.h" -#include "codepoint_extra_ops.h" #include "librt_strings.h" #define CPY_BOOL_ERROR 2 @@ -1154,15 +1153,50 @@ read_f64_be(PyObject *module, PyObject *const *args, size_t nargs) { return PyFloat_FromDouble(CPyBytes_ReadF64BEUnsafe(data + index)); } -// Codepoint classification helpers exposed to interpreted callers. -// The C-side names are prefixed `cp_` to avoid colliding with libc's -// isspace / isdigit / etc. Compiled callers go through the -// LibRTStrings_* static inlines in codepoint_extra_ops.h instead. -// -// All wrappers parse a single int argument as i32 (codepoint) and -// dispatch to the corresponding LibRTStrings_* function. The parse -// step accepts any int but rejects values outside the i32 range with -// OverflowError, matching the input domain of the compiled fast path. +// Codepoint classification helpers. Inputs are signed i32 for compatibility +// with mypyc's int32_rprimitive; negative values are non-codepoints and +// return false. Mypyc-compiled callers reach these through the librt.strings +// capsule API (see librt_strings_api.h); interpreted callers go through the +// `cp_*` Python wrappers below. + +bool LibRTStrings_IsSpace(int32_t c) { + return c >= 0 && Py_UNICODE_ISSPACE((Py_UCS4)c); +} + +bool LibRTStrings_IsDigit(int32_t c) { + return c >= 0 && Py_UNICODE_ISDIGIT((Py_UCS4)c); +} + +bool LibRTStrings_IsAlnum(int32_t c) { + return c >= 0 && Py_UNICODE_ISALNUM((Py_UCS4)c); +} + +bool LibRTStrings_IsAlpha(int32_t c) { + return c >= 0 && Py_UNICODE_ISALPHA((Py_UCS4)c); +} + +// True if c could start a valid identifier (XID_Start, per PEP 3131). +// ASCII fast path covers `[A-Za-z_]`; non-ASCII delegates to CPython's +// PyUnicode_IsIdentifier on a 1-character string. Aborts via +// CPyError_OutOfMemory on allocation failure to keep this ERR_NEVER. +bool LibRTStrings_IsIdentifier(int32_t c) { + if (c < 0) return false; + if (c < 128) { + return (c >= 'a' && c <= 'z') + || (c >= 'A' && c <= 'Z') + || c == '_'; + } + PyObject *s = PyUnicode_FromOrdinal((int)c); + if (s == NULL) { + CPyError_OutOfMemory(); + } + int r = PyUnicode_IsIdentifier(s); + Py_DECREF(s); + return r == 1; +} + +// Python-level wrappers (`cp_*`) for interpreted callers. The C-side names +// are prefixed `cp_` to avoid colliding with libc's isspace etc. // Parse a Python int as i32 codepoint. Returns 0 on success and writes // the value to *out; returns -1 on error with a Python exception set. @@ -1194,6 +1228,7 @@ DEFINE_CP_BOOL_WRAPPER(isspace, LibRTStrings_IsSpace) DEFINE_CP_BOOL_WRAPPER(isdigit, LibRTStrings_IsDigit) DEFINE_CP_BOOL_WRAPPER(isalnum, LibRTStrings_IsAlnum) DEFINE_CP_BOOL_WRAPPER(isalpha, LibRTStrings_IsAlpha) +DEFINE_CP_BOOL_WRAPPER(isidentifier, LibRTStrings_IsIdentifier) static PyMethodDef librt_strings_module_methods[] = { {"write_i16_le", (PyCFunction) write_i16_le, METH_FASTCALL, @@ -1268,6 +1303,9 @@ static PyMethodDef librt_strings_module_methods[] = { {"isalpha", cp_isalpha, METH_O, PyDoc_STR("Test whether a codepoint (i32) is a Unicode letter.") }, + {"isidentifier", cp_isidentifier, METH_O, + PyDoc_STR("Test whether a codepoint (i32) is a valid identifier start (XID_Start).") + }, {NULL, NULL, 0, NULL} }; @@ -1313,6 +1351,11 @@ librt_strings_module_exec(PyObject *m) (void *)StringWriter_type_internal, (void *)StringWriter_write_internal, (void *)grow_string_buffer, + (void *)LibRTStrings_IsSpace, + (void *)LibRTStrings_IsDigit, + (void *)LibRTStrings_IsAlnum, + (void *)LibRTStrings_IsAlpha, + (void *)LibRTStrings_IsIdentifier, }; PyObject *c_api_object = PyCapsule_New((void *)librt_strings_api, "librt.strings._C_API", NULL); if (PyModule_Add(m, "_C_API", c_api_object) < 0) { diff --git a/mypyc/lib-rt/strings/librt_strings.h b/mypyc/lib-rt/strings/librt_strings.h index e6236f795092..903cda6b0918 100644 --- a/mypyc/lib-rt/strings/librt_strings.h +++ b/mypyc/lib-rt/strings/librt_strings.h @@ -13,11 +13,11 @@ // API version -- more recent versions must maintain backward compatibility, i.e. // we can add new features but not remove or change existing features (unless // ABI version is changed, but see the comment above). -#define LIBRT_STRINGS_API_VERSION 4 +#define LIBRT_STRINGS_API_VERSION 5 // Number of functions in the capsule API. If you add a new function, also increase // LIBRT_STRINGS_API_VERSION. -#define LIBRT_STRINGS_API_LEN 14 +#define LIBRT_STRINGS_API_LEN 19 typedef struct { PyObject_HEAD diff --git a/mypyc/lib-rt/strings/librt_strings_api.h b/mypyc/lib-rt/strings/librt_strings_api.h index 536b90ad7f21..406543190daf 100644 --- a/mypyc/lib-rt/strings/librt_strings_api.h +++ b/mypyc/lib-rt/strings/librt_strings_api.h @@ -6,6 +6,7 @@ import_librt_strings(void); #include #include +#include #include "librt_strings.h" extern void *LibRTStrings_API[LIBRT_STRINGS_API_LEN]; @@ -24,6 +25,11 @@ extern void *LibRTStrings_API[LIBRT_STRINGS_API_LEN]; #define LibRTStrings_StringWriter_type_internal (*(PyTypeObject* (*)(void)) LibRTStrings_API[11]) #define LibRTStrings_StringWriter_write_internal (*(char (*)(PyObject *source, PyObject *value)) LibRTStrings_API[12]) #define LibRTStrings_grow_string_buffer (*(bool (*)(StringWriterObject *obj, Py_ssize_t n)) LibRTStrings_API[13]) +#define LibRTStrings_IsSpace (*(bool (*)(int32_t c)) LibRTStrings_API[14]) +#define LibRTStrings_IsDigit (*(bool (*)(int32_t c)) LibRTStrings_API[15]) +#define LibRTStrings_IsAlnum (*(bool (*)(int32_t c)) LibRTStrings_API[16]) +#define LibRTStrings_IsAlpha (*(bool (*)(int32_t c)) LibRTStrings_API[17]) +#define LibRTStrings_IsIdentifier (*(bool (*)(int32_t c)) LibRTStrings_API[18]) static inline bool CPyBytesWriter_Check(PyObject *obj) { diff --git a/mypyc/primitives/librt_strings_ops.py b/mypyc/primitives/librt_strings_ops.py index 93fa717cf529..f025c6e95b71 100644 --- a/mypyc/primitives/librt_strings_ops.py +++ b/mypyc/primitives/librt_strings_ops.py @@ -1,9 +1,4 @@ -from mypyc.ir.deps import ( - BYTES_WRITER_EXTRA_OPS, - CODEPOINT_EXTRA_OPS, - LIBRT_STRINGS, - STRING_WRITER_EXTRA_OPS, -) +from mypyc.ir.deps import BYTES_WRITER_EXTRA_OPS, LIBRT_STRINGS, STRING_WRITER_EXTRA_OPS from mypyc.ir.ops import ERR_MAGIC, ERR_MAGIC_OVERLAPPING, ERR_NEVER from mypyc.ir.rtypes import ( bool_rprimitive, @@ -402,7 +397,7 @@ return_type=bool_rprimitive, c_function_name="LibRTStrings_IsSpace", error_kind=ERR_NEVER, - dependencies=[LIBRT_STRINGS, CODEPOINT_EXTRA_OPS], + dependencies=[LIBRT_STRINGS], ) function_op( @@ -411,7 +406,7 @@ return_type=bool_rprimitive, c_function_name="LibRTStrings_IsDigit", error_kind=ERR_NEVER, - dependencies=[LIBRT_STRINGS, CODEPOINT_EXTRA_OPS], + dependencies=[LIBRT_STRINGS], ) function_op( @@ -420,7 +415,7 @@ return_type=bool_rprimitive, c_function_name="LibRTStrings_IsAlnum", error_kind=ERR_NEVER, - dependencies=[LIBRT_STRINGS, CODEPOINT_EXTRA_OPS], + dependencies=[LIBRT_STRINGS], ) function_op( @@ -429,5 +424,17 @@ return_type=bool_rprimitive, c_function_name="LibRTStrings_IsAlpha", error_kind=ERR_NEVER, - dependencies=[LIBRT_STRINGS, CODEPOINT_EXTRA_OPS], + dependencies=[LIBRT_STRINGS], +) + +# isidentifier checks XID_Start semantics for a single codepoint, matching +# str.isidentifier() on a 1-character string. The non-ASCII path allocates +# and aborts via CPyError_OutOfMemory on failure, so this stays ERR_NEVER. +function_op( + name="librt.strings.isidentifier", + arg_types=[int32_rprimitive], + return_type=bool_rprimitive, + c_function_name="LibRTStrings_IsIdentifier", + error_kind=ERR_NEVER, + dependencies=[LIBRT_STRINGS], ) diff --git a/mypyc/test-data/irbuild-librt-strings.test b/mypyc/test-data/irbuild-librt-strings.test index e5d18b6eb852..e3aaa49bd6f9 100644 --- a/mypyc/test-data/irbuild-librt-strings.test +++ b/mypyc/test-data/irbuild-librt-strings.test @@ -387,3 +387,17 @@ def is_a(c): L0: r0 = LibRTStrings_IsAlpha(c) return r0 + +[case testLibrtStringsIsIdentifierIR] +from librt.strings import isidentifier +from mypy_extensions import i32 + +def is_id(c: i32) -> bool: + return isidentifier(c) +[out] +def is_id(c): + c :: i32 + r0 :: bool +L0: + r0 = LibRTStrings_IsIdentifier(c) + return r0 diff --git a/mypyc/test-data/run-librt-strings.test b/mypyc/test-data/run-librt-strings.test index aa38c713d384..0a3320ff6522 100644 --- a/mypyc/test-data/run-librt-strings.test +++ b/mypyc/test-data/run-librt-strings.test @@ -1443,7 +1443,7 @@ def test_new_without_init_is_usable() -> None: [case testLibrtStringsCodepointClassifiers_librt] from typing import Any from mypy_extensions import i32 -from librt.strings import isspace, isdigit, isalnum, isalpha +from librt.strings import isspace, isdigit, isalnum, isalpha, isidentifier from testutil import assertRaises @@ -1455,6 +1455,7 @@ def test_codepoint_classifiers() -> None: assert not isdigit(bad) assert not isalnum(bad) assert not isalpha(bad) + assert not isidentifier(bad) # Verify each codepoint primitive agrees with the matching str method # across all Unicode codepoints, including the ord(chr(i)) round-trip. # Any forces generic dispatch on the str side. @@ -1466,6 +1467,7 @@ def test_codepoint_classifiers() -> None: assert isdigit(o) == isdigit(i) == a.isdigit() assert isalnum(o) == isalnum(i) == a.isalnum() assert isalpha(o) == isalpha(i) == a.isalpha() + assert isidentifier(o) == isidentifier(i) == a.isidentifier() def test_codepoint_classifiers_via_any() -> None: @@ -1476,6 +1478,7 @@ def test_codepoint_classifiers_via_any() -> None: (isdigit, "5", "a"), (isalnum, "A", " "), (isalpha, "A", " "), + (isidentifier, "A", "0"), ): f: Any = fn assert f(ord(true_input)) is True