Skip to content

Commit 283b2f8

Browse files
committed
[mypyc] Add librt.strings.isidentifier codepoint primitive
True if a codepoint can start a valid identifier (XID_Start, per PEP 3131). The ASCII fast path covers `[A-Za-z_]` inline; non-ASCII codepoints round-trip through PyUnicode_FromOrdinal + PyUnicode_IsIdentifier so the answer matches str.isidentifier on a 1-character string. The non-ASCII path is the first allocating helper in this series, so its body lives out-of-line in codepoint_extra_ops.c (it would otherwise be emitted as a separate copy in every translation unit that includes the header). On OOM it swallows the exception via PyErr_Clear() and returns False, which keeps the function ERR_NEVER. Documented at the call site so callers don't get a surprising silent failure. Stack: depends on the librt.strings.isspace primitive.
1 parent 6f0e77b commit 283b2f8

9 files changed

Lines changed: 75 additions & 4 deletions

File tree

mypy/typeshed/stubs/librt/librt/strings.pyi

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,3 +47,4 @@ def isspace(c: i32, /) -> bool: ...
4747
def isdigit(c: i32, /) -> bool: ...
4848
def isalnum(c: i32, /) -> bool: ...
4949
def isalpha(c: i32, /) -> bool: ...
50+
def isidentifier(c: i32, /) -> bool: ...

mypyc/build.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,12 @@ class ModDesc(NamedTuple):
5454

5555
LIBRT_MODULES = [
5656
ModDesc("librt.internal", ["internal/librt_internal.c"], [], ["internal"]),
57-
ModDesc("librt.strings", ["strings/librt_strings.c"], [], ["strings"]),
57+
ModDesc(
58+
"librt.strings",
59+
["strings/librt_strings.c", "codepoint_extra_ops.c"],
60+
["codepoint_extra_ops.h"],
61+
["strings"],
62+
),
5863
ModDesc(
5964
"librt.base64",
6065
[

mypyc/lib-rt/codepoint_extra_ops.c

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,5 +4,17 @@
44
// The classification helpers and the ASCII fast paths for case conversion
55
// stay inline in codepoint_extra_ops.h; this file holds the slow paths
66
// that round-trip through PyUnicode_FromOrdinal and CPython's Unicode
7-
// machinery. Currently empty; populated as later commits add
8-
// isidentifier, toupper, and tolower.
7+
// machinery.
8+
9+
bool LibRTStrings_IsIdentifier_slow(int32_t c) {
10+
PyObject *s = PyUnicode_FromOrdinal((int)c);
11+
if (s == NULL) {
12+
// OOM. Swallow and return false to keep the function ERR_NEVER;
13+
// callers expect a defined answer, not a propagated exception.
14+
PyErr_Clear();
15+
return false;
16+
}
17+
int r = PyUnicode_IsIdentifier(s);
18+
Py_DECREF(s);
19+
return r == 1;
20+
}

mypyc/lib-rt/codepoint_extra_ops.h

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,4 +25,23 @@ static inline bool LibRTStrings_IsAlpha(int32_t c) {
2525
return c >= 0 && Py_UNICODE_ISALPHA((Py_UCS4)c);
2626
}
2727

28+
// Slow path for non-ASCII isidentifier; defined out-of-line in
29+
// codepoint_extra_ops.c because it allocates and calls into CPython.
30+
bool LibRTStrings_IsIdentifier_slow(int32_t c);
31+
32+
// True if c could start a valid identifier (matches XID_Start
33+
// semantics, which is what str.isidentifier reports for a 1-character
34+
// string). The ASCII fast path covers `[A-Za-z_]` inline; non-ASCII
35+
// delegates to PyUnicode_IsIdentifier for correct PEP 3131 handling.
36+
// Returns false on OOM in the slow path (the function stays ERR_NEVER).
37+
static inline bool LibRTStrings_IsIdentifier(int32_t c) {
38+
if (c < 0) return false;
39+
if (c < 128) {
40+
return (c >= 'a' && c <= 'z')
41+
|| (c >= 'A' && c <= 'Z')
42+
|| c == '_';
43+
}
44+
return LibRTStrings_IsIdentifier_slow(c);
45+
}
46+
2847
#endif // MYPYC_CODEPOINT_EXTRA_OPS_H

mypyc/lib-rt/setup.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,7 @@ def run(self) -> None:
103103
"librt.strings",
104104
[
105105
"strings/librt_strings.c",
106+
"codepoint_extra_ops.c",
106107
"init.c",
107108
"int_ops.c",
108109
"exc_ops.c",

mypyc/lib-rt/strings/librt_strings.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1194,6 +1194,7 @@ DEFINE_CP_BOOL_WRAPPER(isspace, LibRTStrings_IsSpace)
11941194
DEFINE_CP_BOOL_WRAPPER(isdigit, LibRTStrings_IsDigit)
11951195
DEFINE_CP_BOOL_WRAPPER(isalnum, LibRTStrings_IsAlnum)
11961196
DEFINE_CP_BOOL_WRAPPER(isalpha, LibRTStrings_IsAlpha)
1197+
DEFINE_CP_BOOL_WRAPPER(isidentifier, LibRTStrings_IsIdentifier)
11971198

11981199
static PyMethodDef librt_strings_module_methods[] = {
11991200
{"write_i16_le", (PyCFunction) write_i16_le, METH_FASTCALL,
@@ -1268,6 +1269,9 @@ static PyMethodDef librt_strings_module_methods[] = {
12681269
{"isalpha", cp_isalpha, METH_O,
12691270
PyDoc_STR("Test whether a codepoint (i32) is a Unicode letter.")
12701271
},
1272+
{"isidentifier", cp_isidentifier, METH_O,
1273+
PyDoc_STR("Test whether a codepoint (i32) is a valid identifier start (XID_Start).")
1274+
},
12711275
{NULL, NULL, 0, NULL}
12721276
};
12731277

mypyc/primitives/librt_strings_ops.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -431,3 +431,15 @@
431431
error_kind=ERR_NEVER,
432432
dependencies=[LIBRT_STRINGS, CODEPOINT_EXTRA_OPS],
433433
)
434+
435+
# isidentifier checks XID_Start semantics for a single codepoint, matching
436+
# str.isidentifier() on a 1-character string. The non-ASCII path allocates
437+
# but swallows OOM (returning False), keeping the function ERR_NEVER.
438+
function_op(
439+
name="librt.strings.isidentifier",
440+
arg_types=[int32_rprimitive],
441+
return_type=bool_rprimitive,
442+
c_function_name="LibRTStrings_IsIdentifier",
443+
error_kind=ERR_NEVER,
444+
dependencies=[LIBRT_STRINGS, CODEPOINT_EXTRA_OPS],
445+
)

mypyc/test-data/irbuild-librt-strings.test

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -387,3 +387,17 @@ def is_a(c):
387387
L0:
388388
r0 = LibRTStrings_IsAlpha(c)
389389
return r0
390+
391+
[case testLibrtStringsIsIdentifierIR]
392+
from librt.strings import isidentifier
393+
from mypy_extensions import i32
394+
395+
def is_id(c: i32) -> bool:
396+
return isidentifier(c)
397+
[out]
398+
def is_id(c):
399+
c :: i32
400+
r0 :: bool
401+
L0:
402+
r0 = LibRTStrings_IsIdentifier(c)
403+
return r0

mypyc/test-data/run-librt-strings.test

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1443,7 +1443,7 @@ def test_new_without_init_is_usable() -> None:
14431443
[case testLibrtStringsCodepointClassifiers_librt]
14441444
from typing import Any
14451445
from mypy_extensions import i32
1446-
from librt.strings import isspace, isdigit, isalnum, isalpha
1446+
from librt.strings import isspace, isdigit, isalnum, isalpha, isidentifier
14471447

14481448
from testutil import assertRaises
14491449

@@ -1455,6 +1455,7 @@ def test_codepoint_classifiers() -> None:
14551455
assert not isdigit(bad)
14561456
assert not isalnum(bad)
14571457
assert not isalpha(bad)
1458+
assert not isidentifier(bad)
14581459
# Verify each codepoint primitive agrees with the matching str method
14591460
# across all Unicode codepoints, including the ord(chr(i)) round-trip.
14601461
# Any forces generic dispatch on the str side.
@@ -1466,6 +1467,7 @@ def test_codepoint_classifiers() -> None:
14661467
assert isdigit(o) == isdigit(i) == a.isdigit()
14671468
assert isalnum(o) == isalnum(i) == a.isalnum()
14681469
assert isalpha(o) == isalpha(i) == a.isalpha()
1470+
assert isidentifier(o) == isidentifier(i) == a.isidentifier()
14691471

14701472

14711473
def test_codepoint_classifiers_via_any() -> None:
@@ -1476,6 +1478,7 @@ def test_codepoint_classifiers_via_any() -> None:
14761478
(isdigit, "5", "a"),
14771479
(isalnum, "A", " "),
14781480
(isalpha, "A", " "),
1481+
(isidentifier, "A", "0"),
14791482
):
14801483
f: Any = fn
14811484
assert f(ord(true_input)) is True

0 commit comments

Comments
 (0)