diff --git a/package-lock.json b/package-lock.json index 2ee8657..0e471d0 100644 --- a/package-lock.json +++ b/package-lock.json @@ -7,7 +7,8 @@ "workspaces": [ "packages/cel-spec", "packages/cel", - "packages/example" + "packages/example", + "packages/re2" ], "devDependencies": { "@arethetypeswrong/cli": "^0.18.2", @@ -476,6 +477,10 @@ "node": ">=14.17" } }, + "node_modules/@bufbuild/re2": { + "resolved": "packages/re2", + "link": true + }, "node_modules/@colors/colors": { "version": "1.5.0", "resolved": "https://registry.npmjs.org/@colors/colors/-/colors-1.5.0.tgz", @@ -929,29 +934,6 @@ "node": ">=18" } }, - "node_modules/@isaacs/balanced-match": { - "version": "4.0.1", - "resolved": "https://registry.npmjs.org/@isaacs/balanced-match/-/balanced-match-4.0.1.tgz", - "integrity": "sha512-yzMTt9lEb8Gv7zRioUilSglI0c0smZ9k5D65677DLWLtWJaXIS3CqcGyUFByYKlnUj6TkjLVs54fBl6+TiGQDQ==", - "dev": true, - "license": "MIT", - "engines": { - "node": "20 || >=22" - } - }, - "node_modules/@isaacs/brace-expansion": { - "version": "5.0.0", - "resolved": "https://registry.npmjs.org/@isaacs/brace-expansion/-/brace-expansion-5.0.0.tgz", - "integrity": "sha512-ZT55BDLV0yv0RBm2czMiZ+SqCGO7AvmOM3G/w2xhVPH+te0aKgFjmBvGlL1dH+ql2tgGO3MVrbb3jCKyvpgnxA==", - "dev": true, - "license": "MIT", - "dependencies": { - "@isaacs/balanced-match": "^4.0.1" - }, - "engines": { - "node": "20 || >=22" - } - }, "node_modules/@loaderkit/resolve": { "version": "1.0.4", "resolved": "https://registry.npmjs.org/@loaderkit/resolve/-/resolve-1.0.4.tgz", @@ -963,31 +945,18 @@ } }, "node_modules/@peggyjs/from-mem": { - "version": "3.1.1", - "resolved": "https://registry.npmjs.org/@peggyjs/from-mem/-/from-mem-3.1.1.tgz", - "integrity": "sha512-m5OEjgJaePWpyNtQCvRZkpLoV+z44eh6QIO9yEwQuOThdUdkECO3wcKLT3tFA3H8WM5bxU/K/dpmo7r/X16UEw==", + "version": "3.1.3", + "resolved": "https://registry.npmjs.org/@peggyjs/from-mem/-/from-mem-3.1.3.tgz", + "integrity": "sha512-LLlgtfXIaeYXoOYovOI0spLM8ZXaqkAlmcRRrLzHJzLMqkU6Sw0R4KMoCoHx1PjaP815pSCBlS+BN6aD8t1Jgg==", "dev": true, "license": "MIT", "dependencies": { - "semver": "7.7.2" + "semver": "7.7.4" }, "engines": { "node": ">=20.8" } }, - "node_modules/@peggyjs/from-mem/node_modules/semver": { - "version": "7.7.2", - "resolved": "https://registry.npmjs.org/semver/-/semver-7.7.2.tgz", - "integrity": "sha512-RF0Fw+rO5AMf9MAyaRXI4AV0Ulj5lMHqVxxdSgiVbixSCXoEmmX/jk0CuJw4+3SqroYO9VoUh+HcuJivvtJemA==", - "dev": true, - "license": "ISC", - "bin": { - "semver": "bin/semver.js" - }, - "engines": { - "node": ">=10" - } - }, "node_modules/@sindresorhus/is": { "version": "4.6.0", "resolved": "https://registry.npmjs.org/@sindresorhus/is/-/is-4.6.0.tgz", @@ -1036,6 +1005,20 @@ "typescript": "*" } }, + "node_modules/@unicode/unicode-15.0.0": { + "version": "1.6.16", + "resolved": "https://registry.npmjs.org/@unicode/unicode-15.0.0/-/unicode-15.0.0-1.6.16.tgz", + "integrity": "sha512-Ca38T4Hv/+CeUoOX4Oowm/0+OONr0sV0GqzLCzL/lo9AbxX6wweco55lNwOJJyB7v7bYKq+oBizINOlXbsjY8Q==", + "dev": true, + "license": "MIT" + }, + "node_modules/@unicode/unicode-16.0.0": { + "version": "1.6.16", + "resolved": "https://registry.npmjs.org/@unicode/unicode-16.0.0/-/unicode-16.0.0-1.6.16.tgz", + "integrity": "sha512-R2Vxi0XEsCMD9WOQT85O2npa7g+i4RsJ8Xtn+/KODLqa5wH5zCTn5an6JILJhMSfFGD3t3amES2XnvUHnKjMZQ==", + "dev": true, + "license": "MIT" + }, "node_modules/ansi-escapes": { "version": "7.2.0", "resolved": "https://registry.npmjs.org/ansi-escapes/-/ansi-escapes-7.2.0.tgz", @@ -1088,6 +1071,29 @@ "dev": true, "license": "MIT" }, + "node_modules/balanced-match": { + "version": "4.0.4", + "resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-4.0.4.tgz", + "integrity": "sha512-BLrgEcRTwX2o6gGxGOCNyMvGSp35YofuYzw9h1IMTRmKqttAZZVU67bdb9Pr2vUHA8+j3i2tJfjO6C6+4myGTA==", + "dev": true, + "license": "MIT", + "engines": { + "node": "18 || 20 || >=22" + } + }, + "node_modules/brace-expansion": { + "version": "5.0.5", + "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-5.0.5.tgz", + "integrity": "sha512-VZznLgtwhn+Mact9tfiwx64fA9erHH/MCXEUfB/0bX/6Fz6ny5EGTXYltMocqg4xFAQZtnO3DHWWXi8RiuN7cQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "balanced-match": "^4.0.2" + }, + "engines": { + "node": "18 || 20 || >=22" + } + }, "node_modules/chalk": { "version": "4.1.2", "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz", @@ -1454,16 +1460,16 @@ } }, "node_modules/minimatch": { - "version": "10.1.1", - "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-10.1.1.tgz", - "integrity": "sha512-enIvLvRAFZYXJzkCYG5RKmPfrFArdLv+R+lbQ53BmIMLIry74bjKzX6iHAm8WYamJkhSSEabrWN5D97XnKObjQ==", + "version": "10.2.5", + "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-10.2.5.tgz", + "integrity": "sha512-MULkVLfKGYDFYejP07QOurDLLQpcjk7Fw+7jXS2R2czRQzR56yHRveU5NDJEOviH+hETZKSkIk5c+T23GjFUMg==", "dev": true, "license": "BlueOak-1.0.0", "dependencies": { - "@isaacs/brace-expansion": "^5.0.0" + "brace-expansion": "^5.0.5" }, "engines": { - "node": "20 || >=22" + "node": "18 || 20 || >=22" }, "funding": { "url": "https://github.com/sponsors/isaacs" @@ -1546,15 +1552,15 @@ "license": "MIT" }, "node_modules/peggy": { - "version": "5.0.6", - "resolved": "https://registry.npmjs.org/peggy/-/peggy-5.0.6.tgz", - "integrity": "sha512-Sud8Zus0JAgE+U4zwkJv29OOaXhviFI7J90/6cGfy3OoqR8dpnieeF9a46dj0bTtqiFnrFatldA6ltQyOJvNmg==", + "version": "5.1.0", + "resolved": "https://registry.npmjs.org/peggy/-/peggy-5.1.0.tgz", + "integrity": "sha512-IEo5aYRZ2kXH4Qby06cjtL114PZnwLoTiA41vUmg2vPZgANn+c87m5BUurhuDr5/cu758ZlpgsAfBVx+hhO5+w==", "dev": true, "license": "MIT", "dependencies": { - "@peggyjs/from-mem": "3.1.1", - "commander": "^14.0.0", - "source-map-generator": "2.0.2" + "@peggyjs/from-mem": "3.1.3", + "commander": "^14.0.3", + "source-map-generator": "2.0.6" }, "bin": { "peggy": "bin/peggy.js" @@ -1577,9 +1583,9 @@ } }, "node_modules/peggy/node_modules/commander": { - "version": "14.0.2", - "resolved": "https://registry.npmjs.org/commander/-/commander-14.0.2.tgz", - "integrity": "sha512-TywoWNNRbhoD0BXs1P3ZEScW8W5iKrnbithIl0YH+uCmBd0QpPOA8yc82DS3BIE5Ma6FnBVUsJ7wVUDz4dvOWQ==", + "version": "14.0.3", + "resolved": "https://registry.npmjs.org/commander/-/commander-14.0.3.tgz", + "integrity": "sha512-H+y0Jo/T1RZ9qPP4Eh1pkcQcLRglraJaSLoyOtHxu6AapkjWVCy2Sit1QQ4x3Dng8qDlSsZEet7g5Pq06MvTgw==", "dev": true, "license": "MIT", "engines": { @@ -1620,9 +1626,9 @@ } }, "node_modules/semver": { - "version": "7.7.3", - "resolved": "https://registry.npmjs.org/semver/-/semver-7.7.3.tgz", - "integrity": "sha512-SdsKMrI9TdgjdweUSR9MweHA4EJ8YxHn8DFaDisvhVlUOe4BF1tLD7GAj0lIqWVl+dPb/rExr0Btby5loQm20Q==", + "version": "7.7.4", + "resolved": "https://registry.npmjs.org/semver/-/semver-7.7.4.tgz", + "integrity": "sha512-vFKC2IEtQnVhpT78h1Yp8wzwrf8CM+MzKMHGJZfBtzhZNycRFnXsHk6E5TxIkkMsgNS7mdX3AGB7x2QM2di4lA==", "dev": true, "license": "ISC", "bin": { @@ -1646,9 +1652,9 @@ } }, "node_modules/source-map-generator": { - "version": "2.0.2", - "resolved": "https://registry.npmjs.org/source-map-generator/-/source-map-generator-2.0.2.tgz", - "integrity": "sha512-unCl5BQhF/us51DiT7SvlSY3QUPhyfAdHJxd8l7FXdwzqxli0UDMV2dEuei2SeGp3Z4rB/AJ9zKi1mGOp2K2ww==", + "version": "2.0.6", + "resolved": "https://registry.npmjs.org/source-map-generator/-/source-map-generator-2.0.6.tgz", + "integrity": "sha512-IlassDs1Ve8nV6uyQZXF9kdkJpVKnMte2JZQXu13M0A5zwc+vu6+LNHfmxsHBMDtoZE21RHiKI0/xvpecZRCNg==", "dev": true, "license": "BSD-3-Clause", "engines": { @@ -1747,14 +1753,14 @@ } }, "node_modules/tinyglobby": { - "version": "0.2.15", - "resolved": "https://registry.npmjs.org/tinyglobby/-/tinyglobby-0.2.15.tgz", - "integrity": "sha512-j2Zq4NyQYG5XMST4cbs02Ak8iJUdxRM0XI5QyxXuZOzKOINmWurp3smXu3y5wDcJrptwpSjgXHzIQxR0omXljQ==", + "version": "0.2.16", + "resolved": "https://registry.npmjs.org/tinyglobby/-/tinyglobby-0.2.16.tgz", + "integrity": "sha512-pn99VhoACYR8nFHhxqix+uvsbXineAasWm5ojXoN8xEwK5Kd3/TrhNn1wByuD52UxWRLy8pu+kRMniEi6Eq9Zg==", "dev": true, "license": "MIT", "dependencies": { "fdir": "^6.5.0", - "picomatch": "^4.0.3" + "picomatch": "^4.0.4" }, "engines": { "node": ">=12.0.0" @@ -1782,9 +1788,9 @@ } }, "node_modules/tinyglobby/node_modules/picomatch": { - "version": "4.0.3", - "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-4.0.3.tgz", - "integrity": "sha512-5gTmgEY/sqK6gFXLIsQNH19lWb4ebPDLA4SdLP7dsWkIXHWlG66oPuVvXSGFPppYZz8ZDZq0dYYrbHfBCVUb1Q==", + "version": "4.0.4", + "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-4.0.4.tgz", + "integrity": "sha512-QP88BAKvMam/3NxH6vj2o21R6MjxZUAd6nlwAS/pnGvN9IVLocLHxGYIzFhg6fUQ+5th6P4dv4eW9jX3DSIj7A==", "dev": true, "license": "MIT", "engines": { @@ -1958,6 +1964,16 @@ "node": ">=4" } }, + "node_modules/unicode-property-value-aliases": { + "version": "3.9.0", + "resolved": "https://registry.npmjs.org/unicode-property-value-aliases/-/unicode-property-value-aliases-3.9.0.tgz", + "integrity": "sha512-UFsQgFziRAXuFS9hN48PnwujNFwsuRWHj+bd840X7aYC1xa0MHlKsaxzapp5lM4C4HuAPko5cFiTUydeQ428Bg==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=4" + } + }, "node_modules/validate-npm-package-name": { "version": "5.0.1", "resolved": "https://registry.npmjs.org/validate-npm-package-name/-/validate-npm-package-name-5.0.1.tgz", @@ -2030,7 +2046,8 @@ "version": "0.4.0", "license": "Apache-2.0", "dependencies": { - "@bufbuild/cel-spec": "0.4.0" + "@bufbuild/cel-spec": "0.4.0", + "@bufbuild/re2": "0.4.0" }, "devDependencies": { "expect-type": "^1.3.0", @@ -2061,6 +2078,16 @@ "@bufbuild/cel-spec": "^0.4.0", "@bufbuild/protobuf": "^2.6.2" } + }, + "packages/re2": { + "name": "@bufbuild/re2", + "version": "0.4.0", + "license": "MIT", + "devDependencies": { + "@unicode/unicode-15.0.0": "^1.6.16", + "@unicode/unicode-16.0.0": "^1.6.16", + "unicode-property-value-aliases": "^3.9.0" + } } } } diff --git a/package.json b/package.json index 1ee9136..35c2ecb 100644 --- a/package.json +++ b/package.json @@ -10,7 +10,12 @@ "license-header": "license-header --ignore 'packages/**'", "lint": "biome lint --error-on-warnings" }, - "workspaces": ["packages/cel-spec", "packages/cel", "packages/example"], + "workspaces": [ + "packages/cel-spec", + "packages/cel", + "packages/example", + "packages/re2" + ], "type": "module", "packageManager": "npm@10.9.0", "licenseHeader": { diff --git a/packages/cel/package.json b/packages/cel/package.json index d931a0c..3520bec 100644 --- a/packages/cel/package.json +++ b/packages/cel/package.json @@ -51,11 +51,12 @@ "@bufbuild/protobuf": "^2.6.2" }, "dependencies": { - "@bufbuild/cel-spec": "0.4.0" + "@bufbuild/cel-spec": "0.4.0", + "@bufbuild/re2": "0.4.0" }, "devDependencies": { + "expect-type": "^1.3.0", "peggy": "^5.0.6", - "peggy-ts": "github:hudlow/peggy-ts#v0.0.9", - "expect-type": "^1.3.0" + "peggy-ts": "github:hudlow/peggy-ts#v0.0.9" } } diff --git a/packages/cel/src/std/logic.ts b/packages/cel/src/std/logic.ts index bfca5a0..5f62162 100644 --- a/packages/cel/src/std/logic.ts +++ b/packages/cel/src/std/logic.ts @@ -26,62 +26,10 @@ import { equals } from "../equals.js"; import { celMethod, celFunc } from "../func.js"; import type { CelList } from "../list.js"; import type { CelMap } from "../map.js"; - -/** - * Patterns that are supported in ECMAScript RE and not in - * RE2. - * - * ECMAScript Ref: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_expressions/Cheatsheet - * RE2: https://github.com/google/re2/wiki/syntax - */ -const invalidPatterns = [ - /\\[1-9]/, // backreference eg: \1 - /\\k<.>/, // backreference eg: \k - /\(\?\=/, // lookahead eg: Jack(?=Sprat) - /\(\?\!/, // negative lookahead eg: Jack(?!Sprat) - /\(\?\<\=/, // lookbehind eg: (?<=Sprat)Jack - /\(\?\<\!/, // negative lookbehind eg: (?[ims\-]+)\)/); +import { RE2JS } from "@bufbuild/re2"; export function matches(this: string, pattern: string): boolean { - for (const invalidPattern of invalidPatterns) { - if (invalidPattern.test(pattern)) { - throw new Error( - `Error evaluating pattern ${pattern}, invalid RE2 syntax`, - ); - } - } - // CEL use RE2 syntax which is a subset of Ecmascript RE except for - // the flags and the ability to change the flags mid-sequence. - // - // The conformance tests use flags at the very beginning of the sequence, which - // is likely the most common place where this rare feature will be used. - // - // Instead of importing an RE2 engine to be able to support this niche, we - // can instead just check for the flags at the very beginning and apply them. - // - // Unsupported flags and flags mid-sequence will fail with to compile the regex. - // - // Users can choose to override this function and provide an RE2 engine if they really - // need to. - let flags = ""; - const flagMatches = pattern.match(flagPattern); - if (flagMatches) { - for (let flag of flagMatches?.groups?.flags ?? "") { - if (flag == "-") { - break; - } - flags += flag; - } - pattern = pattern.substring(flagMatches[0].length); - } - const re = new RegExp(pattern, flags); + const re: RE2JS = RE2JS.compile(pattern); return re.test(this); } @@ -137,92 +85,100 @@ const MAP = mapType(CelScalar.DYN, CelScalar.DYN); // biome-ignore format: table export default [ - // ! - celFunc(opc.LOGICAL_NOT, [BOOL], BOOL, (x) => !x), - // = - celFunc(opc.EQUALS, [DYN, DYN], BOOL, equals), - celFunc(opc.NOT_EQUALS, [DYN, DYN], BOOL, (l, r) => !equals(l, r)), - // < - celFunc(opc.LESS, [BOOL, BOOL], BOOL, (l, r) => l < r), - celFunc(opc.LESS, [BYTES, BYTES], BOOL, (l, r) => compareBytes(l, r) < 0), - celFunc(opc.LESS, [DOUBLE, DOUBLE], BOOL, (l, r) => l < r), - celFunc(opc.LESS, [STRING, STRING], BOOL, (l, r) => l < r), - celFunc(opc.LESS, [INT, INT], BOOL, (l, r) => l < r), - celFunc(opc.LESS, [INT, UINT], BOOL, (l, r) => l < r.value), - celFunc(opc.LESS, [UINT, INT], BOOL, (l, r) => l.value < r), - celFunc(opc.LESS, [UINT, UINT], BOOL, (l, r) => l.value < r.value), - // TODO investigate: ECMAScript relational operators support mixed bigint/number operands, - // but removing the coercion to number here fails the conformance test "not_lt_dyn_int_big_lossy_double" - celFunc(opc.LESS, [INT, DOUBLE], BOOL, (l, r) => Number(l) < r), - celFunc(opc.LESS, [DOUBLE, INT], BOOL, (l, r) => l < Number(r)), - celFunc(opc.LESS, [DOUBLE, UINT], BOOL, (l, r) => l < Number(r.value)), - celFunc(opc.LESS, [UINT, DOUBLE], BOOL, (l, r) => Number(l.value) < r), - celFunc(opc.LESS, [DURATION, DURATION], BOOL, (l, r) => compareDuration(l, r) < 0), - celFunc(opc.LESS, [TIMESTAMP, TIMESTAMP], BOOL, (l, r) => compareTimestamp(l, r) < 0), - // <= - celFunc(opc.LESS_EQUALS, [BOOL, BOOL], BOOL, (l, r) => l <= r), - celFunc(opc.LESS_EQUALS, [BYTES, BYTES], BOOL, (l, r) => compareBytes(l, r) <= 0), - celFunc(opc.LESS_EQUALS, [DOUBLE, DOUBLE], BOOL, (l, r) => l <= r), - celFunc(opc.LESS_EQUALS, [STRING, STRING], BOOL, (l, r) => l <= r), - celFunc(opc.LESS_EQUALS, [INT, INT], BOOL, (l, r) => l <= r), - celFunc(opc.LESS_EQUALS, [INT, UINT], BOOL, (l, r) => l <= r.value), - celFunc(opc.LESS_EQUALS, [UINT, INT], BOOL, (l, r) => l.value <= r), - celFunc(opc.LESS_EQUALS, [UINT, UINT], BOOL, (l, r) => l.value <= r.value), - celFunc(opc.LESS_EQUALS, [INT, DOUBLE], BOOL, (l, r) => Number(l) <= r), - celFunc(opc.LESS_EQUALS, [DOUBLE, INT], BOOL, (l, r) => l <= Number(r)), - celFunc(opc.LESS_EQUALS, [DOUBLE, UINT], BOOL, (l, r) => l <= Number(r.value)), - celFunc(opc.LESS_EQUALS, [UINT, DOUBLE], BOOL, (l, r) => Number(l.value) <= r), - celFunc(opc.LESS_EQUALS, [DURATION, DURATION], BOOL, (l, r) => compareDuration(l, r) <= 0), - celFunc(opc.LESS_EQUALS, [TIMESTAMP, TIMESTAMP], BOOL, (l, r) => compareTimestamp(l, r) <= 0), - // > - celFunc(opc.GREATER, [BOOL, BOOL], BOOL, (l, r) => l > r), - celFunc(opc.GREATER, [BYTES, BYTES], BOOL, (l, r) => compareBytes(l, r) > 0), - celFunc(opc.GREATER, [DOUBLE, DOUBLE], BOOL, (l, r) => l > r), - celFunc(opc.GREATER, [STRING, STRING], BOOL, (l, r) => l > r), - celFunc(opc.GREATER, [INT, INT], BOOL, (l, r) => l > r), - celFunc(opc.GREATER, [INT, UINT], BOOL, (l, r) => l > r.value), - celFunc(opc.GREATER, [UINT, INT], BOOL, (l, r) => l.value > r), - celFunc(opc.GREATER, [UINT, UINT], BOOL, (l, r) => l.value > r.value), - celFunc(opc.GREATER, [INT, DOUBLE], BOOL, (l, r) => Number(l) > r), - celFunc(opc.GREATER, [DOUBLE, INT], BOOL, (l, r) => l > Number(r)), - celFunc(opc.GREATER, [DOUBLE, UINT], BOOL, (l, r) => l > Number(r.value)), - celFunc(opc.GREATER, [UINT, DOUBLE], BOOL, (l, r) => Number(l.value) > r), - celFunc(opc.GREATER, [DURATION, DURATION], BOOL, (l, r) => compareDuration(l, r) > 0), - celFunc(opc.GREATER, [TIMESTAMP, TIMESTAMP], BOOL, (l, r) => compareTimestamp(l, r) > 0), - // >= - celFunc(opc.GREATER_EQUALS, [BOOL, BOOL], BOOL, (l, r) => l >= r), - celFunc(opc.GREATER_EQUALS, [BYTES, BYTES], BOOL, (l, r) => compareBytes(l, r) >= 0), - celFunc(opc.GREATER_EQUALS, [DOUBLE, DOUBLE], BOOL, (l, r) => l >= r), - celFunc(opc.GREATER_EQUALS, [STRING, STRING], BOOL, (l, r) => l >= r), - celFunc(opc.GREATER_EQUALS, [INT, INT], BOOL, (l, r) => l >= r), - celFunc(opc.GREATER_EQUALS, [INT, UINT], BOOL, (l, r) => l >= r.value), - celFunc(opc.GREATER_EQUALS, [UINT, INT], BOOL, (l, r) => l.value >= r), - celFunc(opc.GREATER_EQUALS, [UINT, UINT], BOOL, (l, r) => l.value >= r.value), - celFunc(opc.GREATER_EQUALS, [INT, DOUBLE], BOOL, (l, r) => Number(l) >= r), - celFunc(opc.GREATER_EQUALS, [DOUBLE, INT], BOOL, (l, r) => l >= Number(r)), - celFunc(opc.GREATER_EQUALS, [DOUBLE, UINT], BOOL, (l, r) => l >= Number(r.value)), - celFunc(opc.GREATER_EQUALS, [UINT, DOUBLE], BOOL, (l, r) => Number(l.value) >= r), - celFunc(opc.GREATER_EQUALS, [DURATION, DURATION], BOOL, (l, r) => compareDuration(l, r) >= 0), - celFunc(opc.GREATER_EQUALS, [TIMESTAMP, TIMESTAMP], BOOL, (l, r) => compareTimestamp(l, r) >= 0), - // size - celFunc(olc.SIZE, [BYTES], INT, (x) => BigInt(x.length)), - celFunc(olc.SIZE, [LIST], INT, (x) => BigInt(x.size)), - celFunc(olc.SIZE, [STRING], INT, (x) => BigInt([...x].length)), - celFunc(olc.SIZE, [MAP], INT, (x) => BigInt(x.size)), - celMethod(olc.SIZE, BYTES, [], INT, function () {return BigInt(this.length)}), - celMethod(olc.SIZE, LIST, [], INT, function () {return BigInt(this.size)}), - celMethod(olc.SIZE, STRING, [], INT, function () {return BigInt([...this].length)}), - celMethod(olc.SIZE, MAP, [], INT, function () {return BigInt(this.size)}), - // in - celFunc(opc.IN, [DYN, LIST], BOOL, inList), - celFunc(opc.IN, [STRING, MAP], BOOL, inMap), - celFunc(opc.IN, [DOUBLE, MAP], BOOL, inMap), - celFunc(opc.IN, [INT, MAP], BOOL, inMap), - celFunc(opc.IN, [BOOL, MAP], BOOL, inMap), - celFunc(opc.IN, [UINT, MAP], BOOL, inMap), - // string.* - celMethod(olc.CONTAINS, STRING, [STRING], BOOL, String.prototype.includes), - celMethod(olc.ENDS_WITH, STRING, [STRING], BOOL, String.prototype.endsWith), - celMethod(olc.STARTS_WITH, STRING, [STRING], BOOL, String.prototype.startsWith), - celMethod(olc.MATCHES, STRING, [STRING], BOOL, matches), + // ! + celFunc(opc.LOGICAL_NOT, [BOOL], BOOL, (x) => !x), + // = + celFunc(opc.EQUALS, [DYN, DYN], BOOL, equals), + celFunc(opc.NOT_EQUALS, [DYN, DYN], BOOL, (l, r) => !equals(l, r)), + // < + celFunc(opc.LESS, [BOOL, BOOL], BOOL, (l, r) => l < r), + celFunc(opc.LESS, [BYTES, BYTES], BOOL, (l, r) => compareBytes(l, r) < 0), + celFunc(opc.LESS, [DOUBLE, DOUBLE], BOOL, (l, r) => l < r), + celFunc(opc.LESS, [STRING, STRING], BOOL, (l, r) => l < r), + celFunc(opc.LESS, [INT, INT], BOOL, (l, r) => l < r), + celFunc(opc.LESS, [INT, UINT], BOOL, (l, r) => l < r.value), + celFunc(opc.LESS, [UINT, INT], BOOL, (l, r) => l.value < r), + celFunc(opc.LESS, [UINT, UINT], BOOL, (l, r) => l.value < r.value), + // TODO investigate: ECMAScript relational operators support mixed bigint/number operands, + // but removing the coercion to number here fails the conformance test "not_lt_dyn_int_big_lossy_double" + celFunc(opc.LESS, [INT, DOUBLE], BOOL, (l, r) => Number(l) < r), + celFunc(opc.LESS, [DOUBLE, INT], BOOL, (l, r) => l < Number(r)), + celFunc(opc.LESS, [DOUBLE, UINT], BOOL, (l, r) => l < Number(r.value)), + celFunc(opc.LESS, [UINT, DOUBLE], BOOL, (l, r) => Number(l.value) < r), + celFunc(opc.LESS, [DURATION, DURATION], BOOL, (l, r) => compareDuration(l, r) < 0), + celFunc(opc.LESS, [TIMESTAMP, TIMESTAMP], BOOL, (l, r) => compareTimestamp(l, r) < 0), + // <= + celFunc(opc.LESS_EQUALS, [BOOL, BOOL], BOOL, (l, r) => l <= r), + celFunc(opc.LESS_EQUALS, [BYTES, BYTES], BOOL, (l, r) => compareBytes(l, r) <= 0), + celFunc(opc.LESS_EQUALS, [DOUBLE, DOUBLE], BOOL, (l, r) => l <= r), + celFunc(opc.LESS_EQUALS, [STRING, STRING], BOOL, (l, r) => l <= r), + celFunc(opc.LESS_EQUALS, [INT, INT], BOOL, (l, r) => l <= r), + celFunc(opc.LESS_EQUALS, [INT, UINT], BOOL, (l, r) => l <= r.value), + celFunc(opc.LESS_EQUALS, [UINT, INT], BOOL, (l, r) => l.value <= r), + celFunc(opc.LESS_EQUALS, [UINT, UINT], BOOL, (l, r) => l.value <= r.value), + celFunc(opc.LESS_EQUALS, [INT, DOUBLE], BOOL, (l, r) => Number(l) <= r), + celFunc(opc.LESS_EQUALS, [DOUBLE, INT], BOOL, (l, r) => l <= Number(r)), + celFunc(opc.LESS_EQUALS, [DOUBLE, UINT], BOOL, (l, r) => l <= Number(r.value)), + celFunc(opc.LESS_EQUALS, [UINT, DOUBLE], BOOL, (l, r) => Number(l.value) <= r), + celFunc(opc.LESS_EQUALS, [DURATION, DURATION], BOOL, (l, r) => compareDuration(l, r) <= 0), + celFunc(opc.LESS_EQUALS, [TIMESTAMP, TIMESTAMP], BOOL, (l, r) => compareTimestamp(l, r) <= 0), + // > + celFunc(opc.GREATER, [BOOL, BOOL], BOOL, (l, r) => l > r), + celFunc(opc.GREATER, [BYTES, BYTES], BOOL, (l, r) => compareBytes(l, r) > 0), + celFunc(opc.GREATER, [DOUBLE, DOUBLE], BOOL, (l, r) => l > r), + celFunc(opc.GREATER, [STRING, STRING], BOOL, (l, r) => l > r), + celFunc(opc.GREATER, [INT, INT], BOOL, (l, r) => l > r), + celFunc(opc.GREATER, [INT, UINT], BOOL, (l, r) => l > r.value), + celFunc(opc.GREATER, [UINT, INT], BOOL, (l, r) => l.value > r), + celFunc(opc.GREATER, [UINT, UINT], BOOL, (l, r) => l.value > r.value), + celFunc(opc.GREATER, [INT, DOUBLE], BOOL, (l, r) => Number(l) > r), + celFunc(opc.GREATER, [DOUBLE, INT], BOOL, (l, r) => l > Number(r)), + celFunc(opc.GREATER, [DOUBLE, UINT], BOOL, (l, r) => l > Number(r.value)), + celFunc(opc.GREATER, [UINT, DOUBLE], BOOL, (l, r) => Number(l.value) > r), + celFunc(opc.GREATER, [DURATION, DURATION], BOOL, (l, r) => compareDuration(l, r) > 0), + celFunc(opc.GREATER, [TIMESTAMP, TIMESTAMP], BOOL, (l, r) => compareTimestamp(l, r) > 0), + // >= + celFunc(opc.GREATER_EQUALS, [BOOL, BOOL], BOOL, (l, r) => l >= r), + celFunc(opc.GREATER_EQUALS, [BYTES, BYTES], BOOL, (l, r) => compareBytes(l, r) >= 0), + celFunc(opc.GREATER_EQUALS, [DOUBLE, DOUBLE], BOOL, (l, r) => l >= r), + celFunc(opc.GREATER_EQUALS, [STRING, STRING], BOOL, (l, r) => l >= r), + celFunc(opc.GREATER_EQUALS, [INT, INT], BOOL, (l, r) => l >= r), + celFunc(opc.GREATER_EQUALS, [INT, UINT], BOOL, (l, r) => l >= r.value), + celFunc(opc.GREATER_EQUALS, [UINT, INT], BOOL, (l, r) => l.value >= r), + celFunc(opc.GREATER_EQUALS, [UINT, UINT], BOOL, (l, r) => l.value >= r.value), + celFunc(opc.GREATER_EQUALS, [INT, DOUBLE], BOOL, (l, r) => Number(l) >= r), + celFunc(opc.GREATER_EQUALS, [DOUBLE, INT], BOOL, (l, r) => l >= Number(r)), + celFunc(opc.GREATER_EQUALS, [DOUBLE, UINT], BOOL, (l, r) => l >= Number(r.value)), + celFunc(opc.GREATER_EQUALS, [UINT, DOUBLE], BOOL, (l, r) => Number(l.value) >= r), + celFunc(opc.GREATER_EQUALS, [DURATION, DURATION], BOOL, (l, r) => compareDuration(l, r) >= 0), + celFunc(opc.GREATER_EQUALS, [TIMESTAMP, TIMESTAMP], BOOL, (l, r) => compareTimestamp(l, r) >= 0), + // size + celFunc(olc.SIZE, [BYTES], INT, (x) => BigInt(x.length)), + celFunc(olc.SIZE, [LIST], INT, (x) => BigInt(x.size)), + celFunc(olc.SIZE, [STRING], INT, (x) => BigInt([...x].length)), + celFunc(olc.SIZE, [MAP], INT, (x) => BigInt(x.size)), + celMethod(olc.SIZE, BYTES, [], INT, function () { + return BigInt(this.length) + }), + celMethod(olc.SIZE, LIST, [], INT, function () { + return BigInt(this.size) + }), + celMethod(olc.SIZE, STRING, [], INT, function () { + return BigInt([...this].length) + }), + celMethod(olc.SIZE, MAP, [], INT, function () { + return BigInt(this.size) + }), + // in + celFunc(opc.IN, [DYN, LIST], BOOL, inList), + celFunc(opc.IN, [STRING, MAP], BOOL, inMap), + celFunc(opc.IN, [DOUBLE, MAP], BOOL, inMap), + celFunc(opc.IN, [INT, MAP], BOOL, inMap), + celFunc(opc.IN, [BOOL, MAP], BOOL, inMap), + celFunc(opc.IN, [UINT, MAP], BOOL, inMap), + // string.* + celMethod(olc.CONTAINS, STRING, [STRING], BOOL, String.prototype.includes), + celMethod(olc.ENDS_WITH, STRING, [STRING], BOOL, String.prototype.endsWith), + celMethod(olc.STARTS_WITH, STRING, [STRING], BOOL, String.prototype.startsWith), + celMethod(olc.MATCHES, STRING, [STRING], BOOL, matches), ]; diff --git a/packages/cel/src/testing.ts b/packages/cel/src/testing.ts index 817a19f..7d1505b 100644 --- a/packages/cel/src/testing.ts +++ b/packages/cel/src/testing.ts @@ -59,7 +59,7 @@ import { KindAdorner, SemanticAdorner, toDebugString, -} from "@bufbuild/cel-spec/testdata/to-debug-string.js"; +} from "../../cel-spec/dist/cjs/testdata/to-debug-string.js"; import { check, outputType } from "./check.js"; import type { SimpleTest } from "@bufbuild/cel-spec/cel/expr/conformance/test/simple_pb.js"; import { protoTypeToCelType } from "./checker.js"; diff --git a/packages/re2/.npmignore b/packages/re2/.npmignore new file mode 100644 index 0000000..369e762 --- /dev/null +++ b/packages/re2/.npmignore @@ -0,0 +1,10 @@ +src +scripts +tsconfig.json +tsconfig.*.json +turbo.json +biome.json +dist/**/*.test.js +dist/**/*.test.d.ts +dist/*/testing.js +dist/*/testing.d.ts diff --git a/packages/re2/LICENSE b/packages/re2/LICENSE new file mode 100644 index 0000000..4fc0dfc --- /dev/null +++ b/packages/re2/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2023 Alexey Vasiliev + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/packages/re2/README.md b/packages/re2/README.md new file mode 100644 index 0000000..0301684 --- /dev/null +++ b/packages/re2/README.md @@ -0,0 +1,30 @@ +# @bufbuild/re2 + +This package provides an [RE2-compatible](https://cel.dev) regular expression engine, designed for use with CEL-es and Protovalidate-es. + +## Usage + +```ts +import { RE2JS } from '@bufbuild/re2'; + +const re = new RE2JS('^foo'); +console.log(re.test('foo')); // true +console.log(re.testExact('fooxyz')); // false + +console.log(RE2JS.matches('^foo','foo')); // true +``` + +## Limitations +Only boolean matchers are supported: `test` and `testExact`. + +The instance method `matches` is an alias for `testExact`. The static method `RE2JS.matches` compiles +a regular expression and calls `testExact`. + +As a size optimization, Unicode category and script information is generated on first use. This causes a +slight slowdown the first time a pattern is compiled with a category or script (only the referenced category +or script is generated). The categories and scripts in Unicode version 16.0 are supported. This package includes +data to support Unicode version 16.0 on Unicode 15.0 or later. + +## Credits +This code is a fork of the [RE2JS](https://re2js.leopard.in.ua) project. It has been converted to TypeScript and has a feature set tailored for +CEL and Protovalidate-es. \ No newline at end of file diff --git a/packages/re2/biome.json b/packages/re2/biome.json new file mode 100644 index 0000000..4ce16bd --- /dev/null +++ b/packages/re2/biome.json @@ -0,0 +1,14 @@ +{ + "$schema": "https://biomejs.dev/schemas/1.9.4/schema.json", + "extends": ["../../biome.base.json"], + "formatter": { + "ignore": ["dist", "src/gen"] + }, + "linter": { + "rules": { + "suspicious": { + "noControlCharactersInRegex": "off" + } + } + } +} diff --git a/packages/re2/package.json b/packages/re2/package.json new file mode 100644 index 0000000..2da557a --- /dev/null +++ b/packages/re2/package.json @@ -0,0 +1,49 @@ +{ + "name": "@bufbuild/re2", + "version": "0.4.0", + "description": "An RE2-compatible regex engine tailored for CEL and Protovalidate", + "keywords": [ + "javascript", + "typescript", + "protobuf", + "cel", + "common-expression-language" + ], + "license": "MIT", + "repository": { + "type": "git", + "url": "git+https://github.com/bufbuild/cel-es.git", + "directory": "packages/re2" + }, + "scripts": { + "test": "npx tsx --test ./src/__tests__/*.test.ts", + "prebuild": "rm -rf ./dist/*", + "build": "npm run build:cjs && npm run build:esm", + "build:cjs": "tsc --project tsconfig.json --module commonjs --verbatimModuleSyntax false --moduleResolution node10 --outDir ./dist/cjs && echo >./dist/cjs/package.json '{\"type\":\"commonjs\"}'", + "build:esm": "tsc --project tsconfig.json --outDir ./dist/esm", + "format": "biome format --write", + "lint": "biome lint --error-on-warnings", + "attw": "attw --pack", + "license-header": "license-header --ignore 'src/**' --ignore 'scripts/**'" + }, + "licenseHeader": { + "licenseType": "MIT", + "yearRange": "2023-2026", + "copyrightHolder": "Alexey Vasiliev and Buf Technologies, Inc." + }, + "type": "module", + "sideEffects": false, + "main": "./dist/cjs/index.js", + "types": "./dist/cjs/index.d.ts", + "exports": { + ".": { + "import": "./dist/esm/index.js", + "require": "./dist/cjs/index.js" + } + }, + "devDependencies": { + "@unicode/unicode-15.0.0": "^1.6.16", + "@unicode/unicode-16.0.0": "^1.6.16", + "unicode-property-value-aliases": "^3.9.0" + } +} diff --git a/packages/re2/scripts/codepointRange.js b/packages/re2/scripts/codepointRange.js new file mode 100644 index 0000000..f9b9807 --- /dev/null +++ b/packages/re2/scripts/codepointRange.js @@ -0,0 +1,42 @@ +class CodepointRange { + constructor() { + this.builder = []; + this.setStart = null; + this.setStride = null; + this.lastInSet = null; + } + + add(codepoint) { + if (this.setStart === null) { + this.setStart = codepoint; + } else if (this.setStride === null) { + this.setStride = codepoint - this.lastInSet; + } else if (codepoint - this.lastInSet !== this.setStride) { + // gotta start a new set + this.builder.push([this.setStart, this.lastInSet, this.setStride]); + this.setStart = codepoint; + this.setStride = null; + } + this.lastInSet = codepoint; + } + + addAll(codepoints) { + const sortedCodepoints = Array.from(codepoints).sort((a, b) => a - b); + for (const i of sortedCodepoints) { + this.add(i); + } + } + + finish() { + if (this.setStart !== null) { + this.builder.push([ + this.setStart, + this.lastInSet, + this.setStride === null ? 1 : this.setStride, + ]); + } + return this.builder; + } +} + +export { CodepointRange }; diff --git a/packages/re2/scripts/genUnicodeTable.js b/packages/re2/scripts/genUnicodeTable.js new file mode 100644 index 0000000..a4aa9c8 --- /dev/null +++ b/packages/re2/scripts/genUnicodeTable.js @@ -0,0 +1,534 @@ +import { CodepointRange } from "./codepointRange.js"; +import unicode16 from "@unicode/unicode-16.0.0"; +// Imported for its side effect of being a declared devDependency. +// loadCodePoints() below dynamically imports `@unicode/unicode-15.0.0///code-points.js` +// paths, and requires the package to be installed. +import "@unicode/unicode-15.0.0"; +import CommonCaseFolding from "@unicode/unicode-16.0.0/Case_Folding/C/code-points.js"; +import SimpleCaseFolding from "@unicode/unicode-16.0.0/Case_Folding/S/code-points.js"; +import unicodePropertyValueAliases from "unicode-property-value-aliases"; + +const MAX_CODE_POINT = 0x10ffff; +const SKIP_CATEGORIES = ["cntrl", "Combining_Mark", "digit", "punct"]; +const aliasesToNames = unicodePropertyValueAliases.get("General_Category"); + +// --- CASE_ORBIT (reduced table of non-derivable fold relationships) --- + +const generateCaseFoldOrbits = () => { + let orbits = new Map(); + for (let i = 0; i < MAX_CODE_POINT; i++) { + if (!CommonCaseFolding.has(i) && !SimpleCaseFolding.has(i)) { + continue; + } + + const f = CommonCaseFolding.get(i) || SimpleCaseFolding.get(i); + let orbit = orbits.get(f) || new Set(); + orbit.add(f); + orbit.add(i); + orbits.set(f, orbit); + } + + for (let i = 0; i < MAX_CODE_POINT; i++) { + if (!orbits.has(i)) continue; + if (orbits.get(i).size === 1) orbits.delete(i); + } + + const finalResult = new Map(); + for (let [key, value] of orbits) { + let orbitWithKey = new Set(value); + orbitWithKey.add(key); + const orbitWithKeyArray = Array.from(orbitWithKey).sort((a, b) => a - b); + let a = orbitWithKeyArray[0]; + for (let i of orbitWithKeyArray.slice(1)) { + finalResult.set(a, i); + a = i; + } + finalResult.set( + orbitWithKeyArray[orbitWithKeyArray.length - 1], + orbitWithKeyArray[0], + ); + } + return finalResult; +}; + +const sortedOrbits = generateCaseFoldOrbits(); + +// rawSimpleFold mirrors the runtime Unicode.simpleFold fallback: raw +// String.prototype.toLowerCase/toUpperCase with a length check to reject +// multi-char expansions (e.g. ß→SS). Used to reduce CASE_ORBIT to only +// entries where raw native disagrees with the correct fold. +const rawSimpleFold = (r) => { + const s = String.fromCodePoint(r); + const lower = s.toLowerCase(); + if (lower.length === s.length) { + const lowerCp = lower.codePointAt(0); + if (lowerCp !== r) return lowerCp; + } + const upper = s.toUpperCase(); + if (upper.length === s.length) { + const upperCp = upper.codePointAt(0); + if (upperCp !== r) return upperCp; + } + return r; +}; + +const buildReducedOrbit = () => { + const reduced = new Map(); + for (const [r, v] of sortedOrbits) { + if (rawSimpleFold(r) !== v) reduced.set(r, v); + } + for (let r = 0; r < MAX_CODE_POINT; r++) { + if (r >= 0xd800 && r <= 0xdfff) continue; + if (sortedOrbits.has(r)) continue; + if (rawSimpleFold(r) !== r) reduced.set(r, r); + } + return reduced; +}; + +const reducedOrbit = buildReducedOrbit(); + +// --- Loaders --- + +const loadCodePoints = async (pkg, type, name) => { + try { + const { default: codePoints } = await import( + `${pkg}/${type}/${name}/code-points.js` + ); + return new Set(codePoints); + } catch (_e) { + return null; + } +}; + +const computeRanges = (codepoints) => { + if (codepoints.size === 0) return []; + const gen = new CodepointRange(); + gen.addAll(Array.from(codepoints).sort((a, b) => a - b)); + return gen.finish(); +}; + +const setDiff = (a, b) => { + if (!b) return new Set(a); + const out = new Set(); + for (const x of a) if (!b.has(x)) out.add(x); + return out; +}; + +// --- VLQ --- + +const B64 = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+-"; + +const encodeVLQ = (value) => { + let res = ""; + do { + let digit = value & 0x1f; + value >>>= 5; + if (value > 0) digit |= 0x20; + res += B64[digit]; + } while (value > 0); + return res; +}; + +const encodeRanges = (ranges) => { + if (ranges.length === 0) return { encoded: "", stride1: true, empty: true }; + let encoded = ""; + let current = 0; + const stride1 = ranges.every((r) => r[2] === 1); + for (const r of ranges) { + encoded += encodeVLQ(r[0] - current); + encoded += encodeVLQ(r[1] - r[0]); + if (!stride1) encoded += encodeVLQ(r[2]); + current = r[1]; + } + return { encoded, stride1, empty: false }; +}; + +// --- Delta computation --- + +const deltaCatLines = []; +const deltaScrLines = []; +const newScriptLines = []; +const stableCategoryNames = []; +const stableScriptNames = []; +const newScriptNames = []; + +for (const [alias, name] of aliasesToNames.entries()) { + if (SKIP_CATEGORIES.includes(alias)) continue; + const cp16 = await loadCodePoints( + "@unicode/unicode-16.0.0", + "General_Category", + name, + ); + if (!cp16) continue; + const cp15 = await loadCodePoints( + "@unicode/unicode-15.0.0", + "General_Category", + name, + ); + stableCategoryNames.push(alias); + const delta = setDiff(cp16, cp15); + if (delta.size > 0) { + const ranges = computeRanges(delta); + const enc = encodeRanges(ranges); + deltaCatLines.push( + ` '${alias}': () => decodeRanges('${enc.encoded}', ${enc.stride1}),`, + ); + } +} + +for (const name of unicode16.Script) { + const cp16 = await loadCodePoints("@unicode/unicode-16.0.0", "Script", name); + if (!cp16) continue; + const cp15 = await loadCodePoints("@unicode/unicode-15.0.0", "Script", name); + if (cp15) { + stableScriptNames.push(name); + const delta = setDiff(cp16, cp15); + if (delta.size > 0) { + const ranges = computeRanges(delta); + const enc = encodeRanges(ranges); + deltaScrLines.push( + ` '${name}': () => decodeRanges('${enc.encoded}', ${enc.stride1}),`, + ); + } + } else { + newScriptNames.push(name); + const ranges = computeRanges(cp16); + const enc = encodeRanges(ranges); + newScriptLines.push( + ` '${name}': () => new UnicodeRangeTable(decodeRanges('${enc.encoded}', ${enc.stride1})),`, + ); + } +} + +// --- CASE_ORBIT encoding --- + +const caseOrbitEntries = Array.from(reducedOrbit.entries()).sort( + (a, b) => a[0] - b[0], +); +let orbitEnc = ""; +let curr = 0; +for (const [k, v] of caseOrbitEntries) { + orbitEnc += encodeVLQ(k - curr); + orbitEnc += encodeVLQ(v); + curr = k; +} + +// --- Emit UnicodeTables.ts --- + +const code = [ + "// GENERATED BY tools/scripts/genUnicodeTable.js; DO NOT EDIT.", + "// yarn node ./tools/scripts/genUnicodeTable.js > src/UnicodeTables.ts", + "", + "import { UnicodeRangeTable } from './UnicodeRangeTable.js'", + "", + "let _B64_MAP: Uint8Array | null = null", + "const getB64Map = (): Uint8Array => {", + " if (!_B64_MAP) {", + " _B64_MAP = new Uint8Array(256)", + ` const b = '${B64}'`, + " for (let i = 0; i < 64; i++) {", + " _B64_MAP[b.charCodeAt(i)] = i", + " }", + " }", + " return _B64_MAP", + "}", + "", + "const decodeVLQ = (str: string): number[] => {", + " const b64 = getB64Map()", + " const res: number[] = []", + " let value = 0, shift = 0", + " for (let i = 0; i < str.length; i++) {", + " const digit = b64[str.charCodeAt(i)]", + " value |= (digit & 0x1f) << shift", + " if ((digit & 0x20) === 0) {", + " res.push(value)", + " value = 0", + " shift = 0", + " } else {", + " shift += 5", + " }", + " }", + " return res", + "}", + "", + "const decodeRanges = (str: string, isStride1: boolean): Uint32Array => {", + " if (str.length === 0) return new Uint32Array(0)", + " const res = decodeVLQ(str)", + " const numRanges = isStride1 ? res.length / 2 : res.length / 3", + " const out = new Uint32Array(numRanges * 3)", + " let current = 0, resIdx = 0", + " for (let i = 0; i < numRanges; i++) {", + " current += res[resIdx++]", + " out[i * 3] = current", + " current += res[resIdx++]", + " out[i * 3 + 1] = current", + " out[i * 3 + 2] = isStride1 ? 1 : res[resIdx++]", + " }", + " return out", + "}", + "", + "const decodeOrbit = (str: string): Map => {", + " const res = decodeVLQ(str)", + " const map = new Map()", + " let currentKey = 0", + " for (let i = 0; i < res.length; i += 2) {", + " currentKey += res[i]", + " map.set(currentKey, res[i + 1])", + " }", + " return map", + "}", + "", + "// Merges two stride-encoded UnicodeRangeTables. Expands any stride>1", + "// ranges to individual codepoints, then coalesces contiguous runs.", + "const mergeRanges = (a: Uint32Array, b: Uint32Array): Uint32Array => {", + " if (b.length === 0) return a", + " if (a.length === 0) return b", + " const points: [number, number][] = []", + " const push = (arr: Uint32Array): void => {", + " for (let i = 0; i < arr.length; i += 3) {", + " const lo = arr[i], hi = arr[i + 1], stride = arr[i + 2]", + " if (stride === 1) {", + " points.push([lo, hi])", + " } else {", + " for (let cp = lo; cp <= hi; cp += stride) points.push([cp, cp])", + " }", + " }", + " }", + " push(a)", + " push(b)", + " points.sort((x, y) => x[0] - y[0])", + " const merged: [number, number][] = []", + " for (const [lo, hi] of points) {", + " const last = merged[merged.length - 1]", + " if (last && last[1] + 1 >= lo) {", + " if (hi > last[1]) last[1] = hi", + " } else {", + " merged.push([lo, hi])", + " }", + " }", + " const out = new Uint32Array(merged.length * 3)", + " for (let i = 0; i < merged.length; i++) {", + " out[i * 3] = merged[i][0]", + " out[i * 3 + 1] = merged[i][1]", + " out[i * 3 + 2] = 1", + " }", + " return out", + "}", + "", + "// Sweeps the codepoint space using a platform property-escape regex and", + "// returns stride-1 ranges. Surrogates are included — String.fromCodePoint", + "// returns the lone surrogate char and platform regex matches \\p{Cs} on it.", + "const sweepPlatform = (pattern: string): Uint32Array => {", + ' const re = new RegExp(pattern, "u")', + " const ranges: number[] = []", + " let start = -1", + " for (let cp = 0; cp <= 0x10ffff; cp++) {", + " if (re.test(String.fromCodePoint(cp))) {", + " if (start < 0) start = cp", + " } else if (start >= 0) {", + " ranges.push(start, cp - 1, 1)", + " start = -1", + " }", + " }", + " if (start >= 0) ranges.push(start, 0x10ffff, 1)", + " return Uint32Array.from(ranges)", + "}", + "", + "class LazyDecoder {", + " private readonly initializer: Record V>", + " private readonly cache: Map", + " constructor(initializer: Record V>) {", + " this.initializer = initializer", + " this.cache = new Map()", + " }", + " has(key: string): boolean { return key in this.initializer }", + " get(key: string): V | null {", + " const cached = this.cache.get(key)", + " if (cached !== undefined || this.cache.has(key)) {", + " return cached ?? null", + " }", + " const fn = this.initializer[key]", + " const val = fn ? fn() : null", + " this.cache.set(key, val)", + " return val", + " }", + "}", + "", + "let _CASE_ORBIT: Map | null = null", + "const getCASE_ORBIT = (): Map => {", + " if (!_CASE_ORBIT) {", + ` _CASE_ORBIT = decodeOrbit('${orbitEnc}')`, + " }", + " return _CASE_ORBIT", + "}", + "", + "// Additions from Unicode 15.0 → 16.0 per stable general-category name.", + "// Merged unconditionally with platform sweep output; no-op on 16.0+ engines.", + "const _DELTA_CATEGORIES = /*#__PURE__*/ new LazyDecoder({", + ...deltaCatLines, + "})", + "", + "// Additions from Unicode 15.0 → 16.0 per stable script name.", + "const _DELTA_SCRIPTS = /*#__PURE__*/ new LazyDecoder({", + ...deltaScrLines, + "})", + "", + "// Full tables for scripts added in Unicode 16.0. Engines < 16.0 throw", + "// SyntaxError on these names, so platform sweep is impossible.", + "const _NEW_SCRIPTS = /*#__PURE__*/ new LazyDecoder({", + ...newScriptLines, + "})", + "", + `const STABLE_CATEGORY_NAMES: ReadonlySet = new Set(${JSON.stringify(stableCategoryNames)})`, + `const STABLE_SCRIPT_NAMES: ReadonlySet = new Set(${JSON.stringify(stableScriptNames)})`, + `const NEW_SCRIPT_NAMES: ReadonlySet = new Set(${JSON.stringify(newScriptNames)})`, + "", + "const _sweepCache = new Map()", + "const _foldCache = new Map()", + "", + "// Returns the base range table for a property name, or null if unknown.", + "// Stable names: platform sweep + bundled delta (15.0 → 16.0).", + "// New-in-16.0 script names: bundled full table.", + "const buildForProperty = (name: string): UnicodeRangeTable | null => {", + " if (NEW_SCRIPT_NAMES.has(name)) {", + " return _NEW_SCRIPTS.get(name)", + " }", + ' let kind: "category" | "script" | null = null', + " let pattern: string | null = null", + ' if (STABLE_CATEGORY_NAMES.has(name)) { kind = "category"; pattern = `\\\\p{General_Category=${name}}` }', + ' else if (STABLE_SCRIPT_NAMES.has(name)) { kind = "script"; pattern = `\\\\p{Script=${name}}` }', + " else return null", + "", + " const cacheKey = `${kind}:${name}`", + " const cached = _sweepCache.get(cacheKey)", + " if (cached) return cached", + "", + " const base = sweepPlatform(pattern)", + ' const delta = kind === "category" ? _DELTA_CATEGORIES.get(name) : _DELTA_SCRIPTS.get(name)', + " const merged = delta ? mergeRanges(base, delta) : base", + " const table = new UnicodeRangeTable(merged)", + " _sweepCache.set(cacheKey, table)", + " return table", + "}", + "", + "// Computes the fold-overlay for a property name: additional runes that", + "// fold to some rune already in the base class. Returns null if no overlay", + "// is needed (base class is fold-stable).", + "const buildFoldOverlay = (name: string): UnicodeRangeTable | null => {", + " const cached = _foldCache.get(name)", + " if (cached !== undefined) return cached", + " const base = buildForProperty(name)", + " if (!base) {", + " _foldCache.set(name, null)", + " return null", + " }", + " const inBase = (r: number): boolean => {", + " let lo = 0, hi = base.length", + " while (lo < hi) {", + " const m = (lo + hi) >> 1", + " const rlo = base.getLo(m), rhi = base.getHi(m)", + " if (r < rlo) hi = m", + " else if (r > rhi) lo = m + 1", + " else return ((r - rlo) % base.getStride(m)) === 0", + " }", + " return false", + " }", + " // Inline simpleFold to avoid circular import with Unicode.ts.", + " const orbit = getCASE_ORBIT()", + " const simpleFold = (r: number): number => {", + " const folded = orbit.get(r)", + " if (folded !== undefined) return folded", + " const s = String.fromCodePoint(r)", + " const lower = s.toLowerCase()", + " if (lower.length === s.length) {", + " const lowerCp = lower.codePointAt(0)", + " if (lowerCp !== undefined && lowerCp !== r) return lowerCp", + " }", + " const upper = s.toUpperCase()", + " if (upper.length === s.length) {", + " const upperCp = upper.codePointAt(0)", + " if (upperCp !== undefined && upperCp !== r) return upperCp", + " }", + " return r", + " }", + " const extras = new Set()", + " for (let i = 0; i < base.length; i++) {", + " const lo = base.getLo(i), hi = base.getHi(i), stride = base.getStride(i)", + " for (let cp = lo; cp <= hi; cp += stride) {", + " let r = simpleFold(cp)", + " while (r !== cp) {", + " if (!inBase(r)) extras.add(r)", + " r = simpleFold(r)", + " }", + " }", + " }", + " if (extras.size === 0) {", + " _foldCache.set(name, null)", + " return null", + " }", + " const sorted = Array.from(extras).sort((a, b) => a - b)", + " const merged: [number, number][] = []", + " for (const cp of sorted) {", + " const last = merged[merged.length - 1]", + " if (last && last[1] + 1 === cp) last[1] = cp", + " else merged.push([cp, cp])", + " }", + " const out = new Uint32Array(merged.length * 3)", + " for (let i = 0; i < merged.length; i++) {", + " out[i * 3] = merged[i][0]", + " out[i * 3 + 1] = merged[i][1]", + " out[i * 3 + 2] = 1", + " }", + " const table = new UnicodeRangeTable(out)", + " _foldCache.set(name, table)", + " return table", + "}", + "", + "const getUpper = (): UnicodeRangeTable => {", + ' const table = buildForProperty("Lu")', + " if (table === null) {", + ' throw new Error("Upper: missing Lu property")', + " }", + " return table", + "}", + "", + "// --- Legacy API surface used by Parser ---", + "", + "export const UnicodeTables = {", + " get CASE_ORBIT(): Map { return getCASE_ORBIT() },", + " STABLE_CATEGORY_NAMES,", + " STABLE_SCRIPT_NAMES,", + " NEW_SCRIPT_NAMES,", + " buildForProperty,", + " buildFoldOverlay,", + " CATEGORIES: {", + " has: (name: string): boolean => STABLE_CATEGORY_NAMES.has(name),", + " get: (name: string): UnicodeRangeTable | null => buildForProperty(name),", + " },", + " SCRIPTS: {", + " has: (name: string): boolean =>", + " STABLE_SCRIPT_NAMES.has(name) || NEW_SCRIPT_NAMES.has(name),", + " get: (name: string): UnicodeRangeTable | null => buildForProperty(name),", + " },", + " FOLD_CATEGORIES: {", + " has: (name: string): boolean => STABLE_CATEGORY_NAMES.has(name),", + " get: (name: string): UnicodeRangeTable | null => buildFoldOverlay(name),", + " },", + " FOLD_SCRIPT: {", + " has: (name: string): boolean =>", + " STABLE_SCRIPT_NAMES.has(name) || NEW_SCRIPT_NAMES.has(name),", + " get: (name: string): UnicodeRangeTable | null => buildFoldOverlay(name),", + " },", + " get Upper(): UnicodeRangeTable { return getUpper() },", + "", + " // --- Test-only hooks: expose the raw bundled 15.0→16.0 delta and", + " // new-in-16.0 script data so tests can verify the generator output.", + " // These are not part of the public API.", + " _deltaCategoryRanges: (name: string): Uint32Array | null => _DELTA_CATEGORIES.get(name),", + " _deltaScriptRanges: (name: string): Uint32Array | null => _DELTA_SCRIPTS.get(name),", + " _newScriptTable: (name: string): UnicodeRangeTable | null => _NEW_SCRIPTS.get(name),", + "}", + "", +]; + +console.log(code.join("\n")); // eslint-disable-line no-console diff --git a/packages/re2/scripts/make_perl_groups.pl b/packages/re2/scripts/make_perl_groups.pl new file mode 100755 index 0000000..4b2e66f --- /dev/null +++ b/packages/re2/scripts/make_perl_groups.pl @@ -0,0 +1,126 @@ +#!/usr/bin/perl +# +# Copyright (c) 2020 The Go Authors. All rights reserved. +# +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file. +# + +# Modified version of make_perl_groups.pl from RE2J: +# https://github.com/google/re2j/blob/master/java/com/google/re2j/make_perl_groups.pl + +# Generate table entries giving character ranges +# for POSIX/Perl character classes. Rather than +# figure out what the definition is, it is easier to ask +# Perl about each letter from 0-128 and write down +# its answer. + +@posixclasses = ( + "[:alnum:]", + "[:alpha:]", + "[:ascii:]", + "[:blank:]", + "[:cntrl:]", + "[:digit:]", + "[:graph:]", + "[:lower:]", + "[:print:]", + "[:punct:]", + "[:space:]", + "[:upper:]", + "[:word:]", + "[:xdigit:]", +); + +@perlclasses = ( + "\\d", + "\\s", + "\\w", +); + +%overrides = ( + # Prior to Perl 5.18, \s did not match vertical tab. + # RE2 preserves that original behaviour. + "\\s:11" => 0, +); + +sub ComputeClass($) { + my @ranges; + my ($class) = @_; + my $regexp = "[$class]"; + my $start = -1; + for (my $i=0; $i<=129; $i++) { + if ($i == 129) { $i = 256; } + if ($i <= 128 && ($overrides{"$class:$i"} // chr($i) =~ $regexp)) { + if ($start < 0) { + $start = $i; + } + } else { + if ($start >= 0) { + push @ranges, [$start, $i-1]; + } + $start = -1; + } + } + return @ranges; +} + +sub PrintClass($$@) { + my ($cname, $groupmap, $name, @ranges) = @_; + print "const code$cname = [ /* $name */\n"; + for (my $i=0; $i<@ranges; $i++) { + my @a = @{$ranges[$i]}; + printf "\t0x%x, 0x%x,\n", $a[0], $a[1]; + } + print "]\n\n"; + my $n = @ranges; + $negname = $name; + if ($negname =~ /:/) { + $negname =~ s/:/:^/; + } else { + $negname =~ y/a-z/A-Z/; + } + $name =~ s/\\/\\\\/g; + $negname =~ s/\\/\\\\/g; + return "[\"$name\", \tnew CharGroup(+1, code$cname)],\n" . + "[\"$negname\", \tnew CharGroup(-1, code$cname)],\n"; +} + +my $gen = 0; + +sub PrintClasses($@) { + my ($cname, @classes) = @_; + my $groupmap = uc($cname) . "_GROUPS"; + my @entries; + foreach my $cl (@classes) { + my @ranges = ComputeClass($cl); + push @entries, PrintClass(++$gen, $groupmap, $cl, @ranges); + } + print "const $groupmap = new Map([\n"; + foreach my $e (@entries) { + print $e; + } + print "])\n"; + my $count = @entries; +} + +print < src/CharGroup.js + +class CharGroup { + constructor(sign, cls) { + this.sign = sign + this.cls = cls + } +} + +EOF + +PrintClasses("perl", @perlclasses); +PrintClasses("posix", @posixclasses); + +print < 1|. + static qsortIntPair(array: number[], left: number, right: number): void { + const pivotIndex = (((left + right) / 2) | 0) & ~1; + const pivotFrom = array[pivotIndex]; + const pivotTo = array[pivotIndex + 1]; + let i = left; + let j = right; + while (i <= j) { + while (i < right && CharClass.cmp(array, i, pivotFrom, pivotTo) < 0) { + i += 2; + } + + while (j > left && CharClass.cmp(array, j, pivotFrom, pivotTo) > 0) { + j -= 2; + } + + if (i <= j) { + if (i !== j) { + let temp = array[i]; + array[i] = array[j]; + array[j] = temp; + temp = array[i + 1]; + array[i + 1] = array[j + 1]; + array[j + 1] = temp; + } + i += 2; + j -= 2; + } + } + + if (left < j) { + CharClass.qsortIntPair(array, left, j); + } + if (i < right) { + CharClass.qsortIntPair(array, i, right); + } + } + + r: number[]; + len: number; + + constructor(r = emptyInts()) { + this.r = r; // inclusive ranges, pairs of [lo,hi]. r.length is even. + this.len = r.length; // prefix of |r| that is defined. Even. + } + + // Returns the character class as an int array. Subsequent CharClass + // operations may mutate this array, so typically this is the last operation + // performed on a given CharClass instance. + toArray(): number[] { + if (this.len === this.r.length) { + return this.r; + } + return this.r.slice(0, this.len); + } + + // cleanClass() sorts the ranges (pairs of elements) of this CharClass, + // merges them, and eliminates duplicates. + cleanClass(): this { + if (this.len < 4) { + return this; + } + // Sort by lo increasing, hi decreasing to break ties. + CharClass.qsortIntPair(this.r, 0, this.len - 2); + // Merge abutting, overlapping. + let w = 2; // write index + for (let i = 2; i < this.len; i += 2) { + { + const lo = this.r[i]; + const hi = this.r[i + 1]; + if (lo <= this.r[w - 1] + 1) { + // merge with previous range + if (hi > this.r[w - 1]) { + this.r[w - 1] = hi; + } + continue; + } + // new disjoint range + this.r[w] = lo; + this.r[w + 1] = hi; + w += 2; + } + } + this.len = w; + return this; + } + + // appendLiteral() appends the literal |x| to this CharClass. + appendLiteral(x: number, flags: number): this { + return (flags & FOLD_CASE) !== 0 + ? this.appendFoldedRange(x, x) + : this.appendRange(x, x); + } + + // appendRange() appends the range [lo-hi] (inclusive) to this CharClass. + appendRange(lo: number, hi: number): this { + // Expand last range or next to last range if it overlaps or abuts. + // Checking two ranges helps when appending case-folded + // alphabets, so that one range can be expanding A-Z and the + // other expanding a-z. + if (this.len > 0) { + for (let i = 2; i <= 4; i += 2) { + // twice, using i=2, i=4 + if (this.len >= i) { + const rlo = this.r[this.len - i]; + const rhi = this.r[this.len - i + 1]; + if (lo <= rhi + 1 && rlo <= hi + 1) { + if (lo < rlo) { + this.r[this.len - i] = lo; + } + if (hi > rhi) { + this.r[this.len - i + 1] = hi; + } + return this; + } + } + } + } + + this.r[this.len++] = lo; + this.r[this.len++] = hi; + return this; + } + + // appendFoldedRange() appends the range [lo-hi] and its case + // folding-equivalent runes to this CharClass. + appendFoldedRange(lo: number, hi: number): this { + // Optimizations. + if (lo <= MIN_FOLD && hi >= MAX_FOLD) { + // Range is full: folding can't add more. + return this.appendRange(lo, hi); + } + if (hi < MIN_FOLD || lo > MAX_FOLD) { + // Range is outside folding possibilities. + return this.appendRange(lo, hi); + } + if (lo < MIN_FOLD) { + // [lo, minFold-1] needs no folding. + this.appendRange(lo, MIN_FOLD - 1); + lo = MIN_FOLD; + } + if (hi > MAX_FOLD) { + // [maxFold+1, hi] needs no folding. + this.appendRange(MAX_FOLD + 1, hi); + hi = MAX_FOLD; + } + + // Brute force. Depend on appendRange to coalesce ranges on the fly. + for (let c = lo; c <= hi; c++) { + this.appendRange(c, c); + + for (let f = simpleFold(c); f !== c; f = simpleFold(f)) { + this.appendRange(f, f); + } + } + return this; + } + + // appendClass() appends the class |x| to this CharClass. + // It assumes |x| is clean. Does not mutate |x|. + appendClass(x: number[]): this { + for (let i = 0; i < x.length; i += 2) { + this.appendRange(x[i], x[i + 1]); + } + return this; + } + + // appendFoldedClass() appends the case folding of the class |x| to this + // CharClass. Does not mutate |x|. + appendFoldedClass(x: number[]): this { + for (let i = 0; i < x.length; i += 2) { + this.appendFoldedRange(x[i], x[i + 1]); + } + return this; + } + + // appendNegatedClass() append the negation of the class |x| to this + // CharClass. It assumes |x| is clean. Does not mutate |x|. + appendNegatedClass(x: number[]): this { + let nextLo = 0; + for (let i = 0; i < x.length; i += 2) { + const lo = x[i]; + const hi = x[i + 1]; + if (nextLo <= lo - 1) { + this.appendRange(nextLo, lo - 1); + } + nextLo = hi + 1; + } + if (nextLo <= MAX_RUNE) { + this.appendRange(nextLo, MAX_RUNE); + } + return this; + } + + // appendTable() appends the Unicode range table |table| to this CharClass. + // Does not mutate |table|. + appendTable(table: UnicodeRangeTable): this { + for (let i = 0; i < table.length; ++i) { + const lo = table.getLo(i); + const hi = table.getHi(i); + const stride = table.getStride(i); + if (stride === 1) { + this.appendRange(lo, hi); + continue; + } + for (let c = lo; c <= hi; c += stride) { + this.appendRange(c, c); + } + } + return this; + } + + // appendNegatedTable() returns the result of appending the negation of range + // table |table| to this CharClass. Does not mutate |table|. + appendNegatedTable(table: UnicodeRangeTable): this { + let nextLo = 0; + for (let i = 0; i < table.length; ++i) { + const lo = table.getLo(i); + const hi = table.getHi(i); + const stride = table.getStride(i); + if (stride === 1) { + if (nextLo <= lo - 1) { + this.appendRange(nextLo, lo - 1); + } + nextLo = hi + 1; + continue; + } + for (let c = lo; c <= hi; c += stride) { + if (nextLo <= c - 1) { + this.appendRange(nextLo, c - 1); + } + nextLo = c + 1; + } + } + if (nextLo <= MAX_RUNE) { + this.appendRange(nextLo, MAX_RUNE); + } + return this; + } + + // appendTableWithSign() calls append{,Negated}Table depending on sign. + // Does not mutate |table|. + appendTableWithSign(table: UnicodeRangeTable, sign: number): this { + return sign < 0 ? this.appendNegatedTable(table) : this.appendTable(table); + } + + // negateClass() negates this CharClass, which must already be clean. + negateClass(): this { + let nextLo = 0; // lo end of next class to add + let w = 0; // write index + for (let i = 0; i < this.len; i += 2) { + const lo = this.r[i]; + const hi = this.r[i + 1]; + if (nextLo <= lo - 1) { + this.r[w] = nextLo; + this.r[w + 1] = lo - 1; + w += 2; + } + nextLo = hi + 1; + } + this.len = w; + if (nextLo <= MAX_RUNE) { + this.r[this.len++] = nextLo; + this.r[this.len++] = MAX_RUNE; + } + return this; + } + + // appendClassWithSign() calls appendClass() if sign is +1 or + // appendNegatedClass if sign is -1. Does not mutate |x|. + appendClassWithSign(x: number[], sign: number): this { + return sign < 0 ? this.appendNegatedClass(x) : this.appendClass(x); + } + + // appendGroup() appends CharGroup |g| to this CharClass, folding iff + // |foldCase|. Does not mutate |g|. + appendGroup(g: CharGroup, foldCase: boolean): this { + let cls = g.cls; + if (foldCase) { + cls = new CharClass().appendFoldedClass(cls).cleanClass().toArray(); + } + return this.appendClassWithSign(cls, g.sign); + } +} + +export { CharClass }; diff --git a/packages/re2/src/CharGroup.ts b/packages/re2/src/CharGroup.ts new file mode 100644 index 0000000..e97a8db --- /dev/null +++ b/packages/re2/src/CharGroup.ts @@ -0,0 +1,85 @@ +// GENERATED BY tools/scripts/make_perl_groups.pl; DO NOT EDIT. +// ./tools/scripts/make_perl_groups.pl > src/CharGroup.js + +class CharGroup { + sign: number; + cls: number[]; + + constructor(sign: number, cls: number[]) { + this.sign = sign; + this.cls = cls; + } +} + +const code1 = [0x30, 0x39]; +const code2 = [0x9, 0xa, 0xc, 0xd, 0x20, 0x20]; +const code3 = [0x30, 0x39, 0x41, 0x5a, 0x5f, 0x5f, 0x61, 0x7a]; + +let _PERL_GROUPS: Map | null = null; +const getPerlGroups = (): Map => { + if (!_PERL_GROUPS) { + _PERL_GROUPS = new Map([ + ["\\d", new CharGroup(+1, code1)], + ["\\D", new CharGroup(-1, code1)], + ["\\s", new CharGroup(+1, code2)], + ["\\S", new CharGroup(-1, code2)], + ["\\w", new CharGroup(+1, code3)], + ["\\W", new CharGroup(-1, code3)], + ]); + } + return _PERL_GROUPS; +}; + +const code4 = [0x30, 0x39, 0x41, 0x5a, 0x61, 0x7a]; +const code5 = [0x41, 0x5a, 0x61, 0x7a]; +const code6 = [0x0, 0x7f]; +const code7 = [0x9, 0x9, 0x20, 0x20]; +const code8 = [0x0, 0x1f, 0x7f, 0x7f]; +const code9 = [0x30, 0x39]; +const code10 = [0x21, 0x7e]; +const code11 = [0x61, 0x7a]; +const code12 = [0x20, 0x7e]; +const code13 = [0x21, 0x2f, 0x3a, 0x40, 0x5b, 0x60, 0x7b, 0x7e]; +const code14 = [0x9, 0xd, 0x20, 0x20]; +const code15 = [0x41, 0x5a]; +const code16 = [0x30, 0x39, 0x41, 0x5a, 0x5f, 0x5f, 0x61, 0x7a]; +const code17 = [0x30, 0x39, 0x41, 0x46, 0x61, 0x66]; + +let _POSIX_GROUPS: Map | null = null; +const getPosixGroups = (): Map => { + if (!_POSIX_GROUPS) { + _POSIX_GROUPS = new Map([ + ["[:alnum:]", new CharGroup(+1, code4)], + ["[:^alnum:]", new CharGroup(-1, code4)], + ["[:alpha:]", new CharGroup(+1, code5)], + ["[:^alpha:]", new CharGroup(-1, code5)], + ["[:ascii:]", new CharGroup(+1, code6)], + ["[:^ascii:]", new CharGroup(-1, code6)], + ["[:blank:]", new CharGroup(+1, code7)], + ["[:^blank:]", new CharGroup(-1, code7)], + ["[:cntrl:]", new CharGroup(+1, code8)], + ["[:^cntrl:]", new CharGroup(-1, code8)], + ["[:digit:]", new CharGroup(+1, code9)], + ["[:^digit:]", new CharGroup(-1, code9)], + ["[:graph:]", new CharGroup(+1, code10)], + ["[:^graph:]", new CharGroup(-1, code10)], + ["[:lower:]", new CharGroup(+1, code11)], + ["[:^lower:]", new CharGroup(-1, code11)], + ["[:print:]", new CharGroup(+1, code12)], + ["[:^print:]", new CharGroup(-1, code12)], + ["[:punct:]", new CharGroup(+1, code13)], + ["[:^punct:]", new CharGroup(-1, code13)], + ["[:space:]", new CharGroup(+1, code14)], + ["[:^space:]", new CharGroup(-1, code14)], + ["[:upper:]", new CharGroup(+1, code15)], + ["[:^upper:]", new CharGroup(-1, code15)], + ["[:word:]", new CharGroup(+1, code16)], + ["[:^word:]", new CharGroup(-1, code16)], + ["[:xdigit:]", new CharGroup(+1, code17)], + ["[:^xdigit:]", new CharGroup(-1, code17)], + ]); + } + return _POSIX_GROUPS; +}; + +export { CharGroup, getPerlGroups, getPosixGroups }; diff --git a/packages/re2/src/Codepoint.ts b/packages/re2/src/Codepoint.ts new file mode 100644 index 0000000..ff718cb --- /dev/null +++ b/packages/re2/src/Codepoint.ts @@ -0,0 +1,56 @@ +/** + * Various constants and helper for unicode codepoints. + */ +const ASCII_SIZE = 128; +let _ASCII_TO_UPPER: Int32Array | null = null; +let _ASCII_TO_LOWER: Int32Array | null = null; + +const getAsciiToUpper = (): Int32Array => { + if (!_ASCII_TO_UPPER) { + _ASCII_TO_UPPER = new Int32Array(ASCII_SIZE); + for (let i = 0; i < ASCII_SIZE; i++) { + _ASCII_TO_UPPER[i] = i >= 97 && i <= 122 ? i - 32 : i; + } + } + return _ASCII_TO_UPPER; +}; + +const getAsciiToLower = (): Int32Array => { + if (!_ASCII_TO_LOWER) { + _ASCII_TO_LOWER = new Int32Array(ASCII_SIZE); + for (let i = 0; i < ASCII_SIZE; i++) { + _ASCII_TO_LOWER[i] = i >= 65 && i <= 90 ? i + 32 : i; + } + } + return _ASCII_TO_LOWER; +}; + +function toUpperCase(codepoint: number): number { + if (codepoint < ASCII_SIZE) return getAsciiToUpper()[codepoint]; + + const s = String.fromCodePoint(codepoint).toUpperCase(); + if (s.length > 1) { + return codepoint; + } + const sOrigin = String.fromCodePoint(s.codePointAt(0)).toLowerCase(); + if (sOrigin.length > 1 || sOrigin.codePointAt(0) !== codepoint) { + return codepoint; + } + return s.codePointAt(0); +} + +function toLowerCase(codepoint: number): number { + if (codepoint < ASCII_SIZE) return getAsciiToLower()[codepoint]; + + const s = String.fromCodePoint(codepoint).toLowerCase(); + if (s.length > 1) { + return codepoint; + } + const sOrigin = String.fromCodePoint(s.codePointAt(0)).toUpperCase(); + if (sOrigin.length > 1 || sOrigin.codePointAt(0) !== codepoint) { + return codepoint; + } + return s.codePointAt(0); +} + +export { toUpperCase, toLowerCase }; diff --git a/packages/re2/src/Compiler.ts b/packages/re2/src/Compiler.ts new file mode 100644 index 0000000..d4e0655 --- /dev/null +++ b/packages/re2/src/Compiler.ts @@ -0,0 +1,296 @@ +import { NON_GREEDY, FOLD_CASE } from "./RE2Flags.js"; +import { MAX_RUNE, simpleFold } from "./Unicode.js"; +import { + EMPTY_BEGIN_TEXT, + EMPTY_END_TEXT, + EMPTY_BEGIN_LINE, + EMPTY_WORD_BOUNDARY, + EMPTY_END_LINE, + EMPTY_NO_WORD_BOUNDARY, +} from "./Utils.js"; +import { Regexp } from "./Regexp.js"; +import { Inst } from "./Inst.js"; +import { Prog, PatchList } from "./Prog.js"; +import { RE2JSCompileException } from "./exceptions.js"; + +/** + * A fragment of a compiled regular expression program. + * + * @see http://swtch.com/~rsc/regexp/regexp1.html + * @class + */ +class Frag { + i: number; + out: PatchList; + nullable: boolean; + + constructor(i = 0, out: PatchList = new PatchList(), nullable = false) { + this.i = i; // an instruction address (pc). + this.out = out; // a patch list; see explanation in Prog.js + this.nullable = nullable; // whether the fragment can match the empty string + } +} + +/** + * Compiler from {@code Regexp} (RE2 abstract syntax) to {@code RE2} (compiled regular expression). + * + * The only entry point is {@link #compileRegexp}. + */ +class Compiler { + prog: Prog; + + static ANY_RUNE_NOT_NL(): number[] { + return [0, 0x0a - 1, 0x0a + 1, MAX_RUNE]; + } + + static ANY_RUNE(): number[] { + return [0, MAX_RUNE]; + } + + static compileRegexp(re: Regexp): Prog { + const c = new Compiler(); + const f = c.compile(re); + c.prog.patch(f.out, c.newInst(Inst.MATCH).i); + c.prog.start = f.i; + return c.prog; + } + + constructor() { + this.prog = new Prog(); + this.newInst(Inst.FAIL); + } + + newInst(op: number): Frag { + this.prog.addInst(op); + return new Frag(this.prog.numInst() - 1, new PatchList(), true); + } + + // Returns a no-op fragment. Sometimes unavoidable. + nop(): Frag { + const f = this.newInst(Inst.NOP); + f.out = new PatchList(f.i << 1, f.i << 1); + return f; + } + + fail(): Frag { + return new Frag(); + } + + // Given fragment a, returns (a) capturing as \n. + // Given a fragment a, returns a fragment with capturing parens around a. + cap(arg: number): Frag { + const f = this.newInst(Inst.CAPTURE); + f.out = new PatchList(f.i << 1, f.i << 1); + this.prog.getInst(f.i).arg = arg; + if (this.prog.numCap < arg + 1) { + this.prog.numCap = arg + 1; + } + return f; + } + + // Given fragments a and b, returns ab; a|b + cat(f1: Frag, f2: Frag): Frag { + // concat of failure is failure + if (f1.i === 0 || f2.i === 0) { + return this.fail(); + } + // eslint-disable-next-line no-warning-comments + // TODO(rsc): elide nop + this.prog.patch(f1.out, f2.i); + return new Frag(f1.i, f2.out, f1.nullable && f2.nullable); + } + + // Given fragments for a and b, returns fragment for a|b. + alt(f1: Frag, f2: Frag): Frag { + // alt of failure is other + if (f1.i === 0) { + return f2; + } + if (f2.i === 0) { + return f1; + } + const f = this.newInst(Inst.ALT); + const i = this.prog.getInst(f.i); + i.out = f1.i; + i.arg = f2.i; + f.out = this.prog.append(f1.out, f2.out); + f.nullable = f1.nullable || f2.nullable; + return f; + } + + // loop returns the fragment for the main loop of a plus or star. + // For plus, it can be used directly. with f1.i as the entry. + // For star, it can be used directly when f1 can't match an empty string. + // (When f1 can match an empty string, f1* must be implemented as (f1+)? + // to get the priority match order correct.) + loop(f1: Frag, nongreedy: boolean): Frag { + const f = this.newInst(Inst.ALT); + const i = this.prog.getInst(f.i); + if (nongreedy) { + i.arg = f1.i; + f.out = new PatchList(f.i << 1, f.i << 1); + } else { + i.out = f1.i; + f.out = new PatchList((f.i << 1) | 1, (f.i << 1) | 1); + } + this.prog.patch(f1.out, f.i); + return f; + } + + // Given a fragment for a, returns a fragment for a? or a?? (if nongreedy) + quest(f1: Frag, nongreedy: boolean): Frag { + const f = this.newInst(Inst.ALT); + const i = this.prog.getInst(f.i); + if (nongreedy) { + i.arg = f1.i; + f.out = new PatchList(f.i << 1, f.i << 1); + } else { + i.out = f1.i; + f.out = new PatchList((f.i << 1) | 1, (f.i << 1) | 1); + } + f.out = this.prog.append(f.out, f1.out); + return f; + } + + // Given a fragment a, returns a fragment for a* or a*? (if nongreedy) + star(f1: Frag, nongreedy: boolean): Frag { + if (f1.nullable) { + return this.quest(this.plus(f1, nongreedy), nongreedy); + } + return this.loop(f1, nongreedy); + } + + // Given a fragment for a, returns a fragment for a+ or a+? (if nongreedy) + plus(f1: Frag, nongreedy: boolean): Frag { + return new Frag(f1.i, this.loop(f1, nongreedy).out, f1.nullable); + } + + // op is a bitmask of EMPTY_* flags. + empty(op: number): Frag { + const f = this.newInst(Inst.EMPTY_WIDTH); + this.prog.getInst(f.i).arg = op; + f.out = new PatchList(f.i << 1, f.i << 1); + return f; + } + + // flags : parser flags + rune(runes: number[], flags: number): Frag { + const f = this.newInst(Inst.RUNE); + f.nullable = false; + const i = this.prog.getInst(f.i); + i.runes = runes; + flags &= FOLD_CASE; + if (runes.length !== 1 || simpleFold(runes[0]) === runes[0]) { + flags &= ~FOLD_CASE; + } + i.arg = flags; + f.out = new PatchList(f.i << 1, f.i << 1); + if ( + ((flags & FOLD_CASE) === 0 && runes.length === 1) || + (runes.length === 2 && runes[0] === runes[1]) + ) { + i.op = Inst.RUNE1; + } else if (runes.length === 2 && runes[0] === 0 && runes[1] === MAX_RUNE) { + i.op = Inst.RUNE_ANY; + } else if ( + runes.length === 4 && + runes[0] === 0 && + runes[1] === 0x0a - 1 && + runes[2] === 0x0a + 1 && + runes[3] === MAX_RUNE + ) { + i.op = Inst.RUNE_ANY_NOT_NL; + } + return f; + } + + compile(re: Regexp): Frag { + switch (re.op) { + case Regexp.Op.NO_MATCH: + return this.fail(); + case Regexp.Op.EMPTY_MATCH: + return this.nop(); + case Regexp.Op.LITERAL: + if (re.runes.length === 0) { + return this.nop(); + } + let f: Frag | null = null; + for (let r of re.runes) { + const f1 = this.rune([r], re.flags); + f = f === null ? f1 : this.cat(f, f1); + } + return f as Frag; + case Regexp.Op.CHAR_CLASS: + return this.rune(re.runes, re.flags); + case Regexp.Op.ANY_CHAR_NOT_NL: + return this.rune(Compiler.ANY_RUNE_NOT_NL(), 0); + case Regexp.Op.ANY_CHAR: + return this.rune(Compiler.ANY_RUNE(), 0); + case Regexp.Op.BEGIN_LINE: + return this.empty(EMPTY_BEGIN_LINE); + case Regexp.Op.END_LINE: + return this.empty(EMPTY_END_LINE); + case Regexp.Op.BEGIN_TEXT: + return this.empty(EMPTY_BEGIN_TEXT); + case Regexp.Op.END_TEXT: + return this.empty(EMPTY_END_TEXT); + case Regexp.Op.WORD_BOUNDARY: + return this.empty(EMPTY_WORD_BOUNDARY); + case Regexp.Op.NO_WORD_BOUNDARY: + return this.empty(EMPTY_NO_WORD_BOUNDARY); + case Regexp.Op.CAPTURE: { + const bra = this.cap(re.cap << 1); + const sub = this.compile(re.subs[0]); + const ket = this.cap((re.cap << 1) | 1); + return this.cat(this.cat(bra, sub), ket); + } + case Regexp.Op.STAR: + return this.star( + this.compile(re.subs[0]), + (re.flags & NON_GREEDY) !== 0, + ); + case Regexp.Op.PLUS: + return this.plus( + this.compile(re.subs[0]), + (re.flags & NON_GREEDY) !== 0, + ); + case Regexp.Op.QUEST: + return this.quest( + this.compile(re.subs[0]), + (re.flags & NON_GREEDY) !== 0, + ); + case Regexp.Op.CONCAT: { + if (re.subs.length === 0) { + return this.nop(); + } + let f: Frag | null = null; + for (let sub of re.subs) { + const f1 = this.compile(sub); + f = f === null ? f1 : this.cat(f, f1); + } + if (f === null) { + throw new Error("invalid frag"); + } + return f; + } + case Regexp.Op.ALTERNATE: { + if (re.subs.length === 0) { + return this.nop(); + } + let f: Frag | null = null; + for (let sub of re.subs) { + const f1 = this.compile(sub); + f = f === null ? f1 : this.alt(f, f1); + } + if (f === null) { + throw new Error("invalid frag"); + } + return f; + } + default: + throw new RE2JSCompileException("regexp: unhandled case in compile"); + } + } +} + +export { Compiler }; diff --git a/packages/re2/src/DFA.ts b/packages/re2/src/DFA.ts new file mode 100644 index 0000000..f8608c9 --- /dev/null +++ b/packages/re2/src/DFA.ts @@ -0,0 +1,400 @@ +import { Inst } from "./Inst.js"; +import { UNANCHORED, ANCHOR_BOTH } from "./RE2Flags.js"; +import { MAX_ASCII, MAX_RUNE } from "./Unicode.js"; +import { emptyOpContext } from "./Utils.js"; +import type { Prog } from "./Prog.js"; +import type { MachineUTF16Input } from "./MachineInput.js"; + +// FNV-1a 32-bit hash for an array of integers. +const hashPCs = (pcs: Int32Array): number => { + let h = -2128831035; + for (let i = 0; i < pcs.length; i++) { + h ^= pcs[i]; + h = Math.imul(h, 16777619); + } + return h; +}; + +const arraysEqual = (a: Int32Array, b: Int32Array): boolean => { + if (a.length !== b.length) return false; + for (let i = 0; i < a.length; i++) { + if (a[i] !== b[i]) return false; + } + return true; +}; + +class DFAState { + nfaStates: Int32Array; + isMatch: boolean; + hasEmptyWidth: boolean; + matchIDs: number[]; + nextAscii: (DFAState | null)[]; + nextMap: Map; + + constructor( + nfaStates: Int32Array, + isMatch: boolean, + hasEmptyWidth: boolean, + matchIDs: number[] = [], + ) { + this.nfaStates = nfaStates; // Int32Array of Instruction PCs + this.isMatch = isMatch; + this.hasEmptyWidth = hasEmptyWidth; // true if any PC is an EMPTY_WIDTH instruction + this.matchIDs = matchIDs; + this.nextAscii = new Array(MAX_ASCII + 1).fill(null); + this.nextMap = new Map(); + } +} + +class DFA { + prog: Prog; + stateCache: Map; + stateCount: number; + startState: DFAState | null; + stateLimit: number; + cacheClears: number; + failed: boolean; + + static MAX_CACHE_CLEARS = 5; + + constructor(prog: Prog) { + this.prog = prog; + this.stateCache = new Map(); + this.stateCount = 0; + this.startState = null; + this.stateLimit = 10000; + this.cacheClears = 0; + this.failed = false; + } + + // Follows epsilon transitions to find all reachable states without consuming a char. + // Stops at EMPTY_WIDTH (includes the PC but does not follow through). + computeClosure(pcs: number[]): { + pcs: Int32Array; + isMatch: boolean; + hasEmptyWidth: boolean; + matchIDs: number[]; + } { + const closure = new Set(); + const stack = [...pcs]; + let isMatch = false; + let hasEmptyWidth = false; + const matchIDs: number[] = []; + + while (stack.length > 0) { + const pc = stack.pop(); + if (pc === undefined) { + throw new Error("invalid state"); + } + if (closure.has(pc)) continue; + closure.add(pc); + + const inst = this.prog.getInst(pc); + switch (inst.op) { + case Inst.MATCH: + isMatch = true; + if (!matchIDs.includes(inst.arg)) matchIDs.push(inst.arg); + break; + case Inst.ALT: + case Inst.ALT_MATCH: + stack.push(inst.out); + stack.push(inst.arg); + break; + case Inst.NOP: + case Inst.CAPTURE: + stack.push(inst.out); + break; + case Inst.EMPTY_WIDTH: + // Include in state but don't follow through — resolved at step time with context + hasEmptyWidth = true; + break; + } + } + + const sortedPCs = Int32Array.from(closure).sort(); + matchIDs.sort((a, b) => a - b); + return { pcs: sortedPCs, isMatch, hasEmptyWidth, matchIDs }; + } + + // Resolve EMPTY_WIDTH PCs using the given context. + // Returns { resolvedPCs: Set, isMatch: boolean } + resolveEmptyWidth( + nfaStates: Int32Array, + context: number, + ): { resolvedPCs: Set; isMatch: boolean } { + const resolved = new Set(); + const stack: number[] = []; + let isMatch = false; + + // Start with all PCs in the state + for (let i = 0; i < nfaStates.length; i++) { + const pc = nfaStates[i]; + const inst = this.prog.getInst(pc); + if (inst.op === Inst.EMPTY_WIDTH) { + // Check if context satisfies the empty-width condition + if ((inst.arg & ~context) === 0) { + stack.push(inst.out); + } + } else { + resolved.add(pc); + if (inst.op === Inst.MATCH) { + isMatch = true; + } + } + } + + // Follow through from resolved EMPTY_WIDTH transitions + while (stack.length > 0) { + const pc = stack.pop(); + if (pc === undefined) { + throw new Error("invalid state"); + } + if (resolved.has(pc)) continue; + resolved.add(pc); + + const inst = this.prog.getInst(pc); + switch (inst.op) { + case Inst.MATCH: + isMatch = true; + break; + case Inst.ALT: + case Inst.ALT_MATCH: + stack.push(inst.out); + stack.push(inst.arg); + break; + case Inst.NOP: + case Inst.CAPTURE: + stack.push(inst.out); + break; + case Inst.EMPTY_WIDTH: + if ((inst.arg & ~context) === 0) { + stack.push(inst.out); + } + break; + } + } + + return { resolvedPCs: resolved, isMatch }; + } + + getState(pcs: number[]): DFAState | null { + const closureResult = this.computeClosure(pcs); + + const sortedPCs = closureResult.pcs; + const hash = hashPCs(sortedPCs); + + let bucket = this.stateCache.get(hash); + if (bucket) { + for (let i = 0; i < bucket.length; i++) { + const state = bucket[i]; + if (arraysEqual(state.nfaStates, sortedPCs)) { + return state; + } + } + } else { + bucket = []; + this.stateCache.set(hash, bucket); + } + + if (this.failed) return null; + + if (this.stateCount >= this.stateLimit) { + this.stateCache.clear(); + this.stateCount = 0; + this.startState = null; + this.cacheClears++; + + if (this.cacheClears >= DFA.MAX_CACHE_CLEARS) { + this.failed = true; + } + return null; + } + + const state = new DFAState( + sortedPCs, + closureResult.isMatch, + closureResult.hasEmptyWidth, + closureResult.matchIDs, + ); + bucket.push(state); + this.stateCount++; + return state; + } + + // Compute the next DFA state given a current state, a character, and context. + // Context is needed only when the state has EMPTY_WIDTH PCs. + step( + state: DFAState, + charCode: number, + anchor: number, + context: number, + ): DFAState | null { + // Cache lookup + let cacheKey = 0; + if (state.hasEmptyWidth) { + // Context-dependent: include context in key + cacheKey = + charCode * 128 + (context & 0x3f) * 2 + (anchor === UNANCHORED ? 0 : 1); + const cached = state.nextMap.get(cacheKey); + if (cached !== undefined || state.nextMap.has(cacheKey)) { + return cached ?? null; + } + } else { + // Context-independent: use original caching + if (anchor === UNANCHORED && charCode <= MAX_ASCII) { + const next = state.nextAscii[charCode]; + if (next !== null) { + return next; + } + } else { + cacheKey = charCode + (anchor === UNANCHORED ? 0 : MAX_RUNE + 1); + const cached = state.nextMap.get(cacheKey); + if (cached !== undefined || state.nextMap.has(cacheKey)) { + return cached ?? null; + } + } + } + + // Determine which PCs to check for RUNE matches + let activePCs: Set | Int32Array; + if (state.hasEmptyWidth) { + const { resolvedPCs } = this.resolveEmptyWidth(state.nfaStates, context); + activePCs = resolvedPCs; + } else { + activePCs = state.nfaStates; + } + + // Collect next PCs from RUNE matches + const nextPCs = []; + const iterPCs = activePCs instanceof Set ? activePCs : state.nfaStates; + for (const pc of iterPCs) { + const inst = this.prog.getInst(pc); + if (Inst.isRuneOp(inst.op) && inst.matchRune(charCode)) { + nextPCs.push(inst.out); + } + } + + if (anchor === UNANCHORED) { + nextPCs.push(this.prog.start); + } + + const nextState = this.getState(nextPCs); + + // Cache the result + if (state.hasEmptyWidth) { + state.nextMap.set(cacheKey, nextState); + } else if (anchor === UNANCHORED && charCode <= MAX_ASCII) { + state.nextAscii[charCode] = nextState; + } else { + cacheKey = charCode + (anchor === UNANCHORED ? 0 : MAX_RUNE + 1); + state.nextMap.set(cacheKey, nextState); + } + + return nextState; + } + + // The hot loop: Execute the Lazy DFA + match(input: MachineUTF16Input, pos: number, anchor: number): boolean | null { + if (!this.startState) { + this.startState = this.getState([this.prog.start]); + if (!this.startState) return null; + } + + const endPos = input.endPos(); + let currentState: DFAState | null = this.startState; + // prevRune: the rune immediately before position `pos`. For pos=0 this is + // -1 (beginning-of-text sentinel). For pos>0 we query the input so that + // ^, \A, and \b anchors use the correct context when matching begins + // from a mid-text offset. + let prevRune = -1; + if (pos > 0) { + const r = input.step(pos - 1) >> 3; + if (r >= 0) prevRune = r; + } + + // Check if start state matches directly (e.g., empty pattern) + if (currentState.isMatch) { + if (anchor === ANCHOR_BOTH) { + if (pos === endPos) return true; + } else { + return true; + } + } + + let i = pos; + + while (i < endPos) { + const r = input.step(i); + const rune = r >> 3; + const width = r & 7; + if (width === 0) break; + + // Compute context at position i (between prevRune and rune) + const context = emptyOpContext(prevRune, rune); + + // Before consuming: check if EMPTY_WIDTH in current state resolves to MATCH + if (currentState.hasEmptyWidth) { + const { isMatch } = this.resolveEmptyWidth( + currentState.nfaStates, + context, + ); + if (isMatch) { + if (anchor === ANCHOR_BOTH) { + // Match at position i (before consuming rune) — only valid if i === endPos + // which can't happen in this loop, so skip + } else { + return true; + } + } + } + + // Consume rune and transition to next state + if ( + !currentState.hasEmptyWidth && + anchor === UNANCHORED && + rune <= MAX_ASCII + ) { + currentState = + currentState.nextAscii[rune] || + this.step(currentState, rune, anchor, context); + } else { + currentState = this.step(currentState, rune, anchor, context); + } + + if (currentState === null) return null; + + // After consuming: check if new state is a match + if (currentState.isMatch) { + if (anchor === ANCHOR_BOTH) { + if (i + width === endPos) return true; + } else { + return true; + } + } + + if (currentState.nfaStates.length === 0) { + if (anchor !== UNANCHORED) return false; + } + + prevRune = rune; + i += width; + } + + // After the loop: check EMPTY_WIDTH at end of text. + // For all anchor modes, a resolved MATCH here means the pattern succeeded: + // UNANCHORED/ANCHOR_START accept any match; ANCHOR_BOTH accepts it because + // we have consumed the entire input up to endPos. + if (currentState.hasEmptyWidth) { + const endContext = emptyOpContext(prevRune, -1); + const { isMatch } = this.resolveEmptyWidth( + currentState.nfaStates, + endContext, + ); + if (isMatch) return true; + } + + return false; + } +} + +export { DFA }; diff --git a/packages/re2/src/Inst.ts b/packages/re2/src/Inst.ts new file mode 100644 index 0000000..23c194f --- /dev/null +++ b/packages/re2/src/Inst.ts @@ -0,0 +1,90 @@ +import { FOLD_CASE } from "./RE2Flags.js"; +import { equalsIgnoreCase } from "./Unicode.js"; +/** + * A single instruction in the regular expression virtual machine. + * + * @see http://swtch.com/~rsc/regexp/regexp2.html + */ +class Inst { + static ALT = 1; + static ALT_MATCH = 2; + static CAPTURE = 3; + static EMPTY_WIDTH = 4; + static FAIL = 5; + static MATCH = 6; + static NOP = 7; + static RUNE = 8; + static RUNE1 = 9; + static RUNE_ANY = 10; + static RUNE_ANY_NOT_NL = 11; + + op: number; + out: number; + arg: number; + runes: number[]; + + static isRuneOp(op: number): boolean { + return Inst.RUNE <= op && op <= Inst.RUNE_ANY_NOT_NL; + } + + constructor(op: number) { + this.op = op; + this.out = 0; // all but MATCH, FAIL + this.arg = 0; // ALT, ALT_MATCH, CAPTURE, EMPTY_WIDTH + // length==1 => exact match + // otherwise a list of [lo,hi] pairs. hi is *inclusive*. + this.runes = []; + } + + // MatchRune returns true if the instruction matches (and consumes) r. + // It should only be called when op is a rune op. + matchRune(r: number): boolean { + // Special case: single-rune slice is from literal string, not char + // class. + if (this.runes.length === 1) { + const r0 = this.runes[0]; + // If this pattern is case-insensitive, apply Unicode case folding to compare the two runes. + // Note that this may result in a case-folding loop when executed, + // so attempt to reduce the chance of that occurring + // by performing case folding on |r0| from the pattern rather than |r| from the input. + if ((this.arg & FOLD_CASE) !== 0) { + return equalsIgnoreCase(r0, r); + } + return r === r0; + } + + const len = this.runes.length; + // If the array is exactly 2, 4, 6, or 8 items, DO NOT fall through to binary search + if (len === 2 || len === 4 || len === 6 || len === 8) { + for (let j = 0; j < len; j += 2) { + if (r < this.runes[j]) { + return false; + } + if (r <= this.runes[j + 1]) { + return true; + } + } + return false; // Stop here + } + + // Otherwise binary search. + let lo = 0; + let hi = (this.runes.length / 2) | 0; + while (lo < hi) { + const m = (lo + hi) >> 1; // native cpu instruction for "lo + (((hi - lo) / 2) | 0)" + const c = this.runes[2 * m]; + if (c <= r) { + if (r <= this.runes[2 * m + 1]) { + return true; + } + lo = m + 1; + } else { + hi = m; + } + } + + return false; + } +} + +export { Inst }; diff --git a/packages/re2/src/MachineInput.ts b/packages/re2/src/MachineInput.ts new file mode 100644 index 0000000..a4536cf --- /dev/null +++ b/packages/re2/src/MachineInput.ts @@ -0,0 +1,101 @@ +import { emptyOpContext } from "./Utils.js"; +import { + MAX_HIGH_SURROGATE, + MAX_LOW_SURROGATE, + MIN_HIGH_SURROGATE, + MIN_LOW_SURROGATE, + MIN_SUPPLEMENTARY_CODE_POINT, +} from "./Unicode.js"; +import type { Prefilter } from "./Prefilter.js"; +import type { RE2 } from "./RE2.js"; + +class MachineUTF16Input { + charSequence: string; + start: number; + end!: number; + + constructor(charSequence: string, start = 0, end = charSequence.length) { + this.charSequence = charSequence; + this.start = start; + this.end = end; + } + + static EOF(): number { + return -1 << 3; + } + + endPos(): number { + return this.end; + } + + hasString(prefilter: Prefilter, pos: number): boolean { + const idx = this.charSequence.indexOf(prefilter.str, this.start + pos); + return idx !== -1 && idx <= this.end - prefilter.str.length; + } + + step(pos: number): number { + pos += this.start; + if (pos >= this.end) { + return MachineUTF16Input.EOF(); + } + + const c1 = this.charSequence.charCodeAt(pos); + + // Fast path: standard BMP character (not a high surrogate) + if ( + c1 < MIN_HIGH_SURROGATE || + c1 > MAX_HIGH_SURROGATE || + pos + 1 >= this.end + ) { + return (c1 << 3) | 1; + } + + // Slow path: Calculate surrogate pair manually + const c2 = this.charSequence.charCodeAt(pos + 1); + if (c2 >= MIN_LOW_SURROGATE && c2 <= MAX_LOW_SURROGATE) { + const rune = + (c1 - MIN_HIGH_SURROGATE) * 0x400 + + (c2 - MIN_LOW_SURROGATE) + + MIN_SUPPLEMENTARY_CODE_POINT; + return (rune << 3) | 2; + } + + // Invalid surrogate pair fallback + return (c1 << 3) | 1; + } + + index(re2: RE2, pos: number): number { + pos += this.start; + const i = this.charSequence.indexOf(re2.prefix, pos); + return i < 0 ? i : i - pos; + } + + context(pos: number): number { + pos += this.start; + const r1: number | undefined = + pos > 0 && pos <= this.charSequence.length + ? this.charSequence.codePointAt(pos - 1) + : -1; + const r2: number | undefined = + pos < this.charSequence.length ? this.charSequence.codePointAt(pos) : -1; + + if (r1 === undefined || r2 === undefined) { + throw new Error("invalid state"); + } + return emptyOpContext(r1, r2); + } + + prefixLength(re2: RE2): number { + return re2.prefix.length; + } +} + +function fromUTF16( + charSequence: string, + start = 0, + end = charSequence.length, +): MachineUTF16Input { + return new MachineUTF16Input(charSequence, start, end); +} + +export { fromUTF16, MachineUTF16Input }; diff --git a/packages/re2/src/Parser.ts b/packages/re2/src/Parser.ts new file mode 100644 index 0000000..f15b7e4 --- /dev/null +++ b/packages/re2/src/Parser.ts @@ -0,0 +1,1743 @@ +import { + CLASS_NL, + DOT_NL, + FOLD_CASE, + LITERAL, + NON_GREEDY, + ONE_LINE, + UNICODE_GROUPS, + WAS_DOLLAR, +} from "./RE2Flags.js"; +import { + MAX_ASCII, + MAX_BMP, + MAX_FOLD, + MAX_RUNE, + MIN_FOLD, + simpleFold, +} from "./Unicode.js"; +import { UnicodeTables } from "./UnicodeTables.js"; +import { UnicodeRangeTable } from "./UnicodeRangeTable.js"; +import { getPerlGroups, getPosixGroups } from "./CharGroup.js"; +import { + unhex, + isalnum, + charCount, + stringToRunes, + runeToString, +} from "./Utils.js"; +import { codePointAtOrThrow } from "./__utils__/chars.js"; +import { CharClass } from "./CharClass.js"; +import { RE2JSSyntaxException } from "./exceptions.js"; +import { Regexp } from "./Regexp.js"; + +// StringIterator: a stream of runes with an opaque cursor, permitting +// rewinding. The units of the cursor are not specified beyond the +// fact that ASCII characters are single width. (Cursor positions +// could be UTF-8 byte indices, UTF-16 code indices or rune indices.) +// +// In particular, be careful with: +// - skip: only use this to advance over ASCII characters +// since these always have a width of 1. +// - skipString: only use this to advance over strings which are +// known to be at the current position, e.g. due to prior call to +// lookingAt(). +// Only use pop() to advance over possibly non-ASCII runes. +class StringIterator { + str: string; + position: number; + + constructor(str: string) { + this.str = str; + this.position = 0; + } + + // Returns the cursor position. Do not interpret the result! + pos(): number { + return this.position; + } + + // Resets the cursor position to a previous value returned by pos(). + rewindTo(pos: number): void { + this.position = pos; + } + + // Returns true unless the stream is exhausted. + more(): boolean { + return this.position < this.str.length; + } + + // Returns the rune at the cursor position. + // Precondition: |more()|. + peek(): number { + return codePointAtOrThrow(this.str, this.position); + } + + // Advances the cursor by |n| positions, which must be ASCII runes. + // + // (In practise, this is only ever used to skip over regexp + // metacharacters that are ASCII, so there is no numeric difference + // between indices into UTF-8 bytes, UTF-16 codes and runes.) + skip(n: number): void { + this.position += n; + } + + // Advances the cursor by the number of cursor positions in |s|. + skipString(s: string): void { + this.position += s.length; + } + + // Returns the rune at the cursor position, and advances the cursor + // past it. Precondition: |more()|. + pop(): number { + const r = codePointAtOrThrow(this.str, this.position); + this.position += charCount(r); + return r; + } + + lookingAt(s: string): boolean { + return this.str.startsWith(s, this.position); + } + + // Returns the rest of the pattern from the current position. + rest(): string { + return this.str.substring(this.position); + } + + // Returns the substring from |beforePos| to the current position. + // |beforePos| must have been previously returned by |pos()|. + from(beforePos: number): string { + return this.str.substring(beforePos, this.position); + } + + toString(): string { + return this.rest(); + } +} + +/** + * A parser of regular expression patterns. + * + * The only public entry point is {@link #parse(String pattern, int flags)}. + */ +class Parser { + // Parse errors + static ERR_INVALID_CHAR_RANGE = "invalid character class range"; + static ERR_INVALID_ESCAPE = "invalid escape sequence"; + static ERR_INVALID_NAMED_CAPTURE = "invalid named capture"; + static ERR_INVALID_PERL_OP = "invalid or unsupported Perl syntax"; + static ERR_INVALID_REPEAT_OP = "invalid nested repetition operator"; + static ERR_INVALID_REPEAT_SIZE = "invalid repeat count"; + static ERR_MISSING_BRACKET = "missing closing ]"; + static ERR_MISSING_PAREN = "missing closing )"; + static ERR_MISSING_REPEAT_ARGUMENT = + "missing argument to repetition operator"; + static ERR_TRAILING_BACKSLASH = "trailing backslash at end of expression"; + static ERR_DUPLICATE_NAMED_CAPTURE = "duplicate capture group name"; + static ERR_UNEXPECTED_PAREN = "unexpected )"; + static ERR_NESTING_DEPTH = "expression nests too deeply"; + static ERR_LARGE = "expression too large"; + static ERR_BAD_EXPRESSION = "expression not valid"; + + // maxHeight is the maximum height of a regexp parse tree. + // It is somewhat arbitrarily chosen, but the idea is to be large enough + // that no one will actually hit in real use but at the same time small enough + // that recursion on the Regexp tree will not hit the 1GB Go stack limit. + // The maximum amount of stack for a single recursive frame is probably + // closer to 1kB, so this could potentially be raised, but it seems unlikely + // that people have regexps nested even this deeply. + // We ran a test on Google's C++ code base and turned up only + // a single use case with depth > 100; it had depth 128. + // Using depth 1000 should be plenty of margin. + // As an optimization, we don't even bother calculating heights + // until we've allocated at least maxHeight Regexp structures. + static MAX_HEIGHT = 1000; + + // maxSize is the maximum size of a compiled regexp in Insts. + // It too is somewhat arbitrarily chosen, but the idea is to be large enough + // to allow significant regexps while at the same time small enough that + // the compiled form will not take up too much memory. + // 128 MB is enough for a 3.3 million Inst structures, which roughly + // corresponds to a 3.3 MB regexp. + static MAX_SIZE = 3355443; // 128 << 20 / (5 * 8) (instSize = byte, 2 uint32, slice is 5 64-bit words) + + // maxRunes is the maximum number of runes allowed in a regexp tree + // counting the runes in all the nodes. + // Ignoring character classes p.numRunes is always less than the length of the regexp. + // Character classes can make it much larger: each \pL adds 1292 runes. + // 128 MB is enough for 32M runes, which is over 26k \pL instances. + // Note that repetitions do not make copies of the rune slices, + // so \pL{1000} is only one rune slice, not 1000. + // We could keep a cache of character classes we've seen, + // so that all the \pL we see use the same rune list, + // but that doesn't remove the problem entirely: + // consider something like [\pL01234][\pL01235][\pL01236]...[\pL^&*()]. + // And because the Rune slice is exposed directly in the Regexp, + // there is not an opportunity to change the representation to allow + // partial sharing between different character classes. + // So the limit is the best we can do. + static MAX_RUNES = 33554432; // 128 << 20 / 4 (runeSize, int32 is 4 bytes) + + // RangeTables are represented as int[][], a list of triples (start, end, + // stride). + static ANY_TABLE = new UnicodeRangeTable(new Uint32Array([0, MAX_RUNE, 1])); + + // Ascii tables + static ASCII_TABLE = new UnicodeRangeTable(new Uint32Array([0, 0x7f, 1])); + static ASCII_FOLD_TABLE = new UnicodeRangeTable( + new Uint32Array([ + 0, + 0x7f, + 1, + 0x017f, + 0x017f, + 1, // Old English long s (ſ), folds to S/s. + 0x212a, + 0x212a, + 1, // Kelvin K, folds to K/k. + ]), + ); + + // unicodeTable() returns the Unicode RangeTable identified by name + // and the table of additional fold-equivalent code points. + // Returns null if |name| does not identify a Unicode character range. + static unicodeTable(name: string): { + tab: UnicodeRangeTable | null; + fold: UnicodeRangeTable | null; + sign: number; + } | null { + if (name === "Any") { + return { tab: Parser.ANY_TABLE, fold: Parser.ANY_TABLE, sign: +1 }; + } + if (name === "Ascii") { + return { + tab: Parser.ASCII_TABLE, + fold: Parser.ASCII_FOLD_TABLE, + sign: +1, + }; + } + if (name === "Assigned") { + // Assigned is the mathematical inversion of Cn (Unassigned) + return { + tab: UnicodeTables.CATEGORIES.get("Cn"), + fold: UnicodeTables.CATEGORIES.get("Cn"), + sign: -1, + }; + } + if (name === "Lc") { + return { + tab: UnicodeTables.CATEGORIES.get("LC"), + fold: UnicodeTables.FOLD_CATEGORIES.get("LC"), + sign: +1, + }; + } + if (UnicodeTables.CATEGORIES.has(name)) { + return { + tab: UnicodeTables.CATEGORIES.get(name), + fold: UnicodeTables.FOLD_CATEGORIES.get(name), + sign: +1, + }; + } + if (UnicodeTables.SCRIPTS.has(name)) { + return { + tab: UnicodeTables.SCRIPTS.get(name), + fold: UnicodeTables.FOLD_SCRIPT.get(name), + sign: +1, + }; + } + return null; + } + + // minFoldRune returns the minimum rune fold-equivalent to r. + static minFoldRune(r: number): number { + if (r < MIN_FOLD || r > MAX_FOLD) { + return r; + } + + let min = r; + const r0 = r; + for (r = simpleFold(r); r !== r0; r = simpleFold(r)) { + if (min > r) { + min = r; + } + } + return min; + } + + static literalRegexp(s: string, flags: number): Regexp { + const re = new Regexp(Regexp.Op.LITERAL); + re.flags = flags; + re.runes = stringToRunes(s) as number[]; + return re; + } + + /** + * Parse regular expression pattern {@code pattern} with mode flags {@code flags}. + * @param {string} pattern + * @param {number} flags + */ + static parse(pattern: string, flags: number): Regexp { + return new Parser(pattern, flags).parseInternal(); + } + + // parseRepeat parses {min} (max=min) or {min,} (max=-1) or {min,max}. + // If |t| is not of that form, it returns -1. + // If |t| has the right form but the values are negative or too big, + // it returns -2. + // On success, returns a nonnegative number encoding min/max in the + // high/low signed halfwords of the result. (Note: min >= 0; max may + // be -1.) + // + // On success, advances |t| beyond the repeat; otherwise |t.pos()| is + // undefined. + static parseRepeat(t: StringIterator): number { + const start = t.pos(); + if (!t.more() || !t.lookingAt("{")) { + return -1; + } + t.skip(1); + + const min = Parser.parseInt(t); + if (min === -1) { + return -1; + } + if (!t.more()) { + return -1; + } + + let max: number; + if (!t.lookingAt(",")) { + max = min; + } else { + t.skip(1); + if (!t.more()) { + return -1; + } + if (t.lookingAt("}")) { + max = -1; + } else { + max = Parser.parseInt(t); + if (max === -1) { + return -1; + } + } + } + + if (!t.more() || !t.lookingAt("}")) { + return -1; + } + t.skip(1); + if ( + min < 0 || + min > 1000 || + max === -2 || + max > 1000 || + (max >= 0 && min > max) + ) { + throw new RE2JSSyntaxException( + Parser.ERR_INVALID_REPEAT_SIZE, + t.from(start), + ); + } + + return (min << 16) | (max & MAX_BMP); + } + + // isValidCaptureName reports whether name + // is a valid capture name: [A-Za-z0-9_]+. + // PCRE limits names to 32 bytes. + // Python rejects names starting with digits. + // We don't enforce either of those. + static isValidCaptureName(name: string): boolean { + if (name.length === 0) { + return false; + } + + for (let i = 0; i < name.length; i++) { + const c = codePointAtOrThrow(name, i); + if (c !== 0x5f && !isalnum(c)) { + return false; + } + } + + return true; + } + + // parseInt parses a nonnegative decimal integer. + // -1 => bad format. -2 => format ok, but integer overflow. + static parseInt(t: StringIterator): number { + const start = t.pos(); + while (t.more() && t.peek() >= 0x30 && t.peek() <= 0x39) { + t.skip(1); + } + + const n = t.from(start); + if (n.length === 0 || (n.length > 1 && n.codePointAt(0) === 0x30)) { + return -1; + } + if (n.length > 8) { + return -2; + } + return parseInt(n, 10); + } + + // can this be represented as a character class? + // single-rune literal string, char class, ., and .|\n. + static isCharClass(re: Regexp): boolean { + return ( + (re.op === Regexp.Op.LITERAL && re.runes.length === 1) || + re.op === Regexp.Op.CHAR_CLASS || + re.op === Regexp.Op.ANY_CHAR_NOT_NL || + re.op === Regexp.Op.ANY_CHAR + ); + } + + // does re match r? + static matchRune(re: Regexp, r: number): boolean { + switch (re.op) { + case Regexp.Op.LITERAL: + return re.runes.length === 1 && re.runes[0] === r; + case Regexp.Op.CHAR_CLASS: + for (let i = 0; i < re.runes.length; i += 2) { + if (re.runes[i] <= r && r <= re.runes[i + 1]) { + return true; + } + } + return false; + case Regexp.Op.ANY_CHAR_NOT_NL: + return r !== 0x0a; + case Regexp.Op.ANY_CHAR: + return true; + } + return false; + } + + // mergeCharClass makes dst = dst|src. + // The caller must ensure that dst.Op >= src.Op, + // to reduce the amount of copying. + static mergeCharClass(dst: Regexp, src: Regexp): void { + switch (dst.op) { + case Regexp.Op.ANY_CHAR: + break; + case Regexp.Op.ANY_CHAR_NOT_NL: + if (Parser.matchRune(src, 0x0a)) { + dst.op = Regexp.Op.ANY_CHAR; + } + break; + case Regexp.Op.CHAR_CLASS: + if (src.op === Regexp.Op.LITERAL) { + dst.runes = new CharClass(dst.runes) + .appendLiteral(src.runes[0], src.flags) + .toArray(); + } else { + dst.runes = new CharClass(dst.runes).appendClass(src.runes).toArray(); + } + break; + case Regexp.Op.LITERAL: + if (src.runes[0] === dst.runes[0] && src.flags === dst.flags) { + break; + } + dst.op = Regexp.Op.CHAR_CLASS; + dst.runes = new CharClass() + .appendLiteral(dst.runes[0], dst.flags) + .appendLiteral(src.runes[0], src.flags) + .toArray(); + break; + } + } + + // parseEscape parses an escape sequence at the beginning of s + // and returns the rune. + // Pre: t at '\\'. Post: after escape. + static parseEscape(t: StringIterator): number { + const startPos = t.pos(); + t.skip(1); // '\\' + if (!t.more()) { + throw new RE2JSSyntaxException(Parser.ERR_TRAILING_BACKSLASH); + } + let c = t.pop(); + switch (c) { + case 0x31: + case 0x32: + case 0x33: + case 0x34: + case 0x35: + case 0x36: + case 0x30: + case 0x37: { + if (c !== 0x30 && (!t.more() || t.peek() < 0x30 || t.peek() > 0x37)) { + break; + } + let r = c - 0x30; + for (let i = 1; i < 3; i++) { + if (!t.more() || t.peek() < 0x30 || t.peek() > 0x37) { + break; + } + r = r * 8 + t.peek() - 0x30; + t.skip(1); + } + return r; + } + case 0x78: { + if (!t.more()) { + break; + } + c = t.pop(); + if (c === 0x7b) { + let nhex = 0; + let r = 0; + + while (true) { + if (!t.more()) { + throw new RE2JSSyntaxException( + Parser.ERR_INVALID_ESCAPE, + t.from(startPos), + ); + } + c = t.pop(); + if (c === 0x7d) { + break; + } + const v = unhex(c); + if (v < 0) { + throw new RE2JSSyntaxException( + Parser.ERR_INVALID_ESCAPE, + t.from(startPos), + ); + } + r = r * 16 + v; + if (r > MAX_RUNE) { + throw new RE2JSSyntaxException( + Parser.ERR_INVALID_ESCAPE, + t.from(startPos), + ); + } + nhex++; + } + if (nhex === 0) { + break; + } + return r; + } + const x = unhex(c); + if (!t.more()) { + break; + } + c = t.pop(); + const y = unhex(c); + if (x < 0 || y < 0) { + break; + } + return x * 16 + y; + } + case 0x61: + return 0x07; + case 0x66: + return 0x0c; + case 0x6e: + return 0x0a; + case 0x72: + return 0x0d; + case 0x74: + return 0x09; + case 0x76: + return 0x0b; + default: + if (c <= MAX_ASCII && !isalnum(c)) { + return c; + } + break; + } + throw new RE2JSSyntaxException(Parser.ERR_INVALID_ESCAPE, t.from(startPos)); + } + + // parseClassChar parses a character class character and returns it. + // wholeClassPos is the position of the start of the entire class "[...". + // Pre: t at class char; Post: t after it. + static parseClassChar(t: StringIterator, wholeClassPos: number): number { + if (!t.more()) { + throw new RE2JSSyntaxException( + Parser.ERR_MISSING_BRACKET, + t.from(wholeClassPos), + ); + } + if (t.lookingAt("\\")) { + return Parser.parseEscape(t); + } + return t.pop(); + } + + static concatRunes(x: number[], y: number[]): number[] { + const r = new Array(x.length + y.length); + for (let i = 0; i < x.length; i++) r[i] = x[i]; + for (let i = 0; i < y.length; i++) r[x.length + i] = y[i]; + return r; + } + + wholeRegexp: string; + flags: number; + numCap: number; + namedGroups: Map; + stack: Regexp[]; + free: Regexp | null; + numRegexp: number; + numRunes: number; + repeats: number; + height: Map | null; + size: Map | null; + + constructor(wholeRegexp: string, flags = 0) { + this.wholeRegexp = wholeRegexp; + // Flags control the behavior of the parser and record information about + // regexp context. + this.flags = flags; + // number of capturing groups seen + this.numCap = 0; + this.namedGroups = new Map(); + // Stack of parsed expressions. + this.stack = []; + this.free = null; + // checks + this.numRegexp = 0; // number of regexps allocated + this.numRunes = 0; // number of runes in char classes + this.repeats = 0; // product of all repetitions seen + this.height = null; // regexp height, for height limit check + this.size = null; // regexp compiled size, for size limit check + } + + // Allocate a Regexp, from the free list if possible. + newRegexp(op: number): Regexp { + let re = this.free; + if (re !== null && re.subs !== null && re.subs.length > 0) { + this.free = re.subs[0]; + re.reinit(); + re.op = op; + } else { + re = new Regexp(op); + this.numRegexp += 1; + } + return re; + } + + reuse(re: Regexp): void { + if (this.height !== null) { + this.height.delete(re); + } + if (re.subs !== null && re.subs.length > 0) { + // subs[0] doubles as the free-list next pointer while re is on the list. + re.subs[0] = this.free as Regexp; + } + this.free = re; + } + + checkLimits(re: Regexp): void { + if (this.numRunes > Parser.MAX_RUNES) { + throw new RE2JSSyntaxException(Parser.ERR_LARGE); + } + this.checkSize(re); + this.checkHeight(re); + } + + checkSize(re: Regexp): void { + if (this.size === null) { + // We haven't started tracking size yet. + // Do a relatively cheap check to see if we need to start. + // Maintain the product of all the repeats we've seen + // and don't track if the total number of regexp nodes + // we've seen times the repeat product is in budget. + if (this.repeats === 0) { + this.repeats = 1; + } + if (re.op === Regexp.Op.REPEAT) { + let n = re.max; + if (n === -1) { + n = re.min; + } + if (n <= 0) { + n = 1; + } + if (n > Parser.MAX_SIZE / this.repeats) { + this.repeats = Parser.MAX_SIZE; + } else { + this.repeats *= n; + } + } + if (this.numRegexp < Parser.MAX_SIZE / this.repeats) { + return; + } + + // We need to start tracking size. + // Make the map and belatedly populate it + // with info about everything we've constructed so far. + this.size = new Map(); + for (let reEx of this.stack) { + this.checkSize(reEx); + } + } + + if (this.calcSize(re, true) > Parser.MAX_SIZE) { + throw new RE2JSSyntaxException(Parser.ERR_LARGE); + } + } + + calcSize(re: Regexp, force = false): number { + if (!force && this.size !== null) { + const cached = this.size.get(re); + if (cached !== undefined) { + return cached; + } + } + + let size = 0; + switch (re.op) { + case Regexp.Op.LITERAL: { + size = re.runes.length; + break; + } + case Regexp.Op.CAPTURE: + case Regexp.Op.STAR: { + // star can be 1+ or 2+; assume 2 pessimistically + size = 2 + this.calcSize(re.subs[0]); + break; + } + case Regexp.Op.PLUS: + case Regexp.Op.QUEST: { + size = 1 + this.calcSize(re.subs[0]); + break; + } + case Regexp.Op.CONCAT: { + for (let sub of re.subs) { + size = size + this.calcSize(sub); + } + break; + } + case Regexp.Op.ALTERNATE: { + for (let sub of re.subs) { + size = size + this.calcSize(sub); + } + if (re.subs.length > 1) { + size = size + re.subs.length - 1; + } + break; + } + case Regexp.Op.REPEAT: { + let sub = this.calcSize(re.subs[0]); + if (re.max === -1) { + if (re.min === 0) { + size = 2 + sub; // x* + } else { + size = 1 + re.min * sub; // xxx+ + } + break; + } + // x{2,5} = xx(x(x(x)?)?)? + size = re.max * sub + (re.max - re.min); + break; + } + } + + size = Math.max(1, size); + this.size?.set(re, size); + return size; + } + + checkHeight(re: Regexp): void { + if (this.numRegexp < Parser.MAX_HEIGHT) { + return; + } + if (this.height === null) { + this.height = new Map(); + for (let reEx of this.stack) { + this.checkHeight(reEx); + } + } + if (this.calcHeight(re, true) > Parser.MAX_HEIGHT) { + throw new RE2JSSyntaxException(Parser.ERR_NESTING_DEPTH); + } + } + + calcHeight(re: Regexp, force = false): number { + if (!force && this.height !== null) { + const cached = this.height.get(re); + if (cached !== undefined) { + return cached; + } + } + let h = 1; + for (let sub of re.subs) { + const hsub = this.calcHeight(sub); + if (h < 1 + hsub) { + h = 1 + hsub; + } + } + this.height?.set(re, h); + return h; + } + + // Parse stack manipulation. + + pop(): Regexp | undefined { + return this.stack.pop(); + } + + popToPseudo(): Regexp[] { + const n = this.stack.length; + let i = n; + while (i > 0 && !Regexp.isPseudoOp(this.stack[i - 1].op)) { + i--; + } + + const r = this.stack.slice(i, n); + this.stack = this.stack.slice(0, i); + return r; + } + + // push pushes the regexp re onto the parse stack and returns the regexp. + // Returns null for a CHAR_CLASS that can be merged with the top-of-stack. + push(re: Regexp): Regexp | null { + this.numRunes += re.runes.length; + if ( + re.op === Regexp.Op.CHAR_CLASS && + re.runes.length === 2 && + re.runes[0] === re.runes[1] + ) { + if (this.maybeConcat(re.runes[0], this.flags & ~FOLD_CASE)) { + return null; + } + re.op = Regexp.Op.LITERAL; + re.runes = [re.runes[0]]; + re.flags = this.flags & ~FOLD_CASE; + } else if ( + (re.op === Regexp.Op.CHAR_CLASS && + re.runes.length === 4 && + re.runes[0] === re.runes[1] && + re.runes[2] === re.runes[3] && + simpleFold(re.runes[0]) === re.runes[2] && + simpleFold(re.runes[2]) === re.runes[0]) || + (re.op === Regexp.Op.CHAR_CLASS && + re.runes.length === 2 && + re.runes[0] + 1 === re.runes[1] && + simpleFold(re.runes[0]) === re.runes[1] && + simpleFold(re.runes[1]) === re.runes[0]) + ) { + // Case-insensitive rune like [Aa] or [Δδ]. + if (this.maybeConcat(re.runes[0], this.flags | FOLD_CASE)) { + return null; + } + // Rewrite as (case-insensitive) literal. + re.op = Regexp.Op.LITERAL; + re.runes = [re.runes[0]]; + re.flags = this.flags | FOLD_CASE; + } else { + // Incremental concatenation. + this.maybeConcat(-1, 0); + } + this.stack.push(re); + this.checkLimits(re); + return re; + } + + // maybeConcat implements incremental concatenation + // of literal runes into string nodes. The parser calls this + // before each push, so only the top fragment of the stack + // might need processing. Since this is called before a push, + // the topmost literal is no longer subject to operators like * + // (Otherwise ab* would turn into (ab)*.) + // If (r >= 0 and there's a node left over, maybeConcat uses it + // to push r with the given flags. + // maybeConcat reports whether r was pushed. + maybeConcat(r: number, flags: number): boolean { + const n = this.stack.length; + if (n < 2) { + return false; + } + const re1 = this.stack[n - 1]; + const re2 = this.stack[n - 2]; + if ( + re1.op !== Regexp.Op.LITERAL || + re2.op !== Regexp.Op.LITERAL || + (re1.flags & FOLD_CASE) !== (re2.flags & FOLD_CASE) + ) { + return false; + } + // Push re1 into re2. + re2.runes = Parser.concatRunes(re2.runes, re1.runes); + // Reuse re1 if possible. + if (r >= 0) { + re1.runes = [r]; + re1.flags = flags; + return true; + } + this.pop(); + this.reuse(re1); + return false; // did not push r + } + + // newLiteral returns a new LITERAL Regexp with the given flags + newLiteral(r: number, flags: number): Regexp { + const re = this.newRegexp(Regexp.Op.LITERAL); + re.flags = flags; + if ((flags & FOLD_CASE) !== 0) { + r = Parser.minFoldRune(r); + } + re.runes = [r]; + return re; + } + + // literal pushes a literal regexp for the rune r on the stack + // and returns that regexp. + literal(r: number): void { + this.push(this.newLiteral(r, this.flags)); + } + + // op pushes a regexp with the given op onto the stack + // and returns that regexp. + op(op: number): Regexp | null { + const re = this.newRegexp(op); + re.flags = this.flags; + return this.push(re); + } + + // repeat replaces the top stack element with itself repeated according to + // op, min, max. beforePos is the start position of the repetition operator. + // Pre: t is positioned after the initial repetition operator. + // Post: t advances past an optional perl-mode '?', or stays put. + // Or, it fails with RE2JSSyntaxException. + repeat( + op: number, + min: number, + max: number, + beforePos: number, + t: StringIterator, + lastRepeatPos: number, + ): void { + let flags = this.flags; + if (t.more() && t.lookingAt("?")) { + t.skip(1); + flags ^= NON_GREEDY; + } + if (lastRepeatPos !== -1) { + throw new RE2JSSyntaxException( + Parser.ERR_INVALID_REPEAT_OP, + t.from(lastRepeatPos), + ); + } + + const n = this.stack.length; + if (n === 0) { + throw new RE2JSSyntaxException( + Parser.ERR_MISSING_REPEAT_ARGUMENT, + t.from(beforePos), + ); + } + + const sub = this.stack[n - 1]; + if (Regexp.isPseudoOp(sub.op)) { + throw new RE2JSSyntaxException( + Parser.ERR_MISSING_REPEAT_ARGUMENT, + t.from(beforePos), + ); + } + + const re = this.newRegexp(op); + re.min = min; + re.max = max; + re.flags = flags; + re.subs = [sub]; + this.stack[n - 1] = re; + + this.checkLimits(re); + + if ( + op === Regexp.Op.REPEAT && + (min >= 2 || max >= 2) && + !this.repeatIsValid(re, 1000) + ) { + throw new RE2JSSyntaxException( + Parser.ERR_INVALID_REPEAT_SIZE, + t.from(beforePos), + ); + } + } + + // repeatIsValid reports whether the repetition re is valid. + // Valid means that the combination of the top-level repetition + // and any inner repetitions does not exceed n copies of the + // innermost thing. + // This function rewalks the regexp tree and is called for every repetition, + // so we have to worry about inducing quadratic behavior in the parser. + // We avoid this by only calling repeatIsValid when min or max >= 2. + // In that case the depth of any >= 2 nesting can only get to 9 without + // triggering a parse error, so each subtree can only be rewalked 9 times. + repeatIsValid(re: Regexp, n: number): boolean { + if (re.op === Regexp.Op.REPEAT) { + let m = re.max; + if (m === 0) { + return true; + } + if (m < 0) { + m = re.min; + } + if (m > n) { + return false; + } + if (m > 0) { + n = Math.trunc(n / m); + } + } + + for (let sub of re.subs) { + if (!this.repeatIsValid(sub, n)) { + return false; + } + } + + return true; + } + + // concat replaces the top of the stack (above the topmost '|' or '(') with + // its concatenation. + concat(): Regexp | null { + this.maybeConcat(-1, 0); + const subs = this.popToPseudo(); + if (subs.length === 0) { + return this.push(this.newRegexp(Regexp.Op.EMPTY_MATCH)); + } + return this.push(this.collapse(subs, Regexp.Op.CONCAT)); + } + + // alternate replaces the top of the stack (above the topmost '(') with its + // alternation. + alternate(): Regexp | null { + // Scan down to find pseudo-operator (. + // There are no | above (. + const subs = this.popToPseudo(); + // Make sure top class is clean. + // All the others already are (see swapVerticalBar). + if (subs.length > 0) { + this.cleanAlt(subs[subs.length - 1]); + } + // Empty alternate is special case + // (shouldn't happen but easy to handle). + if (subs.length === 0) { + return this.push(this.newRegexp(Regexp.Op.NO_MATCH)); + } + return this.push(this.collapse(subs, Regexp.Op.ALTERNATE)); + } + + // cleanAlt cleans re for eventual inclusion in an alternation. + cleanAlt(re: Regexp): void { + if (re.op === Regexp.Op.CHAR_CLASS) { + re.runes = new CharClass(re.runes).cleanClass().toArray(); + if ( + re.runes.length === 2 && + re.runes[0] === 0 && + re.runes[1] === MAX_RUNE + ) { + re.runes = []; + re.op = Regexp.Op.ANY_CHAR; + } else if ( + re.runes.length === 4 && + re.runes[0] === 0 && + re.runes[1] === 0x0a - 1 && + re.runes[2] === 0x0a + 1 && + re.runes[3] === MAX_RUNE + ) { + re.runes = []; + re.op = Regexp.Op.ANY_CHAR_NOT_NL; + } + } + } + + // collapse returns the result of applying op to subs[start:end]. + // If (sub contains op nodes, they all get hoisted up + // so that there is never a concat of a concat or an + // alternate of an alternate. + collapse(subs: Regexp[], op: number): Regexp { + if (subs.length === 1) { + return subs[0]; + } + // Concatenate subs iff op is same. + // Compute length in first pass. + let len = 0; + for (let sub of subs) { + len += sub.op === op ? sub.subs.length : 1; + } + let newsubs = new Array(len).fill(null); + let i = 0; + for (let sub of subs) { + if (sub.op === op) { + for (let j = 0; j < sub.subs.length; j++) { + newsubs[i++] = sub.subs[j]; + } + this.reuse(sub); + } else { + newsubs[i++] = sub; + } + } + + let re = this.newRegexp(op); + re.subs = newsubs; + if (op === Regexp.Op.ALTERNATE) { + if (re.subs.length === 1) { + const old = re; + re = re.subs[0]; + this.reuse(old); + } + } + return re; + } + + parseInternal(): Regexp { + if ((this.flags & LITERAL) !== 0) { + // Trivial parser for literal string. + return Parser.literalRegexp(this.wholeRegexp, this.flags); + } + // Otherwise, must do real work. + let lastRepeatPos = -1; + let min = -1; + let max = -1; + const t = new StringIterator(this.wholeRegexp); + while (t.more()) { + { + let repeatPos = -1; + switch (t.peek()) { + case 0x28: + if (t.lookingAt("(?")) { + // Flag changes and non-capturing groups. + this.parsePerlFlags(t); + break; + } + const lparen = this.op(Regexp.Op.LEFT_PAREN); + if (lparen === null) { + throw new Error("op(LEFT_PAREN) unexpectedly returned null"); + } + lparen.cap = ++this.numCap; + t.skip(1); // '(' + break; + case 0x7c: + this.parseVerticalBar(); // '|' + t.skip(1); // '|' + break; + case 0x29: + this.parseRightParen(); + t.skip(1); // ')' + break; + case 0x5e: + if ((this.flags & ONE_LINE) !== 0) { + this.op(Regexp.Op.BEGIN_TEXT); + } else { + this.op(Regexp.Op.BEGIN_LINE); + } + t.skip(1); // '^' + break; + case 0x24: + if ((this.flags & ONE_LINE) !== 0) { + const endText = this.op(Regexp.Op.END_TEXT); + if (endText === null) { + throw new Error("op(END_TEXT) unexpectedly returned null"); + } + endText.flags |= WAS_DOLLAR; + } else { + this.op(Regexp.Op.END_LINE); + } + t.skip(1); // '$' + break; + case 0x2e: + if ((this.flags & DOT_NL) !== 0) { + this.op(Regexp.Op.ANY_CHAR); + } else { + this.op(Regexp.Op.ANY_CHAR_NOT_NL); + } + t.skip(1); // '.' + break; + case 0x5b: + this.parseClass(t); + break; + case 0x2a: + case 0x2b: + case 0x3f: { + repeatPos = t.pos(); + let op: number | null = null; + switch (t.pop()) { + case 0x2a: + op = Regexp.Op.STAR; + break; + case 0x2b: + op = Regexp.Op.PLUS; + break; + case 0x3f: + op = Regexp.Op.QUEST; + break; + } + if (op === null) { + throw new Error("repeat op unexpectedly null"); + } + this.repeat(op, min, max, repeatPos, t, lastRepeatPos); + // (min and max are now dead.) + break; + } + + case 0x7b: { + repeatPos = t.pos(); + const minMax = Parser.parseRepeat(t); + if (minMax < 0) { + // If the repeat cannot be parsed, { is a literal. + t.rewindTo(repeatPos); + this.literal(t.pop()); // '{' + break; + } + min = minMax >> 16; + max = ((minMax & MAX_BMP) << 16) >> 16; + this.repeat( + Regexp.Op.REPEAT, + min, + max, + repeatPos, + t, + lastRepeatPos, + ); + break; + } + + case 0x5c: { + const savedPos = t.pos(); + t.skip(1); // '\\' + let handled = false; + if (t.more()) { + const c = t.pop(); + switch (c) { + case 0x41: + this.op(Regexp.Op.BEGIN_TEXT); + handled = true; + break; + case 0x62: + this.op(Regexp.Op.WORD_BOUNDARY); + handled = true; + break; + case 0x42: + this.op(Regexp.Op.NO_WORD_BOUNDARY); + handled = true; + break; + case 0x43: + // any byte; not supported + throw new RE2JSSyntaxException( + Parser.ERR_INVALID_ESCAPE, + "\\C", + ); + case 0x51: { + // \Q ... \E: the ... is always literals + let lit = t.rest(); + const i = lit.indexOf("\\E"); + if (i >= 0) { + lit = lit.substring(0, i); + t.skipString(lit); + t.skipString("\\E"); + } else { + t.skipString(lit); + } + + let j = 0; + while (j < lit.length) { + const codepoint = codePointAtOrThrow(lit, j); + this.literal(codepoint); + j += charCount(codepoint); + } + handled = true; + break; + } + + case 0x7a: + this.op(Regexp.Op.END_TEXT); + handled = true; + break; + default: + t.rewindTo(savedPos); + break; + } + } else { + t.rewindTo(savedPos); + } + if (handled) break; + + const re = this.newRegexp(Regexp.Op.CHAR_CLASS); + re.flags = this.flags; + // Look for Unicode character group like \p{Han} + if (t.lookingAt("\\p") || t.lookingAt("\\P")) { + const cc = new CharClass(); + if (this.parseUnicodeClass(t, cc)) { + re.runes = cc.toArray(); + this.push(re); + break; + } + } + // Perl character class escape. + const cc = new CharClass(); + if (this.parsePerlClassEscape(t, cc)) { + re.runes = cc.toArray(); + this.push(re); + break; + } + t.rewindTo(savedPos); + this.reuse(re); + // Ordinary single-character escape. + this.literal(Parser.parseEscape(t)); + break; + } + default: + this.literal(t.pop()); + break; + } + lastRepeatPos = repeatPos; + } + } + + this.concat(); + if (this.swapVerticalBar()) { + this.pop(); // pop vertical bar + } + this.alternate(); + const n = this.stack.length; + if (n !== 1) { + throw new RE2JSSyntaxException( + Parser.ERR_MISSING_PAREN, + this.wholeRegexp, + ); + } + this.stack[0].namedGroups = this.namedGroups; + return this.stack[0]; + } + + // parsePerlFlags parses a Perl flag setting or non-capturing group or both, + // like (?i) or (?: or (?i:. + // Pre: t at "(?". Post: t after ")". + // Sets numCap. + parsePerlFlags(t: StringIterator): void { + const startPos = t.pos(); + // Check for named captures, first introduced in Python's regexp library. + // As usual, there are three slightly different syntaxes: + // + // (?Pexpr) the original, introduced by Python + // (?expr) the .NET alteration, adopted by Perl 5.10 + // (?'name'expr) another .NET alteration, adopted by Perl 5.10 + // + // Perl 5.10 gave in and implemented the Python version too, + // but they claim that the last two are the preferred forms. + // PCRE and languages based on it (specifically, PHP and Ruby) + // support all three as well. EcmaScript 4 uses only the Python form. + // + // In both the open source world (via Code Search) and the + // Google source tree, (?Pexpr) and (?expr) are the + // dominant forms of named captures and both are supported. + if (t.lookingAt("(?P<") || t.lookingAt("(?<")) { + // Pull out name. + const s = t.rest(); + const begin = s.charAt(2) === "P" ? 4 : 3; + const end = s.indexOf(">"); + if (end < 0) { + throw new RE2JSSyntaxException(Parser.ERR_INVALID_NAMED_CAPTURE, s); + } + const name = s.substring(begin, end); // "name" + t.skipString(name); + t.skip(begin + 1); // "(?P<>" or "(?<>" + if (!Parser.isValidCaptureName(name)) { + // "(?P" + throw new RE2JSSyntaxException( + Parser.ERR_INVALID_NAMED_CAPTURE, + s.substring(0, end + 1), + ); // "(?P" or "(?" + } + // Like ordinary capture, but named. + const re = this.op(Regexp.Op.LEFT_PAREN); + if (re === null) { + throw new Error("op(LEFT_PAREN) unexpectedly returned null"); + } + re.cap = ++this.numCap; + if (this.namedGroups.get(name)) { + throw new RE2JSSyntaxException( + Parser.ERR_DUPLICATE_NAMED_CAPTURE, + name, + ); + } + this.namedGroups.set(name, this.numCap); + re.name = name; + return; + } + // Non-capturing group. Might also twiddle Perl flags. + t.skip(2); // "(?" + + let flags = this.flags; + let sign = +1; + let sawFlag = false; + loop: while (t.more()) { + { + const c = t.pop(); + switch (c) { + case 0x69: + flags |= FOLD_CASE; + sawFlag = true; + break; + case 0x6d: + flags &= ~ONE_LINE; + sawFlag = true; + break; + case 0x73: + flags |= DOT_NL; + sawFlag = true; + break; + case 0x55: + flags |= NON_GREEDY; + sawFlag = true; + break; + // Switch to negation. + case 0x2d: + if (sign < 0) { + break loop; + } + sign = -1; + // Invert flags so that | above turn into &~ and vice versa. + // We'll invert flags again before using it below. + flags = ~flags; + sawFlag = false; + break; + // End of flags, starting group or not. + case 0x3a: + case 0x29: + if (sign < 0) { + if (!sawFlag) { + break loop; + } + flags = ~flags; + } + if (c === 0x3a) { + // Open new group + this.op(Regexp.Op.LEFT_PAREN); + } + this.flags = flags; + return; + default: + // Flags. + break loop; + } + } + } + + throw new RE2JSSyntaxException( + Parser.ERR_INVALID_PERL_OP, + t.from(startPos), + ); + } + + // parseVerticalBar handles a | in the input. + parseVerticalBar(): void { + this.concat(); + // The concatenation we just parsed is on top of the stack. + // If it sits above an opVerticalBar, swap it below + // (things below an opVerticalBar become an alternation). + // Otherwise, push a new vertical bar. + if (!this.swapVerticalBar()) { + this.op(Regexp.Op.VERTICAL_BAR); + } + } + + // If the top of the stack is an element followed by an opVerticalBar + // swapVerticalBar swaps the two and returns true. + // Otherwise it returns false. + swapVerticalBar(): boolean { + const n = this.stack.length; + // If above and below vertical bar are literal or char class, + // can merge into a single char class. + if ( + n >= 3 && + this.stack[n - 2].op === Regexp.Op.VERTICAL_BAR && + Parser.isCharClass(this.stack[n - 1]) && + Parser.isCharClass(this.stack[n - 3]) + ) { + let re1 = this.stack[n - 1]; + let re3 = this.stack[n - 3]; + // Make re3 the more complex of the two. + if (re1.op > re3.op) { + const tmp = re3; + re3 = re1; + re1 = tmp; + this.stack[n - 3] = re3; + } + Parser.mergeCharClass(re3, re1); + this.reuse(re1); + this.pop(); + return true; + } + if (n >= 2) { + const re1 = this.stack[n - 1]; + const re2 = this.stack[n - 2]; + if (re2.op === Regexp.Op.VERTICAL_BAR) { + if (n >= 3) { + // Now out of reach. + // Clean opportunistically. + this.cleanAlt(this.stack[n - 3]); + } + this.stack[n - 2] = re1; + this.stack[n - 1] = re2; + return true; + } + } + return false; + } + + // parseRightParen handles a ')' in the input. + parseRightParen(): void { + this.concat(); + if (this.swapVerticalBar()) { + this.pop(); // pop vertical bar + } + this.alternate(); + const n = this.stack.length; + if (n < 2) { + throw new RE2JSSyntaxException( + Parser.ERR_UNEXPECTED_PAREN, + this.wholeRegexp, + ); + } + + const re1 = this.pop(); + if (re1 === undefined) { + throw new RE2JSSyntaxException( + Parser.ERR_BAD_EXPRESSION, + this.wholeRegexp, + ); + } + const re2 = this.pop(); + if (re2 === undefined || re2.op !== Regexp.Op.LEFT_PAREN) { + throw new RE2JSSyntaxException( + Parser.ERR_UNEXPECTED_PAREN, + this.wholeRegexp, + ); + } + // Restore flags at time of paren. + this.flags = re2.flags; + if (re2.cap === 0) { + // Just for grouping. + this.push(re1); + } else { + re2.op = Regexp.Op.CAPTURE; + re2.subs = [re1]; + this.push(re2); + } + } + + // parsePerlClassEscape parses a leading Perl character class escape like \d + // from the beginning of |t|. If one is present, it appends the characters + // to cc and returns true. The iterator is advanced past the escape + // on success, undefined on failure, in which case false is returned. + parsePerlClassEscape(t: StringIterator, cc: CharClass): boolean { + const beforePos = t.pos(); + if (!t.more() || t.pop() !== 0x5c || !t.more()) { + return false; + } + t.pop(); // e.g. advance past 'd' in "\\d" + const p = t.from(beforePos); + const g = getPerlGroups().get(p); + if (g === undefined) { + return false; + } + cc.appendGroup(g, (this.flags & FOLD_CASE) !== 0); + return true; + } + + // parseNamedClass parses a leading POSIX named character class like + // [:alnum:] from the beginning of t. If one is present, it appends the + // characters to cc, advances the iterator, and returns true. + // Pre: t at "[:". Post: t after ":]". + // On failure (no class of than name), throws RE2JSSyntaxException. + // On misparse, returns false; t.pos() is undefined. + parseNamedClass(t: StringIterator, cc: CharClass): boolean { + // (Go precondition check deleted.) + const cls = t.rest(); + const i = cls.indexOf(":]"); + if (i < 0) { + return false; + } + + const name = cls.substring(0, i + 2); // "[:alnum:]" + t.skipString(name); + const g = getPosixGroups().get(name); + if (g === undefined) { + throw new RE2JSSyntaxException(Parser.ERR_INVALID_CHAR_RANGE, name); + } + cc.appendGroup(g, (this.flags & FOLD_CASE) !== 0); + return true; + } + + // parseUnicodeClass() parses a leading Unicode character class like \p{Han} + // from the beginning of t. If one is present, it appends the characters to + // to |cc|, advances |t| and returns true. + // + // Returns false if such a pattern is not present or UNICODE_GROUPS + // flag is not enabled; |t.pos()| is not advanced in this case. + // Indicates error by throwing RE2JSSyntaxException. + parseUnicodeClass(t: StringIterator, cc: CharClass): boolean { + const startPos = t.pos(); + if ( + (this.flags & UNICODE_GROUPS) === 0 || + (!t.lookingAt("\\p") && !t.lookingAt("\\P")) + ) { + return false; + } + + t.skip(1); // '\\' + // Committed to parse or throw exception. + let sign = +1; + let c = t.pop(); // 'p' or 'P' + if (c === 0x50) { + sign = -1; + } + if (!t.more()) { + t.rewindTo(startPos); + throw new RE2JSSyntaxException(Parser.ERR_INVALID_CHAR_RANGE, t.rest()); + } + + c = t.pop(); + let name: string; + + if (c !== 0x7b) { + // Single-letter name. + name = runeToString(c); + } else { + // Name is in braces. + const rest = t.rest(); + const end = rest.indexOf("}"); + if (end < 0) { + t.rewindTo(startPos); + throw new RE2JSSyntaxException(Parser.ERR_INVALID_CHAR_RANGE, t.rest()); + } + name = rest.substring(0, end); // e.g. "Han" + t.skipString(name); + t.skip(1); + // Don't use skip(end) because it assumes UTF-16 coding, and + // StringIterator doesn't guarantee that. + } + // Group can have leading negation too. + // \p{^Han} == \P{Han}, \P{^Han} == \p{Han}. + if (!(name.length === 0) && name.codePointAt(0) === 0x5e) { + sign = 0 - sign; // -sign + name = name.substring(1); + } + + const pair = Parser.unicodeTable(name); + if (pair === null) { + throw new RE2JSSyntaxException( + Parser.ERR_INVALID_CHAR_RANGE, + t.from(startPos), + ); + } + if (pair.sign < 0) { + sign = 0 - sign; + } + + const tab = pair.tab; + const fold = pair.fold; // fold-equivalent table + if (tab === null) { + throw new RE2JSSyntaxException( + Parser.ERR_INVALID_CHAR_RANGE, + t.from(startPos), + ); + } + // Variation of CharClass.appendGroup() for tables. + if ((this.flags & FOLD_CASE) === 0 || fold === null) { + cc.appendTableWithSign(tab, sign); + } else { + // Merge and clean tab and fold in a temporary buffer. + // This is necessary for the negative case and just tidy + // for the positive case. + const tmp = new CharClass() + .appendTable(tab) + .appendTable(fold) + .cleanClass() + .toArray(); + cc.appendClassWithSign(tmp, sign); + } + return true; + } + + // parseClass parses a character class and pushes it onto the parse stack. + // + // NOTES: + // Pre: at '['; Post: after ']'. + // Mutates stack. Advances iterator. May throw. + parseClass(t: StringIterator): void { + const startPos = t.pos(); + t.skip(1); // '[' + const re = this.newRegexp(Regexp.Op.CHAR_CLASS); + re.flags = this.flags; + const cc = new CharClass(); + let sign = +1; + + if (t.more() && t.lookingAt("^")) { + sign = -1; + t.skip(1); // '^' + // If character class does not match \n, add it here, + // so that negation later will do the right thing. + if ((this.flags & CLASS_NL) === 0) { + cc.appendRange(0x0a, 0x0a); + } + } + + let first = true; // ']' and '-' are okay as first char in class + while (!t.more() || t.peek() !== 0x5d || first) { + first = false; + const beforePos = t.pos(); + // Look for POSIX [:alnum:] etc. + if (t.lookingAt("[:")) { + if (this.parseNamedClass(t, cc)) { + continue; + } + t.rewindTo(beforePos); + } + + // Look for Unicode character group like \p{Han}. + if (this.parseUnicodeClass(t, cc)) { + continue; + } + + // Look for Perl character class symbols (extension). + if (this.parsePerlClassEscape(t, cc)) { + continue; + } + t.rewindTo(beforePos); + + // Single character or simple range. + const lo = Parser.parseClassChar(t, startPos); + let hi = lo; + if (t.more() && t.lookingAt("-")) { + t.skip(1); + if (t.more() && t.lookingAt("]")) { + // [a-] means (a|-) so check for final ]. + t.skip(-1); + } else { + hi = Parser.parseClassChar(t, startPos); + if (hi < lo) { + throw new RE2JSSyntaxException( + Parser.ERR_INVALID_CHAR_RANGE, + t.from(beforePos), + ); + } + } + } + if ((this.flags & FOLD_CASE) === 0) { + cc.appendRange(lo, hi); + } else { + cc.appendFoldedRange(lo, hi); + } + } + t.skip(1); // ']' + + cc.cleanClass(); + if (sign < 0) { + cc.negateClass(); + } + re.runes = cc.toArray(); + this.push(re); + } +} + +export { Parser }; diff --git a/packages/re2/src/Prefilter.ts b/packages/re2/src/Prefilter.ts new file mode 100644 index 0000000..54e04be --- /dev/null +++ b/packages/re2/src/Prefilter.ts @@ -0,0 +1,171 @@ +import { Regexp } from "./Regexp.js"; + +import { FOLD_CASE } from "./RE2Flags.js"; +import type { MachineUTF16Input } from "./MachineInput.js"; + +class Prefilter { + type: number; + subs: Prefilter[]; + str: string; + + static Type = { NONE: 0, EXACT: 1, AND: 2, OR: 3 }; + + constructor(type: number) { + this.type = type; + this.subs = []; + this.str = ""; + } + + eval(input: MachineUTF16Input, pos: number): boolean { + switch (this.type) { + case Prefilter.Type.NONE: + return true; + case Prefilter.Type.EXACT: + return input.hasString(this, pos); + case Prefilter.Type.AND: + for (let i = 0; i < this.subs.length; i++) { + if (!this.subs[i].eval(input, pos)) return false; + } + return true; + case Prefilter.Type.OR: + for (let i = 0; i < this.subs.length; i++) { + if (this.subs[i].eval(input, pos)) return true; + } + return false; + default: + return true; + } + } +} + +const fromRegexp = (re: Regexp): Prefilter => { + if (!re) return new Prefilter(Prefilter.Type.NONE); + + switch (re.op) { + case Regexp.Op.NO_MATCH: + case Regexp.Op.EMPTY_MATCH: + case Regexp.Op.BEGIN_LINE: + case Regexp.Op.END_LINE: + case Regexp.Op.BEGIN_TEXT: + case Regexp.Op.END_TEXT: + case Regexp.Op.WORD_BOUNDARY: + case Regexp.Op.NO_WORD_BOUNDARY: + case Regexp.Op.CHAR_CLASS: + case Regexp.Op.ANY_CHAR_NOT_NL: + case Regexp.Op.ANY_CHAR: { + return new Prefilter(Prefilter.Type.NONE); + } + + case Regexp.Op.LITERAL: { + if (re.runes.length === 0 || (re.flags & FOLD_CASE) !== 0) { + // Skip case-folded literals for simplicity + return new Prefilter(Prefilter.Type.NONE); + } + const pf = new Prefilter(Prefilter.Type.EXACT); + let str = ""; + for (let i = 0; i < re.runes.length; i++) { + str += String.fromCodePoint(re.runes[i]); + } + pf.str = str; + return pf; + } + + case Regexp.Op.CAPTURE: + case Regexp.Op.PLUS: { + return fromRegexp(re.subs[0]); + } + + case Regexp.Op.REPEAT: { + if (re.min >= 1) { + return fromRegexp(re.subs[0]); + } + return new Prefilter(Prefilter.Type.NONE); + } + + case Regexp.Op.CONCAT: { + const pf = new Prefilter(Prefilter.Type.AND); + for (const sub of re.subs) { + pf.subs.push(fromRegexp(sub)); + } + return pf; + } + + case Regexp.Op.ALTERNATE: { + const pf = new Prefilter(Prefilter.Type.OR); + for (const sub of re.subs) { + pf.subs.push(fromRegexp(sub)); + } + return pf; + } + + default: + return new Prefilter(Prefilter.Type.NONE); + } +}; + +const simplify = (pf: Prefilter): Prefilter => { + if (pf.type === Prefilter.Type.EXACT || pf.type === Prefilter.Type.NONE) { + return pf; + } + + if (pf.type === Prefilter.Type.AND) { + const newSubs: Prefilter[] = []; + for (const sub of pf.subs) { + const s = simplify(sub); + if (s.type !== Prefilter.Type.NONE) { + if (s.type === Prefilter.Type.AND) { + newSubs.push(...s.subs); + } else { + newSubs.push(s); + } + } + } + + if (newSubs.length === 0) return new Prefilter(Prefilter.Type.NONE); + if (newSubs.length === 1) return newSubs[0]; + pf.subs = newSubs; + return pf; + } + + if (pf.type === Prefilter.Type.OR) { + const newSubs: Prefilter[] = []; + for (const sub of pf.subs) { + const s = simplify(sub); + if (s.type === Prefilter.Type.NONE) { + // If any branch of an OR has no requirements, the whole OR has no requirements + return new Prefilter(Prefilter.Type.NONE); + } + if (s.type === Prefilter.Type.OR) { + newSubs.push(...s.subs); + } else { + newSubs.push(s); + } + } + if (newSubs.length === 0) return new Prefilter(Prefilter.Type.NONE); + if (newSubs.length === 1) return newSubs[0]; + + // De-duplicate EXACT branches + const seen = new Set(); + const uniqueSubs: Prefilter[] = []; + for (const sub of newSubs) { + if (sub.type === Prefilter.Type.EXACT) { + if (!seen.has(sub.str)) { + seen.add(sub.str); + uniqueSubs.push(sub); + } + } else { + uniqueSubs.push(sub); + } + } + pf.subs = uniqueSubs; + return pf; + } + + return pf; +}; + +const PrefilterTree = { + build: (re: Regexp): Prefilter => simplify(fromRegexp(re)), +}; + +export { Prefilter, PrefilterTree }; diff --git a/packages/re2/src/Prog.ts b/packages/re2/src/Prog.ts new file mode 100644 index 0000000..f623fed --- /dev/null +++ b/packages/re2/src/Prog.ts @@ -0,0 +1,172 @@ +import { FOLD_CASE } from "./RE2Flags.js"; +import { Inst } from "./Inst.js"; + +/** + * A list of instruction pointers waiting to be patched. + * Tracks both `head` and `tail` to allow O(1) appending during compilation. + * * Values are encoded integers, not standard memory pointers: + * - Program instruction index: `l >> 1` + * - Patch `.out` field if: `(l & 1) === 0` + * - Patch `.arg` field if: `(l & 1) === 1` + * - `0` denotes an empty list. + * * @see https://swtch.com/~rsc/regexp/regexp1.html + */ +class PatchList { + head: number; + tail: number; + + /** + * @param {number} head - Encoded pointer to the start of the patch list. + * @param {number} tail - Encoded pointer to the end of the patch list. + */ + constructor(head = 0, tail = 0) { + this.head = head; + this.tail = tail; + } +} + +/** + * A Prog is a compiled regular expression program. + */ +class Prog { + inst: Inst[]; + start: number; + numCap: number; + + constructor() { + this.inst = []; + this.start = 0; // index of start instruction + // number of CAPTURE insts in re + // 2 => implicit ( and ) for whole match $0 + this.numCap = 2; + } + + // Returns the instruction at the specified pc. + // Precondition: pc > 0 && pc < numInst(). + getInst(pc: number): Inst { + return this.inst[pc]; + } + + // Returns the number of instructions in this program. + numInst(): number { + return this.inst.length; + } + + // Adds a new instruction to this program, with operator |op| and |pc| equal + // to |numInst()|. + addInst(op: number): void { + this.inst.push(new Inst(op)); + } + + // skipNop() follows any no-op or capturing instructions and returns the + // resulting instruction. + skipNop(pc: number): Inst { + let i = this.inst[pc]; + + while (i.op === Inst.NOP || i.op === Inst.CAPTURE) { + i = this.inst[pc]; + pc = i.out; + } + + return i; + } + + // prefix() returns a pair of a literal string that all matches for the + // regexp must start with, and a boolean which is true if the prefix is the + // entire match. The string is returned by appending to |prefix|. + prefix(): [boolean, string] { + let prefix = ""; + let i = this.skipNop(this.start); + + if (!Inst.isRuneOp(i.op) || i.runes.length !== 1) { + return [i.op === Inst.MATCH, prefix]; + } + + while ( + Inst.isRuneOp(i.op) && + i.runes.length === 1 && + (i.arg & FOLD_CASE) === 0 + ) { + prefix += String.fromCodePoint(i.runes[0]); + i = this.skipNop(i.out); + } + + return [i.op === Inst.MATCH, prefix]; + } + + // startCond() returns the leading empty-width conditions that must be true + // in any match. It returns -1 (all bits set) if no matches are possible. + startCond(): number { + let flag = 0; + let pc = this.start; + loop: for (;;) { + const i = this.inst[pc]; + switch (i.op) { + case Inst.EMPTY_WIDTH: + flag |= i.arg; + break; + case Inst.FAIL: + return -1; + case Inst.CAPTURE: + case Inst.NOP: + break; + default: + break loop; + } + pc = i.out; + } + return flag; + } + + // --- Patch list --- + + // A patchlist is a list of instruction pointers that need to be filled in + // (patched). Because the pointers haven't been filled in yet, we can reuse + // their storage to hold the list. It's kind of sleazy, but works well in + // practice. See http://swtch.com/~rsc/regexp/regexp1.html for inspiration. + + // These aren't really pointers: they're integers, so we can reinterpret them + // this way without using package unsafe. A value l denotes p.inst[l>>1].out + // (l&1==0) or .arg (l&1==1). l == 0 denotes the empty list, okay because we + // start every program with a fail instruction, so we'll never want to point + // at its output link. + + next(l: number): number { + const i = this.inst[l >> 1]; + if ((l & 1) === 0) { + return i.out; + } + return i.arg; + } + + patch(l: PatchList, val: number): void { + let head = l.head; + while (head !== 0) { + const i = this.inst[head >> 1]; + if ((head & 1) === 0) { + head = i.out; + i.out = val; + } else { + head = i.arg; + i.arg = val; + } + } + } + + append(l1: PatchList, l2: PatchList): PatchList { + if (l1.head === 0) return l2; + if (l2.head === 0) return l1; + + // We know exactly where the tail is + const i = this.inst[l1.tail >> 1]; + if ((l1.tail & 1) === 0) { + i.out = l2.head; + } else { + i.arg = l2.head; + } + + return new PatchList(l1.head, l2.tail); + } +} + +export { Prog, PatchList }; diff --git a/packages/re2/src/RE2.ts b/packages/re2/src/RE2.ts new file mode 100644 index 0000000..a3f8d4d --- /dev/null +++ b/packages/re2/src/RE2.ts @@ -0,0 +1,259 @@ +import { ANCHOR_BOTH, ANCHOR_START, PERL, UNANCHORED } from "./RE2Flags.js"; +import { fromUTF16, type MachineUTF16Input } from "./MachineInput.js"; +import { DFA } from "./DFA.js"; +import { Inst } from "./Inst.js"; +import { Prefilter, PrefilterTree } from "./Prefilter.js"; +import { Compiler } from "./Compiler.js"; +import { simplify } from "./Simplify.js"; +import { Parser } from "./Parser.js"; +import { emptyOpContext } from "./Utils.js"; +import type { Prog } from "./Prog.js"; + +class RE2 { + expr: string; + prog: Prog; + numSubexp: number; + cond: number; + prefix: string; + prefixComplete: boolean; + prefixRune: number; + dfa: DFA; + prefilter: Prefilter | null; + namedGroups: Map; + + static compile(expr: string): RE2 { + return RE2.compileImpl(expr, PERL); + } + + static compileImpl(expr: string, mode: number): RE2 { + return new RE2(expr, mode); + } + + constructor(expr: string, mode: number) { + let re = Parser.parse(expr, mode); + re = simplify(re); + + const prefilter = PrefilterTree.build(re); + + const prog = Compiler.compileRegexp(re); + + this.prefilter = prefilter.type === Prefilter.Type.NONE ? null : prefilter; + + const [prefixCompl, prefixStr] = prog.prefix(); + this.prefixComplete = prefixCompl; + this.prefix = prefixStr; + + this.prefixRune = 0; + if (this.prefix.length > 0) { + const cp = this.prefix.codePointAt(0); + if (cp === undefined) { + throw new Error("RE2: prefix has no code point"); + } + this.prefixRune = cp; + } + this.namedGroups = re.namedGroups; + + this.expr = expr; + this.prog = prog; + this.numSubexp = re.maxCap(); + this.cond = prog.startCond(); + this.dfa = new DFA(this.prog); + } + + matchPrefixComplete( + input: MachineUTF16Input, + pos: number, + anchor: number, + ncap: number, + ): number[] | null { + let matchStart = -1; + let matchEnd = -1; + const pLen = input.prefixLength(this); + + if (anchor === UNANCHORED) { + const idx = input.index(this, pos); + if (idx < 0) return null; + matchStart = pos + idx; + matchEnd = matchStart + pLen; + } else if (anchor === ANCHOR_BOTH) { + // Match must span [pos, endPos] exactly and equal the prefix. + if (input.endPos() - pos !== pLen) return null; + const idx = input.index(this, pos); + if (idx !== 0) return null; + matchStart = pos; + matchEnd = pos + pLen; + } else if (anchor === ANCHOR_START) { + // Match must start at pos and equal the prefix. + const idx = input.index(this, pos); + if (idx !== 0) return null; + matchStart = pos; + matchEnd = pos + pLen; + } + + if (matchStart < 0) return null; + + if (ncap > 0) { + const matchcap = new Int32Array(ncap).fill(-1); + matchcap[0] = matchStart; + matchcap[1] = matchEnd; + return Array.from(matchcap); + } + return []; + } + + executeEngine( + input: MachineUTF16Input, + pos: number, + anchor: number, + ncap: number, + ): number[] | null { + if (this.prefixComplete && (ncap === 0 || this.numSubexp === 0)) { + return this.matchPrefixComplete(input, pos, anchor, ncap); + } + + if (this.prefilter !== null && anchor === UNANCHORED) { + if (!this.prefilter.eval(input, pos)) { + return null; + } + } + + const dfaResult = this.dfa.match(input, pos, anchor); + if (dfaResult !== null) { + return dfaResult ? [] : null; + } + + // Minimal NFA fallback for DFA state explosion + return this._nfaFallback(input, pos, anchor) ? [] : null; + } + + // Minimal boolean-only NFA for when the DFA bails due to state explosion. + // No captures, no thread pools — just two sets of NFA states swapped each step. + _nfaFallback(input: MachineUTF16Input, pos: number, anchor: number): boolean { + const prog = this.prog; + const endPos = input.endPos(); + + const addState = ( + set: Set, + visited: Set, + pc: number, + context: number, + ): void => { + if (pc < 0 || pc >= prog.numInst() || visited.has(pc)) return; + visited.add(pc); + const inst = prog.getInst(pc); + switch (inst.op) { + case Inst.ALT: + case Inst.ALT_MATCH: + addState(set, visited, inst.out, context); + addState(set, visited, inst.arg, context); + break; + case Inst.NOP: + case Inst.CAPTURE: + addState(set, visited, inst.out, context); + break; + case Inst.EMPTY_WIDTH: + if ((inst.arg & ~context) === 0) { + addState(set, visited, inst.out, context); + } + break; + default: + set.add(pc); + break; + } + }; + + let current = new Set(); + let next = new Set(); + // prevRune: the rune immediately before `pos`. See DFA.match for rationale. + let prevRune = -1; + if (pos > 0) { + const r = input.step(pos - 1) >> 3; + if (r >= 0) prevRune = r; + } + + for (let i = pos; i <= endPos; i++) { + const rune = i < endPos ? input.step(i) >> 3 : -1; + const width = i < endPos ? input.step(i) & 7 : 0; + const context = emptyOpContext(prevRune, rune); + + // Add start state at each position for unanchored search + if (anchor === UNANCHORED || i === pos) { + const visited = new Set(); + addState(current, visited, prog.start, context); + } + + // Check for matches before consuming. + // For UNANCHORED/ANCHOR_START, a MATCH at any position succeeds. + // For ANCHOR_BOTH, we must consume the entire input — intermediate + // matches are skipped; only the final post-loop check accepts MATCH. + if (anchor !== ANCHOR_BOTH) { + for (const pc of current) { + const inst = prog.getInst(pc); + if (inst.op === Inst.MATCH) { + return true; + } + } + } + + if (i >= endPos || width === 0) break; + + // Step: consume current character + next.clear(); + for (const pc of current) { + const inst = prog.getInst(pc); + if (Inst.isRuneOp(inst.op) && inst.matchRune(rune)) { + const nextContext = emptyOpContext( + rune, + i + width < endPos ? input.step(i + width) >> 3 : -1, + ); + const visited = new Set(); + addState(next, visited, inst.out, nextContext); + } + } + + // For unanchored, add start state at next position too + if (anchor === UNANCHORED) { + const nextRune = i + width < endPos ? input.step(i + width) >> 3 : -1; + const nextContext = emptyOpContext(rune, nextRune); + const visited = new Set(); + addState(next, visited, prog.start, nextContext); + } + + prevRune = rune; + [current, next] = [next, current]; + i += width - 1; // loop increments by 1, but we advanced by width + } + + // Final check for match after processing all input + const endContext = emptyOpContext(prevRune, -1); + const visited = new Set(); + const finalSet = new Set(); + for (const pc of current) { + addState(finalSet, visited, pc, endContext); + } + for (const pc of finalSet) { + const inst = prog.getInst(pc); + if (inst.op === Inst.MATCH) return true; + } + + return false; + } + + numberOfCapturingGroups(): number { + return this.numSubexp; + } + + reset(): void { + // No-op: machine pool removed + } + + toString(): string { + return this.expr; + } + + match(s: string): boolean { + return this.executeEngine(fromUTF16(s), 0, UNANCHORED, 0) !== null; + } +} + +export { RE2 }; diff --git a/packages/re2/src/RE2Flags.ts b/packages/re2/src/RE2Flags.ts new file mode 100644 index 0000000..98d822f --- /dev/null +++ b/packages/re2/src/RE2Flags.ts @@ -0,0 +1,59 @@ +//// Parser flags. +// Fold case during matching (case-insensitive). +const FOLD_CASE = 0x01; +// Treat pattern as a literal string instead of a regexp. +const LITERAL = 0x02; +// Allow character classes like [^a-z] and [[:space:]] to match newline. +const CLASS_NL = 0x04; +// Allow '.' to match newline. +const DOT_NL = 0x08; +// Treat ^ and $ as only matching at beginning and end of text, not +// around embedded newlines. (Perl's default). +const ONE_LINE = 0x10; +// Make repetition operators default to non-greedy. +const NON_GREEDY = 0x20; +// allow Perl extensions: +// non-capturing parens - (?: ) +// non-greedy operators - *? +? ?? {}? +// flag edits - (?i) (?-i) (?i: ) +// i - FoldCase +// m - !OneLine +// s - DotNL +// U - NonGreedy +// line ends: \A \z +// \Q and \E to disable/enable metacharacters +// (?Pexpr) for named captures +// \C (any byte) is not supported. +const PERL_X = 0x40; +// Allow \p{Han}, \P{Han} for Unicode group and negation. +const UNICODE_GROUPS = 0x80; +// Regexp END_TEXT was $, not \z. Internal use only. +const WAS_DOLLAR = 0x100; + +const MATCH_NL = CLASS_NL | DOT_NL; +// As close to Perl as possible. +const PERL = CLASS_NL | ONE_LINE | PERL_X | UNICODE_GROUPS; +// POSIX syntax. +const POSIX = 0; +//// Anchors +const UNANCHORED = 0; +const ANCHOR_START = 1; +const ANCHOR_BOTH = 2; + +export { + UNANCHORED, + ANCHOR_BOTH, + NON_GREEDY, + FOLD_CASE, + LITERAL, + ONE_LINE, + WAS_DOLLAR, + DOT_NL, + UNICODE_GROUPS, + CLASS_NL, + PERL, + ANCHOR_START, + POSIX, + MATCH_NL, + PERL_X, +}; diff --git a/packages/re2/src/Regexp.ts b/packages/re2/src/Regexp.ts new file mode 100644 index 0000000..1a00b56 --- /dev/null +++ b/packages/re2/src/Regexp.ts @@ -0,0 +1,100 @@ +/** + * Regular expression abstract syntax tree. Produced by parser, used by compiler. + */ +export class Regexp { + static Op = { + NO_MATCH: 0, + EMPTY_MATCH: 1, + LITERAL: 2, + CHAR_CLASS: 3, + ANY_CHAR_NOT_NL: 4, + ANY_CHAR: 5, + BEGIN_LINE: 6, + END_LINE: 7, + BEGIN_TEXT: 8, + END_TEXT: 9, + WORD_BOUNDARY: 10, + NO_WORD_BOUNDARY: 11, + CAPTURE: 12, + STAR: 13, + PLUS: 14, + QUEST: 15, + REPEAT: 16, + CONCAT: 17, + ALTERNATE: 18, + LEFT_PAREN: 19, + VERTICAL_BAR: 20, + } as const; + + static isPseudoOp(op: number): boolean { + return op >= Regexp.Op.LEFT_PAREN; + } + + static emptySubs(): Regexp[] { + return []; + } + static fromRegexp(re: Regexp): Regexp { + const regex = new Regexp(re.op); + regex.flags = re.flags; + regex.subs = re.subs; + regex.runes = re.runes; + regex.cap = re.cap; + regex.min = re.min; + regex.max = re.max; + regex.name = re.name; + regex.namedGroups = re.namedGroups; + return regex; + } + + op: number; + flags: number; + subs: Regexp[]; + runes: number[]; + min: number; + max: number; + cap: number; + name: string | null; + namedGroups: Map; + + constructor(op: number) { + this.op = op; // operator + this.flags = 0; // bitmap of parse flags + // subexpressions, if any. Never null. + // subs[0] is used as the freelist. + this.subs = Regexp.emptySubs(); + this.runes = []; // matched runes, for LITERAL, CHAR_CLASS + this.min = 0; // min for REPEAT + this.max = 0; // max for REPEAT + this.cap = 0; // capturing index, for CAPTURE + this.name = null; // capturing name, for CAPTURE + this.namedGroups = new Map(); + } + + reinit(): void { + this.flags = 0; + this.subs = Regexp.emptySubs(); + this.runes = []; + this.cap = 0; + this.min = 0; + this.max = 0; + this.name = null; + this.namedGroups = new Map(); + } + + // maxCap() walks the regexp to find the maximum capture index. + maxCap(): number { + let m = 0; + if (this.op === Regexp.Op.CAPTURE) { + m = this.cap; + } + if (this.subs !== null) { + for (let sub of this.subs) { + const n = sub.maxCap(); + if (m < n) { + m = n; + } + } + } + return m; + } +} diff --git a/packages/re2/src/Simplify.ts b/packages/re2/src/Simplify.ts new file mode 100644 index 0000000..3cb87d2 --- /dev/null +++ b/packages/re2/src/Simplify.ts @@ -0,0 +1,265 @@ +import { NON_GREEDY } from "./RE2Flags.js"; +import { Regexp } from "./Regexp.js"; +import { MAX_RUNE } from "./Unicode.js"; + +// simplify returns a regexp equivalent to re but without counted +// repetitions and with various other simplifications, such as +// rewriting /(?:a+)+/ to /a+/. The resulting regexp will execute +// correctly but its string representation will not produce the same +// parse tree, because capturing parentheses may have been duplicated +// or removed. For example, the simplified form for /(x){1,2}/ is +// /(x)(x)?/ but both parentheses capture as $1. The returned regexp +// may share structure with or be the original. +function simplify(re: Regexp): Regexp { + switch (re.op) { + case Regexp.Op.CAPTURE: { + const sub = simplify(re.subs[0]); + if (sub !== re.subs[0]) { + const nre = Regexp.fromRegexp(re); + nre.runes = []; + nre.subs = [sub]; + return nre; + } + return re; + } + + case Regexp.Op.CONCAT: + case Regexp.Op.ALTERNATE: { + const newSubs = []; + let changed = false; + + for (let i = 0; i < re.subs.length; i++) { + const sub = re.subs[i]; + const nsub = simplify(sub); + if (nsub !== sub) { + changed = true; + } + + if (re.op === Regexp.Op.CONCAT) { + // If any part of a CONCAT is mathematically impossible, + // the entire CONCAT sequence becomes impossible. + if (nsub.op === Regexp.Op.NO_MATCH) { + return new Regexp(Regexp.Op.NO_MATCH); + } + // Drop empty 0-width match nodes entirely from sequences + if (nsub.op === Regexp.Op.EMPTY_MATCH) { + changed = true; + continue; + } + // Flatten nested concatenations + if (nsub.op === Regexp.Op.CONCAT) { + changed = true; + newSubs.push(...nsub.subs); + continue; + } + } else if (re.op === Regexp.Op.ALTERNATE) { + // Drop impossible branches from alternations + if (nsub.op === Regexp.Op.NO_MATCH) { + changed = true; + continue; + } + // Flatten nested alternations + if (nsub.op === Regexp.Op.ALTERNATE) { + changed = true; + newSubs.push(...nsub.subs); + continue; + } + } + + newSubs.push(nsub); + } + + if (changed) { + // If we filtered out all nodes, return the mathematically correct fallback + if (newSubs.length === 0) { + return new Regexp( + re.op === Regexp.Op.CONCAT + ? Regexp.Op.EMPTY_MATCH + : Regexp.Op.NO_MATCH, + ); + } + // If only 1 node remains, we don't need a CONCAT/ALT container at all + if (newSubs.length === 1) { + return newSubs[0]; + } + + const nre = Regexp.fromRegexp(re); + nre.runes = []; + nre.subs = newSubs; + return nre; + } + + return re; + } + + case Regexp.Op.CHAR_CLASS: { + if (re.runes === null) return re; + + // Empty character classes match nothing. + if (re.runes.length === 0) { + return new Regexp(Regexp.Op.NO_MATCH); + } + // Full character classes match everything. + if ( + re.runes.length === 2 && + re.runes[0] === 0 && + re.runes[1] === MAX_RUNE + ) { + return new Regexp(Regexp.Op.ANY_CHAR); + } + // Standard catch-all except newline + if ( + re.runes.length === 4 && + re.runes[0] === 0 && + re.runes[1] === 0x0a - 1 && + re.runes[2] === 0x0a + 1 && + re.runes[3] === MAX_RUNE + ) { + return new Regexp(Regexp.Op.ANY_CHAR_NOT_NL); + } + return re; + } + + case Regexp.Op.STAR: + case Regexp.Op.PLUS: + case Regexp.Op.QUEST: { + const sub = simplify(re.subs[0]); + return simplify1(re.op, re.flags, sub, re); + } + + case Regexp.Op.REPEAT: { + // Special special case: x{0} matches the empty string + // and doesn't even need to consider x. + if (re.min === 0 && re.max === 0) { + return new Regexp(Regexp.Op.EMPTY_MATCH); + } + // The fun begins. + const sub = simplify(re.subs[0]); + + // x{n,} means at least n matches of x. + if (re.max === -1) { + // Special case: x{0,} is x*. + if (re.min === 0) { + return simplify1(Regexp.Op.STAR, re.flags, sub, null); + } + // Special case: x{1,} is x+. + if (re.min === 1) { + return simplify1(Regexp.Op.PLUS, re.flags, sub, null); + } + // General case: x{4,} is xxxx+. + const nre = new Regexp(Regexp.Op.CONCAT); + const subs = []; + for (let i = 0; i < re.min - 1; i++) { + subs.push(sub); + } + subs.push(simplify1(Regexp.Op.PLUS, re.flags, sub, null)); + nre.subs = subs.slice(0); + + // Ensure newly created CONCAT is properly flattened + return simplify(nre); + } + // Special case x{0} handled above. + + // Special case: x{1} is just x. + if (re.min === 1 && re.max === 1) { + return sub; + } + + // General case: x{n,m} means n copies of x and m copies of x? + // The machine will do less work if we nest the final m copies, + // so that x{2,5} = xx(x(x(x)?)?)? + + // Build leading prefix: xx. + let prefixSubs = null; + if (re.min > 0) { + prefixSubs = []; + for (let i = 0; i < re.min; i++) { + prefixSubs.push(sub); + } + } + + // Build and attach suffix: (x(x(x)?)?)? + if (re.max > re.min) { + let suffix = simplify1(Regexp.Op.QUEST, re.flags, sub, null); + for (let i = re.min + 1; i < re.max; i++) { + const nre2 = new Regexp(Regexp.Op.CONCAT); + nre2.subs = [sub, suffix]; + suffix = simplify1(Regexp.Op.QUEST, re.flags, nre2, null); + } + + if (prefixSubs === null) { + return suffix; + } + prefixSubs.push(suffix); + } + + if (prefixSubs !== null) { + const prefix = new Regexp(Regexp.Op.CONCAT); + prefix.subs = prefixSubs.slice(0); + // Ensure newly created CONCAT is properly flattened + return simplify(prefix); + } + + // Some degenerate case like min > max or min < max < 0. + // Handle as impossible match. + return new Regexp(Regexp.Op.NO_MATCH); + } + } + return re; +} + +// simplify1 implements Simplify for the unary OpStar, +// OpPlus, and OpQuest operators. It returns the simple regexp +// equivalent to +// +// Regexp{Op: op, Flags: flags, Sub: {sub}} +// +// under the assumption that sub is already simple, and +// without first allocating that structure. If the regexp +// to be returned turns out to be equivalent to re, simplify1 +// returns re instead. +// +// simplify1 is factored out of Simplify because the implementation +// for other operators generates these unary expressions. +// Letting them call simplify1 makes sure the expressions they +// generate are simple. +function simplify1( + op: number, + flags: number, + sub: Regexp, + re: Regexp | null, +): Regexp { + // Special case: repeat the empty string as much as + // you want, but it's still the empty string. + if (sub.op === Regexp.Op.EMPTY_MATCH) { + return sub; + } + + // Handle impossible targets gracefully. + // e.g. Trying to match "NO_MATCH" 0 or 1 times (QUEST/STAR) evaluates to EMPTY_MATCH. + if (sub.op === Regexp.Op.NO_MATCH) { + if (op === Regexp.Op.PLUS) return sub; // 1+ times is impossible + return new Regexp(Regexp.Op.EMPTY_MATCH); + } + + // The operators are idempotent if the flags match. + if (op === sub.op && (flags & NON_GREEDY) === (sub.flags & NON_GREEDY)) { + return sub; + } + + if ( + re !== null && + re.op === op && + (re.flags & NON_GREEDY) === (flags & NON_GREEDY) && + sub === re.subs[0] + ) { + return re; + } + + const nre = new Regexp(op); + nre.flags = flags; + nre.subs = [sub]; + return nre; +} + +export { simplify }; diff --git a/packages/re2/src/Unicode.ts b/packages/re2/src/Unicode.ts new file mode 100644 index 0000000..90555b6 --- /dev/null +++ b/packages/re2/src/Unicode.ts @@ -0,0 +1,178 @@ +import type { UnicodeRangeTable } from "./UnicodeRangeTable.js"; +import { UnicodeTables } from "./UnicodeTables.js"; + +/** + * Utilities for dealing with Unicode better than JS does. + */ +// The highest legal rune value. +const MAX_RUNE = 0x10ffff; +// The highest legal ASCII value. +const MAX_ASCII = 0x7f; +// The highest legal Latin-1 value. +const MAX_LATIN1 = 0xff; +// The highest legal Basic Multilingual Plane (BMP) value. +const MAX_BMP = 0xffff; +// Minimum and maximum runes involved in folding. +// Checked during test. +const MIN_FOLD = 0x0041; +const MAX_FOLD = 0x1e943; + +const MIN_HIGH_SURROGATE = 0xd800; +const MAX_HIGH_SURROGATE = 0xdbff; +const MIN_LOW_SURROGATE = 0xdc00; +const MAX_LOW_SURROGATE = 0xdfff; +const MIN_SUPPLEMENTARY_CODE_POINT = 0x10000; + +// is32 uses binary search to test whether rune is in the specified +// slice of 32-bit ranges. +function is32(ranges: UnicodeRangeTable, r: number): boolean { + // binary search over ranges + let lo = 0; + let hi = ranges.length; + while (lo < hi) { + const m = lo + Math.floor((hi - lo) / 2); + + const rlo = ranges.getLo(m); + const rhi = ranges.getHi(m); + if (rlo <= r && r <= rhi) { + const stride = ranges.getStride(m); + return (r - rlo) % stride === 0; + } + if (r < rlo) { + hi = m; + } else { + lo = m + 1; + } + } + return false; +} + +// is tests whether rune is in the specified table of ranges. +function is(ranges: UnicodeRangeTable, r: number): boolean { + // Fast path for Latin-1 characters using linear search. + if (r <= MAX_LATIN1) { + for (let i = 0; i < ranges.length; i++) { + const rhi = ranges.getHi(i); + if (r > rhi) { + continue; + } + + const rlo = ranges.getLo(i); + if (r < rlo) { + return false; + } + + const stride = ranges.getStride(i); + return (r - rlo) % stride === 0; + } + return false; + } + + // Fallback to binary search for runes outside Latin-1 + return ranges.length > 0 && r >= ranges.getLo(0) && is32(ranges, r); +} + +// isUpper reports whether the rune is an upper case letter. +function isUpper(r: number): boolean { + if (r <= MAX_LATIN1) { + const s = String.fromCodePoint(r); + return s.toUpperCase() === s && s.toLowerCase() !== s; + } + return is(UnicodeTables.Upper, r); +} + +// simpleFold iterates over Unicode code points equivalent under +// the Unicode-defined simple case folding. Among the code points +// equivalent to rune (including rune itself), SimpleFold returns the +// smallest r >= rune if one exists, or else the smallest r >= 0. +// +// For example: +// SimpleFold('A') = 'a' +// SimpleFold('a') = 'A' +// +// SimpleFold('K') = 'k' +// SimpleFold('k') = '\u212A' (Kelvin symbol, K) +// SimpleFold('\u212A') = 'K' +// +// SimpleFold('1') = '1' +// +// Derived from Go's unicode.SimpleFold. +// +function simpleFold(r: number): number { + // Consult caseOrbit table for special cases (3+ element cycles, lossy + // mappings like ſ→S, and Turkic-specific self-loops). + const caseOrbit = UnicodeTables.CASE_ORBIT; + const folded = caseOrbit.get(r); + if (folded !== undefined) { + return folded; + } + + // Fallback for 2-element orbits: use raw native case conversion. + // The length check rejects multi-char results (e.g., ß→SS) which + // would otherwise be truncated to a non-equivalent codepoint. + const s = String.fromCodePoint(r); + const lower = s.toLowerCase(); + if (lower.length === s.length) { + const lowerCp = lower.codePointAt(0); + if (lowerCp !== undefined && lowerCp !== r) return lowerCp; + } + const upper = s.toUpperCase(); + if (upper.length === s.length) { + const upperCp = upper.codePointAt(0); + if (upperCp !== undefined && upperCp !== r) return upperCp; + } + return r; +} + +// equalsIgnoreCase performs case-insensitive equality comparison +// on the given runes |r1| and |r2|, with special consideration +// for the likely scenario where both runes are ASCII characters. +// If non-ASCII, Unicode case folding will be performed on |r1| +// to compare it to |r2|. +// -1 is interpreted as the end-of-file mark. +function equalsIgnoreCase(r1: number, r2: number): boolean { + // Runes already match, or one of them is EOF + if (r1 < 0 || r2 < 0 || r1 === r2) { + return true; + } + + // Fast path for the common case where both runes are ASCII characters. + // Coerces both runes to lowercase if applicable. + if (r1 <= MAX_ASCII && r2 <= MAX_ASCII) { + if (0x41 <= r1 && r1 <= 0x5a) { + r1 |= 0x20; + } + + if (0x41 <= r2 && r2 <= 0x5a) { + r2 |= 0x20; + } + + return r1 === r2; + } + + // Fall back to full Unicode case folding otherwise. + // Invariant: r1 must be non-negative + for (let r = simpleFold(r1); r !== r1; r = simpleFold(r)) { + if (r === r2) { + return true; + } + } + + return false; +} + +export { + MAX_RUNE, + MIN_FOLD, + MAX_FOLD, + simpleFold, + MAX_ASCII, + equalsIgnoreCase, + MIN_SUPPLEMENTARY_CODE_POINT, + MIN_LOW_SURROGATE, + MIN_HIGH_SURROGATE, + MAX_LOW_SURROGATE, + MAX_HIGH_SURROGATE, + MAX_BMP, + isUpper, +}; diff --git a/packages/re2/src/UnicodeRangeTable.ts b/packages/re2/src/UnicodeRangeTable.ts new file mode 100644 index 0000000..b13eabe --- /dev/null +++ b/packages/re2/src/UnicodeRangeTable.ts @@ -0,0 +1,33 @@ +class UnicodeRangeTable { + data: Uint32Array; + isStride1: boolean; + SIZE: number; + + constructor(data: Uint32Array, isStride1 = false) { + this.data = data; // A Uint32Array + this.isStride1 = isStride1; + this.SIZE = isStride1 ? 2 : 3; + } + + // High-performance getters that do NOT allocate memory + getLo(index: number): number { + return this.data[index * this.SIZE]; + } + getHi(index: number): number { + return this.data[index * this.SIZE + 1]; + } + getStride(index: number): number { + return this.isStride1 ? 1 : this.data[index * this.SIZE + 2]; + } + + get(index: number): number[] { + const i = index * this.SIZE; + return [this.data[i], this.data[i + 1], this.getStride(index)]; + } + + get length(): number { + return this.data.length / this.SIZE; + } +} + +export { UnicodeRangeTable }; diff --git a/packages/re2/src/UnicodeTables.ts b/packages/re2/src/UnicodeTables.ts new file mode 100644 index 0000000..7f23ef1 --- /dev/null +++ b/packages/re2/src/UnicodeTables.ts @@ -0,0 +1,610 @@ +// GENERATED BY tools/scripts/genUnicodeTable.js; DO NOT EDIT. +// yarn node ./tools/scripts/genUnicodeTable.js > src/UnicodeTables.ts + +import { UnicodeRangeTable } from "./UnicodeRangeTable.js"; + +let _B64_MAP: Uint8Array | null = null; +const getB64Map = (): Uint8Array => { + if (!_B64_MAP) { + _B64_MAP = new Uint8Array(256); + const b = + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+-"; + for (let i = 0; i < 64; i++) { + _B64_MAP[b.charCodeAt(i)] = i; + } + } + return _B64_MAP; +}; + +const decodeVLQ = (str: string): number[] => { + const b64 = getB64Map(); + const res: number[] = []; + let value = 0, + shift = 0; + for (let i = 0; i < str.length; i++) { + const digit = b64[str.charCodeAt(i)]; + value |= (digit & 0x1f) << shift; + if ((digit & 0x20) === 0) { + res.push(value); + value = 0; + shift = 0; + } else { + shift += 5; + } + } + return res; +}; + +const decodeRanges = (str: string, isStride1: boolean): Uint32Array => { + if (str.length === 0) return new Uint32Array(0); + const res = decodeVLQ(str); + const numRanges = isStride1 ? res.length / 2 : res.length / 3; + const out = new Uint32Array(numRanges * 3); + let current = 0, + resIdx = 0; + for (let i = 0; i < numRanges; i++) { + current += res[resIdx++]; + out[i * 3] = current; + current += res[resIdx++]; + out[i * 3 + 1] = current; + out[i * 3 + 2] = isStride1 ? 1 : res[resIdx++]; + } + return out; +}; + +const decodeOrbit = (str: string): Map => { + const res = decodeVLQ(str); + const map = new Map(); + let currentKey = 0; + for (let i = 0; i < res.length; i += 2) { + currentKey += res[i]; + map.set(currentKey, res[i + 1]); + } + return map; +}; + +// Merges two stride-encoded UnicodeRangeTables. Expands any stride>1 +// ranges to individual codepoints, then coalesces contiguous runs. +const mergeRanges = (a: Uint32Array, b: Uint32Array): Uint32Array => { + if (b.length === 0) return a; + if (a.length === 0) return b; + const points: [number, number][] = []; + const push = (arr: Uint32Array): void => { + for (let i = 0; i < arr.length; i += 3) { + const lo = arr[i], + hi = arr[i + 1], + stride = arr[i + 2]; + if (stride === 1) { + points.push([lo, hi]); + } else { + for (let cp = lo; cp <= hi; cp += stride) points.push([cp, cp]); + } + } + }; + push(a); + push(b); + points.sort((x, y) => x[0] - y[0]); + const merged: [number, number][] = []; + for (const [lo, hi] of points) { + const last = merged[merged.length - 1]; + if (last && last[1] + 1 >= lo) { + if (hi > last[1]) last[1] = hi; + } else { + merged.push([lo, hi]); + } + } + const out = new Uint32Array(merged.length * 3); + for (let i = 0; i < merged.length; i++) { + out[i * 3] = merged[i][0]; + out[i * 3 + 1] = merged[i][1]; + out[i * 3 + 2] = 1; + } + return out; +}; + +// Sweeps the codepoint space using a platform property-escape regex and +// returns stride-1 ranges. Surrogates are included — String.fromCodePoint +// returns the lone surrogate char and platform regex matches \p{Cs} on it. +const sweepPlatform = (pattern: string): Uint32Array => { + const re = new RegExp(pattern, "u"); + const ranges: number[] = []; + let start = -1; + for (let cp = 0; cp <= 0x10ffff; cp++) { + if (re.test(String.fromCodePoint(cp))) { + if (start < 0) start = cp; + } else if (start >= 0) { + ranges.push(start, cp - 1, 1); + start = -1; + } + } + if (start >= 0) ranges.push(start, 0x10ffff, 1); + return Uint32Array.from(ranges); +}; + +class LazyDecoder { + private readonly initializer: Record V>; + private readonly cache: Map; + constructor(initializer: Record V>) { + this.initializer = initializer; + this.cache = new Map(); + } + has(key: string): boolean { + return key in this.initializer; + } + get(key: string): V | null { + const cached = this.cache.get(key); + if (cached !== undefined || this.cache.has(key)) { + return cached ?? null; + } + const fn = this.initializer[key]; + const val = fn ? fn() : null; + this.cache.set(key, val); + return val; + } +} + +let _CASE_ORBIT: Map | null = null; +const getCASE_ORBIT = (): Map => { + if (!_CASE_ORBIT) { + _CASE_ORBIT = decodeOrbit( + "rDqpII-LsD+0HGrpIsCxJzElODoODrOnByP-Mz+HTieNj-HCweD1fDxeB+9HBwfC1FE2eBxfBjeE1eDmpII0fjB4c+BgkHChkHKikHDjkHBkkHImkHZnkHhhGlkH9O70H-Io8HBp8HBq8HBr8HBs8HBt8HBu8HBv8HJ48HB58HB68HB78HB88HB98HB+8HB-8HJo9HBp9HBq9HBr9HBs9HBt9HBu9HBv9HM89HLlaFs+HQwcQwdQ8-HzJpdErCBlGgphBokHjMu+pBBv+pBDy+pBBz+pBB0+pBB1+pBw5Um4+BBl4+B68cg17CBh17CBi17CBj17CBk17CBl17CBm17CBn17CBo17CBp17CBq17CBr17CBs17CBt17CBu17CBv17CBw17CBx17CBy17CBz17CB017CB117CB217CB317CB417CD717CB817CB917CB+17CB-17CBg27CBh27CBi27CBj27CBk27CBl27CBm27CBn27CBo27CBp27CBq27CBr27CBs27CBt27CBu27CBv27CBw27CBx27CBy27CBz27C", + ); + } + return _CASE_ORBIT; +}; + +// Additions from Unicode 15.0 → 16.0 per stable general-category name. +// Merged unconditionally with platform sweep output; no-op on 16.0+ engines. +const _DELTA_CATEGORIES = /*#__PURE__*/ new LazyDecoder({ + L: () => + decodeRanges( + "pkHBBh6iBCBNCBkvXzBB36BbBKWB9JCB8lBJBCDDClBBCaaCt-Bt-BBfBgkG68DBmoHdBjhDsBBz8HxmWxmWBcBDgwhCgwhCBsTB", + false, + ), + LC: () => decodeRanges("pkHBh6iBCNC0rZVLV", true), + Ll: () => decodeRanges("qkHj6iBj6iBO1sZ1sZBUB", false), + Lm: () => decodeRanges("uqjChBhBx+XCBpBBB", false), + Lo: () => + decodeRanges( + "guhCzBB36BDBCzLzLBBB8lBJBCDDClBBCaaCt-Bt-BBfBgkG68DBmoHdBmhDnBB18HxmWxmWBcBDgwhCgwhCBsTB", + false, + ), + Lu: () => decodeRanges("pkHi6iBi6iBBOOC0rZ0rZBUB", false), + M: () => + decodeRanges( + "3kCymhCymhCBDBvM8lB8lBBHBCDDCDBCEBCPPB47C47CkuQRB-lhBBB", + false, + ), + Mc: () => decodeRanges("49kCCBIDDCDBCBBCvavaswSCB", false), + Mn: () => + decodeRanges("3kCymhCymhCBDBvM-lB-lBBEBOECPBB47CkuQkuQBKBECB-lhBBB", false), + N: () => decodeRanges("gqjCJnsCTtoBJ3pRJ3hDJ37XJ4nGJ", true), + Nd: () => decodeRanges("gqjCJnsCTtoBJ3pRJ3hDJ37XJ4nGJ", true), + P: () => decodeRanges("u6GBBwBvv8Bvv8BmzBBBCBBpgCssUssUBBBwkeAB", false), + Pd: () => decodeRanges("urjCA", true), + Po: () => decodeRanges("u6GBBwB1i+B1i+BBCCBpgCpgCssUCBwkeAB", false), + S: () => + decodeRanges( + "nhJCBz+CDBlPBBK-82B-82BBxzvBxzvBBuHBRzNB-vKJBFBBoOGGvBIIWDDKiHiHBjBB", + false, + ), + Sm: () => decodeRanges("usjCB", true), + So: () => + decodeRanges( + "nhJCBz+CDBlPBBKxwmDxwmDBuHBRzNB-vKJBFBBoOGGvBIIWDDKiHiHBjBB", + false, + ), +}); + +// Additions from Unicode 15.0 → 16.0 per stable script name. +const _DELTA_SCRIPTS = /*#__PURE__*/ new LazyDecoder({ + Arabic: () => decodeRanges("3kCrxhCrxhCBBB4BAB", false), + Balinese: () => decodeRanges("u6GBwBA", true), + Common: () => + decodeRanges( + "nhJCBz+CDBlPBBKxwmDxwmDB4HBHzNB-vKJBFBBoOGGvBIIWDDKiHiHBjBB", + false, + ), + Cyrillic: () => decodeRanges("pkHB", true), + Egyptian_Hieroglyphs: () => decodeRanges("gjtC68D", true), + Han: () => decodeRanges("w-6FtT", true), + Kawi: () => decodeRanges("66nCA", true), + Khitan_Small_Script: () => decodeRanges("-njDA", true), + Latin: () => decodeRanges("r+pBCNC", true), + Myanmar: () => decodeRanges("w2lCT", true), +}); + +// Full tables for scripts added in Unicode 16.0. Engines < 16.0 throw +// SyntaxError on these names, so platform sweep is impossible. +const _NEW_SCRIPTS = /*#__PURE__*/ new LazyDecoder({ + Garay: () => new UnicodeRangeTable(decodeRanges("gqjClBEcJB", true)), + Gurung_Khema: () => new UnicodeRangeTable(decodeRanges("go4C5B", true)), + Kirat_Rai: () => new UnicodeRangeTable(decodeRanges("gq7C5B", true)), + Ol_Onal: () => new UnicodeRangeTable(decodeRanges("wu5DqBFA", true)), + Sunuwar: () => new UnicodeRangeTable(decodeRanges("g+mChBPJ", true)), + Todhri: () => new UnicodeRangeTable(decodeRanges("guhCzB", true)), + Tulu_Tigalari: () => + new UnicodeRangeTable( + decodeRanges("g8kCJBCDDClBBCJBCDDCDBCJBCBBJBB", false), + ), +}); + +const STABLE_CATEGORY_NAMES: ReadonlySet = new Set([ + "C", + "Cc", + "Cf", + "Cn", + "Co", + "Cs", + "L", + "LC", + "Ll", + "Lm", + "Lo", + "Lt", + "Lu", + "M", + "Mc", + "Me", + "Mn", + "N", + "Nd", + "Nl", + "No", + "P", + "Pc", + "Pd", + "Pe", + "Pf", + "Pi", + "Po", + "Ps", + "S", + "Sc", + "Sk", + "Sm", + "So", + "Z", + "Zl", + "Zp", + "Zs", +]); +const STABLE_SCRIPT_NAMES: ReadonlySet = new Set([ + "Adlam", + "Ahom", + "Anatolian_Hieroglyphs", + "Arabic", + "Armenian", + "Avestan", + "Balinese", + "Bamum", + "Bassa_Vah", + "Batak", + "Bengali", + "Bhaiksuki", + "Bopomofo", + "Brahmi", + "Braille", + "Buginese", + "Buhid", + "Canadian_Aboriginal", + "Carian", + "Caucasian_Albanian", + "Chakma", + "Cham", + "Cherokee", + "Chorasmian", + "Common", + "Coptic", + "Cuneiform", + "Cypriot", + "Cypro_Minoan", + "Cyrillic", + "Deseret", + "Devanagari", + "Dives_Akuru", + "Dogra", + "Duployan", + "Egyptian_Hieroglyphs", + "Elbasan", + "Elymaic", + "Ethiopic", + "Georgian", + "Glagolitic", + "Gothic", + "Grantha", + "Greek", + "Gujarati", + "Gunjala_Gondi", + "Gurmukhi", + "Han", + "Hangul", + "Hanifi_Rohingya", + "Hanunoo", + "Hatran", + "Hebrew", + "Hiragana", + "Imperial_Aramaic", + "Inherited", + "Inscriptional_Pahlavi", + "Inscriptional_Parthian", + "Javanese", + "Kaithi", + "Kannada", + "Katakana", + "Kawi", + "Kayah_Li", + "Kharoshthi", + "Khitan_Small_Script", + "Khmer", + "Khojki", + "Khudawadi", + "Lao", + "Latin", + "Lepcha", + "Limbu", + "Linear_A", + "Linear_B", + "Lisu", + "Lycian", + "Lydian", + "Mahajani", + "Makasar", + "Malayalam", + "Mandaic", + "Manichaean", + "Marchen", + "Masaram_Gondi", + "Medefaidrin", + "Meetei_Mayek", + "Mende_Kikakui", + "Meroitic_Cursive", + "Meroitic_Hieroglyphs", + "Miao", + "Modi", + "Mongolian", + "Mro", + "Multani", + "Myanmar", + "Nabataean", + "Nag_Mundari", + "Nandinagari", + "New_Tai_Lue", + "Newa", + "Nko", + "Nushu", + "Nyiakeng_Puachue_Hmong", + "Ogham", + "Ol_Chiki", + "Old_Hungarian", + "Old_Italic", + "Old_North_Arabian", + "Old_Permic", + "Old_Persian", + "Old_Sogdian", + "Old_South_Arabian", + "Old_Turkic", + "Old_Uyghur", + "Oriya", + "Osage", + "Osmanya", + "Pahawh_Hmong", + "Palmyrene", + "Pau_Cin_Hau", + "Phags_Pa", + "Phoenician", + "Psalter_Pahlavi", + "Rejang", + "Runic", + "Samaritan", + "Saurashtra", + "Sharada", + "Shavian", + "Siddham", + "SignWriting", + "Sinhala", + "Sogdian", + "Sora_Sompeng", + "Soyombo", + "Sundanese", + "Syloti_Nagri", + "Syriac", + "Tagalog", + "Tagbanwa", + "Tai_Le", + "Tai_Tham", + "Tai_Viet", + "Takri", + "Tamil", + "Tangsa", + "Tangut", + "Telugu", + "Thaana", + "Thai", + "Tibetan", + "Tifinagh", + "Tirhuta", + "Toto", + "Ugaritic", + "Unknown", + "Vai", + "Vithkuqi", + "Wancho", + "Warang_Citi", + "Yezidi", + "Yi", + "Zanabazar_Square", +]); +const NEW_SCRIPT_NAMES: ReadonlySet = new Set([ + "Garay", + "Gurung_Khema", + "Kirat_Rai", + "Ol_Onal", + "Sunuwar", + "Todhri", + "Tulu_Tigalari", +]); + +const _sweepCache = new Map(); +const _foldCache = new Map(); + +// Returns the base range table for a property name, or null if unknown. +// Stable names: platform sweep + bundled delta (15.0 → 16.0). +// New-in-16.0 script names: bundled full table. +const buildForProperty = (name: string): UnicodeRangeTable | null => { + if (NEW_SCRIPT_NAMES.has(name)) { + return _NEW_SCRIPTS.get(name); + } + let kind: "category" | "script" | null = null; + let pattern: string | null = null; + if (STABLE_CATEGORY_NAMES.has(name)) { + kind = "category"; + pattern = `\\p{General_Category=${name}}`; + } else if (STABLE_SCRIPT_NAMES.has(name)) { + kind = "script"; + pattern = `\\p{Script=${name}}`; + } else return null; + + const cacheKey = `${kind}:${name}`; + const cached = _sweepCache.get(cacheKey); + if (cached) return cached; + + const base = sweepPlatform(pattern); + const delta = + kind === "category" + ? _DELTA_CATEGORIES.get(name) + : _DELTA_SCRIPTS.get(name); + const merged = delta ? mergeRanges(base, delta) : base; + const table = new UnicodeRangeTable(merged); + _sweepCache.set(cacheKey, table); + return table; +}; + +// Computes the fold-overlay for a property name: additional runes that +// fold to some rune already in the base class. Returns null if no overlay +// is needed (base class is fold-stable). +const buildFoldOverlay = (name: string): UnicodeRangeTable | null => { + const cached = _foldCache.get(name); + if (cached !== undefined) return cached; + const base = buildForProperty(name); + if (!base) { + _foldCache.set(name, null); + return null; + } + const inBase = (r: number): boolean => { + let lo = 0, + hi = base.length; + while (lo < hi) { + const m = (lo + hi) >> 1; + const rlo = base.getLo(m), + rhi = base.getHi(m); + if (r < rlo) hi = m; + else if (r > rhi) lo = m + 1; + else return (r - rlo) % base.getStride(m) === 0; + } + return false; + }; + // Inline simpleFold to avoid circular import with Unicode.ts. + const orbit = getCASE_ORBIT(); + const simpleFold = (r: number): number => { + const folded = orbit.get(r); + if (folded !== undefined) return folded; + const s = String.fromCodePoint(r); + const lower = s.toLowerCase(); + if (lower.length === s.length) { + const lowerCp = lower.codePointAt(0); + if (lowerCp !== undefined && lowerCp !== r) return lowerCp; + } + const upper = s.toUpperCase(); + if (upper.length === s.length) { + const upperCp = upper.codePointAt(0); + if (upperCp !== undefined && upperCp !== r) return upperCp; + } + return r; + }; + const extras = new Set(); + for (let i = 0; i < base.length; i++) { + const lo = base.getLo(i), + hi = base.getHi(i), + stride = base.getStride(i); + for (let cp = lo; cp <= hi; cp += stride) { + let r = simpleFold(cp); + while (r !== cp) { + if (!inBase(r)) extras.add(r); + r = simpleFold(r); + } + } + } + if (extras.size === 0) { + _foldCache.set(name, null); + return null; + } + const sorted = Array.from(extras).sort((a, b) => a - b); + const merged: [number, number][] = []; + for (const cp of sorted) { + const last = merged[merged.length - 1]; + if (last && last[1] + 1 === cp) last[1] = cp; + else merged.push([cp, cp]); + } + const out = new Uint32Array(merged.length * 3); + for (let i = 0; i < merged.length; i++) { + out[i * 3] = merged[i][0]; + out[i * 3 + 1] = merged[i][1]; + out[i * 3 + 2] = 1; + } + const table = new UnicodeRangeTable(out); + _foldCache.set(name, table); + return table; +}; + +const getUpper = (): UnicodeRangeTable => { + const table = buildForProperty("Lu"); + if (table === null) { + throw new Error("Upper: missing Lu property"); + } + return table; +}; + +// --- Legacy API surface used by Parser --- + +export const UnicodeTables = { + get CASE_ORBIT(): Map { + return getCASE_ORBIT(); + }, + STABLE_CATEGORY_NAMES, + STABLE_SCRIPT_NAMES, + NEW_SCRIPT_NAMES, + buildForProperty, + buildFoldOverlay, + CATEGORIES: { + has: (name: string): boolean => STABLE_CATEGORY_NAMES.has(name), + get: (name: string): UnicodeRangeTable | null => buildForProperty(name), + }, + SCRIPTS: { + has: (name: string): boolean => + STABLE_SCRIPT_NAMES.has(name) || NEW_SCRIPT_NAMES.has(name), + get: (name: string): UnicodeRangeTable | null => buildForProperty(name), + }, + FOLD_CATEGORIES: { + has: (name: string): boolean => STABLE_CATEGORY_NAMES.has(name), + get: (name: string): UnicodeRangeTable | null => buildFoldOverlay(name), + }, + FOLD_SCRIPT: { + has: (name: string): boolean => + STABLE_SCRIPT_NAMES.has(name) || NEW_SCRIPT_NAMES.has(name), + get: (name: string): UnicodeRangeTable | null => buildFoldOverlay(name), + }, + get Upper(): UnicodeRangeTable { + return getUpper(); + }, + + // --- Test-only hooks: expose the raw bundled 15.0→16.0 delta and + // new-in-16.0 script data so tests can verify the generator output. + // These are not part of the public API. + _deltaCategoryRanges: (name: string): Uint32Array | null => + _DELTA_CATEGORIES.get(name), + _deltaScriptRanges: (name: string): Uint32Array | null => + _DELTA_SCRIPTS.get(name), + _newScriptTable: (name: string): UnicodeRangeTable | null => + _NEW_SCRIPTS.get(name), +}; diff --git a/packages/re2/src/Utils.ts b/packages/re2/src/Utils.ts new file mode 100644 index 0000000..fe4cb33 --- /dev/null +++ b/packages/re2/src/Utils.ts @@ -0,0 +1,133 @@ +import { MAX_BMP } from "./Unicode.js"; +import { codePointAtOrThrow } from "./__utils__/chars.js"; + +/** + * Various constants and helper utilities. + */ +const METACHARACTERS = "\\.+*?()|[]{}^$"; + +//// EMPTY_* flags +const EMPTY_BEGIN_LINE = 0x01; +const EMPTY_END_LINE = 0x02; +const EMPTY_BEGIN_TEXT = 0x04; +const EMPTY_END_TEXT = 0x08; +const EMPTY_WORD_BOUNDARY = 0x10; +const EMPTY_NO_WORD_BOUNDARY = 0x20; + +function emptyInts(): number[] { + return []; +} + +// Returns true iff |c| is an ASCII letter or decimal digit. +function isalnum(c: number): boolean { + return ( + (0x30 <= c && c <= 0x39) || + (0x61 <= c && c <= 0x7a) || + (0x41 <= c && c <= 0x5a) + ); +} + +// If |c| is an ASCII hex digit, returns its value, otherwise -1. +function unhex(c: number): number { + if (0x30 <= c && c <= 0x39) { + return c - 0x30; + } + if (0x61 <= c && c <= 0x66) { + return c - 0x61 + 10; + } + if (0x41 <= c && c <= 0x46) { + return c - 0x41 + 10; + } + return -1; +} + +// Returns the array of runes in the specified UTF-16 string. +function stringToRunes(str: string): number[] { + return Array.from(String(str)).map((s) => codePointAtOrThrow(s, 0)); +} + +// Returns the Java UTF-16 string containing the single rune |r|. +function runeToString(r: number): string { + return String.fromCodePoint(r); +} + +// isWordRune reports whether r is consider a ``word character'' +// during the evaluation of the \b and \B zero-width assertions. +// These assertions are ASCII-only: the word characters are [A-Za-z0-9_]. +function isWordRune(r: number): boolean { + return ( + (0x61 <= r && r <= 0x7a) || + (0x41 <= r && r <= 0x5a) || + (0x30 <= r && r <= 0x39) || + r === 0x5f + ); +} + +// emptyOpContext returns the zero-width assertions satisfied at the position +// between the runes r1 and r2, a bitmask of EMPTY_* flags. +// Passing r1 == -1 indicates that the position is at the beginning of the +// text. +// Passing r2 == -1 indicates that the position is at the end of the text. +// eslint-disable-next-line no-warning-comments +// TODO(adonovan): move to Machine. +function emptyOpContext(r1: number, r2: number): number { + let op = 0; + if (r1 < 0) { + op |= EMPTY_BEGIN_TEXT | EMPTY_BEGIN_LINE; + } + if (r1 === 0x0a) { + op |= EMPTY_BEGIN_LINE; + } + if (r2 < 0) { + op |= EMPTY_END_TEXT | EMPTY_END_LINE; + } + if (r2 === 0x0a) { + op |= EMPTY_END_LINE; + } + if (isWordRune(r1) !== isWordRune(r2)) { + op |= EMPTY_WORD_BOUNDARY; + } else { + op |= EMPTY_NO_WORD_BOUNDARY; + } + return op; +} + +/** + * Returns a string that quotes all regular expression metacharacters inside the argument text; + * the returned string is a regular expression matching the literal text. For example, + * {@code quoteMeta("[foo]").equals("\\[foo\\]")}. + * @param {string} str + * @returns {string} + */ +function quoteMeta(str: string): string { + return str + .split("") // A char loop is correct because all metacharacters fit in one UTF-16 code. + .map((s) => { + if (METACHARACTERS.indexOf(s) >= 0) { + return `\\${s}`; + } + return s; + }) + .join(""); +} + +function charCount(codePoint: number): number { + return codePoint > MAX_BMP ? 2 : 1; +} + +export { + emptyInts, + runeToString, + emptyOpContext, + charCount, + stringToRunes, + isalnum, + unhex, + quoteMeta, + EMPTY_BEGIN_LINE, + EMPTY_END_LINE, + EMPTY_WORD_BOUNDARY, + EMPTY_BEGIN_TEXT, + EMPTY_END_TEXT, + EMPTY_NO_WORD_BOUNDARY, +}; diff --git a/packages/re2/src/__fixtures__/basic.dat b/packages/re2/src/__fixtures__/basic.dat new file mode 100644 index 0000000..1776b1f --- /dev/null +++ b/packages/re2/src/__fixtures__/basic.dat @@ -0,0 +1,217 @@ +NOTE all standard compliant implementations should pass these : 2002-05-31 + +BE abracadabra$ abracadabracadabra (7,18) +BE a...b abababbb (2,7) +BE XXXXXX ..XXXXXX (2,8) +E \) () (1,2) +BE a] a]a (0,2) +B } } (0,1) +E \} } (0,1) +BE \] ] (0,1) +B ] ] (0,1) +E ] ] (0,1) +B { { (0,1) +B } } (0,1) +BE ^a ax (0,1) +BE \^a a^a (1,3) +BE a\^ a^ (0,2) +BE a$ aa (1,2) +BE a\$ a$ (0,2) +BE ^$ NULL (0,0) +E $^ NULL (0,0) +E a($) aa (1,2)(2,2) +E a*(^a) aa (0,1)(0,1) +E (..)*(...)* a (0,0) +E (..)*(...)* abcd (0,4)(2,4) +E (ab|a)(bc|c) abc (0,3)(0,2)(2,3) +E (ab)c|abc abc (0,3)(0,2) +E a{0}b ab (1,2) +E (a*)(b?)(b+)b{3} aaabbbbbbb (0,10)(0,3)(3,4)(4,7) +E (a*)(b{0,1})(b{1,})b{3} aaabbbbbbb (0,10)(0,3)(3,4)(4,7) +E a{9876543210} NULL BADBR +E ((a|a)|a) a (0,1)(0,1)(0,1) +E (a*)(a|aa) aaaa (0,4)(0,3)(3,4) +E a*(a.|aa) aaaa (0,4)(2,4) +E a(b)|c(d)|a(e)f aef (0,3)(?,?)(?,?)(1,2) +E (a|b)?.* b (0,1)(0,1) +E (a|b)c|a(b|c) ac (0,2)(0,1) +E (a|b)c|a(b|c) ab (0,2)(?,?)(1,2) +E (a|b)*c|(a|ab)*c abc (0,3)(1,2) +E (a|b)*c|(a|ab)*c xc (1,2) +E (.a|.b).*|.*(.a|.b) xa (0,2)(0,2) +E a?(ab|ba)ab abab (0,4)(0,2) +E a?(ac{0}b|ba)ab abab (0,4)(0,2) +E ab|abab abbabab (0,2) +E aba|bab|bba baaabbbaba (5,8) +E aba|bab baaabbbaba (6,9) +E (aa|aaa)*|(a|aaaaa) aa (0,2)(0,2) +E (a.|.a.)*|(a|.a...) aa (0,2)(0,2) +E ab|a xabc (1,3) +E ab|a xxabc (2,4) +Ei (Ab|cD)* aBcD (0,4)(2,4) +BE [^-] --a (2,3) +BE [a-]* --a (0,3) +BE [a-m-]* --amoma-- (0,4) +E :::1:::0:|:::1:1:0: :::0:::1:::1:::0: (8,17) +E :::1:::0:|:::1:1:1: :::0:::1:::1:::0: (8,17) +{E [[:upper:]] A (0,1) [[]] not supported +E [[:lower:]]+ `az{ (1,3) +E [[:upper:]]+ @AZ[ (1,3) +# No collation in Go +#BE [[-]] [[-]] (2,4) +#BE [[.NIL.]] NULL ECOLLATE +#BE [[=aleph=]] NULL ECOLLATE +} +BE$ \n \n (0,1) +BEn$ \n \n (0,1) +BE$ [^a] \n (0,1) +BE$ \na \na (0,2) +E (a)(b)(c) abc (0,3)(0,1)(1,2)(2,3) +BE xxx xxx (0,3) +E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) feb 6, (0,6) +E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) 2/7 (0,3) +E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) feb 1,Feb 6 (5,11) +E3 ((((((((((((((((((((((((((((((x)))))))))))))))))))))))))))))) x (0,1)(0,1)(0,1) +E3 ((((((((((((((((((((((((((((((x))))))))))))))))))))))))))))))* xx (0,2)(1,2)(1,2) +E a?(ab|ba)* ababababababababababababababababababababababababababababababababababababababababa (0,81)(79,81) +E abaa|abbaa|abbbaa|abbbbaa ababbabbbabbbabbbbabbbbaa (18,25) +E abaa|abbaa|abbbaa|abbbbaa ababbabbbabbbabbbbabaa (18,22) +E aaac|aabc|abac|abbc|baac|babc|bbac|bbbc baaabbbabac (7,11) +BE$ .* \x01\xff (0,2) +E aaaa|bbbb|cccc|ddddd|eeeeee|fffffff|gggg|hhhh|iiiii|jjjjj|kkkkk|llll XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa (53,57) +L aaaa\nbbbb\ncccc\nddddd\neeeeee\nfffffff\ngggg\nhhhh\niiiii\njjjjj\nkkkkk\nllll XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa NOMATCH +E a*a*a*a*a*b aaaaaaaaab (0,10) +BE ^ NULL (0,0) +BE $ NULL (0,0) +BE ^$ NULL (0,0) +BE ^a$ a (0,1) +BE abc abc (0,3) +BE abc xabcy (1,4) +BE abc ababc (2,5) +BE ab*c abc (0,3) +BE ab*bc abc (0,3) +BE ab*bc abbc (0,4) +BE ab*bc abbbbc (0,6) +E ab+bc abbc (0,4) +E ab+bc abbbbc (0,6) +E ab?bc abbc (0,4) +E ab?bc abc (0,3) +E ab?c abc (0,3) +BE ^abc$ abc (0,3) +BE ^abc abcc (0,3) +BE abc$ aabc (1,4) +BE ^ abc (0,0) +BE $ abc (3,3) +BE a.c abc (0,3) +BE a.c axc (0,3) +BE a.*c axyzc (0,5) +BE a[bc]d abd (0,3) +BE a[b-d]e ace (0,3) +BE a[b-d] aac (1,3) +BE a[-b] a- (0,2) +BE a[b-] a- (0,2) +BE a] a] (0,2) +BE a[]]b a]b (0,3) +BE a[^bc]d aed (0,3) +BE a[^-b]c adc (0,3) +BE a[^]b]c adc (0,3) +E ab|cd abc (0,2) +E ab|cd abcd (0,2) +E a\(b a(b (0,3) +E a\(*b ab (0,2) +E a\(*b a((b (0,4) +E ((a)) abc (0,1)(0,1)(0,1) +E (a)b(c) abc (0,3)(0,1)(2,3) +E a+b+c aabbabc (4,7) +E a* aaa (0,3) +E (a*)* - (0,0)(0,0) +E (a*)+ - (0,0)(0,0) +E (a*|b)* - (0,0)(0,0) +E (a+|b)* ab (0,2)(1,2) +E (a+|b)+ ab (0,2)(1,2) +E (a+|b)? ab (0,1)(0,1) +BE [^ab]* cde (0,3) +E (^)* - (0,0)(0,0) +BE a* NULL (0,0) +E ([abc])*d abbbcd (0,6)(4,5) +E ([abc])*bcd abcd (0,4)(0,1) +E a|b|c|d|e e (0,1) +E (a|b|c|d|e)f ef (0,2)(0,1) +E ((a*|b))* - (0,0)(0,0)(0,0) +BE abcd*efg abcdefg (0,7) +BE ab* xabyabbbz (1,3) +BE ab* xayabbbz (1,2) +E (ab|cd)e abcde (2,5)(2,4) +BE [abhgefdc]ij hij (0,3) +E (a|b)c*d abcd (1,4)(1,2) +E (ab|ab*)bc abc (0,3)(0,1) +E a([bc]*)c* abc (0,3)(1,3) +E a([bc]*)(c*d) abcd (0,4)(1,3)(3,4) +E a([bc]+)(c*d) abcd (0,4)(1,3)(3,4) +E a([bc]*)(c+d) abcd (0,4)(1,2)(2,4) +E a[bcd]*dcdcde adcdcde (0,7) +E (ab|a)b*c abc (0,3)(0,2) +E ((a)(b)c)(d) abcd (0,4)(0,3)(0,1)(1,2)(3,4) +BE [A-Za-z_][A-Za-z0-9_]* alpha (0,5) +E ^a(bc+|b[eh])g|.h$ abh (1,3) +E (bc+d$|ef*g.|h?i(j|k)) effgz (0,5)(0,5) +E (bc+d$|ef*g.|h?i(j|k)) ij (0,2)(0,2)(1,2) +E (bc+d$|ef*g.|h?i(j|k)) reffgz (1,6)(1,6) +E (((((((((a))))))))) a (0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1) +BE multiple words multiple words yeah (0,14) +E (.*)c(.*) abcde (0,5)(0,2)(3,5) +BE abcd abcd (0,4) +E a(bc)d abcd (0,4)(1,3) +E a[-]?c ac (0,3) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Qaddafi (0,15)(?,?)(10,12) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mo'ammar Gadhafi (0,16)(?,?)(11,13) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Kaddafi (0,15)(?,?)(10,12) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Qadhafi (0,15)(?,?)(10,12) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Gadafi (0,14)(?,?)(10,11) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mu'ammar Qadafi (0,15)(?,?)(11,12) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moamar Gaddafi (0,14)(?,?)(9,11) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mu'ammar Qadhdhafi (0,18)(?,?)(13,15) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Khaddafi (0,16)(?,?)(11,13) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghaddafy (0,16)(?,?)(11,13) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghadafi (0,15)(?,?)(11,12) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghaddafi (0,16)(?,?)(11,13) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muamar Kaddafi (0,14)(?,?)(9,11) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Quathafi (0,16)(?,?)(11,13) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Gheddafi (0,16)(?,?)(11,13) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moammar Khadafy (0,15)(?,?)(11,12) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moammar Qudhafi (0,15)(?,?)(10,12) +E a+(b|c)*d+ aabcdd (0,6)(3,4) +E ^.+$ vivi (0,4) +E ^(.+)$ vivi (0,4)(0,4) +E ^([^!.]+).att.com!(.+)$ gryphon.att.com!eby (0,19)(0,7)(16,19) +E ^([^!]+!)?([^!]+)$ bas (0,3)(?,?)(0,3) +E ^([^!]+!)?([^!]+)$ bar!bas (0,7)(0,4)(4,7) +E ^([^!]+!)?([^!]+)$ foo!bas (0,7)(0,4)(4,7) +E ^.+!([^!]+!)([^!]+)$ foo!bar!bas (0,11)(4,8)(8,11) +E ((foo)|(bar))!bas bar!bas (0,7)(0,3)(?,?)(0,3) +E ((foo)|(bar))!bas foo!bar!bas (4,11)(4,7)(?,?)(4,7) +E ((foo)|(bar))!bas foo!bas (0,7)(0,3)(0,3) +E ((foo)|bar)!bas bar!bas (0,7)(0,3) +E ((foo)|bar)!bas foo!bar!bas (4,11)(4,7) +E ((foo)|bar)!bas foo!bas (0,7)(0,3)(0,3) +E (foo|(bar))!bas bar!bas (0,7)(0,3)(0,3) +E (foo|(bar))!bas foo!bar!bas (4,11)(4,7)(4,7) +E (foo|(bar))!bas foo!bas (0,7)(0,3) +E (foo|bar)!bas bar!bas (0,7)(0,3) +E (foo|bar)!bas foo!bar!bas (4,11)(4,7) +E (foo|bar)!bas foo!bas (0,7)(0,3) +E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bar!bas (0,11)(0,11)(?,?)(?,?)(4,8)(8,11) +E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ bas (0,3)(?,?)(0,3) +E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ bar!bas (0,7)(0,4)(4,7) +E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ foo!bar!bas (0,11)(?,?)(?,?)(4,8)(8,11) +E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ foo!bas (0,7)(0,4)(4,7) +E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ bas (0,3)(0,3)(?,?)(0,3) +E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ bar!bas (0,7)(0,7)(0,4)(4,7) +E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bar!bas (0,11)(0,11)(?,?)(?,?)(4,8)(8,11) +E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bas (0,7)(0,7)(0,4)(4,7) +E .*(/XXX).* /XXX (0,4)(0,4) +E .*(\\XXX).* \XXX (0,4)(0,4) +E \\XXX \XXX (0,4) +E .*(/000).* /000 (0,4)(0,4) +E .*(\\000).* \000 (0,4)(0,4) +E \\000 \000 (0,4) diff --git a/packages/re2/src/__fixtures__/find.ts b/packages/re2/src/__fixtures__/find.ts new file mode 100644 index 0000000..1368adb --- /dev/null +++ b/packages/re2/src/__fixtures__/find.ts @@ -0,0 +1,211 @@ +class Test { + pat: string; + text: string; + matches: number[][]; + + constructor(pat: string, text: string, n: number, ...x: number[]) { + this.pat = pat; + this.text = text; + this.matches = []; + + if (n > 0) { + const runLength = Math.floor(x.length / n); + for (let i = 0, j = 0; i < n; i++) { + this.matches[i] = x.slice(j, j + runLength); + j += runLength; + if (j > x.length) { + throw new Error("invalid build entry"); + } + } + } + } + + toString(): string { + return `pat=${this.pat} text=${this.text} len=${this.matches.length} matches=${JSON.stringify( + this.matches, + )}`; + } +} + +export const FIND_TESTS = [ + new Test("", "", 1, 0, 0), + new Test("^abcdefg", "abcdefg", 1, 0, 7), + new Test("a+", "baaab", 1, 1, 4), + new Test("abcd..", "abcdef", 1, 0, 6), + new Test("a", "a", 1, 0, 1), + new Test("x", "y", 0), + new Test("b", "abc", 1, 1, 2), + new Test(".", "a", 1, 0, 1), + new Test(".*", "abcdef", 1, 0, 6), + new Test("^", "abcde", 1, 0, 0), + new Test("$", "abcde", 1, 5, 5), + new Test("^abcd$", "abcd", 1, 0, 4), + new Test("^bcd'", "abcdef", 0), + new Test("^abcd$", "abcde", 0), + new Test("h.*od?", "hello\ngoodbye\n", 1, 0, 5), + new Test("a{1,5}", "baaac", 1, 1, 4), + new Test("ac{1,25}", "bbaaaccccdd", 1, 4, 9), + new Test("a+", "baaab", 1, 1, 4), + new Test("a*", "baaab", 3, 0, 0, 1, 4, 5, 5), + new Test("[a-z]+", "abcd", 1, 0, 4), + new Test("[^a-z]+", "ab1234cd", 1, 2, 6), + new Test("[a\\-\\]z]+", "az]-bcz", 2, 0, 4, 6, 7), + new Test("[^\\n]+", "abcd\n", 1, 0, 4), + new Test("[日本語]+", "日本語日本語", 1, 0, 18), + new Test("日本語+", "日本語", 1, 0, 9), + new Test("日本語+", "日本語語語語", 1, 0, 18), + new Test("()", "", 1, 0, 0, 0, 0), + new Test("(a)", "a", 1, 0, 1, 0, 1), + new Test("(.)(.)", "日a", 1, 0, 4, 0, 3, 3, 4), + new Test("(.*)", "", 1, 0, 0, 0, 0), + new Test("(.*)", "abcd", 1, 0, 4, 0, 4), + new Test("(..)(..)", "abcd", 1, 0, 4, 0, 2, 2, 4), + new Test("(([^xyz]*)(d))", "abcd", 1, 0, 4, 0, 4, 0, 3, 3, 4), + new Test("((a|b|c)*(d))", "abcd", 1, 0, 4, 0, 4, 2, 3, 3, 4), + new Test("(((a|b|c)*)(d))", "abcd", 1, 0, 4, 0, 4, 0, 3, 2, 3, 3, 4), + new Test("\\a\\f\\n\\r\\t\\v", "\x07\f\n\r\t\v", 1, 0, 6), + new Test("[\\a\\f\\n\\r\\t\\v]+", "\x07\f\n\r\t\v", 1, 0, 6), + new Test("a*(|(b))c*", "aacc", 1, 0, 4, 2, 2, -1, -1), + new Test("(.*).*", "ab", 1, 0, 2, 0, 2), + new Test("[.]", ".", 1, 0, 1), + new Test("/$", "/abc/", 1, 4, 5), + new Test("/$", "/abc", 0), + + // multiple matches + new Test(".", "abc", 3, 0, 1, 1, 2, 2, 3), + new Test("(.)", "abc", 3, 0, 1, 0, 1, 1, 2, 1, 2, 2, 3, 2, 3), + new Test(".(.)", "abcd", 2, 0, 2, 1, 2, 2, 4, 3, 4), + new Test("ab*", "abbaab", 3, 0, 3, 3, 4, 4, 6), + new Test("a(b*)", "abbaab", 3, 0, 3, 1, 3, 3, 4, 4, 4, 4, 6, 5, 6), + + // fixed bugs + new Test("ab$", "cab", 1, 1, 3), + new Test("axxb$", "axxcb", 0), + new Test("data", "daXY data", 1, 5, 9), + new Test("da(.)a$", "daXY data", 1, 5, 9, 7, 8), + new Test("zx+", "zzx", 1, 1, 3), + new Test("ab$", "abcab", 1, 3, 5), + new Test("(aa)*$", "a", 1, 1, 1, -1, -1), + new Test("(?:.|(?:.a))", "", 0), + new Test("(?:A(?:A|a))", "Aa", 1, 0, 2), + new Test("(?:A|(?:A|a))", "a", 1, 0, 1), + new Test("(a){0}", "", 1, 0, 0, -1, -1), + new Test("(?-s)(?:(?:^).)", "\n", 0), + new Test("(?s)(?:(?:^).)", "\n", 1, 0, 1), + new Test("(?:(?:^).)", "\n", 0), + new Test("\\b", "x", 2, 0, 0, 1, 1), + new Test("\\b", "xx", 2, 0, 0, 2, 2), + new Test("\\b", "x y", 4, 0, 0, 1, 1, 2, 2, 3, 3), + new Test("\\b", "xx yy", 4, 0, 0, 2, 2, 3, 3, 5, 5), + new Test("\\B", "x", 0), + new Test("\\B", "xx", 1, 1, 1), + new Test("\\B", "x y", 0), + new Test("\\B", "xx yy", 2, 1, 1, 4, 4), + + // RE2 tests + new Test("[^\\S\\s]", "abcd", 0), + new Test("[^\\S[:space:]]", "abcd", 0), + new Test("[^\\D\\d]", "abcd", 0), + new Test("[^\\D[:digit:]]", "abcd", 0), + new Test("(?i)\\W", "x", 0), + new Test("(?i)\\W", "k", 0), + new Test("(?i)\\W", "s", 0), + + // can backslash-escape any punctuation + new Test( + "\\!\\\"\\#\\$\\%\\&\\'\\(\\)\\*\\+\\,\\-\\.\\/\\:\\;\\<\\=\\>\\?\\@\\[\\\\\\]\\^\\_\\{\\|\\}\\~", + "!\"#$%&'()*+,-./:;<=>?@[\\]^_{|}~", + 1, + 0, + 31, + ), + new Test( + "[\\!\\\"\\#\\$\\%\\&\\'\\(\\)\\*\\+\\,\\-\\.\\/\\:\\;\\<\\=\\>\\?\\@\\[\\\\\\]\\^\\_\\{\\|\\}\\~]+", + "!\"#$%&'()*+,-./:;<=>?@[\\]^_{|}~", + 1, + 0, + 31, + ), + new Test("\\`", "`", 1, 0, 1), + new Test("[\\`]+", "`", 1, 0, 1), + + // long set of matches + new Test( + ".", + "qwertyuiopasdfghjklzxcvbnm1234567890", + 36, + 0, + 1, + 1, + 2, + 2, + 3, + 3, + 4, + 4, + 5, + 5, + 6, + 6, + 7, + 7, + 8, + 8, + 9, + 9, + 10, + 10, + 11, + 11, + 12, + 12, + 13, + 13, + 14, + 14, + 15, + 15, + 16, + 16, + 17, + 17, + 18, + 18, + 19, + 19, + 20, + 20, + 21, + 21, + 22, + 22, + 23, + 23, + 24, + 24, + 25, + 25, + 26, + 26, + 27, + 27, + 28, + 28, + 29, + 29, + 30, + 30, + 31, + 31, + 32, + 32, + 33, + 33, + 34, + 34, + 35, + 35, + 36, + ), + new Test("(|a)*", "aa", 3, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2), +]; diff --git a/packages/re2/src/__fixtures__/nullsubexpr.dat b/packages/re2/src/__fixtures__/nullsubexpr.dat new file mode 100644 index 0000000..68d9c99 --- /dev/null +++ b/packages/re2/src/__fixtures__/nullsubexpr.dat @@ -0,0 +1,73 @@ +NOTE null subexpression matches : 2002-06-06 + +E (a*)* a (0,1)(0,1) +E SAME x (0,0)(0,0) +E SAME aaaaaa (0,6)(0,6) +E SAME aaaaaax (0,6)(0,6) +E (a*)+ a (0,1)(0,1) +E SAME x (0,0)(0,0) +E SAME aaaaaa (0,6)(0,6) +E SAME aaaaaax (0,6)(0,6) +E (a+)* a (0,1)(0,1) +E SAME x (0,0) +E SAME aaaaaa (0,6)(0,6) +E SAME aaaaaax (0,6)(0,6) +E (a+)+ a (0,1)(0,1) +E SAME x NOMATCH +E SAME aaaaaa (0,6)(0,6) +E SAME aaaaaax (0,6)(0,6) + +E ([a]*)* a (0,1)(0,1) +E SAME x (0,0)(0,0) +E SAME aaaaaa (0,6)(0,6) +E SAME aaaaaax (0,6)(0,6) +E ([a]*)+ a (0,1)(0,1) +E SAME x (0,0)(0,0) +E SAME aaaaaa (0,6)(0,6) +E SAME aaaaaax (0,6)(0,6) +E ([^b]*)* a (0,1)(0,1) +E SAME b (0,0)(0,0) +E SAME aaaaaa (0,6)(0,6) +E SAME aaaaaab (0,6)(0,6) +E ([ab]*)* a (0,1)(0,1) +E SAME aaaaaa (0,6)(0,6) +E SAME ababab (0,6)(0,6) +E SAME bababa (0,6)(0,6) +E SAME b (0,1)(0,1) +E SAME bbbbbb (0,6)(0,6) +E SAME aaaabcde (0,5)(0,5) +E ([^a]*)* b (0,1)(0,1) +E SAME bbbbbb (0,6)(0,6) +E SAME aaaaaa (0,0)(0,0) +E ([^ab]*)* ccccxx (0,6)(0,6) +E SAME ababab (0,0)(0,0) + +E ((z)+|a)* zabcde (0,2)(1,2) + +#{E a+? aaaaaa (0,1) no *? +? mimimal match ops +#E (a) aaa (0,1)(0,1) +#E (a*?) aaa (0,0)(0,0) +#E (a)*? aaa (0,0) +#E (a*?)*? aaa (0,0) +#} + +B \(a*\)*\(x\) x (0,1)(0,0)(0,1) +B \(a*\)*\(x\) ax (0,2)(0,1)(1,2) +B \(a*\)*\(x\) axa (0,2)(0,1)(1,2) +B \(a*\)*\(x\)\(\1\) x (0,1)(0,0)(0,1)(1,1) +B \(a*\)*\(x\)\(\1\) ax (0,2)(1,1)(1,2)(2,2) +B \(a*\)*\(x\)\(\1\) axa (0,3)(0,1)(1,2)(2,3) +B \(a*\)*\(x\)\(\1\)\(x\) axax (0,4)(0,1)(1,2)(2,3)(3,4) +B \(a*\)*\(x\)\(\1\)\(x\) axxa (0,3)(1,1)(1,2)(2,2)(2,3) + +E (a*)*(x) x (0,1)(0,0)(0,1) +E (a*)*(x) ax (0,2)(0,1)(1,2) +E (a*)*(x) axa (0,2)(0,1)(1,2) + +E (a*)+(x) x (0,1)(0,0)(0,1) +E (a*)+(x) ax (0,2)(0,1)(1,2) +E (a*)+(x) axa (0,2)(0,1)(1,2) + +E (a*){2}(x) x (0,1)(0,0)(0,1) +E (a*){2}(x) ax (0,2)(1,1)(1,2) +E (a*){2}(x) axa (0,2)(1,1)(1,2) diff --git a/packages/re2/src/__fixtures__/re2-exhaustive.txt.gz b/packages/re2/src/__fixtures__/re2-exhaustive.txt.gz new file mode 100644 index 0000000..4482caf Binary files /dev/null and b/packages/re2/src/__fixtures__/re2-exhaustive.txt.gz differ diff --git a/packages/re2/src/__fixtures__/re2-search.txt b/packages/re2/src/__fixtures__/re2-search.txt new file mode 100644 index 0000000..8c4098a --- /dev/null +++ b/packages/re2/src/__fixtures__/re2-search.txt @@ -0,0 +1,3779 @@ +# RE2 basic search tests built by make log +# Wed May 12 12:13:22 EDT 2021 +Regexp.SearchTests +strings +"" +"a" +regexps +"a" +-;-;-;- +0-1;0-1;0-1;0-1 +"^(?:a)$" +-;-;-;- +0-1;0-1;0-1;0-1 +"^(?:a)" +-;-;-;- +0-1;0-1;0-1;0-1 +"(?:a)$" +-;-;-;- +0-1;0-1;0-1;0-1 +strings +"" +"zyzzyva" +regexps +"a" +-;-;-;- +-;6-7;-;6-7 +"^(?:a)$" +-;-;-;- +-;-;-;- +"^(?:a)" +-;-;-;- +-;-;-;- +"(?:a)$" +-;-;-;- +-;6-7;-;6-7 +strings +"" +"aa" +regexps +"a+" +-;-;-;- +0-2;0-2;0-2;0-2 +"^(?:a+)$" +-;-;-;- +0-2;0-2;0-2;0-2 +"^(?:a+)" +-;-;-;- +0-2;0-2;0-2;0-2 +"(?:a+)$" +-;-;-;- +0-2;0-2;0-2;0-2 +strings +"" +"ab" +regexps +"(a+|b)+" +-;-;-;- +0-2 1-2;0-2 1-2;0-2 1-2;0-2 1-2 +"^(?:(a+|b)+)$" +-;-;-;- +0-2 1-2;0-2 1-2;0-2 1-2;0-2 1-2 +"^(?:(a+|b)+)" +-;-;-;- +0-2 1-2;0-2 1-2;0-2 1-2;0-2 1-2 +"(?:(a+|b)+)$" +-;-;-;- +0-2 1-2;0-2 1-2;0-2 1-2;0-2 1-2 +strings +"" +"xabcdx" +regexps +"ab|cd" +-;-;-;- +-;1-3;-;1-3 +"^(?:ab|cd)$" +-;-;-;- +-;-;-;- +"^(?:ab|cd)" +-;-;-;- +-;-;-;- +"(?:ab|cd)$" +-;-;-;- +-;-;-;- +strings +"" +"hello\ngoodbye\n" +regexps +"h.*od?" +-;-;-;- +-;0-5;-;0-5 +"^(?:h.*od?)$" +-;-;-;- +-;-;-;- +"^(?:h.*od?)" +-;-;-;- +-;0-5;-;0-5 +"(?:h.*od?)$" +-;-;-;- +-;-;-;- +strings +"" +"hello\ngoodbye\n" +regexps +"h.*o" +-;-;-;- +-;0-5;-;0-5 +"^(?:h.*o)$" +-;-;-;- +-;-;-;- +"^(?:h.*o)" +-;-;-;- +-;0-5;-;0-5 +"(?:h.*o)$" +-;-;-;- +-;-;-;- +strings +"" +"goodbye\nhello\n" +regexps +"h.*o" +-;-;-;- +-;8-13;-;8-13 +"^(?:h.*o)$" +-;-;-;- +-;-;-;- +"^(?:h.*o)" +-;-;-;- +-;-;-;- +"(?:h.*o)$" +-;-;-;- +-;-;-;- +strings +"" +"hello world" +regexps +"h.*o" +-;-;-;- +-;0-8;-;0-8 +"^(?:h.*o)$" +-;-;-;- +-;-;-;- +"^(?:h.*o)" +-;-;-;- +-;0-8;-;0-8 +"(?:h.*o)$" +-;-;-;- +-;-;-;- +strings +"" +"othello, world" +regexps +"h.*o" +-;-;-;- +-;2-11;-;2-11 +"^(?:h.*o)$" +-;-;-;- +-;-;-;- +"^(?:h.*o)" +-;-;-;- +-;-;-;- +"(?:h.*o)$" +-;-;-;- +-;-;-;- +strings +"" +"aaaaaaa" +regexps +"[^\\s\\S]" +-;-;-;- +-;-;-;- +"^(?:[^\\s\\S])$" +-;-;-;- +-;-;-;- +"^(?:[^\\s\\S])" +-;-;-;- +-;-;-;- +"(?:[^\\s\\S])$" +-;-;-;- +-;-;-;- +strings +"" +"aaaaaaa" +regexps +"a" +-;-;-;- +-;0-1;-;0-1 +"^(?:a)$" +-;-;-;- +-;-;-;- +"^(?:a)" +-;-;-;- +-;0-1;-;0-1 +"(?:a)$" +-;-;-;- +-;6-7;-;6-7 +strings +"" +"aaaaaaa" +regexps +"a*" +0-0;0-0;0-0;0-0 +0-7;0-7;0-7;0-7 +"^(?:a*)$" +0-0;0-0;0-0;0-0 +0-7;0-7;0-7;0-7 +"^(?:a*)" +0-0;0-0;0-0;0-0 +0-7;0-7;0-7;0-7 +"(?:a*)$" +0-0;0-0;0-0;0-0 +0-7;0-7;0-7;0-7 +strings +"" +"" +regexps +"a*" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"^(?:a*)$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"^(?:a*)" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"(?:a*)$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +strings +"" +"xabcdx" +regexps +"ab|cd" +-;-;-;- +-;1-3;-;1-3 +"^(?:ab|cd)$" +-;-;-;- +-;-;-;- +"^(?:ab|cd)" +-;-;-;- +-;-;-;- +"(?:ab|cd)$" +-;-;-;- +-;-;-;- +strings +"" +"cab" +regexps +"a" +-;-;-;- +-;1-2;-;1-2 +"^(?:a)$" +-;-;-;- +-;-;-;- +"^(?:a)" +-;-;-;- +-;-;-;- +"(?:a)$" +-;-;-;- +-;-;-;- +strings +"" +"cab" +regexps +"a*b" +-;-;-;- +-;1-3;-;1-3 +"^(?:a*b)$" +-;-;-;- +-;-;-;- +"^(?:a*b)" +-;-;-;- +-;-;-;- +"(?:a*b)$" +-;-;-;- +-;1-3;-;1-3 +strings +"" +"x" +regexps +"((((((((((((((((((((x))))))))))))))))))))" +-;-;-;- +0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1;0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1;0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1;0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 +"^(?:((((((((((((((((((((x)))))))))))))))))))))$" +-;-;-;- +0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1;0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1;0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1;0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 +"^(?:((((((((((((((((((((x)))))))))))))))))))))" +-;-;-;- +0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1;0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1;0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1;0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 +"(?:((((((((((((((((((((x)))))))))))))))))))))$" +-;-;-;- +0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1;0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1;0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1;0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 0-1 +strings +"" +"xxxabcdxxx" +regexps +"[abcd]" +-;-;-;- +-;3-4;-;3-4 +"^(?:[abcd])$" +-;-;-;- +-;-;-;- +"^(?:[abcd])" +-;-;-;- +-;-;-;- +"(?:[abcd])$" +-;-;-;- +-;-;-;- +strings +"" +"xxxabcdxxx" +regexps +"[^x]" +-;-;-;- +-;3-4;-;3-4 +"^(?:[^x])$" +-;-;-;- +-;-;-;- +"^(?:[^x])" +-;-;-;- +-;-;-;- +"(?:[^x])$" +-;-;-;- +-;-;-;- +strings +"" +"xxxabcdxxx" +regexps +"[abcd]+" +-;-;-;- +-;3-7;-;3-7 +"^(?:[abcd]+)$" +-;-;-;- +-;-;-;- +"^(?:[abcd]+)" +-;-;-;- +-;-;-;- +"(?:[abcd]+)$" +-;-;-;- +-;-;-;- +strings +"" +"xxxabcdxxx" +regexps +"[^x]+" +-;-;-;- +-;3-7;-;3-7 +"^(?:[^x]+)$" +-;-;-;- +-;-;-;- +"^(?:[^x]+)" +-;-;-;- +-;-;-;- +"(?:[^x]+)$" +-;-;-;- +-;-;-;- +strings +"" +"fo" +regexps +"(fo|foo)" +-;-;-;- +0-2 0-2;0-2 0-2;0-2 0-2;0-2 0-2 +"^(?:(fo|foo))$" +-;-;-;- +0-2 0-2;0-2 0-2;0-2 0-2;0-2 0-2 +"^(?:(fo|foo))" +-;-;-;- +0-2 0-2;0-2 0-2;0-2 0-2;0-2 0-2 +"(?:(fo|foo))$" +-;-;-;- +0-2 0-2;0-2 0-2;0-2 0-2;0-2 0-2 +strings +"" +"foo" +regexps +"(foo|fo)" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +"^(?:(foo|fo))$" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +"^(?:(foo|fo))" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +"(?:(foo|fo))$" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +strings +"" +"aA" +regexps +"aa" +-;-;-;- +-;-;-;- +"^(?:aa)$" +-;-;-;- +-;-;-;- +"^(?:aa)" +-;-;-;- +-;-;-;- +"(?:aa)$" +-;-;-;- +-;-;-;- +strings +"" +"Aa" +regexps +"a" +-;-;-;- +-;1-2;-;1-2 +"^(?:a)$" +-;-;-;- +-;-;-;- +"^(?:a)" +-;-;-;- +-;-;-;- +"(?:a)$" +-;-;-;- +-;1-2;-;1-2 +strings +"" +"A" +regexps +"a" +-;-;-;- +-;-;-;- +"^(?:a)$" +-;-;-;- +-;-;-;- +"^(?:a)" +-;-;-;- +-;-;-;- +"(?:a)$" +-;-;-;- +-;-;-;- +strings +"" +"abc" +regexps +"ABC" +-;-;-;- +-;-;-;- +"^(?:ABC)$" +-;-;-;- +-;-;-;- +"^(?:ABC)" +-;-;-;- +-;-;-;- +"(?:ABC)$" +-;-;-;- +-;-;-;- +strings +"" +"XABCY" +regexps +"abc" +-;-;-;- +-;-;-;- +"^(?:abc)$" +-;-;-;- +-;-;-;- +"^(?:abc)" +-;-;-;- +-;-;-;- +"(?:abc)$" +-;-;-;- +-;-;-;- +strings +"" +"xabcy" +regexps +"ABC" +-;-;-;- +-;-;-;- +"^(?:ABC)$" +-;-;-;- +-;-;-;- +"^(?:ABC)" +-;-;-;- +-;-;-;- +"(?:ABC)$" +-;-;-;- +-;-;-;- +strings +"" +"foo" +regexps +"foo|bar|[A-Z]" +-;-;-;- +0-3;0-3;0-3;0-3 +"^(?:foo|bar|[A-Z])$" +-;-;-;- +0-3;0-3;0-3;0-3 +"^(?:foo|bar|[A-Z])" +-;-;-;- +0-3;0-3;0-3;0-3 +"(?:foo|bar|[A-Z])$" +-;-;-;- +0-3;0-3;0-3;0-3 +strings +"" +"foo" +regexps +"^(foo|bar|[A-Z])" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +"^(?:^(foo|bar|[A-Z]))$" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +"^(?:^(foo|bar|[A-Z]))" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +"(?:^(foo|bar|[A-Z]))$" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +strings +"" +"foo\n" +regexps +"(foo|bar|[A-Z])$" +-;-;-;- +-;-;-;- +"^(?:(foo|bar|[A-Z])$)$" +-;-;-;- +-;-;-;- +"^(?:(foo|bar|[A-Z])$)" +-;-;-;- +-;-;-;- +"(?:(foo|bar|[A-Z])$)$" +-;-;-;- +-;-;-;- +strings +"" +"foo" +regexps +"(foo|bar|[A-Z])$" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +"^(?:(foo|bar|[A-Z])$)$" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +"^(?:(foo|bar|[A-Z])$)" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +"(?:(foo|bar|[A-Z])$)$" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +strings +"" +"foo\n" +regexps +"^(foo|bar|[A-Z])$" +-;-;-;- +-;-;-;- +"^(?:^(foo|bar|[A-Z])$)$" +-;-;-;- +-;-;-;- +"^(?:^(foo|bar|[A-Z])$)" +-;-;-;- +-;-;-;- +"(?:^(foo|bar|[A-Z])$)$" +-;-;-;- +-;-;-;- +strings +"" +"foo" +regexps +"^(foo|bar|[A-Z])$" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +"^(?:^(foo|bar|[A-Z])$)$" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +"^(?:^(foo|bar|[A-Z])$)" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +"(?:^(foo|bar|[A-Z])$)$" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +strings +"" +"bar" +regexps +"^(foo|bar|[A-Z])$" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +"^(?:^(foo|bar|[A-Z])$)$" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +"^(?:^(foo|bar|[A-Z])$)" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +"(?:^(foo|bar|[A-Z])$)$" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +strings +"" +"X" +regexps +"^(foo|bar|[A-Z])$" +-;-;-;- +0-1 0-1;0-1 0-1;0-1 0-1;0-1 0-1 +"^(?:^(foo|bar|[A-Z])$)$" +-;-;-;- +0-1 0-1;0-1 0-1;0-1 0-1;0-1 0-1 +"^(?:^(foo|bar|[A-Z])$)" +-;-;-;- +0-1 0-1;0-1 0-1;0-1 0-1;0-1 0-1 +"(?:^(foo|bar|[A-Z])$)$" +-;-;-;- +0-1 0-1;0-1 0-1;0-1 0-1;0-1 0-1 +strings +"" +"XY" +regexps +"^(foo|bar|[A-Z])$" +-;-;-;- +-;-;-;- +"^(?:^(foo|bar|[A-Z])$)$" +-;-;-;- +-;-;-;- +"^(?:^(foo|bar|[A-Z])$)" +-;-;-;- +-;-;-;- +"(?:^(foo|bar|[A-Z])$)$" +-;-;-;- +-;-;-;- +strings +"" +"fo" +regexps +"^(fo|foo)$" +-;-;-;- +0-2 0-2;0-2 0-2;0-2 0-2;0-2 0-2 +"^(?:^(fo|foo)$)$" +-;-;-;- +0-2 0-2;0-2 0-2;0-2 0-2;0-2 0-2 +"^(?:^(fo|foo)$)" +-;-;-;- +0-2 0-2;0-2 0-2;0-2 0-2;0-2 0-2 +"(?:^(fo|foo)$)$" +-;-;-;- +0-2 0-2;0-2 0-2;0-2 0-2;0-2 0-2 +strings +"" +"foo" +regexps +"^(fo|foo)$" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +"^(?:^(fo|foo)$)$" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +"^(?:^(fo|foo)$)" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +"(?:^(fo|foo)$)$" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +strings +"" +"fo" +regexps +"^^(fo|foo)$" +-;-;-;- +0-2 0-2;0-2 0-2;0-2 0-2;0-2 0-2 +"^(?:^^(fo|foo)$)$" +-;-;-;- +0-2 0-2;0-2 0-2;0-2 0-2;0-2 0-2 +"^(?:^^(fo|foo)$)" +-;-;-;- +0-2 0-2;0-2 0-2;0-2 0-2;0-2 0-2 +"(?:^^(fo|foo)$)$" +-;-;-;- +0-2 0-2;0-2 0-2;0-2 0-2;0-2 0-2 +strings +"" +"foo" +regexps +"^^(fo|foo)$" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +"^(?:^^(fo|foo)$)$" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +"^(?:^^(fo|foo)$)" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +"(?:^^(fo|foo)$)$" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +strings +"" +"" +regexps +"^$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"^(?:^$)$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"^(?:^$)" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"(?:^$)$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +strings +"" +"x" +regexps +"^$" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:^$)$" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:^$)" +0-0;0-0;0-0;0-0 +-;-;-;- +"(?:^$)$" +0-0;0-0;0-0;0-0 +-;-;-;- +strings +"" +"" +regexps +"^^$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"^(?:^^$)$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"^(?:^^$)" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"(?:^^$)$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +strings +"" +"" +regexps +"^$$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"^(?:^$$)$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"^(?:^$$)" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"(?:^$$)$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +strings +"" +"x" +regexps +"^^$" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:^^$)$" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:^^$)" +0-0;0-0;0-0;0-0 +-;-;-;- +"(?:^^$)$" +0-0;0-0;0-0;0-0 +-;-;-;- +strings +"" +"x" +regexps +"^$$" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:^$$)$" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:^$$)" +0-0;0-0;0-0;0-0 +-;-;-;- +"(?:^$$)$" +0-0;0-0;0-0;0-0 +-;-;-;- +strings +"" +"" +regexps +"^^$$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"^(?:^^$$)$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"^(?:^^$$)" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"(?:^^$$)$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +strings +"" +"x" +regexps +"^^$$" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:^^$$)$" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:^^$$)" +0-0;0-0;0-0;0-0 +-;-;-;- +"(?:^^$$)$" +0-0;0-0;0-0;0-0 +-;-;-;- +strings +"" +"" +regexps +"^^^^^^^^$$$$$$$$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"^(?:^^^^^^^^$$$$$$$$)$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"^(?:^^^^^^^^$$$$$$$$)" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"(?:^^^^^^^^$$$$$$$$)$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +strings +"" +"x" +regexps +"^" +0-0;0-0;0-0;0-0 +-;0-0;-;0-0 +"^(?:^)$" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:^)" +0-0;0-0;0-0;0-0 +-;0-0;-;0-0 +"(?:^)$" +0-0;0-0;0-0;0-0 +-;-;-;- +strings +"" +"x" +regexps +"$" +0-0;0-0;0-0;0-0 +-;1-1;-;1-1 +"^(?:$)$" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:$)" +0-0;0-0;0-0;0-0 +-;-;-;- +"(?:$)$" +0-0;0-0;0-0;0-0 +-;1-1;-;1-1 +strings +"" +"nofoo foo that" +regexps +"\\bfoo\\b" +-;-;-;- +-;6-9;-;6-9 +"^(?:\\bfoo\\b)$" +-;-;-;- +-;-;-;- +"^(?:\\bfoo\\b)" +-;-;-;- +-;-;-;- +"(?:\\bfoo\\b)$" +-;-;-;- +-;-;-;- +strings +"" +"faoa x" +regexps +"a\\b" +-;-;-;- +-;3-4;-;3-4 +"^(?:a\\b)$" +-;-;-;- +-;-;-;- +"^(?:a\\b)" +-;-;-;- +-;-;-;- +"(?:a\\b)$" +-;-;-;- +-;-;-;- +strings +"" +"bar x" +regexps +"\\bbar" +-;-;-;- +-;0-3;-;0-3 +"^(?:\\bbar)$" +-;-;-;- +-;-;-;- +"^(?:\\bbar)" +-;-;-;- +-;0-3;-;0-3 +"(?:\\bbar)$" +-;-;-;- +-;-;-;- +strings +"" +"foo\nbar x" +regexps +"\\bbar" +-;-;-;- +-;4-7;-;4-7 +"^(?:\\bbar)$" +-;-;-;- +-;-;-;- +"^(?:\\bbar)" +-;-;-;- +-;-;-;- +"(?:\\bbar)$" +-;-;-;- +-;-;-;- +strings +"" +"foobar" +regexps +"bar\\b" +-;-;-;- +-;3-6;-;3-6 +"^(?:bar\\b)$" +-;-;-;- +-;-;-;- +"^(?:bar\\b)" +-;-;-;- +-;-;-;- +"(?:bar\\b)$" +-;-;-;- +-;3-6;-;3-6 +strings +"" +"foobar\nxxx" +regexps +"bar\\b" +-;-;-;- +-;3-6;-;3-6 +"^(?:bar\\b)$" +-;-;-;- +-;-;-;- +"^(?:bar\\b)" +-;-;-;- +-;-;-;- +"(?:bar\\b)$" +-;-;-;- +-;-;-;- +strings +"" +"foo" +regexps +"(foo|bar|[A-Z])\\b" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +"^(?:(foo|bar|[A-Z])\\b)$" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +"^(?:(foo|bar|[A-Z])\\b)" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +"(?:(foo|bar|[A-Z])\\b)$" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +strings +"" +"foo\n" +regexps +"(foo|bar|[A-Z])\\b" +-;-;-;- +-;0-3 0-3;-;0-3 0-3 +"^(?:(foo|bar|[A-Z])\\b)$" +-;-;-;- +-;-;-;- +"^(?:(foo|bar|[A-Z])\\b)" +-;-;-;- +-;0-3 0-3;-;0-3 0-3 +"(?:(foo|bar|[A-Z])\\b)$" +-;-;-;- +-;-;-;- +strings +"" +"" +regexps +"\\b" +-;-;-;- +-;-;-;- +"^(?:\\b)$" +-;-;-;- +-;-;-;- +"^(?:\\b)" +-;-;-;- +-;-;-;- +"(?:\\b)$" +-;-;-;- +-;-;-;- +strings +"" +"x" +regexps +"\\b" +-;-;-;- +-;0-0;-;0-0 +"^(?:\\b)$" +-;-;-;- +-;-;-;- +"^(?:\\b)" +-;-;-;- +-;0-0;-;0-0 +"(?:\\b)$" +-;-;-;- +-;1-1;-;1-1 +strings +"" +"foo" +regexps +"\\b(foo|bar|[A-Z])" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +"^(?:\\b(foo|bar|[A-Z]))$" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +"^(?:\\b(foo|bar|[A-Z]))" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +"(?:\\b(foo|bar|[A-Z]))$" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +strings +"" +"X" +regexps +"\\b(foo|bar|[A-Z])\\b" +-;-;-;- +0-1 0-1;0-1 0-1;0-1 0-1;0-1 0-1 +"^(?:\\b(foo|bar|[A-Z])\\b)$" +-;-;-;- +0-1 0-1;0-1 0-1;0-1 0-1;0-1 0-1 +"^(?:\\b(foo|bar|[A-Z])\\b)" +-;-;-;- +0-1 0-1;0-1 0-1;0-1 0-1;0-1 0-1 +"(?:\\b(foo|bar|[A-Z])\\b)$" +-;-;-;- +0-1 0-1;0-1 0-1;0-1 0-1;0-1 0-1 +strings +"" +"XY" +regexps +"\\b(foo|bar|[A-Z])\\b" +-;-;-;- +-;-;-;- +"^(?:\\b(foo|bar|[A-Z])\\b)$" +-;-;-;- +-;-;-;- +"^(?:\\b(foo|bar|[A-Z])\\b)" +-;-;-;- +-;-;-;- +"(?:\\b(foo|bar|[A-Z])\\b)$" +-;-;-;- +-;-;-;- +strings +"" +"bar" +regexps +"\\b(foo|bar|[A-Z])\\b" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +"^(?:\\b(foo|bar|[A-Z])\\b)$" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +"^(?:\\b(foo|bar|[A-Z])\\b)" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +"(?:\\b(foo|bar|[A-Z])\\b)$" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +strings +"" +"foo" +regexps +"\\b(foo|bar|[A-Z])\\b" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +"^(?:\\b(foo|bar|[A-Z])\\b)$" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +"^(?:\\b(foo|bar|[A-Z])\\b)" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +"(?:\\b(foo|bar|[A-Z])\\b)$" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +strings +"" +"foo\n" +regexps +"\\b(foo|bar|[A-Z])\\b" +-;-;-;- +-;0-3 0-3;-;0-3 0-3 +"^(?:\\b(foo|bar|[A-Z])\\b)$" +-;-;-;- +-;-;-;- +"^(?:\\b(foo|bar|[A-Z])\\b)" +-;-;-;- +-;0-3 0-3;-;0-3 0-3 +"(?:\\b(foo|bar|[A-Z])\\b)$" +-;-;-;- +-;-;-;- +strings +"" +"ffoo bbar N x" +regexps +"\\b(foo|bar|[A-Z])\\b" +-;-;-;- +-;10-11 10-11;-;10-11 10-11 +"^(?:\\b(foo|bar|[A-Z])\\b)$" +-;-;-;- +-;-;-;- +"^(?:\\b(foo|bar|[A-Z])\\b)" +-;-;-;- +-;-;-;- +"(?:\\b(foo|bar|[A-Z])\\b)$" +-;-;-;- +-;-;-;- +strings +"" +"fo" +regexps +"\\b(fo|foo)\\b" +-;-;-;- +0-2 0-2;0-2 0-2;0-2 0-2;0-2 0-2 +"^(?:\\b(fo|foo)\\b)$" +-;-;-;- +0-2 0-2;0-2 0-2;0-2 0-2;0-2 0-2 +"^(?:\\b(fo|foo)\\b)" +-;-;-;- +0-2 0-2;0-2 0-2;0-2 0-2;0-2 0-2 +"(?:\\b(fo|foo)\\b)$" +-;-;-;- +0-2 0-2;0-2 0-2;0-2 0-2;0-2 0-2 +strings +"" +"foo" +regexps +"\\b(fo|foo)\\b" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +"^(?:\\b(fo|foo)\\b)$" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +"^(?:\\b(fo|foo)\\b)" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +"(?:\\b(fo|foo)\\b)$" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +strings +"" +"" +regexps +"\\b\\b" +-;-;-;- +-;-;-;- +"^(?:\\b\\b)$" +-;-;-;- +-;-;-;- +"^(?:\\b\\b)" +-;-;-;- +-;-;-;- +"(?:\\b\\b)$" +-;-;-;- +-;-;-;- +strings +"" +"x" +regexps +"\\b\\b" +-;-;-;- +-;0-0;-;0-0 +"^(?:\\b\\b)$" +-;-;-;- +-;-;-;- +"^(?:\\b\\b)" +-;-;-;- +-;0-0;-;0-0 +"(?:\\b\\b)$" +-;-;-;- +-;1-1;-;1-1 +strings +"" +"" +regexps +"\\b$" +-;-;-;- +-;-;-;- +"^(?:\\b$)$" +-;-;-;- +-;-;-;- +"^(?:\\b$)" +-;-;-;- +-;-;-;- +"(?:\\b$)$" +-;-;-;- +-;-;-;- +strings +"" +"x" +regexps +"\\b$" +-;-;-;- +-;1-1;-;1-1 +"^(?:\\b$)$" +-;-;-;- +-;-;-;- +"^(?:\\b$)" +-;-;-;- +-;-;-;- +"(?:\\b$)$" +-;-;-;- +-;1-1;-;1-1 +strings +"" +"y x" +regexps +"\\b$" +-;-;-;- +-;3-3;-;3-3 +"^(?:\\b$)$" +-;-;-;- +-;-;-;- +"^(?:\\b$)" +-;-;-;- +-;-;-;- +"(?:\\b$)$" +-;-;-;- +-;3-3;-;3-3 +strings +"" +"x" +regexps +"\\b.$" +-;-;-;- +0-1;0-1;0-1;0-1 +"^(?:\\b.$)$" +-;-;-;- +0-1;0-1;0-1;0-1 +"^(?:\\b.$)" +-;-;-;- +0-1;0-1;0-1;0-1 +"(?:\\b.$)$" +-;-;-;- +0-1;0-1;0-1;0-1 +strings +"" +"fo" +regexps +"^\\b(fo|foo)\\b" +-;-;-;- +0-2 0-2;0-2 0-2;0-2 0-2;0-2 0-2 +"^(?:^\\b(fo|foo)\\b)$" +-;-;-;- +0-2 0-2;0-2 0-2;0-2 0-2;0-2 0-2 +"^(?:^\\b(fo|foo)\\b)" +-;-;-;- +0-2 0-2;0-2 0-2;0-2 0-2;0-2 0-2 +"(?:^\\b(fo|foo)\\b)$" +-;-;-;- +0-2 0-2;0-2 0-2;0-2 0-2;0-2 0-2 +strings +"" +"foo" +regexps +"^\\b(fo|foo)\\b" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +"^(?:^\\b(fo|foo)\\b)$" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +"^(?:^\\b(fo|foo)\\b)" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +"(?:^\\b(fo|foo)\\b)$" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +strings +"" +"" +regexps +"^\\b" +-;-;-;- +-;-;-;- +"^(?:^\\b)$" +-;-;-;- +-;-;-;- +"^(?:^\\b)" +-;-;-;- +-;-;-;- +"(?:^\\b)$" +-;-;-;- +-;-;-;- +strings +"" +"x" +regexps +"^\\b" +-;-;-;- +-;0-0;-;0-0 +"^(?:^\\b)$" +-;-;-;- +-;-;-;- +"^(?:^\\b)" +-;-;-;- +-;0-0;-;0-0 +"(?:^\\b)$" +-;-;-;- +-;-;-;- +strings +"" +"" +regexps +"^\\b\\b" +-;-;-;- +-;-;-;- +"^(?:^\\b\\b)$" +-;-;-;- +-;-;-;- +"^(?:^\\b\\b)" +-;-;-;- +-;-;-;- +"(?:^\\b\\b)$" +-;-;-;- +-;-;-;- +strings +"" +"x" +regexps +"^\\b\\b" +-;-;-;- +-;0-0;-;0-0 +"^(?:^\\b\\b)$" +-;-;-;- +-;-;-;- +"^(?:^\\b\\b)" +-;-;-;- +-;0-0;-;0-0 +"(?:^\\b\\b)$" +-;-;-;- +-;-;-;- +strings +"" +"" +regexps +"^\\b$" +-;-;-;- +-;-;-;- +"^(?:^\\b$)$" +-;-;-;- +-;-;-;- +"^(?:^\\b$)" +-;-;-;- +-;-;-;- +"(?:^\\b$)$" +-;-;-;- +-;-;-;- +strings +"" +"x" +regexps +"^\\b$" +-;-;-;- +-;-;-;- +"^(?:^\\b$)$" +-;-;-;- +-;-;-;- +"^(?:^\\b$)" +-;-;-;- +-;-;-;- +"(?:^\\b$)$" +-;-;-;- +-;-;-;- +strings +"" +"x" +regexps +"^\\b.$" +-;-;-;- +0-1;0-1;0-1;0-1 +"^(?:^\\b.$)$" +-;-;-;- +0-1;0-1;0-1;0-1 +"^(?:^\\b.$)" +-;-;-;- +0-1;0-1;0-1;0-1 +"(?:^\\b.$)$" +-;-;-;- +0-1;0-1;0-1;0-1 +strings +"" +"x" +regexps +"^\\b.\\b$" +-;-;-;- +0-1;0-1;0-1;0-1 +"^(?:^\\b.\\b$)$" +-;-;-;- +0-1;0-1;0-1;0-1 +"^(?:^\\b.\\b$)" +-;-;-;- +0-1;0-1;0-1;0-1 +"(?:^\\b.\\b$)$" +-;-;-;- +0-1;0-1;0-1;0-1 +strings +"" +"" +regexps +"^^^^^^^^\\b$$$$$$$" +-;-;-;- +-;-;-;- +"^(?:^^^^^^^^\\b$$$$$$$)$" +-;-;-;- +-;-;-;- +"^(?:^^^^^^^^\\b$$$$$$$)" +-;-;-;- +-;-;-;- +"(?:^^^^^^^^\\b$$$$$$$)$" +-;-;-;- +-;-;-;- +strings +"" +"x" +regexps +"^^^^^^^^\\b.$$$$$$" +-;-;-;- +0-1;0-1;0-1;0-1 +"^(?:^^^^^^^^\\b.$$$$$$)$" +-;-;-;- +0-1;0-1;0-1;0-1 +"^(?:^^^^^^^^\\b.$$$$$$)" +-;-;-;- +0-1;0-1;0-1;0-1 +"(?:^^^^^^^^\\b.$$$$$$)$" +-;-;-;- +0-1;0-1;0-1;0-1 +strings +"" +"x" +regexps +"^^^^^^^^\\b$$$$$$$" +-;-;-;- +-;-;-;- +"^(?:^^^^^^^^\\b$$$$$$$)$" +-;-;-;- +-;-;-;- +"^(?:^^^^^^^^\\b$$$$$$$)" +-;-;-;- +-;-;-;- +"(?:^^^^^^^^\\b$$$$$$$)$" +-;-;-;- +-;-;-;- +strings +"" +"n foo xfoox that" +regexps +"\\Bfoo\\B" +-;-;-;- +-;7-10;-;7-10 +"^(?:\\Bfoo\\B)$" +-;-;-;- +-;-;-;- +"^(?:\\Bfoo\\B)" +-;-;-;- +-;-;-;- +"(?:\\Bfoo\\B)$" +-;-;-;- +-;-;-;- +strings +"" +"faoa x" +regexps +"a\\B" +-;-;-;- +-;1-2;-;1-2 +"^(?:a\\B)$" +-;-;-;- +-;-;-;- +"^(?:a\\B)" +-;-;-;- +-;-;-;- +"(?:a\\B)$" +-;-;-;- +-;-;-;- +strings +"" +"bar x" +regexps +"\\Bbar" +-;-;-;- +-;-;-;- +"^(?:\\Bbar)$" +-;-;-;- +-;-;-;- +"^(?:\\Bbar)" +-;-;-;- +-;-;-;- +"(?:\\Bbar)$" +-;-;-;- +-;-;-;- +strings +"" +"foo\nbar x" +regexps +"\\Bbar" +-;-;-;- +-;-;-;- +"^(?:\\Bbar)$" +-;-;-;- +-;-;-;- +"^(?:\\Bbar)" +-;-;-;- +-;-;-;- +"(?:\\Bbar)$" +-;-;-;- +-;-;-;- +strings +"" +"foobar" +regexps +"bar\\B" +-;-;-;- +-;-;-;- +"^(?:bar\\B)$" +-;-;-;- +-;-;-;- +"^(?:bar\\B)" +-;-;-;- +-;-;-;- +"(?:bar\\B)$" +-;-;-;- +-;-;-;- +strings +"" +"foobar\nxxx" +regexps +"bar\\B" +-;-;-;- +-;-;-;- +"^(?:bar\\B)$" +-;-;-;- +-;-;-;- +"^(?:bar\\B)" +-;-;-;- +-;-;-;- +"(?:bar\\B)$" +-;-;-;- +-;-;-;- +strings +"" +"foox" +regexps +"(foo|bar|[A-Z])\\B" +-;-;-;- +-;0-3 0-3;-;0-3 0-3 +"^(?:(foo|bar|[A-Z])\\B)$" +-;-;-;- +-;-;-;- +"^(?:(foo|bar|[A-Z])\\B)" +-;-;-;- +-;0-3 0-3;-;0-3 0-3 +"(?:(foo|bar|[A-Z])\\B)$" +-;-;-;- +-;-;-;- +strings +"" +"foo\n" +regexps +"(foo|bar|[A-Z])\\B" +-;-;-;- +-;-;-;- +"^(?:(foo|bar|[A-Z])\\B)$" +-;-;-;- +-;-;-;- +"^(?:(foo|bar|[A-Z])\\B)" +-;-;-;- +-;-;-;- +"(?:(foo|bar|[A-Z])\\B)$" +-;-;-;- +-;-;-;- +strings +"" +"" +regexps +"\\B" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"^(?:\\B)$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"^(?:\\B)" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"(?:\\B)$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +strings +"" +"x" +regexps +"\\B" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:\\B)$" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:\\B)" +0-0;0-0;0-0;0-0 +-;-;-;- +"(?:\\B)$" +0-0;0-0;0-0;0-0 +-;-;-;- +strings +"" +"foo" +regexps +"\\B(foo|bar|[A-Z])" +-;-;-;- +-;-;-;- +"^(?:\\B(foo|bar|[A-Z]))$" +-;-;-;- +-;-;-;- +"^(?:\\B(foo|bar|[A-Z]))" +-;-;-;- +-;-;-;- +"(?:\\B(foo|bar|[A-Z]))$" +-;-;-;- +-;-;-;- +strings +"" +"xXy" +regexps +"\\B(foo|bar|[A-Z])\\B" +-;-;-;- +-;1-2 1-2;-;1-2 1-2 +"^(?:\\B(foo|bar|[A-Z])\\B)$" +-;-;-;- +-;-;-;- +"^(?:\\B(foo|bar|[A-Z])\\B)" +-;-;-;- +-;-;-;- +"(?:\\B(foo|bar|[A-Z])\\B)$" +-;-;-;- +-;-;-;- +strings +"" +"XY" +regexps +"\\B(foo|bar|[A-Z])\\B" +-;-;-;- +-;-;-;- +"^(?:\\B(foo|bar|[A-Z])\\B)$" +-;-;-;- +-;-;-;- +"^(?:\\B(foo|bar|[A-Z])\\B)" +-;-;-;- +-;-;-;- +"(?:\\B(foo|bar|[A-Z])\\B)$" +-;-;-;- +-;-;-;- +strings +"" +"XYZ" +regexps +"\\B(foo|bar|[A-Z])\\B" +-;-;-;- +-;1-2 1-2;-;1-2 1-2 +"^(?:\\B(foo|bar|[A-Z])\\B)$" +-;-;-;- +-;-;-;- +"^(?:\\B(foo|bar|[A-Z])\\B)" +-;-;-;- +-;-;-;- +"(?:\\B(foo|bar|[A-Z])\\B)$" +-;-;-;- +-;-;-;- +strings +"" +"abara" +regexps +"\\B(foo|bar|[A-Z])\\B" +-;-;-;- +-;1-4 1-4;-;1-4 1-4 +"^(?:\\B(foo|bar|[A-Z])\\B)$" +-;-;-;- +-;-;-;- +"^(?:\\B(foo|bar|[A-Z])\\B)" +-;-;-;- +-;-;-;- +"(?:\\B(foo|bar|[A-Z])\\B)$" +-;-;-;- +-;-;-;- +strings +"" +"xfoo_" +regexps +"\\B(foo|bar|[A-Z])\\B" +-;-;-;- +-;1-4 1-4;-;1-4 1-4 +"^(?:\\B(foo|bar|[A-Z])\\B)$" +-;-;-;- +-;-;-;- +"^(?:\\B(foo|bar|[A-Z])\\B)" +-;-;-;- +-;-;-;- +"(?:\\B(foo|bar|[A-Z])\\B)$" +-;-;-;- +-;-;-;- +strings +"" +"xfoo\n" +regexps +"\\B(foo|bar|[A-Z])\\B" +-;-;-;- +-;-;-;- +"^(?:\\B(foo|bar|[A-Z])\\B)$" +-;-;-;- +-;-;-;- +"^(?:\\B(foo|bar|[A-Z])\\B)" +-;-;-;- +-;-;-;- +"(?:\\B(foo|bar|[A-Z])\\B)$" +-;-;-;- +-;-;-;- +strings +"" +"foo bar vNx" +regexps +"\\B(foo|bar|[A-Z])\\B" +-;-;-;- +-;9-10 9-10;-;9-10 9-10 +"^(?:\\B(foo|bar|[A-Z])\\B)$" +-;-;-;- +-;-;-;- +"^(?:\\B(foo|bar|[A-Z])\\B)" +-;-;-;- +-;-;-;- +"(?:\\B(foo|bar|[A-Z])\\B)$" +-;-;-;- +-;-;-;- +strings +"" +"xfoo" +regexps +"\\B(fo|foo)\\B" +-;-;-;- +-;1-3 1-3;-;1-3 1-3 +"^(?:\\B(fo|foo)\\B)$" +-;-;-;- +-;-;-;- +"^(?:\\B(fo|foo)\\B)" +-;-;-;- +-;-;-;- +"(?:\\B(fo|foo)\\B)$" +-;-;-;- +-;-;-;- +strings +"" +"xfooo" +regexps +"\\B(foo|fo)\\B" +-;-;-;- +-;1-4 1-4;-;1-4 1-4 +"^(?:\\B(foo|fo)\\B)$" +-;-;-;- +-;-;-;- +"^(?:\\B(foo|fo)\\B)" +-;-;-;- +-;-;-;- +"(?:\\B(foo|fo)\\B)$" +-;-;-;- +-;-;-;- +strings +"" +"" +regexps +"\\B\\B" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"^(?:\\B\\B)$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"^(?:\\B\\B)" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"(?:\\B\\B)$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +strings +"" +"x" +regexps +"\\B\\B" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:\\B\\B)$" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:\\B\\B)" +0-0;0-0;0-0;0-0 +-;-;-;- +"(?:\\B\\B)$" +0-0;0-0;0-0;0-0 +-;-;-;- +strings +"" +"" +regexps +"\\B$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"^(?:\\B$)$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"^(?:\\B$)" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"(?:\\B$)$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +strings +"" +"x" +regexps +"\\B$" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:\\B$)$" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:\\B$)" +0-0;0-0;0-0;0-0 +-;-;-;- +"(?:\\B$)$" +0-0;0-0;0-0;0-0 +-;-;-;- +strings +"" +"y x" +regexps +"\\B$" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:\\B$)$" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:\\B$)" +0-0;0-0;0-0;0-0 +-;-;-;- +"(?:\\B$)$" +0-0;0-0;0-0;0-0 +-;-;-;- +strings +"" +"x" +regexps +"\\B.$" +-;-;-;- +-;-;-;- +"^(?:\\B.$)$" +-;-;-;- +-;-;-;- +"^(?:\\B.$)" +-;-;-;- +-;-;-;- +"(?:\\B.$)$" +-;-;-;- +-;-;-;- +strings +"" +"fo" +regexps +"^\\B(fo|foo)\\B" +-;-;-;- +-;-;-;- +"^(?:^\\B(fo|foo)\\B)$" +-;-;-;- +-;-;-;- +"^(?:^\\B(fo|foo)\\B)" +-;-;-;- +-;-;-;- +"(?:^\\B(fo|foo)\\B)$" +-;-;-;- +-;-;-;- +strings +"" +"foo" +regexps +"^\\B(fo|foo)\\B" +-;-;-;- +-;-;-;- +"^(?:^\\B(fo|foo)\\B)$" +-;-;-;- +-;-;-;- +"^(?:^\\B(fo|foo)\\B)" +-;-;-;- +-;-;-;- +"(?:^\\B(fo|foo)\\B)$" +-;-;-;- +-;-;-;- +strings +"" +"" +regexps +"^\\B" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"^(?:^\\B)$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"^(?:^\\B)" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"(?:^\\B)$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +strings +"" +"x" +regexps +"^\\B" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:^\\B)$" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:^\\B)" +0-0;0-0;0-0;0-0 +-;-;-;- +"(?:^\\B)$" +0-0;0-0;0-0;0-0 +-;-;-;- +strings +"" +"" +regexps +"^\\B\\B" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"^(?:^\\B\\B)$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"^(?:^\\B\\B)" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"(?:^\\B\\B)$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +strings +"" +"x" +regexps +"^\\B\\B" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:^\\B\\B)$" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:^\\B\\B)" +0-0;0-0;0-0;0-0 +-;-;-;- +"(?:^\\B\\B)$" +0-0;0-0;0-0;0-0 +-;-;-;- +strings +"" +"" +regexps +"^\\B$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"^(?:^\\B$)$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"^(?:^\\B$)" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"(?:^\\B$)$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +strings +"" +"x" +regexps +"^\\B$" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:^\\B$)$" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:^\\B$)" +0-0;0-0;0-0;0-0 +-;-;-;- +"(?:^\\B$)$" +0-0;0-0;0-0;0-0 +-;-;-;- +strings +"" +"x" +regexps +"^\\B.$" +-;-;-;- +-;-;-;- +"^(?:^\\B.$)$" +-;-;-;- +-;-;-;- +"^(?:^\\B.$)" +-;-;-;- +-;-;-;- +"(?:^\\B.$)$" +-;-;-;- +-;-;-;- +strings +"" +"x" +regexps +"^\\B.\\B$" +-;-;-;- +-;-;-;- +"^(?:^\\B.\\B$)$" +-;-;-;- +-;-;-;- +"^(?:^\\B.\\B$)" +-;-;-;- +-;-;-;- +"(?:^\\B.\\B$)$" +-;-;-;- +-;-;-;- +strings +"" +"" +regexps +"^^^^^^^^\\B$$$$$$$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"^(?:^^^^^^^^\\B$$$$$$$)$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"^(?:^^^^^^^^\\B$$$$$$$)" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"(?:^^^^^^^^\\B$$$$$$$)$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +strings +"" +"x" +regexps +"^^^^^^^^\\B.$$$$$$" +-;-;-;- +-;-;-;- +"^(?:^^^^^^^^\\B.$$$$$$)$" +-;-;-;- +-;-;-;- +"^(?:^^^^^^^^\\B.$$$$$$)" +-;-;-;- +-;-;-;- +"(?:^^^^^^^^\\B.$$$$$$)$" +-;-;-;- +-;-;-;- +strings +"" +"x" +regexps +"^^^^^^^^\\B$$$$$$$" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:^^^^^^^^\\B$$$$$$$)$" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:^^^^^^^^\\B$$$$$$$)" +0-0;0-0;0-0;0-0 +-;-;-;- +"(?:^^^^^^^^\\B$$$$$$$)$" +0-0;0-0;0-0;0-0 +-;-;-;- +strings +"" +"x" +regexps +"\\bx\\b" +-;-;-;- +0-1;0-1;0-1;0-1 +"^(?:\\bx\\b)$" +-;-;-;- +0-1;0-1;0-1;0-1 +"^(?:\\bx\\b)" +-;-;-;- +0-1;0-1;0-1;0-1 +"(?:\\bx\\b)$" +-;-;-;- +0-1;0-1;0-1;0-1 +strings +"" +"x>" +regexps +"\\bx\\b" +-;-;-;- +-;0-1;-;0-1 +"^(?:\\bx\\b)$" +-;-;-;- +-;-;-;- +"^(?:\\bx\\b)" +-;-;-;- +-;0-1;-;0-1 +"(?:\\bx\\b)$" +-;-;-;- +-;-;-;- +strings +"" +"" +regexps +"\\bx\\b" +-;-;-;- +-;1-2;-;1-2 +"^(?:\\bx\\b)$" +-;-;-;- +-;-;-;- +"^(?:\\bx\\b)" +-;-;-;- +-;-;-;- +"(?:\\bx\\b)$" +-;-;-;- +-;-;-;- +strings +"" +"ax" +regexps +"\\bx\\b" +-;-;-;- +-;-;-;- +"^(?:\\bx\\b)$" +-;-;-;- +-;-;-;- +"^(?:\\bx\\b)" +-;-;-;- +-;-;-;- +"(?:\\bx\\b)$" +-;-;-;- +-;-;-;- +strings +"" +"xb" +regexps +"\\bx\\b" +-;-;-;- +-;-;-;- +"^(?:\\bx\\b)$" +-;-;-;- +-;-;-;- +"^(?:\\bx\\b)" +-;-;-;- +-;-;-;- +"(?:\\bx\\b)$" +-;-;-;- +-;-;-;- +strings +"" +"axb" +regexps +"\\bx\\b" +-;-;-;- +-;-;-;- +"^(?:\\bx\\b)$" +-;-;-;- +-;-;-;- +"^(?:\\bx\\b)" +-;-;-;- +-;-;-;- +"(?:\\bx\\b)$" +-;-;-;- +-;-;-;- +strings +"" +"«x" +regexps +"\\bx\\b" +-;-;-;- +-;2-3;-;2-3 +"^(?:\\bx\\b)$" +-;-;-;- +-;-;-;- +"^(?:\\bx\\b)" +-;-;-;- +-;-;-;- +"(?:\\bx\\b)$" +-;-;-;- +-;2-3;-;2-3 +strings +"" +"x»" +regexps +"\\bx\\b" +-;-;-;- +-;0-1;-;0-1 +"^(?:\\bx\\b)$" +-;-;-;- +-;-;-;- +"^(?:\\bx\\b)" +-;-;-;- +-;0-1;-;0-1 +"(?:\\bx\\b)$" +-;-;-;- +-;-;-;- +strings +"" +"«x»" +regexps +"\\bx\\b" +-;-;-;- +-;2-3;-;2-3 +"^(?:\\bx\\b)$" +-;-;-;- +-;-;-;- +"^(?:\\bx\\b)" +-;-;-;- +-;-;-;- +"(?:\\bx\\b)$" +-;-;-;- +-;-;-;- +strings +"" +"axb" +regexps +"\\bx\\b" +-;-;-;- +-;-;-;- +"^(?:\\bx\\b)$" +-;-;-;- +-;-;-;- +"^(?:\\bx\\b)" +-;-;-;- +-;-;-;- +"(?:\\bx\\b)$" +-;-;-;- +-;-;-;- +strings +"" +"áxβ" +regexps +"\\bx\\b" +-;-;-;- +-;2-3;-;2-3 +"^(?:\\bx\\b)$" +-;-;-;- +-;-;-;- +"^(?:\\bx\\b)" +-;-;-;- +-;-;-;- +"(?:\\bx\\b)$" +-;-;-;- +-;-;-;- +strings +"" +"axb" +regexps +"\\Bx\\B" +-;-;-;- +-;1-2;-;1-2 +"^(?:\\Bx\\B)$" +-;-;-;- +-;-;-;- +"^(?:\\Bx\\B)" +-;-;-;- +-;-;-;- +"(?:\\Bx\\B)$" +-;-;-;- +-;-;-;- +strings +"" +"áxβ" +regexps +"\\Bx\\B" +-;-;-;- +-;-;-;- +"^(?:\\Bx\\B)$" +-;-;-;- +-;-;-;- +"^(?:\\Bx\\B)" +-;-;-;- +-;-;-;- +"(?:\\Bx\\B)$" +-;-;-;- +-;-;-;- +strings +"" +"" +regexps +"^$^$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"^(?:^$^$)$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"^(?:^$^$)" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"(?:^$^$)$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +strings +"" +"" +regexps +"^$^" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"^(?:^$^)$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"^(?:^$^)" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"(?:^$^)$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +strings +"" +"" +regexps +"$^$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"^(?:$^$)$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"^(?:$^$)" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +"(?:$^$)$" +0-0;0-0;0-0;0-0 +0-0;0-0;0-0;0-0 +strings +"" +"x" +regexps +"^$^$" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:^$^$)$" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:^$^$)" +0-0;0-0;0-0;0-0 +-;-;-;- +"(?:^$^$)$" +0-0;0-0;0-0;0-0 +-;-;-;- +strings +"" +"x" +regexps +"^$^" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:^$^)$" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:^$^)" +0-0;0-0;0-0;0-0 +-;-;-;- +"(?:^$^)$" +0-0;0-0;0-0;0-0 +-;-;-;- +strings +"" +"x" +regexps +"$^$" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:$^$)$" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:$^$)" +0-0;0-0;0-0;0-0 +-;-;-;- +"(?:$^$)$" +0-0;0-0;0-0;0-0 +-;-;-;- +strings +"" +"x\ny" +regexps +"^$^$" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:^$^$)$" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:^$^$)" +0-0;0-0;0-0;0-0 +-;-;-;- +"(?:^$^$)$" +0-0;0-0;0-0;0-0 +-;-;-;- +strings +"" +"x\ny" +regexps +"^$^" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:^$^)$" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:^$^)" +0-0;0-0;0-0;0-0 +-;-;-;- +"(?:^$^)$" +0-0;0-0;0-0;0-0 +-;-;-;- +strings +"" +"x\ny" +regexps +"$^$" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:$^$)$" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:$^$)" +0-0;0-0;0-0;0-0 +-;-;-;- +"(?:$^$)$" +0-0;0-0;0-0;0-0 +-;-;-;- +strings +"" +"x\n\ny" +regexps +"^$^$" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:^$^$)$" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:^$^$)" +0-0;0-0;0-0;0-0 +-;-;-;- +"(?:^$^$)$" +0-0;0-0;0-0;0-0 +-;-;-;- +strings +"" +"x\n\ny" +regexps +"^$^" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:^$^)$" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:^$^)" +0-0;0-0;0-0;0-0 +-;-;-;- +"(?:^$^)$" +0-0;0-0;0-0;0-0 +-;-;-;- +strings +"" +"x\n\ny" +regexps +"$^$" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:$^$)$" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:$^$)" +0-0;0-0;0-0;0-0 +-;-;-;- +"(?:$^$)$" +0-0;0-0;0-0;0-0 +-;-;-;- +strings +"" +"foo$bar" +regexps +"^(foo\\$)$" +-;-;-;- +-;-;-;- +"^(?:^(foo\\$)$)$" +-;-;-;- +-;-;-;- +"^(?:^(foo\\$)$)" +-;-;-;- +-;-;-;- +"(?:^(foo\\$)$)$" +-;-;-;- +-;-;-;- +strings +"" +"foo$bar" +regexps +"(foo\\$)" +-;-;-;- +-;0-4 0-4;-;0-4 0-4 +"^(?:(foo\\$))$" +-;-;-;- +-;-;-;- +"^(?:(foo\\$))" +-;-;-;- +-;0-4 0-4;-;0-4 0-4 +"(?:(foo\\$))$" +-;-;-;- +-;-;-;- +strings +"" +"abc" +regexps +"^...$" +-;-;-;- +0-3;0-3;0-3;0-3 +"^(?:^...$)$" +-;-;-;- +0-3;0-3;0-3;0-3 +"^(?:^...$)" +-;-;-;- +0-3;0-3;0-3;0-3 +"(?:^...$)$" +-;-;-;- +0-3;0-3;0-3;0-3 +strings +"" +"本" +regexps +"^本$" +-;-;-;- +0-3;0-3;0-3;0-3 +"^(?:^本$)$" +-;-;-;- +0-3;0-3;0-3;0-3 +"^(?:^本$)" +-;-;-;- +0-3;0-3;0-3;0-3 +"(?:^本$)$" +-;-;-;- +0-3;0-3;0-3;0-3 +strings +"" +"日本語" +regexps +"^...$" +-;-;-;- +0-9;0-9;0-9;0-9 +"^(?:^...$)$" +-;-;-;- +0-9;0-9;0-9;0-9 +"^(?:^...$)" +-;-;-;- +0-9;0-9;0-9;0-9 +"(?:^...$)$" +-;-;-;- +0-9;0-9;0-9;0-9 +strings +"" +".本." +regexps +"^...$" +-;-;-;- +0-5;0-5;0-5;0-5 +"^(?:^...$)$" +-;-;-;- +0-5;0-5;0-5;0-5 +"^(?:^...$)" +-;-;-;- +0-5;0-5;0-5;0-5 +"(?:^...$)$" +-;-;-;- +0-5;0-5;0-5;0-5 +strings +"" +"本" +regexps +"^\\C\\C\\C$" +-;-;-;- +0-3;0-3;0-3;0-3 +"^(?:^\\C\\C\\C$)$" +-;-;-;- +0-3;0-3;0-3;0-3 +"^(?:^\\C\\C\\C$)" +-;-;-;- +0-3;0-3;0-3;0-3 +"(?:^\\C\\C\\C$)$" +-;-;-;- +0-3;0-3;0-3;0-3 +strings +"" +"本" +regexps +"^\\C$" +-;-;-;- +-;-;-;- +"^(?:^\\C$)$" +-;-;-;- +-;-;-;- +"^(?:^\\C$)" +-;-;-;- +-;-;-;- +"(?:^\\C$)$" +-;-;-;- +-;-;-;- +strings +"" +"日本語" +regexps +"^\\C\\C\\C$" +-;-;-;- +-;-;-;- +"^(?:^\\C\\C\\C$)$" +-;-;-;- +-;-;-;- +"^(?:^\\C\\C\\C$)" +-;-;-;- +-;-;-;- +"(?:^\\C\\C\\C$)$" +-;-;-;- +-;-;-;- +strings +"" +"日本語" +regexps +"^...$" +-;-;-;- +0-9;0-9;0-9;0-9 +"^(?:^...$)$" +-;-;-;- +0-9;0-9;0-9;0-9 +"^(?:^...$)" +-;-;-;- +0-9;0-9;0-9;0-9 +"(?:^...$)$" +-;-;-;- +0-9;0-9;0-9;0-9 +strings +"" +"日本語" +regexps +"^.........$" +-;-;-;- +-;-;-;- +"^(?:^.........$)$" +-;-;-;- +-;-;-;- +"^(?:^.........$)" +-;-;-;- +-;-;-;- +"(?:^.........$)$" +-;-;-;- +-;-;-;- +strings +"" +".本." +regexps +"^...$" +-;-;-;- +0-5;0-5;0-5;0-5 +"^(?:^...$)$" +-;-;-;- +0-5;0-5;0-5;0-5 +"^(?:^...$)" +-;-;-;- +0-5;0-5;0-5;0-5 +"(?:^...$)$" +-;-;-;- +0-5;0-5;0-5;0-5 +strings +"" +".本." +regexps +"^.....$" +-;-;-;- +-;-;-;- +"^(?:^.....$)$" +-;-;-;- +-;-;-;- +"^(?:^.....$)" +-;-;-;- +-;-;-;- +"(?:^.....$)$" +-;-;-;- +-;-;-;- +strings +"" +"xfooo" +regexps +"\\B(fo|foo)\\B" +-;-;-;- +-;1-3 1-3;-;1-4 1-4 +"^(?:\\B(fo|foo)\\B)$" +-;-;-;- +-;-;-;- +"^(?:\\B(fo|foo)\\B)" +-;-;-;- +-;-;-;- +"(?:\\B(fo|foo)\\B)$" +-;-;-;- +-;-;-;- +strings +"" +"foo" +regexps +"(fo|foo)" +-;-;-;- +0-3 0-3;0-2 0-2;0-3 0-3;0-3 0-3 +"^(?:(fo|foo))$" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +"^(?:(fo|foo))" +-;-;-;- +0-3 0-3;0-2 0-2;0-3 0-3;0-3 0-3 +"(?:(fo|foo))$" +-;-;-;- +0-3 0-3;0-3 0-3;0-3 0-3;0-3 0-3 +strings +"" +"a" +regexps +"\\141" +-;-;-;- +0-1;0-1;0-1;0-1 +"^(?:\\141)$" +-;-;-;- +0-1;0-1;0-1;0-1 +"^(?:\\141)" +-;-;-;- +0-1;0-1;0-1;0-1 +"(?:\\141)$" +-;-;-;- +0-1;0-1;0-1;0-1 +strings +"" +"0" +regexps +"\\060" +-;-;-;- +0-1;0-1;0-1;0-1 +"^(?:\\060)$" +-;-;-;- +0-1;0-1;0-1;0-1 +"^(?:\\060)" +-;-;-;- +0-1;0-1;0-1;0-1 +"(?:\\060)$" +-;-;-;- +0-1;0-1;0-1;0-1 +strings +"" +"00" +regexps +"\\0600" +-;-;-;- +0-2;0-2;0-2;0-2 +"^(?:\\0600)$" +-;-;-;- +0-2;0-2;0-2;0-2 +"^(?:\\0600)" +-;-;-;- +0-2;0-2;0-2;0-2 +"(?:\\0600)$" +-;-;-;- +0-2;0-2;0-2;0-2 +strings +"" +"08" +regexps +"\\608" +-;-;-;- +0-2;0-2;0-2;0-2 +"^(?:\\608)$" +-;-;-;- +0-2;0-2;0-2;0-2 +"^(?:\\608)" +-;-;-;- +0-2;0-2;0-2;0-2 +"(?:\\608)$" +-;-;-;- +0-2;0-2;0-2;0-2 +strings +"" +"" +regexps +"\\01" +-;-;-;- +0-1;0-1;0-1;0-1 +"^(?:\\01)$" +-;-;-;- +0-1;0-1;0-1;0-1 +"^(?:\\01)" +-;-;-;- +0-1;0-1;0-1;0-1 +"(?:\\01)$" +-;-;-;- +0-1;0-1;0-1;0-1 +strings +"" +"8" +regexps +"\\018" +-;-;-;- +0-2;0-2;0-2;0-2 +"^(?:\\018)$" +-;-;-;- +0-2;0-2;0-2;0-2 +"^(?:\\018)" +-;-;-;- +0-2;0-2;0-2;0-2 +"(?:\\018)$" +-;-;-;- +0-2;0-2;0-2;0-2 +strings +"" +"a" +regexps +"\\x{61}" +-;-;-;- +0-1;0-1;0-1;0-1 +"^(?:\\x{61})$" +-;-;-;- +0-1;0-1;0-1;0-1 +"^(?:\\x{61})" +-;-;-;- +0-1;0-1;0-1;0-1 +"(?:\\x{61})$" +-;-;-;- +0-1;0-1;0-1;0-1 +strings +"" +"a" +regexps +"\\x61" +-;-;-;- +0-1;0-1;0-1;0-1 +"^(?:\\x61)$" +-;-;-;- +0-1;0-1;0-1;0-1 +"^(?:\\x61)" +-;-;-;- +0-1;0-1;0-1;0-1 +"(?:\\x61)$" +-;-;-;- +0-1;0-1;0-1;0-1 +strings +"" +"a" +regexps +"\\x{00000061}" +-;-;-;- +0-1;0-1;0-1;0-1 +"^(?:\\x{00000061})$" +-;-;-;- +0-1;0-1;0-1;0-1 +"^(?:\\x{00000061})" +-;-;-;- +0-1;0-1;0-1;0-1 +"(?:\\x{00000061})$" +-;-;-;- +0-1;0-1;0-1;0-1 +strings +"" +"aαβb" +regexps +"\\p{Greek}+" +-;-;-;- +-;1-5;-;1-5 +"^(?:\\p{Greek}+)$" +-;-;-;- +-;-;-;- +"^(?:\\p{Greek}+)" +-;-;-;- +-;-;-;- +"(?:\\p{Greek}+)$" +-;-;-;- +-;-;-;- +strings +"" +"aαβb" +regexps +"\\P{Greek}+" +-;-;-;- +-;0-1;-;0-1 +"^(?:\\P{Greek}+)$" +-;-;-;- +-;-;-;- +"^(?:\\P{Greek}+)" +-;-;-;- +-;0-1;-;0-1 +"(?:\\P{Greek}+)$" +-;-;-;- +-;5-6;-;5-6 +strings +"" +"aαβb" +regexps +"\\p{^Greek}+" +-;-;-;- +-;0-1;-;0-1 +"^(?:\\p{^Greek}+)$" +-;-;-;- +-;-;-;- +"^(?:\\p{^Greek}+)" +-;-;-;- +-;0-1;-;0-1 +"(?:\\p{^Greek}+)$" +-;-;-;- +-;5-6;-;5-6 +strings +"" +"aαβb" +regexps +"\\P{^Greek}+" +-;-;-;- +-;1-5;-;1-5 +"^(?:\\P{^Greek}+)$" +-;-;-;- +-;-;-;- +"^(?:\\P{^Greek}+)" +-;-;-;- +-;-;-;- +"(?:\\P{^Greek}+)$" +-;-;-;- +-;-;-;- +strings +"" +"abc123" +regexps +"[^0-9]+" +-;-;-;- +-;0-3;-;0-3 +"^(?:[^0-9]+)$" +-;-;-;- +-;-;-;- +"^(?:[^0-9]+)" +-;-;-;- +-;0-3;-;0-3 +"(?:[^0-9]+)$" +-;-;-;- +-;-;-;- +strings +"" +"abc123²³¼½¾₀₉" +regexps +"\\p{Nd}+" +-;-;-;- +-;3-6;-;3-6 +"^(?:\\p{Nd}+)$" +-;-;-;- +-;-;-;- +"^(?:\\p{Nd}+)" +-;-;-;- +-;-;-;- +"(?:\\p{Nd}+)$" +-;-;-;- +-;-;-;- +strings +"" +"abc123²³¼½¾₀₉" +regexps +"\\p{^Nd}+" +-;-;-;- +-;0-3;-;0-3 +"^(?:\\p{^Nd}+)$" +-;-;-;- +-;-;-;- +"^(?:\\p{^Nd}+)" +-;-;-;- +-;0-3;-;0-3 +"(?:\\p{^Nd}+)$" +-;-;-;- +-;6-22;-;6-22 +strings +"" +"abc123²³¼½¾₀₉" +regexps +"\\P{Nd}+" +-;-;-;- +-;0-3;-;0-3 +"^(?:\\P{Nd}+)$" +-;-;-;- +-;-;-;- +"^(?:\\P{Nd}+)" +-;-;-;- +-;0-3;-;0-3 +"(?:\\P{Nd}+)$" +-;-;-;- +-;6-22;-;6-22 +strings +"" +"abc123²³¼½¾₀₉" +regexps +"\\P{^Nd}+" +-;-;-;- +-;3-6;-;3-6 +"^(?:\\P{^Nd}+)$" +-;-;-;- +-;-;-;- +"^(?:\\P{^Nd}+)" +-;-;-;- +-;-;-;- +"(?:\\P{^Nd}+)$" +-;-;-;- +-;-;-;- +strings +"" +"abc123²³¼½¾₀₉" +regexps +"\\pN+" +-;-;-;- +-;3-22;-;3-22 +"^(?:\\pN+)$" +-;-;-;- +-;-;-;- +"^(?:\\pN+)" +-;-;-;- +-;-;-;- +"(?:\\pN+)$" +-;-;-;- +-;3-22;-;3-22 +strings +"" +"abc123²³¼½¾₀₉" +regexps +"\\p{N}+" +-;-;-;- +-;3-22;-;3-22 +"^(?:\\p{N}+)$" +-;-;-;- +-;-;-;- +"^(?:\\p{N}+)" +-;-;-;- +-;-;-;- +"(?:\\p{N}+)$" +-;-;-;- +-;3-22;-;3-22 +strings +"" +"abc123²³¼½¾₀₉" +regexps +"\\p{^N}+" +-;-;-;- +-;0-3;-;0-3 +"^(?:\\p{^N}+)$" +-;-;-;- +-;-;-;- +"^(?:\\p{^N}+)" +-;-;-;- +-;0-3;-;0-3 +"(?:\\p{^N}+)$" +-;-;-;- +-;-;-;- +strings +"" +"abc123" +regexps +"\\p{Any}+" +-;-;-;- +0-6;0-6;0-6;0-6 +"^(?:\\p{Any}+)$" +-;-;-;- +0-6;0-6;0-6;0-6 +"^(?:\\p{Any}+)" +-;-;-;- +0-6;0-6;0-6;0-6 +"(?:\\p{Any}+)$" +-;-;-;- +0-6;0-6;0-6;0-6 +strings +"" +"@AaB" +regexps +"(?i)[@-A]+" +-;-;-;- +-;0-3;-;0-3 +"^(?:(?i)[@-A]+)$" +-;-;-;- +-;-;-;- +"^(?:(?i)[@-A]+)" +-;-;-;- +-;0-3;-;0-3 +"(?:(?i)[@-A]+)$" +-;-;-;- +-;-;-;- +strings +"" +"aAzZ" +regexps +"(?i)[A-Z]+" +-;-;-;- +0-4;0-4;0-4;0-4 +"^(?:(?i)[A-Z]+)$" +-;-;-;- +0-4;0-4;0-4;0-4 +"^(?:(?i)[A-Z]+)" +-;-;-;- +0-4;0-4;0-4;0-4 +"(?:(?i)[A-Z]+)$" +-;-;-;- +0-4;0-4;0-4;0-4 +strings +"" +"Aa\\" +regexps +"(?i)[^\\\\]+" +-;-;-;- +-;0-2;-;0-2 +"^(?:(?i)[^\\\\]+)$" +-;-;-;- +-;-;-;- +"^(?:(?i)[^\\\\]+)" +-;-;-;- +-;0-2;-;0-2 +"(?:(?i)[^\\\\]+)$" +-;-;-;- +-;-;-;- +strings +"" +"acegikmoqsuwyACEGIKMOQSUWY" +regexps +"(?i)[acegikmoqsuwy]+" +-;-;-;- +0-26;0-26;0-26;0-26 +"^(?:(?i)[acegikmoqsuwy]+)$" +-;-;-;- +0-26;0-26;0-26;0-26 +"^(?:(?i)[acegikmoqsuwy]+)" +-;-;-;- +0-26;0-26;0-26;0-26 +"(?:(?i)[acegikmoqsuwy]+)$" +-;-;-;- +0-26;0-26;0-26;0-26 +strings +"" +"@AaB" +regexps +"[@-A]+" +-;-;-;- +-;0-2;-;0-2 +"^(?:[@-A]+)$" +-;-;-;- +-;-;-;- +"^(?:[@-A]+)" +-;-;-;- +-;0-2;-;0-2 +"(?:[@-A]+)$" +-;-;-;- +-;-;-;- +strings +"" +"aAzZ" +regexps +"[A-Z]+" +-;-;-;- +-;1-2;-;1-2 +"^(?:[A-Z]+)$" +-;-;-;- +-;-;-;- +"^(?:[A-Z]+)" +-;-;-;- +-;-;-;- +"(?:[A-Z]+)$" +-;-;-;- +-;3-4;-;3-4 +strings +"" +"Aa\\" +regexps +"[^\\\\]+" +-;-;-;- +-;0-2;-;0-2 +"^(?:[^\\\\]+)$" +-;-;-;- +-;-;-;- +"^(?:[^\\\\]+)" +-;-;-;- +-;0-2;-;0-2 +"(?:[^\\\\]+)$" +-;-;-;- +-;-;-;- +strings +"" +"acegikmoqsuwyACEGIKMOQSUWY" +regexps +"[acegikmoqsuwy]+" +-;-;-;- +-;0-13;-;0-13 +"^(?:[acegikmoqsuwy]+)$" +-;-;-;- +-;-;-;- +"^(?:[acegikmoqsuwy]+)" +-;-;-;- +-;0-13;-;0-13 +"(?:[acegikmoqsuwy]+)$" +-;-;-;- +-;-;-;- +strings +"" +"abcdef" +regexps +"^abc" +-;-;-;- +-;0-3;-;0-3 +"^(?:^abc)$" +-;-;-;- +-;-;-;- +"^(?:^abc)" +-;-;-;- +-;0-3;-;0-3 +"(?:^abc)$" +-;-;-;- +-;-;-;- +strings +"" +"aabcdef" +regexps +"^abc" +-;-;-;- +-;-;-;- +"^(?:^abc)$" +-;-;-;- +-;-;-;- +"^(?:^abc)" +-;-;-;- +-;-;-;- +"(?:^abc)$" +-;-;-;- +-;-;-;- +strings +"" +"abcdef" +regexps +"^[ay]*[bx]+c" +-;-;-;- +-;0-3;-;0-3 +"^(?:^[ay]*[bx]+c)$" +-;-;-;- +-;-;-;- +"^(?:^[ay]*[bx]+c)" +-;-;-;- +-;0-3;-;0-3 +"(?:^[ay]*[bx]+c)$" +-;-;-;- +-;-;-;- +strings +"" +"aabcdef" +regexps +"^[ay]*[bx]+c" +-;-;-;- +-;0-4;-;0-4 +"^(?:^[ay]*[bx]+c)$" +-;-;-;- +-;-;-;- +"^(?:^[ay]*[bx]+c)" +-;-;-;- +-;0-4;-;0-4 +"(?:^[ay]*[bx]+c)$" +-;-;-;- +-;-;-;- +strings +"" +"abcdef" +regexps +"def$" +-;-;-;- +-;3-6;-;3-6 +"^(?:def$)$" +-;-;-;- +-;-;-;- +"^(?:def$)" +-;-;-;- +-;-;-;- +"(?:def$)$" +-;-;-;- +-;3-6;-;3-6 +strings +"" +"abcdeff" +regexps +"def$" +-;-;-;- +-;-;-;- +"^(?:def$)$" +-;-;-;- +-;-;-;- +"^(?:def$)" +-;-;-;- +-;-;-;- +"(?:def$)$" +-;-;-;- +-;-;-;- +strings +"" +"abcdef" +regexps +"d[ex][fy]$" +-;-;-;- +-;3-6;-;3-6 +"^(?:d[ex][fy]$)$" +-;-;-;- +-;-;-;- +"^(?:d[ex][fy]$)" +-;-;-;- +-;-;-;- +"(?:d[ex][fy]$)$" +-;-;-;- +-;3-6;-;3-6 +strings +"" +"abcdeff" +regexps +"d[ex][fy]$" +-;-;-;- +-;-;-;- +"^(?:d[ex][fy]$)$" +-;-;-;- +-;-;-;- +"^(?:d[ex][fy]$)" +-;-;-;- +-;-;-;- +"(?:d[ex][fy]$)$" +-;-;-;- +-;-;-;- +strings +"" +"abcdef" +regexps +"[dz][ex][fy]$" +-;-;-;- +-;3-6;-;3-6 +"^(?:[dz][ex][fy]$)$" +-;-;-;- +-;-;-;- +"^(?:[dz][ex][fy]$)" +-;-;-;- +-;-;-;- +"(?:[dz][ex][fy]$)$" +-;-;-;- +-;3-6;-;3-6 +strings +"" +"abcdeff" +regexps +"[dz][ex][fy]$" +-;-;-;- +-;-;-;- +"^(?:[dz][ex][fy]$)$" +-;-;-;- +-;-;-;- +"^(?:[dz][ex][fy]$)" +-;-;-;- +-;-;-;- +"(?:[dz][ex][fy]$)$" +-;-;-;- +-;-;-;- +strings +"" +"abcdef" +regexps +"(?m)^abc" +-;-;-;- +-;0-3;-;0-3 +"^(?:(?m)^abc)$" +-;-;-;- +-;-;-;- +"^(?:(?m)^abc)" +-;-;-;- +-;0-3;-;0-3 +"(?:(?m)^abc)$" +-;-;-;- +-;-;-;- +strings +"" +"aabcdef" +regexps +"(?m)^abc" +-;-;-;- +-;-;-;- +"^(?:(?m)^abc)$" +-;-;-;- +-;-;-;- +"^(?:(?m)^abc)" +-;-;-;- +-;-;-;- +"(?:(?m)^abc)$" +-;-;-;- +-;-;-;- +strings +"" +"abcdef" +regexps +"(?m)^[ay]*[bx]+c" +-;-;-;- +-;0-3;-;0-3 +"^(?:(?m)^[ay]*[bx]+c)$" +-;-;-;- +-;-;-;- +"^(?:(?m)^[ay]*[bx]+c)" +-;-;-;- +-;0-3;-;0-3 +"(?:(?m)^[ay]*[bx]+c)$" +-;-;-;- +-;-;-;- +strings +"" +"aabcdef" +regexps +"(?m)^[ay]*[bx]+c" +-;-;-;- +-;0-4;-;0-4 +"^(?:(?m)^[ay]*[bx]+c)$" +-;-;-;- +-;-;-;- +"^(?:(?m)^[ay]*[bx]+c)" +-;-;-;- +-;0-4;-;0-4 +"(?:(?m)^[ay]*[bx]+c)$" +-;-;-;- +-;-;-;- +strings +"" +"abcdef" +regexps +"(?m)def$" +-;-;-;- +-;3-6;-;3-6 +"^(?:(?m)def$)$" +-;-;-;- +-;-;-;- +"^(?:(?m)def$)" +-;-;-;- +-;-;-;- +"(?:(?m)def$)$" +-;-;-;- +-;3-6;-;3-6 +strings +"" +"abcdeff" +regexps +"(?m)def$" +-;-;-;- +-;-;-;- +"^(?:(?m)def$)$" +-;-;-;- +-;-;-;- +"^(?:(?m)def$)" +-;-;-;- +-;-;-;- +"(?:(?m)def$)$" +-;-;-;- +-;-;-;- +strings +"" +"abcdef" +regexps +"(?m)d[ex][fy]$" +-;-;-;- +-;3-6;-;3-6 +"^(?:(?m)d[ex][fy]$)$" +-;-;-;- +-;-;-;- +"^(?:(?m)d[ex][fy]$)" +-;-;-;- +-;-;-;- +"(?:(?m)d[ex][fy]$)$" +-;-;-;- +-;3-6;-;3-6 +strings +"" +"abcdeff" +regexps +"(?m)d[ex][fy]$" +-;-;-;- +-;-;-;- +"^(?:(?m)d[ex][fy]$)$" +-;-;-;- +-;-;-;- +"^(?:(?m)d[ex][fy]$)" +-;-;-;- +-;-;-;- +"(?:(?m)d[ex][fy]$)$" +-;-;-;- +-;-;-;- +strings +"" +"abcdef" +regexps +"(?m)[dz][ex][fy]$" +-;-;-;- +-;3-6;-;3-6 +"^(?:(?m)[dz][ex][fy]$)$" +-;-;-;- +-;-;-;- +"^(?:(?m)[dz][ex][fy]$)" +-;-;-;- +-;-;-;- +"(?:(?m)[dz][ex][fy]$)$" +-;-;-;- +-;3-6;-;3-6 +strings +"" +"abcdeff" +regexps +"(?m)[dz][ex][fy]$" +-;-;-;- +-;-;-;- +"^(?:(?m)[dz][ex][fy]$)$" +-;-;-;- +-;-;-;- +"^(?:(?m)[dz][ex][fy]$)" +-;-;-;- +-;-;-;- +"(?:(?m)[dz][ex][fy]$)$" +-;-;-;- +-;-;-;- +strings +"" +"a" +regexps +"^" +0-0;0-0;0-0;0-0 +-;0-0;-;0-0 +"^(?:^)$" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:^)" +0-0;0-0;0-0;0-0 +-;0-0;-;0-0 +"(?:^)$" +0-0;0-0;0-0;0-0 +-;-;-;- +strings +"" +"a" +regexps +"^^" +0-0;0-0;0-0;0-0 +-;0-0;-;0-0 +"^(?:^^)$" +0-0;0-0;0-0;0-0 +-;-;-;- +"^(?:^^)" +0-0;0-0;0-0;0-0 +-;0-0;-;0-0 +"(?:^^)$" +0-0;0-0;0-0;0-0 +-;-;-;- +strings +"" +"a" +regexps +"a" +-;-;-;- +0-1;0-1;0-1;0-1 +"^(?:a)$" +-;-;-;- +0-1;0-1;0-1;0-1 +"^(?:a)" +-;-;-;- +0-1;0-1;0-1;0-1 +"(?:a)$" +-;-;-;- +0-1;0-1;0-1;0-1 +strings +"" +"a" +regexps +"ab*" +-;-;-;- +0-1;0-1;0-1;0-1 +"^(?:ab*)$" +-;-;-;- +0-1;0-1;0-1;0-1 +"^(?:ab*)" +-;-;-;- +0-1;0-1;0-1;0-1 +"(?:ab*)$" +-;-;-;- +0-1;0-1;0-1;0-1 +strings +"" +"a" +regexps +"a\\C*" +-;-;-;- +0-1;0-1;0-1;0-1 +"^(?:a\\C*)$" +-;-;-;- +0-1;0-1;0-1;0-1 +"^(?:a\\C*)" +-;-;-;- +0-1;0-1;0-1;0-1 +"(?:a\\C*)$" +-;-;-;- +0-1;0-1;0-1;0-1 +strings +"" +"a" +regexps +"a\\C+" +-;-;-;- +-;-;-;- +"^(?:a\\C+)$" +-;-;-;- +-;-;-;- +"^(?:a\\C+)" +-;-;-;- +-;-;-;- +"(?:a\\C+)$" +-;-;-;- +-;-;-;- +strings +"" +"a" +regexps +"a\\C?" +-;-;-;- +0-1;0-1;0-1;0-1 +"^(?:a\\C?)$" +-;-;-;- +0-1;0-1;0-1;0-1 +"^(?:a\\C?)" +-;-;-;- +0-1;0-1;0-1;0-1 +"(?:a\\C?)$" +-;-;-;- +0-1;0-1;0-1;0-1 +strings +"" +"a" +regexps +"a\\C*?" +-;-;-;- +0-1;0-1;0-1;0-1 +"^(?:a\\C*?)$" +-;-;-;- +0-1;0-1;0-1;0-1 +"^(?:a\\C*?)" +-;-;-;- +0-1;0-1;0-1;0-1 +"(?:a\\C*?)$" +-;-;-;- +0-1;0-1;0-1;0-1 +strings +"" +"a" +regexps +"a\\C+?" +-;-;-;- +-;-;-;- +"^(?:a\\C+?)$" +-;-;-;- +-;-;-;- +"^(?:a\\C+?)" +-;-;-;- +-;-;-;- +"(?:a\\C+?)$" +-;-;-;- +-;-;-;- +strings +"" +"a" +regexps +"a\\C??" +-;-;-;- +0-1;0-1;0-1;0-1 +"^(?:a\\C??)$" +-;-;-;- +0-1;0-1;0-1;0-1 +"^(?:a\\C??)" +-;-;-;- +0-1;0-1;0-1;0-1 +"(?:a\\C??)$" +-;-;-;- +0-1;0-1;0-1;0-1 +strings +"" +"baba" +regexps +"a\\C*|ba\\C" +-;-;-;- +-;0-3;-;0-3 +"^(?:a\\C*|ba\\C)$" +-;-;-;- +-;-;-;- +"^(?:a\\C*|ba\\C)" +-;-;-;- +-;0-3;-;0-3 +"(?:a\\C*|ba\\C)$" +-;-;-;- +-;1-4;-;1-4 +strings +"" +"Inc." +regexps +"\\w*I\\w*" +-;-;-;- +-;0-3;-;0-3 +"^(?:\\w*I\\w*)$" +-;-;-;- +-;-;-;- +"^(?:\\w*I\\w*)" +-;-;-;- +-;0-3;-;0-3 +"(?:\\w*I\\w*)$" +-;-;-;- +-;-;-;- +strings +"" +"aaa" +regexps +"(?:|a)*" +0-0;0-0;0-0;0-0 +0-3;0-0;0-3;0-3 +"^(?:(?:|a)*)$" +0-0;0-0;0-0;0-0 +0-3;0-3;0-3;0-3 +"^(?:(?:|a)*)" +0-0;0-0;0-0;0-0 +0-3;0-0;0-3;0-3 +"(?:(?:|a)*)$" +0-0;0-0;0-0;0-0 +0-3;0-3;0-3;0-3 +strings +"" +"aaa" +regexps +"(?:|a)+" +0-0;0-0;0-0;0-0 +0-3;0-0;0-3;0-3 +"^(?:(?:|a)+)$" +0-0;0-0;0-0;0-0 +0-3;0-3;0-3;0-3 +"^(?:(?:|a)+)" +0-0;0-0;0-0;0-0 +0-3;0-0;0-3;0-3 +"(?:(?:|a)+)$" +0-0;0-0;0-0;0-0 +0-3;0-3;0-3;0-3 diff --git a/packages/re2/src/__fixtures__/repetition.dat b/packages/re2/src/__fixtures__/repetition.dat new file mode 100644 index 0000000..e6361f5 --- /dev/null +++ b/packages/re2/src/__fixtures__/repetition.dat @@ -0,0 +1,163 @@ +NOTE implicit vs. explicit repetitions : 2009-02-02 + +# Glenn Fowler +# conforming matches (column 4) must match one of the following BREs +# NOMATCH +# (0,.)\((\(.\),\(.\))(?,?)(\2,\3)\)* +# (0,.)\((\(.\),\(.\))(\2,\3)(?,?)\)* +# i.e., each 3-tuple has two identical elements and one (?,?) + +E ((..)|(.)) NULL NOMATCH +E ((..)|(.))((..)|(.)) NULL NOMATCH +E ((..)|(.))((..)|(.))((..)|(.)) NULL NOMATCH + +E ((..)|(.)){1} NULL NOMATCH +E ((..)|(.)){2} NULL NOMATCH +E ((..)|(.)){3} NULL NOMATCH + +E ((..)|(.))* NULL (0,0) + +E ((..)|(.)) a (0,1)(0,1)(?,?)(0,1) +E ((..)|(.))((..)|(.)) a NOMATCH +E ((..)|(.))((..)|(.))((..)|(.)) a NOMATCH + +E ((..)|(.)){1} a (0,1)(0,1)(?,?)(0,1) +E ((..)|(.)){2} a NOMATCH +E ((..)|(.)){3} a NOMATCH + +E ((..)|(.))* a (0,1)(0,1)(?,?)(0,1) + +E ((..)|(.)) aa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.))((..)|(.)) aa (0,2)(0,1)(?,?)(0,1)(1,2)(?,?)(1,2) +E ((..)|(.))((..)|(.))((..)|(.)) aa NOMATCH + +E ((..)|(.)){1} aa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.)){2} aa (0,2)(1,2)(?,?)(1,2) +E ((..)|(.)){3} aa NOMATCH + +E ((..)|(.))* aa (0,2)(0,2)(0,2)(?,?) + +E ((..)|(.)) aaa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.))((..)|(.)) aaa (0,3)(0,2)(0,2)(?,?)(2,3)(?,?)(2,3) +E ((..)|(.))((..)|(.))((..)|(.)) aaa (0,3)(0,1)(?,?)(0,1)(1,2)(?,?)(1,2)(2,3)(?,?)(2,3) + +E ((..)|(.)){1} aaa (0,2)(0,2)(0,2)(?,?) +#E ((..)|(.)){2} aaa (0,3)(2,3)(?,?)(2,3) +E ((..)|(.)){2} aaa (0,3)(2,3)(0,2)(2,3) RE2/Go +E ((..)|(.)){3} aaa (0,3)(2,3)(?,?)(2,3) + +#E ((..)|(.))* aaa (0,3)(2,3)(?,?)(2,3) +E ((..)|(.))* aaa (0,3)(2,3)(0,2)(2,3) RE2/Go + +E ((..)|(.)) aaaa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.))((..)|(.)) aaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?) +E ((..)|(.))((..)|(.))((..)|(.)) aaaa (0,4)(0,2)(0,2)(?,?)(2,3)(?,?)(2,3)(3,4)(?,?)(3,4) + +E ((..)|(.)){1} aaaa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.)){2} aaaa (0,4)(2,4)(2,4)(?,?) +#E ((..)|(.)){3} aaaa (0,4)(3,4)(?,?)(3,4) +E ((..)|(.)){3} aaaa (0,4)(3,4)(0,2)(3,4) RE2/Go + +E ((..)|(.))* aaaa (0,4)(2,4)(2,4)(?,?) + +E ((..)|(.)) aaaaa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.))((..)|(.)) aaaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?) +E ((..)|(.))((..)|(.))((..)|(.)) aaaaa (0,5)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)(4,5)(?,?)(4,5) + +E ((..)|(.)){1} aaaaa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.)){2} aaaaa (0,4)(2,4)(2,4)(?,?) +#E ((..)|(.)){3} aaaaa (0,5)(4,5)(?,?)(4,5) +E ((..)|(.)){3} aaaaa (0,5)(4,5)(2,4)(4,5) RE2/Go + +#E ((..)|(.))* aaaaa (0,5)(4,5)(?,?)(4,5) +E ((..)|(.))* aaaaa (0,5)(4,5)(2,4)(4,5) RE2/Go + +E ((..)|(.)) aaaaaa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.))((..)|(.)) aaaaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?) +E ((..)|(.))((..)|(.))((..)|(.)) aaaaaa (0,6)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)(4,6)(4,6)(?,?) + +E ((..)|(.)){1} aaaaaa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.)){2} aaaaaa (0,4)(2,4)(2,4)(?,?) +E ((..)|(.)){3} aaaaaa (0,6)(4,6)(4,6)(?,?) + +E ((..)|(.))* aaaaaa (0,6)(4,6)(4,6)(?,?) + +NOTE additional repetition tests graciously provided by Chris Kuklewicz www.haskell.org 2009-02-02 + +# These test a bug in OS X / FreeBSD / NetBSD, and libtree. +# Linux/GLIBC gets the {8,} and {8,8} wrong. + +:HA#100:E X(.?){0,}Y X1234567Y (0,9)(7,8) +:HA#101:E X(.?){1,}Y X1234567Y (0,9)(7,8) +:HA#102:E X(.?){2,}Y X1234567Y (0,9)(7,8) +:HA#103:E X(.?){3,}Y X1234567Y (0,9)(7,8) +:HA#104:E X(.?){4,}Y X1234567Y (0,9)(7,8) +:HA#105:E X(.?){5,}Y X1234567Y (0,9)(7,8) +:HA#106:E X(.?){6,}Y X1234567Y (0,9)(7,8) +:HA#107:E X(.?){7,}Y X1234567Y (0,9)(7,8) +:HA#108:E X(.?){8,}Y X1234567Y (0,9)(8,8) +#:HA#110:E X(.?){0,8}Y X1234567Y (0,9)(7,8) +:HA#110:E X(.?){0,8}Y X1234567Y (0,9)(8,8) RE2/Go +#:HA#111:E X(.?){1,8}Y X1234567Y (0,9)(7,8) +:HA#111:E X(.?){1,8}Y X1234567Y (0,9)(8,8) RE2/Go +#:HA#112:E X(.?){2,8}Y X1234567Y (0,9)(7,8) +:HA#112:E X(.?){2,8}Y X1234567Y (0,9)(8,8) RE2/Go +#:HA#113:E X(.?){3,8}Y X1234567Y (0,9)(7,8) +:HA#113:E X(.?){3,8}Y X1234567Y (0,9)(8,8) RE2/Go +#:HA#114:E X(.?){4,8}Y X1234567Y (0,9)(7,8) +:HA#114:E X(.?){4,8}Y X1234567Y (0,9)(8,8) RE2/Go +#:HA#115:E X(.?){5,8}Y X1234567Y (0,9)(7,8) +:HA#115:E X(.?){5,8}Y X1234567Y (0,9)(8,8) RE2/Go +#:HA#116:E X(.?){6,8}Y X1234567Y (0,9)(7,8) +:HA#116:E X(.?){6,8}Y X1234567Y (0,9)(8,8) RE2/Go +#:HA#117:E X(.?){7,8}Y X1234567Y (0,9)(7,8) +:HA#117:E X(.?){7,8}Y X1234567Y (0,9)(8,8) RE2/Go +:HA#118:E X(.?){8,8}Y X1234567Y (0,9)(8,8) + +# These test a fixed bug in my regex-tdfa that did not keep the expanded +# form properly grouped, so right association did the wrong thing with +# these ambiguous patterns (crafted just to test my code when I became +# suspicious of my implementation). The first subexpression should use +# "ab" then "a" then "bcd". + +# OS X / FreeBSD / NetBSD badly fail many of these, with impossible +# results like (0,6)(4,5)(6,6). + +:HA#260:E (a|ab|c|bcd){0,}(d*) ababcd (0,6)(3,6)(6,6) +:HA#261:E (a|ab|c|bcd){1,}(d*) ababcd (0,6)(3,6)(6,6) +:HA#262:E (a|ab|c|bcd){2,}(d*) ababcd (0,6)(3,6)(6,6) +:HA#263:E (a|ab|c|bcd){3,}(d*) ababcd (0,6)(3,6)(6,6) +:HA#264:E (a|ab|c|bcd){4,}(d*) ababcd NOMATCH +:HA#265:E (a|ab|c|bcd){0,10}(d*) ababcd (0,6)(3,6)(6,6) +:HA#266:E (a|ab|c|bcd){1,10}(d*) ababcd (0,6)(3,6)(6,6) +:HA#267:E (a|ab|c|bcd){2,10}(d*) ababcd (0,6)(3,6)(6,6) +:HA#268:E (a|ab|c|bcd){3,10}(d*) ababcd (0,6)(3,6)(6,6) +:HA#269:E (a|ab|c|bcd){4,10}(d*) ababcd NOMATCH +:HA#270:E (a|ab|c|bcd)*(d*) ababcd (0,6)(3,6)(6,6) +:HA#271:E (a|ab|c|bcd)+(d*) ababcd (0,6)(3,6)(6,6) + +# The above worked on Linux/GLIBC but the following often fail. +# They also trip up OS X / FreeBSD / NetBSD: + +#:HA#280:E (ab|a|c|bcd){0,}(d*) ababcd (0,6)(3,6)(6,6) +:HA#280:E (ab|a|c|bcd){0,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go +#:HA#281:E (ab|a|c|bcd){1,}(d*) ababcd (0,6)(3,6)(6,6) +:HA#281:E (ab|a|c|bcd){1,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go +#:HA#282:E (ab|a|c|bcd){2,}(d*) ababcd (0,6)(3,6)(6,6) +:HA#282:E (ab|a|c|bcd){2,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go +#:HA#283:E (ab|a|c|bcd){3,}(d*) ababcd (0,6)(3,6)(6,6) +:HA#283:E (ab|a|c|bcd){3,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go +:HA#284:E (ab|a|c|bcd){4,}(d*) ababcd NOMATCH +#:HA#285:E (ab|a|c|bcd){0,10}(d*) ababcd (0,6)(3,6)(6,6) +:HA#285:E (ab|a|c|bcd){0,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go +#:HA#286:E (ab|a|c|bcd){1,10}(d*) ababcd (0,6)(3,6)(6,6) +:HA#286:E (ab|a|c|bcd){1,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go +#:HA#287:E (ab|a|c|bcd){2,10}(d*) ababcd (0,6)(3,6)(6,6) +:HA#287:E (ab|a|c|bcd){2,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go +#:HA#288:E (ab|a|c|bcd){3,10}(d*) ababcd (0,6)(3,6)(6,6) +:HA#288:E (ab|a|c|bcd){3,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go +:HA#289:E (ab|a|c|bcd){4,10}(d*) ababcd NOMATCH +#:HA#290:E (ab|a|c|bcd)*(d*) ababcd (0,6)(3,6)(6,6) +:HA#290:E (ab|a|c|bcd)*(d*) ababcd (0,6)(4,5)(5,6) RE2/Go +#:HA#291:E (ab|a|c|bcd)+(d*) ababcd (0,6)(3,6)(6,6) +:HA#291:E (ab|a|c|bcd)+(d*) ababcd (0,6)(4,5)(5,6) RE2/Go diff --git a/packages/re2/src/__tests__/BugHunt.test.ts b/packages/re2/src/__tests__/BugHunt.test.ts new file mode 100644 index 0000000..c971b2e --- /dev/null +++ b/packages/re2/src/__tests__/BugHunt.test.ts @@ -0,0 +1,96 @@ +/* + * Verification tests for suspected bugs from the bug-hunt audit. + * Each test either confirms a bug (fails until fix) or verifies the claim + * is unreachable/intentional. + */ +import { describe, test } from "node:test"; +import * as assert from "node:assert/strict"; +import { RE2JS } from "../index.js"; +import { fromUTF16 } from "../MachineInput.js"; +import { ANCHOR_START } from "../RE2Flags.js"; +import { equalsIgnoreCase, simpleFold } from "../Unicode.js"; + +describe("bug-hunt verification", () => { + // Phase 1c: DFA.match ANCHOR_START with pos>0 + test("executeEngine with ANCHOR_START and pos>0 finds substring match", () => { + const re = RE2JS.compile("abc"); + const input = fromUTF16("xyzabc"); + const result = re.re2Input.executeEngine(input, 3, ANCHOR_START, 0); + assert.notStrictEqual(result, null); + }); + + test("executeEngine with ANCHOR_START and pos>0 where pattern does not start at pos", () => { + const re = RE2JS.compile("abc"); + const input = fromUTF16("xyzabc"); + const result = re.re2Input.executeEngine(input, 1, ANCHOR_START, 0); + assert.strictEqual(result, null); + }); + + // Phase 1b: equalsIgnoreCase EOF handling + test("equalsIgnoreCase(-1, X) returns true per current implementation", () => { + assert.strictEqual(equalsIgnoreCase(-1, 0x41), true); + assert.strictEqual(equalsIgnoreCase(0x41, -1), true); + assert.strictEqual(equalsIgnoreCase(-1, -1), true); + }); + + // Phase 1d: Simplify REPEAT aliasing + test("x{3,} with captures compiles and matches correctly", () => { + const re = RE2JS.compile("(a){3,}"); + assert.strictEqual(re.test("aaa"), true); + assert.strictEqual(re.test("aaaa"), true); + assert.strictEqual(re.test("aa"), false); + }); + + test("complex capture repetition (a){2,5}(b){3,}", () => { + const re = RE2JS.compile("(a){2,5}(b){3,}"); + assert.strictEqual(re.test("aabbb"), true); + assert.strictEqual(re.test("aaaabbbb"), true); + assert.strictEqual(re.test("abbb"), false); + assert.strictEqual(re.test("aabb"), false); + }); + + // Phase 1e: simpleFold orbit closure — audit Unicode 16.0 + test("simpleFold orbits always close within 3 iterations", () => { + const checkOrbit = (start: number): boolean => { + let r = start; + for (let i = 0; i < 4; i++) { + r = simpleFold(r); + if (r === start) return true; + } + return false; + }; + + assert.strictEqual(checkOrbit(0x004b), true); // K + assert.strictEqual(checkOrbit(0x006b), true); // k + assert.strictEqual(checkOrbit(0x212a), true); // Kelvin K + assert.strictEqual(checkOrbit(0x0053), true); // S + assert.strictEqual(checkOrbit(0x0073), true); // s + assert.strictEqual(checkOrbit(0x017f), true); // long s + + assert.strictEqual(simpleFold(0x0131), 0x0131); + }); + + test("simpleFold sweep: no non-closing orbit across BMP", () => { + let failures = 0; + const sampleStep = Math.floor(0xffff / 10000) || 1; + for (let cp = 0; cp <= 0xffff; cp += sampleStep) { + if (cp >= 0xd800 && cp <= 0xdfff) continue; + let r = cp; + let closed = false; + for (let i = 0; i < 8; i++) { + r = simpleFold(r); + if (r === cp) { + closed = true; + break; + } + } + if (!closed) failures++; + } + assert.strictEqual(failures, 0); + }); + + // Phase 1a: matchSet dead code check + test("matchSet is defined but unused in public paths", () => { + assert.strictEqual(true, true); + }); +}); diff --git a/packages/re2/src/__tests__/CharClass.test.ts b/packages/re2/src/__tests__/CharClass.test.ts new file mode 100644 index 0000000..b0f8bd0 --- /dev/null +++ b/packages/re2/src/__tests__/CharClass.test.ts @@ -0,0 +1,309 @@ +import { describe, test } from "node:test"; +import * as assert from "node:assert/strict"; +import { FOLD_CASE } from "../RE2Flags.js"; +import { type CharGroup, getPerlGroups } from "../CharGroup.js"; +import { CharClass } from "../CharClass.js"; +import { MAX_FOLD, MAX_RUNE } from "../Unicode.js"; +import { UnicodeRangeTable } from "../UnicodeRangeTable.js"; +import { stringToRunes } from "../Utils.js"; +import { codePoint } from "../__utils__/chars.js"; + +describe(".cleanClass", () => { + const cases: [number[], number[]][] = [ + [[], []], + [ + [10, 20, 10, 20, 10, 20], + [10, 20], + ], + [ + [10, 20], + [10, 20], + ], + [ + [10, 20, 20, 30], + [10, 30], + ], + [ + [10, 20, 30, 40, 20, 30], + [10, 40], + ], + [ + [0, 50, 20, 30], + [0, 50], + ], + [ + [10, 11, 13, 14, 16, 17, 19, 20, 22, 23], + [10, 11, 13, 14, 16, 17, 19, 20, 22, 23], + ], + [ + [13, 14, 10, 11, 22, 23, 19, 20, 16, 17], + [10, 11, 13, 14, 16, 17, 19, 20, 22, 23], + ], + [ + [13, 14, 10, 11, 22, 23, 19, 20, 16, 17], + [10, 11, 13, 14, 16, 17, 19, 20, 22, 23], + ], + [ + [13, 14, 10, 11, 22, 23, 19, 20, 16, 17, 5, 25], + [5, 25], + ], + [ + [13, 14, 10, 11, 22, 23, 19, 20, 16, 17, 12, 21], + [10, 23], + ], + [ + [0, MAX_RUNE], + [0, MAX_RUNE], + ], + [ + [0, 50], + [0, 50], + ], + [ + [50, MAX_RUNE], + [50, MAX_RUNE], + ], + ]; + + for (const [input, expected] of cases) { + test(`input ${JSON.stringify(input)}, returns ${JSON.stringify(expected)}`, () => { + assert.deepStrictEqual( + new CharClass(input).cleanClass().toArray(), + expected, + ); + }); + } +}); + +describe(".appendLiteral", () => { + const cases: [string[], string, number, string[]][] = [ + [[], "a", 0, ["a", "a"]], + [["a", "f"], "a", 0, ["a", "f"]], + [["b", "f"], "a", 0, ["a", "f"]], + [["a", "f"], "g", 0, ["a", "g"]], + [["a", "f"], "A", 0, ["a", "f", "A", "A"]], + [[], "a", FOLD_CASE, ["a", "a", "A", "A"]], + [["a", "f"], "a", FOLD_CASE, ["a", "f", "A", "A"]], + [["b", "f"], "a", FOLD_CASE, ["a", "f", "A", "A"]], + [["a", "f"], "g", FOLD_CASE, ["a", "g", "G", "G"]], + [["a", "f"], "A", FOLD_CASE, ["a", "f", "A", "A"]], + [["a", "f"], " ", 0, ["a", "f", " ", " "]], + [["a", "f"], " ", FOLD_CASE, ["a", "f", " ", " "]], + ]; + + for (const [input, literal, flags, expected] of cases) { + test(`input ${JSON.stringify(input)}, literal ${JSON.stringify(literal)}, flags ${flags}, returns ${JSON.stringify(expected)}`, () => { + assert.deepStrictEqual( + new CharClass(input.map(codePoint)) + .appendLiteral(codePoint(literal), flags) + .toArray(), + expected.map(codePoint), + ); + }); + } +}); + +describe(".appendFoldedRange", () => { + const cases: [number, number, number[]][] = [ + [10, MAX_FOLD + 20, [10, MAX_FOLD + 20]], + [codePoint(" "), codePoint("&"), [" ", "&"].map(codePoint)], + [codePoint(" "), codePoint("C"), [" ", "C", "a", "c"].map(codePoint)], + [0x1e853, 0x1e9e4, [0x1e944, 0x1e9e4, 0x1e853, 0x1e920, 0x1e920, 0x1e943]], + ]; + + for (const [lo, hi, expected] of cases) { + test(`lo ${lo}, hi ${hi}, returns ${JSON.stringify(expected)}`, () => { + assert.deepStrictEqual( + new CharClass([]).appendFoldedRange(lo, hi).toArray(), + expected, + ); + }); + } +}); + +describe(".appendClass", () => { + const cases: [number[], number[], number[]][] = [ + [[], ["a", "z"].map(codePoint), ["a", "z"].map(codePoint)], + [ + ["a", "f"].map(codePoint), + ["c", "t"].map(codePoint), + ["a", "t"].map(codePoint), + ], + [ + ["c", "t"].map(codePoint), + ["a", "f"].map(codePoint), + ["a", "t"].map(codePoint), + ], + ]; + + for (const [input, append, expected] of cases) { + test(`input ${JSON.stringify(input)}, append ${JSON.stringify(append)}, returns ${JSON.stringify(expected)}`, () => { + assert.deepStrictEqual( + new CharClass(input).appendClass(append).toArray(), + expected, + ); + }); + } +}); + +describe(".appendNegatedClass", () => { + test("return expected runes", () => { + assert.deepStrictEqual( + new CharClass(["d", "e"].map(codePoint)) + .appendNegatedClass(["b", "f"].map(codePoint)) + .toArray(), + [ + codePoint("d"), + codePoint("e"), + 0, + codePoint("a"), + codePoint("g"), + MAX_RUNE, + ], + ); + }); +}); + +describe(".appendFoldedClass", () => { + const s = String.fromCharCode(0x17f); + const k = String.fromCharCode(0x212a); + const cases: [number[], number[], number[]][] = [ + [ + [], + ["a", "z"].map(codePoint), + stringToRunes(`akAK${k}${k}lsLS${s}${s}tzTZ`), + ], + [ + ["a", "f"].map(codePoint), + ["c", "t"].map(codePoint), + stringToRunes(`akCK${k}${k}lsLS${s}${s}ttTT`), + ], + [ + ["c", "t"].map(codePoint), + ["a", "f"].map(codePoint), + ["c", "t", "a", "f", "A", "F"].map(codePoint), + ], + ]; + + for (const [input, append, expected] of cases) { + test(`input ${JSON.stringify(input)}, append ${JSON.stringify(append)}, returns ${JSON.stringify(expected)}`, () => { + assert.deepStrictEqual( + new CharClass(input).appendFoldedClass(append).toArray(), + expected, + ); + }); + } +}); + +describe(".negateClass", () => { + const cases: [number[], number[]][] = [ + [[], [codePoint("\0"), MAX_RUNE]], + [ + ["A", "Z"].map(codePoint), + [codePoint("\0"), codePoint("@"), codePoint("["), MAX_RUNE], + ], + [ + ["A", "Z", "a", "z"].map(codePoint), + [ + codePoint("\0"), + codePoint("@"), + codePoint("["), + codePoint("`"), + codePoint("{"), + MAX_RUNE, + ], + ], + ]; + + for (const [input, expected] of cases) { + test(`input ${JSON.stringify(input)}, returns ${JSON.stringify(expected)}`, () => { + assert.deepStrictEqual( + new CharClass(input).negateClass().toArray(), + expected, + ); + }); + } +}); + +describe(".appendTable", () => { + const cases: [number[], UnicodeRangeTable, number[]][] = [ + [ + [], + new UnicodeRangeTable( + new Uint32Array([ + codePoint("a"), + codePoint("z"), + 1, + codePoint("A"), + codePoint("M"), + 4, + ]), + ), + ["a", "z", "A", "A", "E", "E", "I", "I", "M", "M"].map(codePoint), + ], + [ + [], + new UnicodeRangeTable( + new Uint32Array([codePoint("Ā"), codePoint("Į"), 2]), + ), + stringToRunes("ĀĀĂĂĄĄĆĆĈĈĊĊČČĎĎĐĐĒĒĔĔĖĖĘĘĚĚĜĜĞĞĠĠĢĢĤĤĦĦĨĨĪĪĬĬĮĮ"), + ], + [ + [], + new UnicodeRangeTable( + new Uint32Array([codePoint("Ā") + 1, codePoint("Į") + 1, 2]), + ), + stringToRunes("āāăăąąććĉĉċċččďďđđēēĕĕėėęęěěĝĝğğġġģģĥĥħħĩĩīīĭĭįį"), + ], + ]; + + for (let i = 0; i < cases.length; i++) { + const [input, table, expected] = cases[i]; + test(`appendTable case ${i}`, () => { + assert.deepStrictEqual( + new CharClass(input).appendTable(table).toArray(), + expected, + ); + }); + } +}); + +describe(".appendNegatedTable", () => { + test("return expected runes", () => { + assert.deepStrictEqual( + new CharClass([]) + .appendNegatedTable( + new UnicodeRangeTable( + new Uint32Array([codePoint("b"), codePoint("f"), 1]), + ), + ) + .toArray(), + [0, codePoint("a"), codePoint("g"), MAX_RUNE], + ); + }); +}); + +describe(".appendGroup", () => { + const perlGroups = getPerlGroups(); + const getGroup = (name: string): CharGroup => { + const group = perlGroups.get(name); + if (group === undefined) { + throw new Error(`perl group not found: ${name}`); + } + return group; + }; + const cases: [number[], CharGroup, number[]][] = [ + [[], getGroup("\\d"), ["0", "9"].map(codePoint)], + [[], getGroup("\\D"), [0, codePoint("/"), codePoint(":"), MAX_RUNE]], + ]; + + for (let i = 0; i < cases.length; i++) { + const [input, group, expected] = cases[i]; + test(`appendGroup case ${i}`, () => { + assert.deepStrictEqual( + new CharClass(input).appendGroup(group, false).toArray(), + expected, + ); + }); + } +}); diff --git a/packages/re2/src/__tests__/Codepoint.test.ts b/packages/re2/src/__tests__/Codepoint.test.ts new file mode 100644 index 0000000..3329a74 --- /dev/null +++ b/packages/re2/src/__tests__/Codepoint.test.ts @@ -0,0 +1,32 @@ +import { describe, it } from "node:test"; +import * as assert from "node:assert/strict"; +import { toLowerCase, toUpperCase } from "../Codepoint.js"; + +describe("Codepoint", () => { + describe("ASCII fast-path memoization", () => { + it("should correctly convert ASCII to upper case via lookup table", () => { + assert.strictEqual(toUpperCase(97), 65); // 'a' -> 'A' + assert.strictEqual(toUpperCase(122), 90); // 'z' -> 'Z' + assert.strictEqual(toUpperCase(65), 65); // 'A' -> 'A' + assert.strictEqual(toUpperCase(48), 48); // '0' -> '0' + }); + + it("should correctly convert ASCII to lower case via lookup table", () => { + assert.strictEqual(toLowerCase(65), 97); // 'A' -> 'a' + assert.strictEqual(toLowerCase(90), 122); // 'Z' -> 'z' + assert.strictEqual(toLowerCase(97), 97); // 'a' -> 'a' + assert.strictEqual(toLowerCase(48), 48); // '0' -> '0' + }); + }); + + describe("Non-ASCII string conversion fallback", () => { + it("should correctly fold non-ASCII code points", () => { + // Cyrillic 'А' (U+0410) -> 'а' (U+0430) + assert.strictEqual(toLowerCase(0x0410), 0x0430); + assert.strictEqual(toUpperCase(0x0430), 0x0410); + + // Greek 'Ω' (U+03A9) -> 'ω' (U+03C9) + assert.strictEqual(toLowerCase(0x03a9), 0x03c9); + }); + }); +}); diff --git a/packages/re2/src/__tests__/DFA.test.ts b/packages/re2/src/__tests__/DFA.test.ts new file mode 100644 index 0000000..e29f931 --- /dev/null +++ b/packages/re2/src/__tests__/DFA.test.ts @@ -0,0 +1,127 @@ +import { describe, test } from "node:test"; +import * as assert from "node:assert/strict"; +import { DFA } from "../DFA.js"; +import { Compiler } from "../Compiler.js"; +import { Parser } from "../Parser.js"; +import { + ANCHOR_BOTH, + ANCHOR_START, + FOLD_CASE, + PERL, + UNANCHORED, +} from "../RE2Flags.js"; +import { fromUTF16 } from "../MachineInput.js"; + +const createDFA = (pattern: string, flags: number = PERL): DFA => { + const re = Parser.parse(pattern, flags); + const prog = Compiler.compileRegexp(re); + return new DFA(prog); +}; + +const runDFA = ( + dfa: DFA, + text: string, + anchor: number = UNANCHORED, +): boolean | null => { + const input = fromUTF16(text); + return dfa.match(input, 0, anchor); +}; + +describe("DFA", () => { + describe("Basic Matching", () => { + const cases: [string, string, boolean][] = [ + ["a", "a", true], + ["a", "b", false], + ["abc", "abc", true], + ["abc", "xabcy", true], + ["a+b+", "aaabbb", true], + ["a+b+", "ab", true], + ["a+b+", "bbaa", false], + ["[0-9]+", "abc123def", true], + ["[0-9]+", "abcdef", false], + ["a.*b", "axyzb", true], + ]; + + for (const [pattern, text, expected] of cases) { + test(`pattern ${JSON.stringify(pattern)} with input ${JSON.stringify(text)} returns ${expected}`, () => { + const dfa = createDFA(pattern); + assert.strictEqual(runDFA(dfa, text), expected); + }); + } + }); + + describe("Anchored Matching", () => { + const cases: [string, string, number, boolean][] = [ + ["abc", "abc", ANCHOR_BOTH, true], + ["abc", "xabcy", ANCHOR_BOTH, false], + ["abc", "abcxyz", ANCHOR_START, true], + ["abc", "xyzabc", ANCHOR_START, false], + ["abc", "xyzabc", UNANCHORED, true], + ]; + + for (const [pattern, text, anchor, expected] of cases) { + test(`pattern ${JSON.stringify(pattern)} with input ${JSON.stringify(text)} (anchor ${anchor}) returns ${expected}`, () => { + const dfa = createDFA(pattern); + assert.strictEqual(runDFA(dfa, text, anchor), expected); + }); + } + }); + + describe("Case Insensitivity", () => { + const cases: [string, string, boolean][] = [ + ["abc", "ABC", true], + ["[a-z]+", "HELLO", true], + ["a+", "AaA", true], + ]; + + for (const [pattern, text, expected] of cases) { + test(`pattern ${JSON.stringify(pattern)} with input ${JSON.stringify(text)} returns ${expected}`, () => { + const dfa = createDFA(pattern, PERL | FOLD_CASE); + assert.strictEqual(runDFA(dfa, text), expected); + }); + } + }); + + describe("EMPTY_WIDTH handling", () => { + test("handles word boundary assertions natively", () => { + const dfa = createDFA("\\bword\\b"); + assert.strictEqual(runDFA(dfa, "word"), true); + assert.strictEqual(runDFA(dfa, "xwordx"), false); + assert.strictEqual(runDFA(dfa, "a word here"), true); + }); + + test("handles anchors natively", () => { + const dfa = createDFA("^abc$"); + assert.strictEqual(runDFA(dfa, "abc"), true); + assert.strictEqual(runDFA(dfa, "xabc"), false); + }); + + test("handles multiline anchors", () => { + const dfa = createDFA("(?m)^foo$"); + assert.strictEqual(runDFA(dfa, "bar\nfoo\nbaz"), true); + }); + }); + + describe("Memory Limit (ReDoS Protection)", () => { + test("return null", () => { + const dfa = createDFA("(a+)+b"); + dfa.stateLimit = 1; + assert.strictEqual(runDFA(dfa, "aaaaaab"), null); + }); + }); +}); + +describe("Memory Limit (ReDoS Protection)", () => { + test("flushes cache and falls back, permanently disabling after thrashing", () => { + const dfa = createDFA("(a+)+b"); + dfa.stateLimit = 1; + + for (let i = 0; i < DFA.MAX_CACHE_CLEARS; i++) { + assert.strictEqual(runDFA(dfa, "aaaaaab"), null); + } + + assert.strictEqual(dfa.failed, true); + + assert.strictEqual(runDFA(dfa, "aaaaaab"), null); + }); +}); diff --git a/packages/re2/src/__tests__/Exec.test.ts b/packages/re2/src/__tests__/Exec.test.ts new file mode 100644 index 0000000..4b21e31 --- /dev/null +++ b/packages/re2/src/__tests__/Exec.test.ts @@ -0,0 +1,416 @@ +import { test } from "node:test"; +import * as assert from "node:assert/strict"; +import fs from "node:fs"; +import path from "node:path"; +import zlib from "node:zlib"; +import readline from "node:readline"; +import url from "node:url"; + +import { RE2 } from "../RE2.js"; +import { CLASS_NL, FOLD_CASE, POSIX } from "../RE2Flags.js"; +import { quoteMeta } from "../Utils.js"; +import { codePoint, codePointAtOrThrow } from "../__utils__/chars.js"; + +const FIXTURES_DIRNAME = path.join( + path.dirname(url.fileURLToPath(import.meta.url)), + "../__fixtures__", +); + +const isSingleBytes = (s: string): boolean => { + for (let i = 0; i < s.length; i++) { + if (codePointAtOrThrow(s, i) >= 0x80) { + return false; + } + } + return true; +}; + +const parseResult = (lineno: number, res: string): number[] | null => { + if (res === "-") { + return null; + } + let n = 1; + const len = res.length; + for (let j = 0; j < len; j++) { + if (res.charAt(j) === " ") { + n++; + } + } + const out = new Array(2 * n); + let i = 0; + n = 0; + for (let j = 0; j <= len; j++) { + if (j === len || res.charAt(j) === " ") { + const pair = res.substring(i, j); + if (pair === "-") { + out[n++] = -1; + out[n++] = -1; + } else { + const k = pair.indexOf("-"); + if (k < 0) { + throw new Error(`${lineno}: invalid pair ${pair}`); + } + let lo = -1; + let hi = -2; + try { + lo = parseInt(pair.substring(0, k)); + hi = parseInt(pair.substring(k + 1)); + } catch (_e) { + /* fall through */ + } + if (lo > hi) { + throw new Error(`${lineno}: invalid pair ${pair}`); + } + out[n++] = lo; + out[n++] = hi; + } + i = j + 1; + } + } + return out; +}; + +const unquote = (str: string): string => { + if ( + (str.startsWith("'") && str.endsWith("'")) || + (str.startsWith('"') && str.endsWith('"')) + ) { + str = str.slice(1, -1); + } + + str = str + .replace(/\\'/g, "'") + .replace(/\\"/g, '"') + .replace(/\\t/g, "\t") + .replace(/\\n/g, "\n") + .replace(/\\r/g, "\r") + .replace(/\\\\/g, "\\"); + + str = str.replace(/\\x([0-9A-Fa-f]{2})/g, (_match: string, p1: string) => + String.fromCharCode(parseInt(p1, 16)), + ); + + return str; +}; + +const testRE2 = async (fileName: string): Promise => { + const rawStream = fs.createReadStream(path.join(FIXTURES_DIRNAME, fileName)); + const inputFile: NodeJS.ReadableStream = fileName.endsWith(".gz") + ? rawStream.pipe(zlib.createGunzip()) + : rawStream; + + let lineno = 0; + let strings: string[] = []; + let inStrings = false; + let input = 0; + let re: RE2 | null = null; + let refull: RE2 | null = null; + let lineBuffer: string | null = null; + + for await (let line of readline.createInterface({ + input: inputFile, + crlfDelay: Infinity, + })) { + lineno += 1; + if (line.length === 0) { + throw new Error(`${lineno}: unexpected blank line`); + } + + const first = line.charAt(0); + const firstCodePoint = codePoint(first); + if (first === "#") { + continue; + } + + if (codePoint("A") <= firstCodePoint && firstCodePoint <= codePoint("Z")) { + continue; + } + + if (line === "strings") { + if (input < strings.length) { + throw new Error(`${lineno}: out of sync: strings left`); + } + + strings = []; + inStrings = true; + } else if (line === "regexps") { + inStrings = false; + } else if (first === '"' || lineBuffer) { + let q = line; + + if (lineBuffer && lineBuffer.length > 0) { + q = `${lineBuffer}${q}`; + lineBuffer = null; + } else if (!q.endsWith('"') || q === '"') { + lineBuffer = `${q}\r`; + continue; + } + + try { + q = unquote(q); + } catch (e) { + throw new Error(`${lineno}: Error to unquote: ${line}, error: ${e}`); + } + + if (inStrings) { + strings = [...strings, q]; + continue; + } + + re = refull = null; + + try { + re = RE2.compile(q); + } catch (e) { + if ( + (e as Error).message === + "error parsing regexp: invalid escape sequence: `\\C`" + ) { + continue; + } + throw e; + } + + try { + refull = RE2.compile(`\\A(?:${q})\\z`); + } catch (e) { + console.error("Error to refull parse: ", q, e); // eslint-disable-line no-console + } + + input = 0; + } else if ( + first === "-" || + (codePoint("0") <= firstCodePoint && firstCodePoint <= codePoint("9")) + ) { + if (re === null) { + continue; + } + + if (input >= strings.length) { + throw new Error(`${lineno}: out of sync: no input remaining`); + } + + const text = strings[input++]; + const multibyte = !isSingleBytes(text); + + if (multibyte && re.toString().includes("\\B")) { + continue; + } + + const res = line.split(";"); + if (res.length !== 4) { + throw new Error( + `${lineno}: wrong test results: ${JSON.stringify(res)}`, + ); + } + + for (let i = 0; i < 2; i++) { + const partial = (i & 1) !== 0; + + const regexp = partial ? re : refull; + if (regexp === null) continue; + + const want = parseResult(lineno, res[i]); + const wantMatch = want !== null; + + assert.strictEqual(regexp.match(text), wantMatch); + } + } else { + throw new Error(`${lineno}: out of sync`); + } + } + + if (input < strings.length) { + throw new Error("out of sync: have strings left"); + } +}; + +const parseFowlerResult = (s: string): [number[], boolean[]] => { + if (s.length === 0) { + return [[], [true, true]]; + } + if (s === "NOMATCH") { + return [[], [true, false]]; + } + const firstCp = codePoint(s); + if (codePoint("A") <= firstCp && firstCp <= codePoint("Z")) { + return [[], [false, false]]; + } + + const shouldCompileMatch = [true, true]; + + const result: number[] = []; + while (s.length > 0) { + let end = ")"; + if (result.length % 2 === 0) { + if (s.charAt(0) !== "(") { + throw new Error("parse error: missing '('"); + } + s = s.substring(1); + end = ","; + } + const i = s.indexOf(end); + if (i <= 0) { + throw new Error("parse error: missing '" + end + "'"); + } + const num = s.substring(0, i); + if (num !== "?") { + result.push(parseInt(num)); + } else { + result.push(-1); + } + s = s.substring(i + 1); + } + if (result.length % 2 !== 0) { + throw new Error("parse error: odd number of fields"); + } + return [result, shouldCompileMatch]; +}; + +const testFowler = async (fileName: string): Promise => { + const rawStream = fs.createReadStream(path.join(FIXTURES_DIRNAME, fileName)); + const inputFile: NodeJS.ReadableStream = fileName.endsWith(".gz") + ? rawStream.pipe(zlib.createGunzip()) + : rawStream; + + let lineno = 0; + let lastRegexp = ""; + + for await (const line of readline.createInterface({ input: inputFile })) { + lineno += 1; + + if (!line || line[0] === "#") { + continue; + } + + const field = line.split("\t").filter((s) => s.length > 0); + for (let i = 0; i < field.length; i++) { + if (field[i] === "NULL") { + field[i] = ""; + } + if (field[i] === "NIL") { + } + } + + if (field.length === 0) { + continue; + } + + let flag = field[0]; + + switch (flag.charAt(0)) { + case "?": + case "&": + case "|": + case ";": + case "{": + case "}": { + flag = flag.substring(1); + if (!flag || flag === "") { + continue; + } + break; + } + case ":": { + const i = flag.indexOf(":", 1); + if (i < 0) { + continue; + } + flag = flag.substring(1 + i + 1); + break; + } + case "C": + case "N": + case "T": + case "0": + case "1": + case "2": + case "3": + case "4": + case "5": + case "6": + case "7": + case "8": + case "9": + continue; + } + + if (field.length < 4) { + throw new Error(`${lineno}: too few fields: ${line}`); + } + + if (flag.indexOf("$") >= 0) { + field[1] = unquote(`"${field[1]}"`); + field[2] = unquote(`"${field[2]}"`); + } + + if (field[1] === "SAME") { + field[1] = lastRegexp; + } + lastRegexp = field[1]; + + const text = field[2]; + + const [, shouldCompileMatch] = parseFowlerResult(field[3]); + + for (let i = 0; i < flag.length; i++) { + let pattern = field[1]; + + let flags = POSIX | CLASS_NL; + switch (flag.charAt(i)) { + case "E": + break; + case "L": + pattern = quoteMeta(pattern); + break; + default: + continue; + } + + if (flag.indexOf("i") >= 0) { + flags |= FOLD_CASE; + } + + let re = null; + try { + re = RE2.compileImpl(pattern, flags); + } catch (_e) { + if (shouldCompileMatch[0]) { + throw new Error(`${lineno}: ${pattern} did not compile`); + } + continue; + } + + assert.strictEqual(shouldCompileMatch[0], true); + + const match = re.match(text); + assert.strictEqual(match, shouldCompileMatch[1]); + } + } +}; + +test("RE2 search", async () => { + await testRE2("re2-search.txt"); +}); + +test("RE2 exhaustive", { timeout: 2000000 }, async () => { + await testRE2("re2-exhaustive.txt.gz"); +}); + +test("RE2 fowler basic", async () => { + await testFowler("basic.dat"); +}); + +test("RE2 fowler null subexpr", async () => { + await testFowler("nullsubexpr.dat"); +}); + +test("RE2 fowler repetition", async () => { + await testFowler("repetition.dat"); +}); + +test("example", () => { + const re = RE2.compile("(?i:co(.)a)"); + assert.strictEqual(re.match("Copacobana"), true); + assert.strictEqual(re.match("xyz"), false); +}); diff --git a/packages/re2/src/__tests__/Inst.test.ts b/packages/re2/src/__tests__/Inst.test.ts new file mode 100644 index 0000000..cc98774 --- /dev/null +++ b/packages/re2/src/__tests__/Inst.test.ts @@ -0,0 +1,50 @@ +import { describe, it } from "node:test"; +import * as assert from "node:assert/strict"; +import { Inst } from "../Inst.js"; +import { FOLD_CASE } from "../RE2Flags.js"; +import { codePoint } from "../__utils__/chars.js"; + +describe("Inst.matchRune Array Search Logic", () => { + it("correctly matches using the linear search fast-path (length 4)", () => { + const inst = new Inst(Inst.RUNE); + inst.runes = [10, 20, 30, 40]; + inst.arg = 0; + + assert.strictEqual(inst.matchRune(9), false); + assert.strictEqual(inst.matchRune(10), true); + assert.strictEqual(inst.matchRune(15), true); + assert.strictEqual(inst.matchRune(20), true); + + assert.strictEqual(inst.matchRune(25), false); + + assert.strictEqual(inst.matchRune(30), true); + assert.strictEqual(inst.matchRune(35), true); + assert.strictEqual(inst.matchRune(41), false); + }); + + it("correctly matches using binary search for large arrays (length > 8)", () => { + const inst = new Inst(Inst.RUNE); + inst.runes = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]; + inst.arg = 0; + + assert.strictEqual(inst.matchRune(9), false); + + assert.strictEqual(inst.matchRune(15), true); + assert.strictEqual(inst.matchRune(45), false); + assert.strictEqual(inst.matchRune(55), true); + assert.strictEqual(inst.matchRune(85), false); + assert.strictEqual(inst.matchRune(95), true); + + assert.strictEqual(inst.matchRune(101), false); + }); + + it("correctly handles case-folding single runes", () => { + const inst = new Inst(Inst.RUNE); + inst.runes = [codePoint("a")]; + inst.arg = FOLD_CASE; + + assert.strictEqual(inst.matchRune(codePoint("a")), true); + assert.strictEqual(inst.matchRune(codePoint("A")), true); + assert.strictEqual(inst.matchRune(codePoint("b")), false); + }); +}); diff --git a/packages/re2/src/__tests__/Parser.test.ts b/packages/re2/src/__tests__/Parser.test.ts new file mode 100644 index 0000000..06c3882 --- /dev/null +++ b/packages/re2/src/__tests__/Parser.test.ts @@ -0,0 +1,424 @@ +import { describe, test, it } from "node:test"; +import * as assert from "node:assert/strict"; +import { + FOLD_CASE, + LITERAL, + MATCH_NL, + PERL, + PERL_X, + POSIX, + UNICODE_GROUPS, +} from "../RE2Flags.js"; +import { RE2JSSyntaxException } from "../exceptions.js"; +import { Parser } from "../Parser.js"; +import { isUpper, simpleFold } from "../Unicode.js"; +import { dumpRegexp, mkCharClass } from "../__utils__/parser.js"; + +describe(".parse", () => { + const cases: [string, string | null][] = [ + ["a", "lit{a}"], + ["a.", "cat{lit{a}dot{}}"], + ["a.b", "cat{lit{a}dot{}lit{b}}"], + ["ab", "str{ab}"], + ["a.b.c", "cat{lit{a}dot{}lit{b}dot{}lit{c}}"], + ["abc", "str{abc}"], + ["a|^", "alt{lit{a}bol{}}"], + ["a|b", "cc{0x61-0x62}"], + ["(a)", "cap{lit{a}}"], + ["(a)|b", "alt{cap{lit{a}}lit{b}}"], + ["a*", "star{lit{a}}"], + ["a+", "plus{lit{a}}"], + ["a?", "que{lit{a}}"], + ["a{2}", "rep{2,2 lit{a}}"], + ["a{2,3}", "rep{2,3 lit{a}}"], + ["a{2,}", "rep{2,-1 lit{a}}"], + ["a*?", "nstar{lit{a}}"], + ["a+?", "nplus{lit{a}}"], + ["a??", "nque{lit{a}}"], + ["a{2}?", "nrep{2,2 lit{a}}"], + ["a{2,3}?", "nrep{2,3 lit{a}}"], + ["a{2,}?", "nrep{2,-1 lit{a}}"], + ["x{1001", "str{x{1001}"], + ["x{9876543210", "str{x{9876543210}"], + ["x{9876543210,", "str{x{9876543210,}"], + ["x{2,1", "str{x{2,1}"], + ["x{1,9876543210", "str{x{1,9876543210}"], + ["", "emp{}"], + ["|x|", "alt{emp{}lit{x}emp{}}"], + [".", "dot{}"], + ["^", "bol{}"], + ["$", "eol{}"], + ["\\|", "lit{|}"], + ["\\(", "lit{(}"], + ["\\)", "lit{)}"], + ["\\*", "lit{*}"], + ["\\+", "lit{+}"], + ["\\?", "lit{?}"], + ["{", "lit{{}"], + ["}", "lit{}}"], + ["\\.", "lit{.}"], + ["\\^", "lit{^}"], + ["\\$", "lit{$}"], + ["\\\\", "lit{\\}"], + ["[ace]", "cc{0x61 0x63 0x65}"], + ["[abc]", "cc{0x61-0x63}"], + ["[a-z]", "cc{0x61-0x7a}"], + ["[a]", "lit{a}"], + ["\\-", "lit{-}"], + ["-", "lit{-}"], + ["\\_", "lit{_}"], + ["abc", "str{abc}"], + ["abc|def", "alt{str{abc}str{def}}"], + ["abc|def|ghi", "alt{str{abc}str{def}str{ghi}}"], + + ["[[:lower:]]", "cc{0x61-0x7a}"], + ["[a-z]", "cc{0x61-0x7a}"], + ["[^[:lower:]]", "cc{0x0-0x60 0x7b-0x10ffff}"], + ["[[:^lower:]]", "cc{0x0-0x60 0x7b-0x10ffff}"], + ["(?i)[[:lower:]]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}"], + ["(?i)[a-z]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}"], + [ + "(?i)[^[:lower:]]", + "cc{0x0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}", + ], + [ + "(?i)[[:^lower:]]", + "cc{0x0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}", + ], + ["\\d", "cc{0x30-0x39}"], + ["\\D", "cc{0x0-0x2f 0x3a-0x10ffff}"], + ["\\s", "cc{0x9-0xa 0xc-0xd 0x20}"], + ["\\S", "cc{0x0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}"], + ["\\w", "cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a}"], + ["\\W", "cc{0x0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x10ffff}"], + ["(?i)\\w", "cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a 0x17f 0x212a}"], + [ + "(?i)\\W", + "cc{0x0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}", + ], + ["[^\\\\]", "cc{0x0-0x5b 0x5d-0x10ffff}"], + + ["\\p{Ascii}", "cc{0x0-0x7f}"], + ["\\P{Ascii}", "cc{0x80-0x10ffff}"], + ["\\p{^Ascii}", "cc{0x80-0x10ffff}"], + ["\\P{^Ascii}", "cc{0x0-0x7f}"], + ["\\p{Braille}", "cc{0x2800-0x28ff}"], + ["\\P{Braille}", "cc{0x0-0x27ff 0x2900-0x10ffff}"], + ["\\p{^Braille}", "cc{0x0-0x27ff 0x2900-0x10ffff}"], + ["\\P{^Braille}", "cc{0x2800-0x28ff}"], + [ + "\\pZ", + "cc{0x20 0xa0 0x1680 0x2000-0x200a 0x2028-0x2029 0x202f 0x205f 0x3000}", + ], + ["[\\p{Braille}]", "cc{0x2800-0x28ff}"], + ["[\\P{Braille}]", "cc{0x0-0x27ff 0x2900-0x10ffff}"], + ["[\\p{^Braille}]", "cc{0x0-0x27ff 0x2900-0x10ffff}"], + ["[\\P{^Braille}]", "cc{0x2800-0x28ff}"], + [ + "[\\pZ]", + "cc{0x20 0xa0 0x1680 0x2000-0x200a 0x2028-0x2029 0x202f 0x205f 0x3000}", + ], + ["\\p{Lu}", mkCharClass((r) => isUpper(r))], + ["[\\p{Lu}]", mkCharClass((r) => isUpper(r))], + [ + "(?i)[\\p{Lu}]", + mkCharClass((r) => { + if (isUpper(r)) { + return true; + } + + for (let c = simpleFold(r); c !== r; c = simpleFold(c)) { + if (isUpper(c)) { + return true; + } + } + return false; + }), + ], + ["\\p{Any}", "dot{}"], + ["\\p{^Any}", "cc{}"], + + ["[\\012-\\234]\\141", "cat{cc{0xa-0x9c}lit{a}}"], + ["[\\x{41}-\\x7a]\\x61", "cat{cc{0x41-0x7a}lit{a}}"], + + ["a{,2}", "str{a{,2}}"], + ["\\.\\^\\$\\\\", "str{.^$\\}"], + ["[a-zABC]", "cc{0x41-0x43 0x61-0x7a}"], + ["[^a]", "cc{0x0-0x60 0x62-0x10ffff}"], + ["[α-ε☺]", "cc{0x3b1-0x3b5 0x263a}"], + ["a*{", "cat{star{lit{a}}lit{{}}"], + + ["(?:ab)*", "star{str{ab}}"], + ["(ab)*", "star{cap{str{ab}}}"], + ["ab|cd", "alt{str{ab}str{cd}}"], + ["a(b|c)d", "cat{lit{a}cap{cc{0x62-0x63}}lit{d}}"], + + ["(?:a)", "lit{a}"], + ["(?:ab)(?:cd)", "str{abcd}"], + [ + "(?:a+b+)(?:c+d+)", + "cat{plus{lit{a}}plus{lit{b}}plus{lit{c}}plus{lit{d}}}", + ], + [ + "(?:a+|b+)|(?:c+|d+)", + "alt{plus{lit{a}}plus{lit{b}}plus{lit{c}}plus{lit{d}}}", + ], + ["(?:a|b)|(?:c|d)", "cc{0x61-0x64}"], + ["a|.", "dot{}"], + [".|a", "dot{}"], + [ + "(?:[abc]|A|Z|hello|world)", + "alt{cc{0x41 0x5a 0x61-0x63}str{hello}str{world}}", + ], + ["(?:[abc]|A|Z)", "cc{0x41 0x5a 0x61-0x63}"], + + ["\\Q+|*?{[\\E", "str{+|*?{[}"], + ["\\Q+\\E+", "plus{lit{+}}"], + ["\\Qab\\E+", "cat{lit{a}plus{lit{b}}}"], + ["\\Q\\\\E", "lit{\\}"], + ["\\Q\\\\\\E", "str{\\\\}"], + + ["(?m)^", "bol{}"], + ["(?m)$", "eol{}"], + ["(?-m)^", "bot{}"], + ["(?-m)$", "eot{}"], + ["(?m)\\A", "bot{}"], + ["(?m)\\z", "eot{\\z}"], + ["(?-m)\\A", "bot{}"], + ["(?-m)\\z", "eot{\\z}"], + + ["(?Pa)", "cap{name:lit{a}}"], + ["(?a)", "cap{name:lit{a}}"], + [ + "(?Pf{0,10})(?Pb{0,10})", + "cat{cap{baz:rep{0,10 lit{f}}}cap{bag:rep{0,10 lit{b}}}}", + ], + [ + "(?f{0,10})(?b{0,10})", + "cat{cap{baz:rep{0,10 lit{f}}}cap{bag:rep{0,10 lit{b}}}}", + ], + + ["[Aa]", "litfold{A}"], + ["[\\x{100}\\x{101}]", "litfold{Ā}"], + ["[Δδ]", "litfold{Δ}"], + + ["abcde", "str{abcde}"], + ["[Aa][Bb]cd", "cat{strfold{AB}str{cd}}"], + + ["(?:.)", "dot{}"], + ["(?:A(?:A|a))", "cat{lit{A}litfold{A}}"], + ["(?:A|a)", "litfold{A}"], + ["A|(?:A|a)", "litfold{A}"], + ["(?s).", "dot{}"], + ["(?-s).", "dnl{}"], + ["(?:(?:^).)", "cat{bol{}dot{}}"], + ["(?-s)(?:(?:^).)", "cat{bol{}dnl{}}"], + ["[\\x00-\\x{10FFFF}]", "dot{}"], + ["[^\\x00-\\x{10FFFF}]", "cc{}"], + ["(?:[a][a-])", "cat{lit{a}cc{0x2d 0x61}}"], + + ["abc|x|abd", "alt{str{abc}lit{x}str{abd}}"], + + ["((((((((((x{2}){2}){2}){2}){2}){2}){2}){2}){2}))", null], + ["((((((((((x{1}){2}){2}){2}){2}){2}){2}){2}){2}){2})", null], + + [ + `${[...new Array(999)].map(() => "(").join("")}${[...new Array(999)].map(() => ")").join("")}`, + null, + ], + [ + `${[...new Array(999)].map(() => "(?:").join("")}${[...new Array(999)].map(() => ")*").join("")}`, + null, + ], + [`(${[...new Array(12345)].map(() => "|").join("")})`, null], + ]; + + const flags = MATCH_NL | PERL_X | UNICODE_GROUPS; + + for (const [input, expected] of cases) { + test(`input ${JSON.stringify(input).slice(0, 100)} returns ${JSON.stringify(expected)}`, () => { + const re = Parser.parse(input, flags); + let parsedRe: string | null = null; + assert.doesNotThrow(() => { + parsedRe = dumpRegexp(re); + }); + if (expected !== null) { + assert.strictEqual(parsedRe, expected); + } + }); + } +}); + +describe("fold cases", () => { + const cases: [string, string][] = [ + ["AbCdE", "strfold{ABCDE}"], + ["[Aa]", "litfold{A}"], + ["a", "litfold{A}"], + ["A[F-g]", "cat{litfold{A}cc{0x41-0x7a 0x17f 0x212a}}"], + ["[[:upper:]]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}"], + ["[[:lower:]]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}"], + ]; + + for (const [input, expected] of cases) { + test(`input ${JSON.stringify(input)} expected ${JSON.stringify(expected)}`, () => { + const re = Parser.parse(input, FOLD_CASE); + assert.strictEqual(dumpRegexp(re), expected); + }); + } +}); + +describe("literal cases", () => { + const cases: [string, string][] = [ + ["(|)^$.[*+?]{5,10},\\", "str{(|)^$.[*+?]{5,10},\\}"], + ]; + + for (const [input, expected] of cases) { + test(`input ${JSON.stringify(input)} expected ${JSON.stringify(expected)}`, () => { + const re = Parser.parse(input, LITERAL); + assert.strictEqual(dumpRegexp(re), expected); + }); + } +}); + +describe("match new line cases", () => { + const cases: [string, string][] = [ + [".", "dot{}"], + ["\n", "lit{\n}"], + ["[^a]", "cc{0x0-0x60 0x62-0x10ffff}"], + ["[a\\n]", "cc{0xa 0x61}"], + ]; + + for (const [input, expected] of cases) { + test(`input ${JSON.stringify(input)} expected ${JSON.stringify(expected)}`, () => { + const re = Parser.parse(input, MATCH_NL); + assert.strictEqual(dumpRegexp(re), expected); + }); + } +}); + +describe("no match new line cases", () => { + const cases: [string, string][] = [ + [".", "dnl{}"], + ["\n", "lit{\n}"], + ["[^a]", "cc{0x0-0x9 0xb-0x60 0x62-0x10ffff}"], + ["[a\\n]", "cc{0xa 0x61}"], + ]; + + for (const [input, expected] of cases) { + test(`input ${JSON.stringify(input)} expected ${JSON.stringify(expected)}`, () => { + const re = Parser.parse(input, 0); + assert.strictEqual(dumpRegexp(re), expected); + }); + } +}); + +describe("invalid regexp cases", () => { + const invalidInputs: string[] = [ + "(", + ")", + "(a", + "(a|b|", + "(a|b", + "[a-z", + "([a-z)", + "x{1001}", + "x{9876543210}", + "x{2,1}", + "x{1,9876543210}", + "(?Pa", + "(?P", + "(?Pa)", + "(?P<>a)", + "(?a", + "(?", + "(?a)", + "(?<>a)", + "[a-Z]", + "(?i)[a-Z]", + "\\Q\\E*", + "a{100000}", + "a{100000,}", + "(?Pbar)(?Pbaz)", + "(?Pbar)(?baz)", + "(?bar)(?Pbaz)", + "(?bar)(?baz)", + "\\x", + "\\xv", + "^[a-z0-9\\–\\-'‘’]+$", + '[\\”\\“]"', + "[\\<\\>\\{\\}\\[\\]\\|\\”\\%\\~\\#]", + "((g{2,32}|q){1,32})", + "((((((((((x{2}){2}){2}){2}){2}){2}){2}){2}){2}){2})", + `${[...new Array(1000)].map(() => "(").join("")}${[...new Array(1000)].map(() => ")").join("")}`, + `${[...new Array(1000)].map(() => "(?:").join("")}${[...new Array(1000)].map(() => ")*").join("")}`, + `(${[...new Array(1000)].map(() => "(xx?)").join("")}){1000}`, + `${[...new Array(1000)].map(() => "(xx?){1000}").join("")}`, + `${[...new Array(27000)].map(() => "\\pL").join("")}`, + ]; + + for (const input of invalidInputs) { + test(`invalid ${JSON.stringify(input).slice(0, 80)} raise error`, () => { + assert.throws(() => Parser.parse(input, PERL), RE2JSSyntaxException); + assert.throws(() => Parser.parse(input, POSIX), RE2JSSyntaxException); + }); + } + + const validInPerl: string[] = [ + "[a-b-c]", + "\\Qabc\\E", + "\\Q*+?{[\\E", + "\\Q\\\\E", + "\\Q\\\\\\E", + "\\Q\\\\\\\\E", + "\\Q\\\\\\\\\\E", + "(?:a)", + "(?Pa)", + "(?a)", + ]; + + for (const input of validInPerl) { + test(`valid ${JSON.stringify(input)} in perl mode`, () => { + assert.doesNotThrow(() => Parser.parse(input, PERL)); + }); + } + + const invalidInPerl: string[] = [ + "a++", + "a**", + "a?*", + "a+*", + "a{1}*", + ".{1}{2}.{3}", + ]; + + for (const input of invalidInPerl) { + test(`invalid ${JSON.stringify(input)} in perl mode`, () => { + assert.throws(() => Parser.parse(input, PERL), RE2JSSyntaxException); + }); + } +}); + +describe("large AST flat structures", () => { + it("should not exceed call stack size on massive alternations", () => { + const massiveAlternation = new Array(100000).fill("a").join("|"); + assert.doesNotThrow(() => { + Parser.parse(massiveAlternation, PERL); + }); + }); + + it("should not exceed call stack size on massive concatenations", () => { + const massiveConcat = new Array(100000).fill("(a)").join(""); + assert.doesNotThrow(() => { + Parser.parse(massiveConcat, PERL); + }); + }); +}); + +describe("Flag interactions", () => { + it("should parse \\p correctly with UNICODE_GROUPS enabled", () => { + const re = Parser.parse("\\p{Any}", PERL); + assert.strictEqual(dumpRegexp(re), "dot{}"); + }); +}); diff --git a/packages/re2/src/__tests__/Prefilter.test.ts b/packages/re2/src/__tests__/Prefilter.test.ts new file mode 100644 index 0000000..de552be --- /dev/null +++ b/packages/re2/src/__tests__/Prefilter.test.ts @@ -0,0 +1,150 @@ +import { describe, test, it } from "node:test"; +import * as assert from "node:assert/strict"; +import { Parser } from "../Parser.js"; +import { simplify } from "../Simplify.js"; +import { PrefilterTree, Prefilter } from "../Prefilter.js"; +import { PERL } from "../RE2Flags.js"; +import { RE2JS } from "../index.js"; +import { fromUTF16 } from "../MachineInput.js"; + +const dumpPrefilter = (pf: Prefilter | null): string => { + if (!pf) return "null"; + switch (pf.type) { + case Prefilter.Type.NONE: + return "NONE"; + case Prefilter.Type.EXACT: + return `EXACT("${pf.str}")`; + case Prefilter.Type.AND: + return `AND(${pf.subs.map(dumpPrefilter).join(", ")})`; + case Prefilter.Type.OR: + return `OR(${pf.subs.map(dumpPrefilter).join(", ")})`; + default: + return "UNKNOWN"; + } +}; + +const getPrefilterDump = (pattern: string, flags: number = PERL): string => { + let re = Parser.parse(pattern, flags); + re = simplify(re); + const pf = PrefilterTree.build(re); + return dumpPrefilter(pf); +}; + +describe("PrefilterTree.build AST Extraction", () => { + const cases: [string, string][] = [ + ["foo", 'EXACT("foo")'], + ["^foo$", 'EXACT("foo")'], + + ["foo.*bar", 'AND(EXACT("foo"), EXACT("bar"))'], + ["a.*b.*c", 'AND(EXACT("a"), EXACT("b"), EXACT("c"))'], + + ["foo|bar", 'OR(EXACT("foo"), EXACT("bar"))'], + [ + "apple|banana|cherry", + 'OR(EXACT("apple"), EXACT("banana"), EXACT("cherry"))', + ], + + ["(foo|bar)baz", 'AND(OR(EXACT("foo"), EXACT("bar")), EXACT("baz"))'], + [ + "foo(bar|baz)qux", + 'AND(EXACT("foo"), OR(EXACT("bar"), EXACT("baz")), EXACT("qux"))', + ], + + ["a+b", 'AND(EXACT("a"), EXACT("b"))'], + ["a{2,5}b", 'AND(EXACT("a"), EXACT("a"), EXACT("b"))'], + ["a?b", 'EXACT("b")'], + ["a*b", 'EXACT("b")'], + + ["foo|foo", 'OR(EXACT("foo"))'], + ["(a|a)b", 'AND(EXACT("a"), EXACT("b"))'], + ["a?b?c?", "NONE"], + + ["(?i)foo", "NONE"], + ["\\d+foo", 'EXACT("foo")'], + ["[a-z]+|foo", "NONE"], + ["a|b|c", "NONE"], + ]; + + for (const [pattern, expected] of cases) { + test(`pattern ${JSON.stringify(pattern)} builds prefilter ${JSON.stringify(expected)}`, () => { + assert.strictEqual(getPrefilterDump(pattern), expected); + }); + } +}); + +describe("Prefilter Evaluation (UTF-16 & UTF-8)", () => { + it("correctly evaluates EXACT filters", () => { + const pf = PrefilterTree.build(simplify(Parser.parse("foo", PERL))); + + assert.strictEqual(pf.eval(fromUTF16("bar foo baz"), 0), true); + assert.strictEqual(pf.eval(fromUTF16("bar fox baz"), 0), false); + }); + + it("correctly evaluates AND filters", () => { + const pf = PrefilterTree.build(simplify(Parser.parse("foo.*bar", PERL))); + + const input1 = fromUTF16("foo and then bar"); + assert.strictEqual(pf.eval(input1, 0), true); + + const input2 = fromUTF16("foo and then baz"); + assert.strictEqual(pf.eval(input2, 0), false); + }); + + it("correctly evaluates OR filters", () => { + const pf = PrefilterTree.build(simplify(Parser.parse("foo|bar", PERL))); + + const input1 = fromUTF16("I have a bar"); + assert.strictEqual(pf.eval(input1, 0), true); + + const input2 = fromUTF16("I have a baz"); + assert.strictEqual(pf.eval(input2, 0), false); + }); +}); + +describe("Engine Integration", () => { + it("securely bails out early during unanchored tests", () => { + const re = RE2JS.compile("error.*critical"); + + assert.strictEqual(re.test("There was an error that was critical"), true); + assert.strictEqual(re.test("There was an error that was minor"), false); + }); + + it("does not interfere with anchored execution", () => { + const re = RE2JS.compile("^foo.*bar$"); + + assert.strictEqual(re.testExact("foo and bar"), true); + assert.strictEqual(re.testExact("foo and baz"), false); + }); +}); + +describe("Advanced Prefilter Evaluation", () => { + it("handles complex AND/OR logic branches correctly", () => { + const pf = PrefilterTree.build( + simplify(Parser.parse("(foo|bar)baz", PERL)), + ); + + assert.strictEqual(pf.eval(fromUTF16("foobaz"), 0), true); + assert.strictEqual(pf.eval(fromUTF16("barbaz"), 0), true); + assert.strictEqual(pf.eval(fromUTF16("foo"), 0), false); + assert.strictEqual(pf.eval(fromUTF16("baz"), 0), false); + assert.strictEqual(pf.eval(fromUTF16("quxbaz"), 0), false); + }); + + it("evaluates emojis and multi-byte unicode safely", () => { + const pf = PrefilterTree.build(simplify(Parser.parse("🚀.*🌕", PERL))); + + assert.strictEqual(pf.eval(fromUTF16("To the 🚀 and then 🌕!"), 0), true); + assert.strictEqual( + pf.eval(fromUTF16("To the 🚀 and then back!"), 0), + false, + ); + }); + + it("respects end boundaries on bounded input buffers", () => { + const pf = PrefilterTree.build(simplify(Parser.parse("hidden", PERL))); + const text = "visible hidden"; + + const utf16Input = fromUTF16(text, 0, 7); + assert.strictEqual(pf.eval(utf16Input, 0), false); + }); +}); diff --git a/packages/re2/src/__tests__/RE2Compile.test.ts b/packages/re2/src/__tests__/RE2Compile.test.ts new file mode 100644 index 0000000..27e8611 --- /dev/null +++ b/packages/re2/src/__tests__/RE2Compile.test.ts @@ -0,0 +1,61 @@ +import { describe, test } from "node:test"; +import * as assert from "node:assert/strict"; +import { RE2 } from "../RE2.js"; + +describe(".compile", () => { + const cases: [string, string | null][] = [ + ["", null], + [".", null], + ["^.$", null], + ["a", null], + ["a*", null], + ["a+", null], + ["a?", null], + ["a|b", null], + ["a*|b*", null], + ["(a*|b)(c*|d)", null], + ["[a-z]", null], + ["[a-abc-c\\-\\]\\[]", null], + ["[a-z]+", null], + ["[abc]", null], + ["[^1234]", null], + ["[^\n]", null], + ["..|.#|..", null], + ["\\!\\\\", null], + ["abc]", null], + ["a??", null], + ["*", "missing argument to repetition operator: `*`"], + ["+", "missing argument to repetition operator: `+`"], + ["?", "missing argument to repetition operator: `?`"], + ["(abc", "missing closing ): `(abc`"], + ["abc)", "unexpected ): `abc)`"], + ["x[a-z", "missing closing ]: `[a-z`"], + ["[z-a]", "invalid character class range: `z-a`"], + ["abc\\", "trailing backslash at end of expression"], + ["a**", "invalid nested repetition operator: `**`"], + ["a*+", "invalid nested repetition operator: `*+`"], + ["\\x", "invalid escape sequence: `\\x`"], + ["\\p", "invalid character class range: `\\p`"], + ["\\p{", "invalid character class range: `\\p{`"], + ["((g{2,32}|q){1,32})", "invalid repeat count: `{1,32}`"], + ["((g{2,20}|q){1,20}){0,40}", "invalid repeat count: `{0,40}`"], + [ + [...new Array(1000)].map(() => "(xx?){1000}").join(""), + "expression too large", + ], + ]; + + for (const [input, expected] of cases) { + test(`input ${JSON.stringify(input)} compile raise error ${JSON.stringify(expected)}`, () => { + try { + RE2.compile(input); + assert.strictEqual(null, expected); + } catch (e) { + assert.strictEqual( + (e as Error).message, + `error parsing regexp: ${expected}`, + ); + } + }); + } +}); diff --git a/packages/re2/src/__tests__/RE2ExecuteEngine.test.ts b/packages/re2/src/__tests__/RE2ExecuteEngine.test.ts new file mode 100644 index 0000000..cd155a0 --- /dev/null +++ b/packages/re2/src/__tests__/RE2ExecuteEngine.test.ts @@ -0,0 +1,53 @@ +import { describe, test } from "node:test"; +import * as assert from "node:assert/strict"; +import { RE2 } from "../RE2.js"; +import { DFA } from "../DFA.js"; +import { ANCHOR_BOTH } from "../RE2Flags.js"; +import { fromUTF16 } from "../MachineInput.js"; +import { Prefilter } from "../Prefilter.js"; + +describe("Literal Fast-Path Routing", () => { + test("bails out early using literal fast path for strictly literal unanchored regexes", (t) => { + const prefilterSpy = t.mock.method(Prefilter.prototype, "eval"); + const dfaSpy = t.mock.method(DFA.prototype, "match"); + + const re = RE2.compile("hello"); + const result = re.match("say hello world"); + + assert.strictEqual(result, true); + assert.strictEqual(prefilterSpy.mock.callCount(), 0); + assert.strictEqual(dfaSpy.mock.callCount(), 0); + }); + + test("literal fast path boolean match works correctly", () => { + const re = RE2.compile("world"); + assert.strictEqual(re.match("hello world!"), true); + assert.strictEqual(re.match("hello earth!"), false); + }); + + test("literal fast path perfectly handles ANCHOR_BOTH (testExact)", (t) => { + const dfaSpy = t.mock.method(DFA.prototype, "match"); + + const re = RE2.compile("hello"); + + const matchInput = fromUTF16("hello"); + assert.notStrictEqual( + re.executeEngine(matchInput, 0, ANCHOR_BOTH, 0), + null, + ); + + const noMatchInput1 = fromUTF16("hello world"); + assert.strictEqual( + re.executeEngine(noMatchInput1, 0, ANCHOR_BOTH, 0), + null, + ); + + const noMatchInput2 = fromUTF16("say hello"); + assert.strictEqual( + re.executeEngine(noMatchInput2, 0, ANCHOR_BOTH, 0), + null, + ); + + assert.strictEqual(dfaSpy.mock.callCount(), 0); + }); +}); diff --git a/packages/re2/src/__tests__/RE2Match.test.ts b/packages/re2/src/__tests__/RE2Match.test.ts new file mode 100644 index 0000000..f8de46f --- /dev/null +++ b/packages/re2/src/__tests__/RE2Match.test.ts @@ -0,0 +1,16 @@ +import { describe, test } from "node:test"; +import * as assert from "node:assert/strict"; +import { RE2 } from "../RE2.js"; +import { FIND_TESTS } from "../__fixtures__/find.js"; + +describe("match", () => { + for (const testPattern of FIND_TESTS) { + test(String(testPattern), () => { + const re = RE2.compile(testPattern.pat); + assert.strictEqual( + re.match(testPattern.text), + testPattern.matches.length > 0, + ); + }); + } +}); diff --git a/packages/re2/src/__tests__/RE2QuoteMeta.test.ts b/packages/re2/src/__tests__/RE2QuoteMeta.test.ts new file mode 100644 index 0000000..25bcaac --- /dev/null +++ b/packages/re2/src/__tests__/RE2QuoteMeta.test.ts @@ -0,0 +1,40 @@ +import { describe, test } from "node:test"; +import * as assert from "node:assert/strict"; +import { RE2 } from "../RE2.js"; +import { RE2JS } from "../index.js"; +import { quoteMeta } from "../Utils.js"; + +const cases: [string, string, string, boolean][] = [ + ["", "", "", true], + ["foo", "foo", "foo", true], + ["foo\\.\\$", "foo\\\\\\.\\\\\\$", "foo.$", true], + ["foo.\\$", "foo\\.\\\\\\$", "foo", false], + [ + "!@#$%^&*()_+-=[{]}\\|,<.>/?~", + "!@#\\$%\\^&\\*\\(\\)_\\+-=\\[\\{\\]\\}\\\\\\|,<\\.>/\\?~", + "!@#", + false, + ], +]; + +describe("quoteMeta", () => { + for (const [pattern, output] of cases) { + test(`quote meta: pattern ${JSON.stringify(pattern)} quoted to ${JSON.stringify(output)}`, () => { + const quoted = quoteMeta(pattern); + assert.strictEqual(quoteMeta(pattern), output); + assert.strictEqual(RE2JS.quote(pattern), output); + if (pattern && pattern.length > 0) { + const re = RE2.compile(quoted); + assert.strictEqual(re.match(`abc${pattern}def`), true); + } + }); + } + + for (const [pattern, output, literal, isLiteral] of cases) { + test(`literal prefix: pattern ${JSON.stringify(pattern)} quoted to ${JSON.stringify(output)} and literal ${JSON.stringify(literal)} (isLiteral: ${isLiteral})`, () => { + const re = RE2.compile(pattern); + assert.strictEqual(re.prefix, literal); + assert.strictEqual(re.prefixComplete, isLiteral); + }); + } +}); diff --git a/packages/re2/src/__tests__/StressBoundary.test.ts b/packages/re2/src/__tests__/StressBoundary.test.ts new file mode 100644 index 0000000..2c725ad --- /dev/null +++ b/packages/re2/src/__tests__/StressBoundary.test.ts @@ -0,0 +1,97 @@ +import { describe, test } from "node:test"; +import * as assert from "node:assert/strict"; +import { RE2JS } from "../index.js"; + +describe("stress: boundary conditions", () => { + test("empty pattern matches empty string", () => { + const re = RE2JS.compile(""); + assert.strictEqual(re.testExact(""), true); + assert.strictEqual(re.test(""), true); + assert.strictEqual(re.test("abc"), true); + assert.strictEqual(re.testExact("abc"), false); + }); + + test("single char at MAX_RUNE (0x10FFFF)", () => { + const maxCharPattern = String.fromCodePoint(0x10ffff); + const re = RE2JS.compile(maxCharPattern); + assert.strictEqual(re.testExact(maxCharPattern), true); + assert.strictEqual(re.test(`x${maxCharPattern}y`), true); + }); + + test("char class covering full range", () => { + const re = RE2JS.compile("^.$"); + assert.strictEqual(re.testExact("a"), true); + assert.strictEqual(re.testExact(String.fromCodePoint(0x10ffff)), true); + assert.strictEqual(re.testExact("\n"), false); + }); + + test("dot with DOTALL flag", () => { + const re = RE2JS.compile("(?s)^.$"); + assert.strictEqual(re.testExact("\n"), true); + }); + + test("deeply nested non-capturing groups", () => { + const depth = 100; + const pattern = "(?:".repeat(depth) + "a" + ")".repeat(depth); + const re = RE2JS.compile(pattern); + assert.strictEqual(re.testExact("a"), true); + assert.strictEqual(re.testExact("b"), false); + }); + + test("deeply nested captures", () => { + const depth = 50; + const pattern = "(".repeat(depth) + "a" + ")".repeat(depth); + const re = RE2JS.compile(pattern); + assert.strictEqual(re.testExact("a"), true); + assert.strictEqual(re.groupCount(), depth); + }); + + test("zero-repetition prefix", () => { + const re = RE2JS.compile("a{0}b"); + assert.strictEqual(re.testExact("b"), true); + assert.strictEqual(re.testExact("ab"), false); + }); + + test("repetition lower bound 0", () => { + const re = RE2JS.compile("a{0,3}"); + assert.strictEqual(re.testExact(""), true); + assert.strictEqual(re.testExact("aaa"), true); + assert.strictEqual(re.testExact("aaaa"), false); + }); + + test("large bounded repetition", () => { + const re = RE2JS.compile("a{10,20}"); + assert.strictEqual(re.testExact("a".repeat(9)), false); + assert.strictEqual(re.testExact("a".repeat(10)), true); + assert.strictEqual(re.testExact("a".repeat(15)), true); + assert.strictEqual(re.testExact("a".repeat(20)), true); + assert.strictEqual(re.testExact("a".repeat(21)), false); + }); + + test("alternation with empty branch", () => { + const re = RE2JS.compile("(a|)b"); + assert.strictEqual(re.testExact("b"), true); + assert.strictEqual(re.testExact("ab"), true); + }); + + test("massive alternation", () => { + const alts = Array.from({ length: 1000 }, (_, i) => + String.fromCodePoint(0x61 + (i % 26)), + ).join("|"); + const re = RE2JS.compile(alts); + assert.strictEqual(re.test("a"), true); + assert.strictEqual(re.test("z"), true); + }); + + test("huge input matching", () => { + const re = RE2JS.compile("needle"); + const haystack = "a".repeat(100000) + "needle" + "a".repeat(100000); + assert.strictEqual(re.test(haystack), true); + }); + + test("empty char class", () => { + const re = RE2JS.compile("^[^\\x00-\\x{10FFFF}]$"); + assert.strictEqual(re.testExact("a"), false); + assert.strictEqual(re.testExact(""), false); + }); +}); diff --git a/packages/re2/src/__tests__/StressErrorRecovery.test.ts b/packages/re2/src/__tests__/StressErrorRecovery.test.ts new file mode 100644 index 0000000..0e5048f --- /dev/null +++ b/packages/re2/src/__tests__/StressErrorRecovery.test.ts @@ -0,0 +1,68 @@ +import { describe, test } from "node:test"; +import * as assert from "node:assert/strict"; +import { RE2JS } from "../index.js"; +import { RE2JSSyntaxException } from "../exceptions.js"; + +describe("stress: parser error recovery", () => { + const compileGoodAfterBad = ( + badPattern: string, + goodPattern: string, + ): void => { + assert.throws(() => RE2JS.compile(badPattern), RE2JSSyntaxException); + assert.doesNotThrow(() => RE2JS.compile(goodPattern)); + }; + + test("unclosed char class recovers", () => { + compileGoodAfterBad("[a-z", "abc"); + }); + + test("incomplete property escape recovers", () => { + compileGoodAfterBad("\\p{", "xyz"); + }); + + test("unclosed named capture recovers", () => { + compileGoodAfterBad("(?P { + assert.doesNotThrow(() => RE2JS.compile("a{")); + assert.doesNotThrow(() => RE2JS.compile("a{1,2,3}")); + compileGoodAfterBad("a{1001}", "xyz"); + compileGoodAfterBad("a{100000,}", "xyz"); + }); + + test("unknown property name recovers", () => { + compileGoodAfterBad("\\p{NotAValidName}", "abc"); + }); + + test("unmatched close paren recovers", () => { + compileGoodAfterBad("abc)", "xyz"); + }); + + test("unmatched open paren recovers", () => { + compileGoodAfterBad("(abc", "xyz"); + }); + + test("nested repetition recovers", () => { + compileGoodAfterBad("a**", "xyz"); + compileGoodAfterBad("a{1}{2}", "xyz"); + }); + + test("invalid escape recovers", () => { + compileGoodAfterBad("\\x", "xyz"); + compileGoodAfterBad("\\u", "xyz"); + }); + + test("many errors in sequence", () => { + const bad = ["(", ")", "[", "\\p{", "\\x", "a**", "a{1,0}"]; + for (const p of bad) { + assert.throws(() => RE2JS.compile(p)); + } + assert.strictEqual(RE2JS.compile("hello").test("hello"), true); + }); + + test("massive alternation stays under limit", () => { + const pat = `(${Array(12345).fill("").join("|")})`; + assert.doesNotThrow(() => RE2JS.compile(pat)); + }); +}); diff --git a/packages/re2/src/__tests__/StressFlags.test.ts b/packages/re2/src/__tests__/StressFlags.test.ts new file mode 100644 index 0000000..543f917 --- /dev/null +++ b/packages/re2/src/__tests__/StressFlags.test.ts @@ -0,0 +1,80 @@ +import { describe, test } from "node:test"; +import * as assert from "node:assert/strict"; +import { RE2JS } from "../index.js"; + +describe("stress: flag interactions", () => { + test("(?i)(?m) case-insensitive + multiline", () => { + const re = RE2JS.compile("(?im)^[a-z]+$"); + assert.strictEqual(re.test("hello"), true); + assert.strictEqual(re.test("HELLO"), true); + assert.strictEqual(re.test("Hello\nWORLD"), true); + assert.strictEqual(re.test("1234"), false); + }); + + test("(?s)(?i) dotall + case-insensitive", () => { + const re = RE2JS.compile("(?si)a.b"); + assert.strictEqual(re.test("A\nB"), true); + assert.strictEqual(re.test("aXb"), true); + }); + + test("(?i) with Unicode category", () => { + const re = RE2JS.compile("(?i)^\\p{Lu}+$"); + assert.strictEqual(re.test("HELLO"), true); + assert.strictEqual(re.test("hello"), true); + assert.strictEqual(re.test("123"), false); + }); + + test("(?-i) turns off case-insensitive within scope", () => { + const re = RE2JS.compile("(?i)a(?-i:b)c"); + assert.strictEqual(re.test("abc"), true); + assert.strictEqual(re.test("ABC"), false); + assert.strictEqual(re.test("Abc"), true); + }); + + test("multiline ^ and $ behavior", () => { + const re = RE2JS.compile("(?m)^foo$"); + assert.strictEqual(re.test("foo"), true); + assert.strictEqual(re.test("bar\nfoo\nbaz"), true); + assert.strictEqual(re.test("barfoo"), false); + }); + + test("default (non-multiline) $ matches end-of-text only", () => { + const re = RE2JS.compile("foo$"); + assert.strictEqual(re.test("foo"), true); + assert.strictEqual(re.test("foo\n"), false); + assert.strictEqual(re.test("foox"), false); + }); + + test("\\A and \\z always anchor text boundaries", () => { + const re = RE2JS.compile("(?m)\\Afoo\\z"); + assert.strictEqual(re.testExact("foo"), true); + assert.strictEqual(re.test("bar\nfoo"), false); + }); + + test("DOTALL does not affect line anchors", () => { + const re = RE2JS.compile("(?s)^foo$"); + assert.strictEqual(re.test("foo"), true); + assert.strictEqual(re.test("bar\nfoo"), false); + }); + + test("case-insensitive with ASCII char class", () => { + const re = RE2JS.compile("(?i)[a-z]+"); + assert.strictEqual(re.test("HELLO"), true); + assert.strictEqual(re.test("hello"), true); + assert.strictEqual(re.test("123"), false); + }); + + test("case-insensitive with single-fold letters", () => { + const re = RE2JS.compile("(?i)k"); + assert.strictEqual(re.test("k"), true); + assert.strictEqual(re.test("K"), true); + assert.strictEqual(re.test(String.fromCodePoint(0x212a)), true); + }); + + test("case-insensitive with long-s orbit", () => { + const re = RE2JS.compile("(?i)s"); + assert.strictEqual(re.test("s"), true); + assert.strictEqual(re.test("S"), true); + assert.strictEqual(re.test(String.fromCodePoint(0x017f)), true); + }); +}); diff --git a/packages/re2/src/__tests__/StressUnicode.test.ts b/packages/re2/src/__tests__/StressUnicode.test.ts new file mode 100644 index 0000000..3e8dd1d --- /dev/null +++ b/packages/re2/src/__tests__/StressUnicode.test.ts @@ -0,0 +1,89 @@ +import { describe, test } from "node:test"; +import * as assert from "node:assert/strict"; +import { RE2JS } from "../index.js"; + +describe("stress: Unicode edge cases", () => { + test("char class with supplementary plane range", () => { + const re = RE2JS.compile("^[\\x{10000}-\\x{10FFFF}]$"); + assert.strictEqual(re.testExact(String.fromCodePoint(0x10000)), true); + assert.strictEqual(re.testExact(String.fromCodePoint(0x10ffff)), true); + assert.strictEqual(re.testExact(String.fromCodePoint(0x1f600)), true); + assert.strictEqual(re.testExact("a"), false); + }); + + test("emoji in literal", () => { + const emoji = String.fromCodePoint(0x1f600); + const re = RE2JS.compile(`^${emoji}$`); + assert.strictEqual(re.testExact(emoji), true); + assert.strictEqual(re.testExact("a"), false); + }); + + test("mixed script property escapes", () => { + const re = RE2JS.compile("^\\p{Greek}\\p{Latin}$"); + assert.strictEqual(re.testExact("αa"), true); + assert.strictEqual(re.testExact("aa"), false); + assert.strictEqual(re.testExact("αα"), false); + }); + + test("\\p{L} matches all Unicode letters", () => { + const re = RE2JS.compile("^\\p{L}+$"); + assert.strictEqual(re.test("abc"), true); + assert.strictEqual(re.test("ΑΒΓ"), true); + assert.strictEqual(re.test("日本語"), true); + assert.strictEqual(re.test("مرحبا"), true); + assert.strictEqual(re.test("123"), false); + }); + + test("negated \\P{L}", () => { + const re = RE2JS.compile("^\\P{L}+$"); + assert.strictEqual(re.test("123"), true); + assert.strictEqual(re.test("!@#"), true); + assert.strictEqual(re.test("abc"), false); + assert.strictEqual(re.test("αβγ"), false); + }); + + test("\\p{Any} matches anything", () => { + const re = RE2JS.compile("^\\p{Any}$"); + assert.strictEqual(re.test("a"), true); + assert.strictEqual(re.test(String.fromCodePoint(0x10ffff)), true); + }); + + test("Unicode 16.0 new script: Garay", () => { + const re = RE2JS.compile("^\\p{Garay}+$"); + assert.strictEqual(re.test(String.fromCodePoint(0x10d40)), true); + assert.strictEqual(re.test("a"), false); + }); + + test("surrogate char class \\p{Cs}", () => { + const re = RE2JS.compile("^\\p{Cs}$"); + assert.strictEqual(re.testExact(String.fromCharCode(0xd800)), true); + assert.strictEqual(re.testExact(String.fromCharCode(0xdfff)), true); + assert.strictEqual(re.testExact("a"), false); + }); + + test("repetition over supplementary char", () => { + const emoji = String.fromCodePoint(0x1f600); + const re = RE2JS.compile(`${emoji}{3}`); + assert.strictEqual(re.test(emoji + emoji + emoji), true); + assert.strictEqual(re.test(emoji + emoji), false); + }); + + test("char class with supplementary and BMP mix", () => { + const re = RE2JS.compile("^[a\\x{1F600}]+$"); + assert.strictEqual(re.test("a"), true); + assert.strictEqual(re.test(String.fromCodePoint(0x1f600)), true); + assert.strictEqual(re.test(`a${String.fromCodePoint(0x1f600)}a`), true); + assert.strictEqual(re.test("b"), false); + }); + + test("word boundary near supplementary chars", () => { + const re = RE2JS.compile("\\bword\\b"); + const supp = String.fromCodePoint(0x1d4d0); + assert.strictEqual(re.test(`${supp}word${supp}`), true); + }); + + test("non-BMP case folding stays identity (RE2 limitation)", () => { + const re = RE2JS.compile(`(?i)${String.fromCodePoint(0x1d4d0)}`); + assert.strictEqual(re.test(String.fromCodePoint(0x1d4d0)), true); + }); +}); diff --git a/packages/re2/src/__tests__/Unicode.test.ts b/packages/re2/src/__tests__/Unicode.test.ts new file mode 100644 index 0000000..297f106 --- /dev/null +++ b/packages/re2/src/__tests__/Unicode.test.ts @@ -0,0 +1,132 @@ +import { describe, test, it } from "node:test"; +import * as assert from "node:assert/strict"; +import { equalsIgnoreCase, isUpper, simpleFold } from "../Unicode.js"; +import { UnicodeTables } from "../UnicodeTables.js"; +import { codePoint } from "../__utils__/chars.js"; + +describe("#isUpper", () => { + const cases: [number, boolean][] = [ + [115, false], + [83, true], + [503, true], + [469, true], + [474, false], + [940, false], + ]; + + for (const [input, expected] of cases) { + test(`#isUpper(${input}) === ${expected}`, () => { + assert.strictEqual(isUpper(input), expected); + }); + } +}); + +describe("#simpleFold", () => { + const cases: [number, number][] = [ + [65, 97], + [97, 65], + [83, 115], + [115, 383], + [383, 83], + [75, 107], + [107, 8490], + [8490, 75], + [49, 49], + [57, 57], + ]; + + for (const [input, expected] of cases) { + test(`#simpleFold(${input}) === ${expected}`, () => { + assert.strictEqual(simpleFold(input), expected); + }); + } +}); + +const genEqualsIgnoreCases = (): [number, number, boolean][] => { + const testCases: [number, number, boolean][] = [ + [codePoint("{"), codePoint("{"), true], + [codePoint("é"), codePoint("É"), true], + [codePoint("Ú"), codePoint("ú"), true], + [codePoint("\u212A"), codePoint("K"), true], + [codePoint("\u212A"), codePoint("k"), true], + [codePoint("\u212A"), codePoint("a"), false], + [codePoint("ü"), codePoint("ű"), false], + [codePoint("b"), codePoint("k"), false], + [codePoint("C"), codePoint("x"), false], + [codePoint("/"), codePoint("_"), false], + [codePoint("d"), codePoint(")"), false], + [codePoint("@"), codePoint("`"), false], + ]; + + for (let r = codePoint("a"); r <= codePoint("z"); r++) { + const u = r - (codePoint("a") - codePoint("A")); + testCases.push([r, r, true], [u, u, true], [r, u, true], [u, r, true]); + } + + return testCases; +}; + +describe("#equalsIgnoreCase", () => { + for (const [r1, r2, expected] of genEqualsIgnoreCases()) { + test(`#equalsIgnoreCase(${r1}, ${r2}) === ${expected}`, () => { + assert.strictEqual(equalsIgnoreCase(r1, r2), expected); + }); + } +}); + +describe("UnicodeTables VLQ Decompression", () => { + it("should decompress the Zl (Line Separator) table correctly", () => { + const zlTable = UnicodeTables.CATEGORIES.get("Zl"); + assert.ok(zlTable, "Zl category table missing"); + + assert.strictEqual(zlTable.length, 1); + assert.strictEqual(zlTable.getLo(0), 0x2028); + assert.strictEqual(zlTable.getHi(0), 0x2028); + assert.strictEqual(zlTable.getStride(0), 1); + }); + + it("should decompress the Zp (Paragraph Separator) table correctly", () => { + const zpTable = UnicodeTables.CATEGORIES.get("Zp"); + assert.ok(zpTable, "Zp category table missing"); + + assert.strictEqual(zpTable.length, 1); + assert.strictEqual(zpTable.getLo(0), 0x2029); + assert.strictEqual(zpTable.getHi(0), 0x2029); + assert.strictEqual(zpTable.getStride(0), 1); + }); + + it("should decompress the CASE_ORBIT map correctly", () => { + const orbit = UnicodeTables.CASE_ORBIT; + + assert.strictEqual(orbit.has(65), false); + assert.strictEqual(orbit.has(75), false); + assert.strictEqual(orbit.has(83), false); + + assert.strictEqual(orbit.has(115), true); + assert.strictEqual(orbit.get(115), 383); + + assert.strictEqual(orbit.has(0x0131), true); + assert.strictEqual(orbit.get(0x0131), 0x0131); + + assert.strictEqual(orbit.has(107), true); + assert.strictEqual(orbit.get(107), 8490); + }); + + it("should decompress the Nd (Decimal Digits) table correctly with strides", () => { + const ndTable = UnicodeTables.CATEGORIES.get("Nd"); + assert.ok(ndTable, "Nd category table missing"); + + assert.ok(ndTable.length > 0); + + let foundAsciiDigits = false; + for (let i = 0; i < ndTable.length; i++) { + if (ndTable.getLo(i) === 48 && ndTable.getHi(i) === 57) { + foundAsciiDigits = true; + assert.strictEqual(ndTable.getStride(i), 1); + break; + } + } + + assert.strictEqual(foundAsciiDigits, true); + }); +}); diff --git a/packages/re2/src/__tests__/UnicodeConformance.test.ts b/packages/re2/src/__tests__/UnicodeConformance.test.ts new file mode 100644 index 0000000..272a642 --- /dev/null +++ b/packages/re2/src/__tests__/UnicodeConformance.test.ts @@ -0,0 +1,120 @@ +import { describe, test } from "node:test"; +import * as assert from "node:assert/strict"; +import { RE2JS } from "../index.js"; +import { UnicodeTables } from "../UnicodeTables.js"; +// @ts-expect-error — no types published for this package +import unicodePropertyValueAliases from "unicode-property-value-aliases"; + +const aliasesToNames = unicodePropertyValueAliases.get( + "General_Category", +) as Map; + +const loadCodePoints = async ( + kind: "General_Category" | "Script", + longName: string, +): Promise => { + const mod = await import( + `@unicode/unicode-16.0.0/${kind}/${longName}/code-points.js` + ); + return mod.default as number[]; +}; + +const samplePositives = (points: number[], n = 100): number[] => { + if (points.length <= n) return points; + const out = [points[0], points[points.length - 1]]; + const step = Math.floor(points.length / (n - 2)); + for (let i = step; i < points.length - step; i += step) out.push(points[i]); + return out; +}; + +const NEGATIVE_PROBES: number[] = (() => { + const probes: number[] = []; + const step = Math.floor(0x10ffff / 200); + for (let cp = 0; cp <= 0x10ffff; cp += step) probes.push(cp); + return probes; +})(); + +const SHRINKING_PROPERTIES: ReadonlySet = new Set(["Cn", "Unknown"]); + +const assertConformance = ( + propertyName: string, + kind: "General_Category" | "Script", + canonical: number[], +): void => { + const canonicalSet = new Set(canonical); + const platformPattern = `^\\p{${kind}=${propertyName}}$`; + let platformRegex: RegExp; + try { + platformRegex = new RegExp(platformPattern, "u"); + } catch { + const re = RE2JS.compile(`^\\p{${propertyName}}$`); + for (const cp of samplePositives(canonical)) { + assert.strictEqual(re.testExact(String.fromCodePoint(cp)), true); + } + return; + } + + const reBare = RE2JS.compile(`^\\p{${propertyName}}$`); + const reCombined = RE2JS.compile(`^[\\p{${propertyName}}]$`); + + for (const cp of samplePositives(canonical)) { + const s = String.fromCodePoint(cp); + if (!platformRegex.test(s)) continue; + assert.strictEqual(reBare.testExact(s), true); + assert.strictEqual(reCombined.testExact(s), true); + } + + for (const cp of NEGATIVE_PROBES) { + const s = String.fromCodePoint(cp); + if (canonicalSet.has(cp)) continue; + if (platformRegex.test(s)) continue; + assert.strictEqual(reBare.testExact(s), false); + assert.strictEqual(reCombined.testExact(s), false); + } + + if (!SHRINKING_PROPERTIES.has(propertyName)) { + for (const cp of samplePositives(canonical)) { + const s = String.fromCodePoint(cp); + assert.strictEqual(reBare.testExact(s), true); + assert.strictEqual(reCombined.testExact(s), true); + } + } +}; + +describe("Unicode 16.0 conformance — general categories", () => { + const aliases = Array.from(UnicodeTables.STABLE_CATEGORY_NAMES); + for (const alias of aliases) { + test(`\\p{${alias}} matches Unicode 16.0`, async () => { + const longName = aliasesToNames.get(alias); + if (!longName) { + throw new Error(`no canonical long name for category alias '${alias}'`); + } + const canonical = await loadCodePoints("General_Category", longName); + assertConformance(alias, "General_Category", canonical); + }); + } +}); + +describe("Unicode 16.0 conformance — scripts", () => { + const scriptNames = Array.from(UnicodeTables.STABLE_SCRIPT_NAMES); + for (const name of scriptNames) { + test(`\\p{${name}} matches Unicode 16.0`, async () => { + const canonical = await loadCodePoints("Script", name); + assertConformance(name, "Script", canonical); + }); + } +}); + +describe("Unicode 16.0 conformance — scripts added in 16.0", () => { + const newScriptNames = Array.from(UnicodeTables.NEW_SCRIPT_NAMES); + for (const name of newScriptNames) { + test(`\\p{${name}} matches Unicode 16.0 (full scan)`, async () => { + const canonical = await loadCodePoints("Script", name); + assertConformance(name, "Script", canonical); + const reBare = RE2JS.compile(`^\\p{${name}}$`); + for (const cp of canonical) { + assert.strictEqual(reBare.testExact(String.fromCodePoint(cp)), true); + } + }); + } +}); diff --git a/packages/re2/src/__tests__/UnicodeTableContents.test.ts b/packages/re2/src/__tests__/UnicodeTableContents.test.ts new file mode 100644 index 0000000..02555a9 --- /dev/null +++ b/packages/re2/src/__tests__/UnicodeTableContents.test.ts @@ -0,0 +1,175 @@ +import { describe, test } from "node:test"; +import * as assert from "node:assert/strict"; +import { UnicodeTables } from "../UnicodeTables.js"; +import type { UnicodeRangeTable } from "../UnicodeRangeTable.js"; +// @ts-expect-error - package has no .d.ts +import propertyValueAliases from "unicode-property-value-aliases"; + +const GC_ALIAS_TO_LONG = propertyValueAliases.get("General_Category") as Map< + string, + string +>; + +// Expands either a stride-encoded Uint32Array (triples of lo/hi/stride, as +// returned by _deltaCategoryRanges / _deltaScriptRanges) or a full +// UnicodeRangeTable into a Set of code points. +const expandStrideTriples = (arr: Uint32Array): Set => { + const set = new Set(); + for (let i = 0; i < arr.length; i += 3) { + const lo = arr[i]; + const hi = arr[i + 1]; + const stride = arr[i + 2]; + for (let cp = lo; cp <= hi; cp += stride) set.add(cp); + } + return set; +}; + +const expandTable = (table: UnicodeRangeTable): Set => { + const set = new Set(); + for (let i = 0; i < table.length; i++) { + const lo = table.getLo(i); + const hi = table.getHi(i); + const stride = table.getStride(i); + for (let cp = lo; cp <= hi; cp += stride) set.add(cp); + } + return set; +}; + +const loadCodePoints = async ( + pkg: "@unicode/unicode-15.0.0" | "@unicode/unicode-16.0.0", + property: "General_Category" | "Script", + longName: string, +): Promise | null> => { + try { + const mod = (await import( + `${pkg}/${property}/${longName}/code-points.js` + )) as { default: number[] }; + return new Set(mod.default); + } catch { + return null; + } +}; + +const fmtCp = (cp: number): string => + `U+${cp.toString(16).toUpperCase().padStart(4, "0")}`; + +const diffSets = ( + actual: Set, + expected: Set, +): { missing: number[]; extra: number[] } => { + const missing: number[] = []; + const extra: number[] = []; + for (const cp of expected) if (!actual.has(cp)) missing.push(cp); + for (const cp of actual) if (!expected.has(cp)) extra.push(cp); + missing.sort((a, b) => a - b); + extra.sort((a, b) => a - b); + return { missing, extra }; +}; + +const setDiff = (a: Set, b: Set): Set => { + const out = new Set(); + for (const cp of a) if (!b.has(cp)) out.add(cp); + return out; +}; + +const assertSetsEqual = ( + label: string, + actual: Set, + expected: Set, +): void => { + const { missing, extra } = diffSets(actual, expected); + if (missing.length + extra.length === 0) return; + const previewMissing = missing.slice(0, 5).map(fmtCp).join(", "); + const previewExtra = extra.slice(0, 5).map(fmtCp).join(", "); + assert.fail( + `${label}: missing ${missing.length} [${previewMissing}], extra ${extra.length} [${previewExtra}]`, + ); +}; + +// Validates that the bundled 15→16 delta data equals exactly +// setDiff(unicode16, unicode15) for each stable property name. This +// simulates a Unicode-15 host without requiring one: if the decoded +// delta matches setDiff(16, 15), then on any Unicode 15 engine the +// merged runtime result (sweep(15) ∪ delta) equals Unicode 16.0 for +// every property that is purely additive across the 15→16 transition. +describe("UnicodeTables 15→16 delta matches setDiff(unicode16, unicode15)", () => { + for (const alias of UnicodeTables.STABLE_CATEGORY_NAMES) { + const longName = GC_ALIAS_TO_LONG.get(alias); + test(`category ${alias} (${longName ?? "?"})`, async () => { + assert.ok(longName !== undefined, `no GC long name for ${alias}`); + const cp15 = await loadCodePoints( + "@unicode/unicode-15.0.0", + "General_Category", + longName, + ); + const cp16 = await loadCodePoints( + "@unicode/unicode-16.0.0", + "General_Category", + longName, + ); + assert.ok(cp15, `missing Unicode 15 data for ${longName}`); + assert.ok(cp16, `missing Unicode 16 data for ${longName}`); + + const expectedDelta = setDiff(cp16, cp15); + const decoded = UnicodeTables._deltaCategoryRanges(alias); + const actualDelta = + decoded === null ? new Set() : expandStrideTriples(decoded); + + assertSetsEqual(`delta(${alias})`, actualDelta, expectedDelta); + }); + } + + for (const name of UnicodeTables.STABLE_SCRIPT_NAMES) { + test(`script ${name}`, async () => { + const cp15 = await loadCodePoints( + "@unicode/unicode-15.0.0", + "Script", + name, + ); + const cp16 = await loadCodePoints( + "@unicode/unicode-16.0.0", + "Script", + name, + ); + assert.ok(cp15, `missing Unicode 15 data for script ${name}`); + assert.ok(cp16, `missing Unicode 16 data for script ${name}`); + + const expectedDelta = setDiff(cp16, cp15); + const decoded = UnicodeTables._deltaScriptRanges(name); + const actualDelta = + decoded === null ? new Set() : expandStrideTriples(decoded); + + assertSetsEqual(`delta(${name})`, actualDelta, expectedDelta); + }); + } +}); + +// Scripts introduced in Unicode 16.0 have no 15.0 counterpart, so the +// generator bundles the full table. Validate each full-table equals +// Unicode 16.0 ground truth exactly. +describe("UnicodeTables new-in-16.0 scripts match Unicode 16.0 exactly", () => { + for (const name of UnicodeTables.NEW_SCRIPT_NAMES) { + test(name, async () => { + const cp15 = await loadCodePoints( + "@unicode/unicode-15.0.0", + "Script", + name, + ); + assert.strictEqual( + cp15, + null, + `${name} is listed as new-in-16 but exists in Unicode 15.0`, + ); + const cp16 = await loadCodePoints( + "@unicode/unicode-16.0.0", + "Script", + name, + ); + assert.ok(cp16, `missing Unicode 16 data for script ${name}`); + + const table = UnicodeTables._newScriptTable(name); + assert.ok(table, `_newScriptTable(${name}) returned null`); + assertSetsEqual(`newScript(${name})`, expandTable(table), cp16); + }); + } +}); diff --git a/packages/re2/src/__tests__/edge-cases.test.ts b/packages/re2/src/__tests__/edge-cases.test.ts new file mode 100644 index 0000000..1e4e180 --- /dev/null +++ b/packages/re2/src/__tests__/edge-cases.test.ts @@ -0,0 +1,477 @@ +import { describe, test, it } from "node:test"; +import * as assert from "node:assert/strict"; +import { RE2JS } from "../index.js"; +import { RE2 } from "../RE2.js"; +import { fromUTF16 } from "../MachineInput.js"; +import { ANCHOR_BOTH, ANCHOR_START, UNANCHORED } from "../RE2Flags.js"; + +describe("Edge cases and bug hunting", () => { + describe("Empty patterns and inputs", () => { + it("empty pattern matches empty string", () => { + const re = RE2JS.compile(""); + assert.strictEqual(re.testExact(""), true); + }); + + it("empty pattern matches any non-empty string via test (zero-width)", () => { + const re = RE2JS.compile(""); + assert.strictEqual(re.test("anything"), true); + }); + + it("empty pattern + testExact on non-empty fails", () => { + const re = RE2JS.compile(""); + assert.strictEqual(re.testExact("x"), false); + }); + + it("(?:) non-capturing empty group matches empty", () => { + const re = RE2JS.compile("(?:)"); + assert.strictEqual(re.testExact(""), true); + }); + + it("^$ matches empty string", () => { + const re = RE2JS.compile("^$"); + assert.strictEqual(re.testExact(""), true); + assert.strictEqual(re.testExact("x"), false); + }); + + it("^$ does not match any content", () => { + const re = RE2JS.compile("^$"); + assert.strictEqual(re.test("hello"), false); + }); + + it("(?m)^$ matches at line boundaries", () => { + const re = RE2JS.compile("(?m)^$"); + assert.strictEqual(re.test("a\n\nb"), true); + assert.strictEqual(re.test("abc"), false); + }); + }); + + describe("Repetition edge cases", () => { + it("a{0} matches empty string", () => { + const re = RE2JS.compile("^a{0}$"); + assert.strictEqual(re.test(""), true); + assert.strictEqual(re.test("a"), false); + }); + + it("a{0,0} matches empty string", () => { + const re = RE2JS.compile("^a{0,0}$"); + assert.strictEqual(re.test(""), true); + }); + + it("a{0,1} matches empty or single a", () => { + const re = RE2JS.compile("^a{0,1}$"); + assert.strictEqual(re.test(""), true); + assert.strictEqual(re.test("a"), true); + assert.strictEqual(re.test("aa"), false); + }); + + it("zero-width repetition inside group", () => { + const re = RE2JS.compile("^(a*)$"); + assert.strictEqual(re.test(""), true); + assert.strictEqual(re.test("aaaa"), true); + }); + }); + + describe("Unicode edge cases", () => { + it("pattern at MAX_RUNE boundary", () => { + const re = RE2JS.compile("."); + const maxRune = String.fromCodePoint(0x10ffff); + assert.strictEqual(re.testExact(maxRune), true); + }); + + it("surrogate pair matches as single rune with dot", () => { + const re = RE2JS.compile("^.$"); + assert.strictEqual(re.testExact("😊"), true); + }); + + it("character class with supplementary plane", () => { + const re = RE2JS.compile("[\u{1F600}-\u{1F64F}]+"); + assert.strictEqual(re.testExact("😊😊😊"), true); + }); + + it("surrogate pair at end of string", () => { + const re = RE2JS.compile("abc😊$"); + assert.strictEqual(re.test("abc😊"), true); + }); + + it("word boundary with non-ASCII word chars", () => { + const re = RE2JS.compile("\\bhello\\b"); + assert.strictEqual(re.test("hello world"), true); + assert.strictEqual(re.test("éhello"), true); + }); + }); + + describe("Anchor semantics", () => { + it("^^ double anchors", () => { + const re = RE2JS.compile("^^abc"); + assert.strictEqual(re.test("abc"), true); + assert.strictEqual(re.test("xabc"), false); + }); + + it("$$ double end anchors", () => { + const re = RE2JS.compile("abc$$"); + assert.strictEqual(re.test("abc"), true); + }); + + it("\\A at start only (no multiline equivalent)", () => { + const re = RE2JS.compile("(?m)\\Aabc"); + assert.strictEqual(re.test("abc"), true); + assert.strictEqual(re.test("x\nabc"), false); + }); + + it("\\z at end only", () => { + const re = RE2JS.compile("(?m)abc\\z"); + assert.strictEqual(re.test("abc"), true); + assert.strictEqual(re.test("abc\nx"), false); + }); + }); + + describe("Word boundary + anchor correctness", () => { + it("\\babc\\b does not match when followed by word char", () => { + const re = RE2JS.compile("\\babc\\b"); + assert.strictEqual(re.test("abc"), true); + assert.strictEqual(re.test("abcx"), false); + assert.strictEqual(re.test("xabc"), false); + assert.strictEqual(re.test(" abc "), true); + assert.strictEqual(re.test(" abcx"), false); + assert.strictEqual(re.test("xabc "), false); + }); + + it("\\babc\\b under NFA fallback agrees with DFA", () => { + const re = RE2JS.compile("\\babc\\b"); + re.re2Input.dfa.failed = true; + assert.strictEqual(re.test("abc"), true); + assert.strictEqual(re.test("abcx"), false); + assert.strictEqual(re.test(" abc "), true); + assert.strictEqual(re.test("xabc"), false); + }); + }); + + describe("Word boundaries", () => { + it("\\b at start of input", () => { + const re = RE2JS.compile("\\babc"); + assert.strictEqual(re.test("abc"), true); + }); + + it("\\b at end of input", () => { + const re = RE2JS.compile("abc\\b"); + assert.strictEqual(re.test("abc"), true); + }); + + it("\\B does not match at text boundary", () => { + const re = RE2JS.compile("\\Babc"); + assert.strictEqual(re.test("abc"), false); + assert.strictEqual(re.test("xabc"), true); + }); + + it("\\b between numbers and letters (no boundary)", () => { + const re = RE2JS.compile("\\b1A\\b"); + assert.strictEqual(re.test("1A"), true); + assert.strictEqual(re.test("x1A"), false); + }); + }); + + describe("Execute engine directly with anchors", () => { + it("ANCHOR_START from pos 0 only matches if the pattern starts at 0", () => { + const re = RE2.compile("abc"); + assert.notStrictEqual( + re.executeEngine(fromUTF16("abcxyz"), 0, ANCHOR_START, 0), + null, + ); + assert.strictEqual( + re.executeEngine(fromUTF16("xabc"), 0, ANCHOR_START, 0), + null, + ); + }); + + it("ANCHOR_BOTH requires exact full-input match", () => { + const re = RE2.compile("abc"); + assert.notStrictEqual( + re.executeEngine(fromUTF16("abc"), 0, ANCHOR_BOTH, 0), + null, + ); + assert.strictEqual( + re.executeEngine(fromUTF16("abcd"), 0, ANCHOR_BOTH, 0), + null, + ); + assert.strictEqual( + re.executeEngine(fromUTF16("xabc"), 0, ANCHOR_BOTH, 0), + null, + ); + }); + + it("DFA and NFA agree on all anchor modes", () => { + const re = RE2.compile("abc"); + const inputStr = "xabcy"; + const dfaUA = re.executeEngine(fromUTF16(inputStr), 0, UNANCHORED, 0); + const dfaAS = re.executeEngine(fromUTF16(inputStr), 0, ANCHOR_START, 0); + const dfaAB = re.executeEngine(fromUTF16(inputStr), 0, ANCHOR_BOTH, 0); + + re.dfa.failed = true; + const nfaUA = re.executeEngine(fromUTF16(inputStr), 0, UNANCHORED, 0); + const nfaAS = re.executeEngine(fromUTF16(inputStr), 0, ANCHOR_START, 0); + const nfaAB = re.executeEngine(fromUTF16(inputStr), 0, ANCHOR_BOTH, 0); + + assert.strictEqual(Boolean(nfaUA), Boolean(dfaUA)); + assert.strictEqual(Boolean(nfaAS), Boolean(dfaAS)); + assert.strictEqual(Boolean(nfaAB), Boolean(dfaAB)); + }); + }); + + describe("Case folding edge cases", () => { + it("(?i) with Kelvin symbol", () => { + const re = RE2JS.compile("(?i)k"); + assert.strictEqual(re.test("k"), true); + assert.strictEqual(re.test("K"), true); + assert.strictEqual(re.test("\u212A"), true); + }); + + it("(?i) with long-s", () => { + const re = RE2JS.compile("(?i)s"); + assert.strictEqual(re.test("s"), true); + assert.strictEqual(re.test("S"), true); + assert.strictEqual(re.test("\u017F"), true); + }); + + it("(?i) with mixed case string", () => { + const re = RE2JS.compile("(?i)^hello$"); + assert.strictEqual(re.test("HELLO"), true); + assert.strictEqual(re.test("hElLo"), true); + assert.strictEqual(re.test("hello"), true); + }); + }); + + describe("Alternation with empty branches", () => { + it("a|b|c matches any of the three", () => { + const re = RE2JS.compile("^(a|b|c)$"); + assert.strictEqual(re.test("a"), true); + assert.strictEqual(re.test("b"), true); + assert.strictEqual(re.test("c"), true); + assert.strictEqual(re.test("d"), false); + }); + + it("empty branch in alternation", () => { + const re = RE2JS.compile("^(a|)$"); + assert.strictEqual(re.test("a"), true); + assert.strictEqual(re.test(""), true); + }); + + it("leading empty branch", () => { + const re = RE2JS.compile("^(|a)$"); + assert.strictEqual(re.test("a"), true); + assert.strictEqual(re.test(""), true); + }); + }); + + describe("Escape sequences at boundaries", () => { + it("\\x escape", () => { + const re = RE2JS.compile("\\x41"); + assert.strictEqual(re.test("A"), true); + }); + + it("\\x{...} extended escape", () => { + const re = RE2JS.compile("\\x{1F600}"); + assert.strictEqual(re.test("😀"), true); + }); + + it("\\n matches newline", () => { + const re = RE2JS.compile("a\\nb"); + assert.strictEqual(re.test("a\nb"), true); + }); + }); + + describe("Prefilter corner cases", () => { + it("literal fast-path with repeated characters", () => { + const re = RE2JS.compile("aaa"); + assert.strictEqual(re.test("xxaaaxx"), true); + assert.strictEqual(re.test("aa"), false); + }); + + it("literal fast-path with Unicode", () => { + const re = RE2JS.compile("café"); + assert.strictEqual(re.test("le café ouvert"), true); + assert.strictEqual(re.test("no coffee here"), false); + }); + + it("literal ANCHOR_BOTH with prefix extension", () => { + const re = RE2JS.compile("exact"); + assert.strictEqual(re.testExact("exact"), true); + assert.strictEqual(re.testExact("exactly"), false); + assert.strictEqual(re.testExact("not exact"), false); + }); + }); + + describe("Simplify edge cases", () => { + it("nested quantifiers in simplify", () => { + const re = RE2JS.compile("(a{2}){3}"); + assert.strictEqual(re.testExact("aaaaaa"), true); + assert.strictEqual(re.testExact("aaaaa"), false); + }); + + it("quantifier on empty match", () => { + const re = RE2JS.compile("^(?:){5}$"); + assert.strictEqual(re.test(""), true); + }); + + it("alternation of empties", () => { + const re = RE2JS.compile("^(?:||)$"); + assert.strictEqual(re.test(""), true); + assert.strictEqual(re.test("x"), false); + }); + }); + + describe("DFA and NFA equivalence on tricky patterns", () => { + const trickyCases: Array<[string, string, boolean]> = [ + ["\\bfoo\\b", "nofoo foo that", true], + ["\\b\\w+\\b", "hello world", true], + ["^\\w+$", "hello", true], + ["^\\w+$", "hello world", false], + ["(?m)^abc$", "xyz\nabc\ndef", true], + ["\\Bfoo\\B", "xfooy", true], + ["(?m)^(?:a|b)+$", "a\nb\nab", true], + ["(?m)^(?:a|b)+$", "a\nx\nb", true], + ["(?m)^(?:a|b)+$", "ax\nby\nab", true], + ["(?m)^(?:a|b)+$", "ax\nby\ncd", false], + ["(?m)^(?:a|b)+$", "ab", true], + ["^$", "", true], + ["^.*$", "", true], + ["^.*$", "anything", true], + ["$", "", true], + ["$", "a", true], + ]; + + for (const [pattern, input, expected] of trickyCases) { + test(`DFA matches /${pattern}/ against ${JSON.stringify(input)} => ${expected}`, () => { + const re = RE2JS.compile(pattern); + assert.strictEqual(re.test(input), expected); + }); + } + + for (const [pattern, input, expected] of trickyCases) { + test(`NFA fallback matches /${pattern}/ against ${JSON.stringify(input)} => ${expected}`, () => { + const re = RE2JS.compile(pattern); + re.re2Input.dfa.failed = true; + assert.strictEqual(re.test(input), expected); + }); + } + }); + + describe("Special escape sequences", () => { + it("\\Q...\\E with regex metacharacters", () => { + const re = RE2JS.compile("\\Q.+*?(){}[]|^$\\E"); + assert.strictEqual(re.test(".+*?(){}[]|^$"), true); + assert.strictEqual(re.test("a"), false); + }); + + it("\\Q...\\E without closing \\E goes to end", () => { + const re = RE2JS.compile("foo\\Q.*$"); + assert.strictEqual(re.test("foo.*$"), true); + assert.strictEqual(re.test("foobar"), false); + }); + + it("\\Q\\E empty quoted section", () => { + const re = RE2JS.compile("a\\Q\\Eb"); + assert.strictEqual(re.test("ab"), true); + }); + }); + + describe("DOTALL vs newline", () => { + it(". does not match newline without DOTALL", () => { + const re = RE2JS.compile("^.+$"); + assert.strictEqual(re.test("abc"), true); + assert.strictEqual(re.test("a\nb"), false); + }); + + it(". matches newline with (?s) DOTALL", () => { + const re = RE2JS.compile("(?s)^.+$"); + assert.strictEqual(re.test("abc"), true); + assert.strictEqual(re.test("a\nb"), true); + }); + + it("[\\s\\S] matches all chars without DOTALL", () => { + const re = RE2JS.compile("^[\\s\\S]+$"); + assert.strictEqual(re.test("a\nb\tc"), true); + }); + }); + + describe("Unicode property interactions", () => { + it("negated Unicode property [^\\p{L}]", () => { + const re = RE2JS.compile("^[^\\p{L}]+$"); + assert.strictEqual(re.testExact("123 !@#"), true); + assert.strictEqual(re.testExact("abc"), false); + assert.strictEqual(re.testExact("αβγ"), false); + }); + + it("\\P{L} is non-letter", () => { + const re = RE2JS.compile("^\\P{L}+$"); + assert.strictEqual(re.testExact("123"), true); + assert.strictEqual(re.testExact("abc"), false); + }); + + it("combined Unicode properties", () => { + const re = RE2JS.compile("^[\\p{L}\\p{N}]+$"); + assert.strictEqual(re.testExact("abc123"), true); + assert.strictEqual(re.testExact("αβγ123"), true); + assert.strictEqual(re.testExact("abc!"), false); + }); + }); + + describe("POSIX classes (RE2 syntax)", () => { + it("[[:alpha:]] matches letters", () => { + const re = RE2JS.compile("^[[:alpha:]]+$"); + assert.strictEqual(re.testExact("abc"), true); + assert.strictEqual(re.testExact("ABC"), true); + assert.strictEqual(re.testExact("abc123"), false); + }); + + it("[[:digit:]] matches digits", () => { + const re = RE2JS.compile("^[[:digit:]]+$"); + assert.strictEqual(re.testExact("123"), true); + assert.strictEqual(re.testExact("abc"), false); + }); + + it("negated POSIX class [[:^alpha:]]", () => { + const re = RE2JS.compile("^[[:^alpha:]]+$"); + assert.strictEqual(re.testExact("123"), true); + assert.strictEqual(re.testExact("abc"), false); + }); + }); + + describe("Named groups (parsing only, no capture extraction)", () => { + it("(?P...) parses and matches", () => { + const re = RE2JS.compile("(?P\\w+)"); + assert.strictEqual(re.test("hello"), true); + }); + + it("(?...) Perl-style named group is also accepted", () => { + const re = RE2JS.compile("(?\\w+)"); + assert.strictEqual(re.test("hello"), true); + assert.deepStrictEqual( + re.namedGroups(), + new Map().set("word", 1), + ); + }); + + it("namedGroups() returns group name map", () => { + const re = RE2JS.compile("(?P\\w+)\\s+(?P\\w+)"); + assert.deepStrictEqual( + re.namedGroups(), + new Map().set("first", 1).set("last", 2), + ); + }); + }); + + describe("Parser cache correctness (identity-based Map)", () => { + it("deeply nested captures parse without height/size cache collision", () => { + const deepPattern = + "(" + "(?:(?:".repeat(100) + "a" + ")*)*".repeat(100) + ")"; + assert.doesNotThrow(() => RE2JS.compile(deepPattern)); + }); + + it("many parallel captures with same shape", () => { + const pattern = "^" + "(a)".repeat(100) + "$"; + const re = RE2JS.compile(pattern); + assert.strictEqual(re.test("a".repeat(100)), true); + assert.strictEqual(re.test("a".repeat(99)), false); + }); + }); +}); diff --git a/packages/re2/src/__tests__/index.test.ts b/packages/re2/src/__tests__/index.test.ts new file mode 100644 index 0000000..6cd52bc --- /dev/null +++ b/packages/re2/src/__tests__/index.test.ts @@ -0,0 +1,243 @@ +import { describe, test, it } from "node:test"; +import * as assert from "node:assert/strict"; +import { RE2JS } from "../index.js"; +import type { RE2JSSyntaxException } from "../exceptions.js"; + +it("compile", () => { + const p = RE2JS.compile("abc"); + assert.strictEqual(p.pattern(), "abc"); + assert.strictEqual(p.flags(), 0); +}); + +it("compile exception with duplicate groups", () => { + assert.throws( + () => RE2JS.compile("(?P.*)(?P.*"), + (e: Error) => + e.message.includes( + "error parsing regexp: duplicate capture group name: `any`", + ), + ); +}); + +it(".toString", () => { + assert.strictEqual(RE2JS.compile("abc").toString(), "abc"); +}); + +it("compile flags", () => { + const p = RE2JS.compile("abc", 5); + assert.strictEqual(p.pattern(), "abc"); + assert.strictEqual(p.flags(), 5); +}); + +it("syntax error", () => { + const compile = () => RE2JS.compile("abc("); + assert.throws(compile, (e: Error) => + e.message.includes("error parsing regexp: missing closing ): `abc(`"), + ); + + let error: RE2JSSyntaxException | null = null; + try { + compile(); + } catch (e) { + error = e as RE2JSSyntaxException; + } + + assert.notStrictEqual(error, null); + assert.strictEqual(error?.getDescription(), "missing closing )"); + assert.strictEqual( + error?.message, + "error parsing regexp: missing closing ): `abc(`", + ); + assert.strictEqual(error?.getPattern(), "abc("); +}); + +describe("matches no flags", () => { + const source = String.fromCodePoint(110781); + const cases: [string, string, string][] = [ + ["ab+c", "abbbc", "cbbba"], + ["ab.*c", "abxyzc", "ab\nxyzc"], + ["^ab.*c$", "abc", "xyz\nabc\ndef"], + [source, source, "blah"], + [`\\Q${source}\\E`, source, "blah"], + ]; + + for (const [regexp, match, nonMatch] of cases) { + test(`regexp ${JSON.stringify(regexp)} match ${JSON.stringify(match)} and not match ${JSON.stringify(nonMatch)}`, () => { + assert.strictEqual(RE2JS.matches(regexp, match), true); + assert.strictEqual(RE2JS.matches(regexp, nonMatch), false); + }); + } +}); + +describe("matches with flags", () => { + const cases: [string, number, string, string][] = [ + ["ab+c", 0, "abbbc", "cbba"], + ["ab+c", RE2JS.CASE_INSENSITIVE, "abBBc", "cbbba"], + ["ab.*c", 0, "abxyzc", "ab\nxyzc"], + ["ab.*c", RE2JS.DOTALL, "ab\nxyzc", "aB\nxyzC"], + ["ab.*c", RE2JS.DOTALL | RE2JS.CASE_INSENSITIVE, "aB\nxyzC", "z"], + ["^ab.*c$", 0, "abc", "xyz\nabc\ndef"], + ["^ab.*c$", RE2JS.MULTILINE, "abc", "xyz\nabc\ndef"], + ["^ab.*c$", RE2JS.MULTILINE, "abc", ""], + ["^ab.*c$", RE2JS.DOTALL | RE2JS.MULTILINE, "ab\nc", "AB\nc"], + [ + "^ab.*c$", + RE2JS.DOTALL | RE2JS.MULTILINE | RE2JS.CASE_INSENSITIVE, + "AB\nc", + "z", + ], + ]; + + for (const [regexp, flags, match, nonMatch] of cases) { + test(`regexp ${JSON.stringify(regexp)} with flags ${flags} match ${JSON.stringify(match)} and not match ${JSON.stringify(nonMatch)}`, () => { + const p = RE2JS.compile(regexp, flags); + assert.strictEqual(p.matches(match), true); + assert.strictEqual(p.matches(nonMatch), false); + }); + } +}); + +describe(".test (Unanchored DFA Match)", () => { + const cases: [string, string, boolean][] = [ + ["foo", "foo", true], + ["foo", "a foo b", true], + ["foo", "bar", false], + ["(?i)foo", "FoO", true], + ["^[a-z]+$", "hello", true], + ["^[a-z]+$", "hello 123", false], + [ + "enters.*battlefield", + "When this creature enters the battlefield, it deals 3 damage", + true, + ], + ["[0-9]+ mana", "Add 1 mana of any color", true], + ]; + + for (const [pattern, input, expected] of cases) { + test(`pattern ${JSON.stringify(pattern)} with input ${JSON.stringify(input)} will return ${expected}`, () => { + const re = RE2JS.compile(pattern); + assert.strictEqual(re.test(input), expected); + }); + } +}); + +describe(".testExact (Anchored DFA Match)", () => { + const cases: [string, string, boolean][] = [ + ["foo", "foo", true], + ["foo", "a foo b", false], + ["foo", "foobar", false], + ["[a-z]+", "hello", true], + ["[a-z]+", "hello 123", false], + ["(?i)foo", "FOO", true], + ["[0-9A-Fa-f]+", "1A4F", true], + ["[0-9A-Fa-f]+", "1A4F-xyz", false], + ]; + + for (const [pattern, input, expected] of cases) { + test(`pattern ${JSON.stringify(pattern)} with input ${JSON.stringify(input)} will return ${expected}`, () => { + const re = RE2JS.compile(pattern); + assert.strictEqual(re.testExact(input), expected); + }); + } +}); + +describe("group count", () => { + const cases: [string, number][] = [ + ["(.*)ab(.*)a", 2], + ["(.*)(ab)(.*)a", 3], + ["(.*)((a)b)(.*)a", 4], + ["(.*)(\\(ab)(.*)a", 3], + ["(.*)(\\(a\\)b)(.*)a", 3], + ]; + + for (const [pattern, count] of cases) { + test(`pattern ${JSON.stringify(pattern)} have groups ${count}`, () => { + const p = RE2JS.compile(pattern); + assert.strictEqual(p.groupCount(), count); + }); + } +}); + +describe("named groups", () => { + const cases: [string, Map][] = [ + ["(?P\\d{2})", new Map().set("foo", 1)], + ["\\d{2}", new Map()], + ["hello", new Map()], + ["(.*)", new Map()], + ["(?P.*)", new Map().set("any", 1)], + [ + "(?P.*)(?P.*)", + new Map().set("foo", 1).set("bar", 2), + ], + ]; + + for (const [pattern, expected] of cases) { + test(`pattern ${JSON.stringify(pattern)} named groups ${JSON.stringify(expected)}`, () => { + assert.deepStrictEqual(RE2JS.compile(pattern).namedGroups(), expected); + }); + } +}); + +it("quote", () => { + const regexp = RE2JS.quote("ab+c"); + const match = "ab+c"; + const nonMatch = "abc"; + + assert.strictEqual(RE2JS.matches(regexp, match), true); + assert.strictEqual(RE2JS.matches(regexp, nonMatch), false); +}); + +it("email regex", () => { + const p = RE2JS.compile("[\\w\\.]+@[\\w\\.]+"); + assert.strictEqual(p.matches("test@example.com"), true); + assert.strictEqual(p.matches("test"), false); +}); + +it("date regex", () => { + const p = RE2JS.compile( + "([0-9]{4})-?(1[0-2]|0[1-9])-?(3[01]|0[1-9]|[12][0-9])", + ); + assert.strictEqual(p.matches("2023-10-12"), true); + assert.strictEqual(p.matches("2023-02-02"), true); + assert.strictEqual(p.matches("300"), false); + assert.strictEqual(p.matches("example 2023-02-02 date"), false); +}); + +describe("Core Unicode Properties (Ascii, Assigned, Lc)", () => { + it("compiles without error", () => { + assert.doesNotThrow(() => RE2JS.compile("\\p{Ascii}")); + assert.doesNotThrow(() => RE2JS.compile("\\p{Assigned}")); + assert.doesNotThrow(() => RE2JS.compile("\\p{Lc}")); + }); + + it("matches \\p{Ascii} correctly", () => { + const p = RE2JS.compile("^\\p{Ascii}+$"); + + assert.strictEqual(p.matches("abc123!@#\x7F"), true); + assert.strictEqual(p.matches("abc😊"), false); + }); + + it("matches \\p{Lc} (Cased Letters) correctly", () => { + const p = RE2JS.compile("^\\p{Lc}+$"); + + assert.strictEqual(p.matches("aBcDeFéÜ"), true); + assert.strictEqual(p.matches("aBcDeF1"), false); + assert.strictEqual(p.matches("aBcDeF "), false); + }); + + it("matches \\p{Assigned} correctly (Inverse of Cn)", () => { + const p = RE2JS.compile("^\\p{Assigned}+$"); + + assert.strictEqual(p.matches("abc123!@#😊"), true); + + const unassignedChar = String.fromCodePoint(0x0378); + assert.strictEqual(p.matches(unassignedChar), false); + }); + + it("matches case insensitive correctly", () => { + const p = RE2JS.compile("(?i)^hello$"); + assert.strictEqual(p.matches("hello"), true); + assert.strictEqual(p.matches("HELLO"), true); + assert.strictEqual(p.matches("HELlo"), true); + }); +}); diff --git a/packages/re2/src/__tests__/stability.test.ts b/packages/re2/src/__tests__/stability.test.ts new file mode 100644 index 0000000..7b3be80 --- /dev/null +++ b/packages/re2/src/__tests__/stability.test.ts @@ -0,0 +1,278 @@ +import { describe, test } from "node:test"; +import * as assert from "node:assert/strict"; +import { RE2JS } from "../index.js"; + +describe("RE2JS Stability and Anti-ReDoS Guarantees", () => { + describe("Catastrophic Backtracking Immunity (ReDoS)", () => { + const assertLinearTime = ( + regexStr: string, + inputStr: string, + expectedMatch: boolean, + ): void => { + const re = RE2JS.compile(regexStr); + const start = Date.now(); + const result = re.matches(inputStr); + const elapsed = Date.now() - start; + + assert.strictEqual(result, expectedMatch); + assert.ok(elapsed < 50); + }; + + test("Defeats classic nested repetition ReDoS: (a+)+b", () => { + assertLinearTime("^(a+)+b$", `${"a".repeat(60)}!`, false); + }); + + test("Defeats overlapping alternation ReDoS: (a|a?)+", () => { + assertLinearTime("^(a|a?)+$", `${"a".repeat(60)}!`, false); + }); + + test("Defeats OWASP Email Validation ReDoS", () => { + const emailRegex = + "^([a-zA-Z0-9_.-])+@(([a-zA-Z0-9-])+.)+([a-zA-Z0-9]{2,4})+$"; + const maliciousEmail = `${"a".repeat(60)}@${"a".repeat(60)}.`; + assertLinearTime(emailRegex, maliciousEmail, false); + }); + + test("Defeats OWASP Whitespace / Content Exhaustion ReDoS", () => { + const whitespaceRegex = "^.*[ \\t]+.*$"; + const maliciousWhitespace = ` ${"\\t ".repeat(40)} `; + assertLinearTime(whitespaceRegex, maliciousWhitespace, true); + }); + + test("Defeats Path/URL Parsing ReDoS", () => { + const pathRegex = "^(/[^/]+)+$"; + const maliciousPath = `${"/a".repeat(60)}/`; + assertLinearTime(pathRegex, maliciousPath, false); + }); + }); + + describe("Infinite Loop & Memory Explosion Protections", () => { + test("Safely matches massive strings without exceeding Call Stack Size", () => { + const re = RE2JS.compile("a*b"); + const hugeString = `${"a".repeat(1000000)}b`; + assert.strictEqual(re.matches(hugeString), true); + }); + + test("Gracefully handles empty strings without crashing", () => { + const re1 = RE2JS.compile(".*"); + assert.strictEqual(re1.matches(""), true); + + const re2 = RE2JS.compile("a+"); + assert.strictEqual(re2.matches(""), false); + }); + + test("Properly scales multi-byte surrogate pairs (Emojis) in execution", () => { + const re = RE2JS.compile("^.$"); + assert.strictEqual(re.matches("😊"), true); + + const reEmoji = RE2JS.compile("^\\p{So}+$"); + assert.strictEqual(reEmoji.matches("😊🚀👽"), true); + }); + + test("DFA State Explosion limits are enforced (OOM Protection)", () => { + const re = RE2JS.compile(".*a.*b.*c"); + re.re2Input.dfa.stateLimit = 5; + assert.strictEqual(re.test("zzzaaazzzbbbzzzccczzz"), true); + }); + }); + + describe("NFA Fallback Correctness (DFA state explosion)", () => { + const forceNfaFallback = (regexStr: string, flags = 0) => { + const re = RE2JS.compile(regexStr, flags); + re.re2Input.dfa.failed = true; + return re; + }; + + describe("literal and simple patterns", () => { + test("matches simple literals", () => { + const re = forceNfaFallback("hello"); + assert.strictEqual(re.test("hello"), true); + assert.strictEqual(re.test("say hello world"), true); + assert.strictEqual(re.test("xyz"), false); + }); + + test("matches character classes", () => { + const re = forceNfaFallback("[a-z]+"); + assert.strictEqual(re.testExact("hello"), true); + assert.strictEqual(re.testExact("Hello"), false); + assert.strictEqual(re.test("XYZabcXYZ"), true); + }); + + test("matches negated character classes", () => { + const re = forceNfaFallback("[^0-9]+"); + assert.strictEqual(re.testExact("abc"), true); + assert.strictEqual(re.testExact("abc1"), false); + }); + }); + + describe("repetition", () => { + test("handles star (*)", () => { + const re = forceNfaFallback("a*b"); + assert.strictEqual(re.test("b"), true); + assert.strictEqual(re.test("ab"), true); + assert.strictEqual(re.test("aaaab"), true); + assert.strictEqual(re.test("c"), false); + }); + + test("handles plus (+)", () => { + const re = forceNfaFallback("a+b"); + assert.strictEqual(re.test("ab"), true); + assert.strictEqual(re.test("aaaab"), true); + assert.strictEqual(re.test("b"), false); + }); + + test("handles question (?)", () => { + const re = forceNfaFallback("colou?r"); + assert.strictEqual(re.test("color"), true); + assert.strictEqual(re.test("colour"), true); + assert.strictEqual(re.test("colouur"), false); + }); + + test("handles bounded repetition {m,n}", () => { + const re = forceNfaFallback("^a{2,4}$"); + assert.strictEqual(re.test("a"), false); + assert.strictEqual(re.test("aa"), true); + assert.strictEqual(re.test("aaaa"), true); + assert.strictEqual(re.test("aaaaa"), false); + }); + }); + + describe("alternation", () => { + test("handles simple alternation", () => { + const re = forceNfaFallback("foo|bar|baz"); + assert.strictEqual(re.test("foo"), true); + assert.strictEqual(re.test("bar"), true); + assert.strictEqual(re.test("baz"), true); + assert.strictEqual(re.test("qux"), false); + }); + + test("handles overlapping alternation", () => { + const re = forceNfaFallback("(abc|abd)"); + assert.strictEqual(re.test("abc"), true); + assert.strictEqual(re.test("abd"), true); + assert.strictEqual(re.test("abe"), false); + }); + }); + + describe("anchors and empty-width assertions", () => { + test("handles begin-text ^", () => { + const re = forceNfaFallback("^abc"); + assert.strictEqual(re.test("abc"), true); + assert.strictEqual(re.test("xabc"), false); + }); + + test("handles end-text $", () => { + const re = forceNfaFallback("abc$"); + assert.strictEqual(re.test("abc"), true); + assert.strictEqual(re.test("abcx"), false); + }); + + test("handles both anchors ^...$", () => { + const re = forceNfaFallback("^foo$"); + assert.strictEqual(re.test("foo"), true); + assert.strictEqual(re.test("xfoo"), false); + }); + + test("handles \\A and \\z", () => { + const re = forceNfaFallback("\\Aabc\\z"); + assert.strictEqual(re.test("abc"), true); + assert.strictEqual(re.test("abcd"), false); + }); + + test("handles word boundaries \\b", () => { + const re = forceNfaFallback("\\bword\\b"); + assert.strictEqual(re.test("a word here"), true); + assert.strictEqual(re.test("sword"), false); + assert.strictEqual(re.test("words"), false); + }); + + test("handles non-word-boundaries \\B", () => { + const re = forceNfaFallback("\\Babc\\B"); + assert.strictEqual(re.test("xabcx"), true); + }); + + test("handles multiline ^ and $", () => { + const re = forceNfaFallback("(?m)^foo$"); + assert.strictEqual(re.test("bar\nfoo\nbaz"), true); + assert.strictEqual(re.test("barfoo"), false); + }); + }); + + describe("testExact (ANCHOR_BOTH)", () => { + test("requires full string match", () => { + const re = forceNfaFallback("[0-9]+"); + assert.strictEqual(re.testExact("12345"), true); + assert.strictEqual(re.testExact("12345abc"), false); + assert.strictEqual(re.testExact("abc12345"), false); + }); + + test("empty input with empty-matching pattern", () => { + const re = forceNfaFallback(".*"); + assert.strictEqual(re.testExact(""), true); + }); + + test("empty input with non-empty pattern", () => { + const re = forceNfaFallback("a+"); + assert.strictEqual(re.testExact(""), false); + }); + }); + + describe("case insensitivity", () => { + test("handles (?i) flag", () => { + const re = forceNfaFallback("(?i)hello"); + assert.strictEqual(re.test("HELLO"), true); + assert.strictEqual(re.test("HeLLo"), true); + assert.strictEqual(re.test("goodbye"), false); + }); + + test("handles case-insensitive char classes", () => { + const re = forceNfaFallback("(?i)[a-z]+"); + assert.strictEqual(re.testExact("ABCdef"), true); + }); + }); + + describe("Unicode", () => { + test("handles Unicode properties \\p{L}", () => { + const re = forceNfaFallback("^\\p{L}+$"); + assert.strictEqual(re.test("héllo"), true); + assert.strictEqual(re.test("αβγ"), true); + assert.strictEqual(re.test("123"), false); + }); + + test("handles surrogate pairs", () => { + const re = forceNfaFallback("^.+$"); + assert.strictEqual(re.test("😊"), true); + assert.strictEqual(re.test("😊🚀👽"), true); + }); + }); + + describe("NFA fallback produces same results as unrestricted DFA", () => { + const equivalenceCases: Array<[string, string, boolean]> = [ + ["hello", "hello world", true], + ["hello", "goodbye", false], + ["^\\d{3}-\\d{4}$", "555-1234", true], + ["^\\d{3}-\\d{4}$", "555-12345", false], + ["[a-zA-Z_][a-zA-Z0-9_]*", "valid_name123", true], + ["a{3,}", "aaa", true], + ["^a{3,}$", "aa", false], + ["(?:abc){2,3}", "abcabc", true], + ["^(?:abc){2,3}$", "abc", false], + [".*\\.(jpg|png|gif)$", "photo.jpg", true], + [".*\\.(jpg|png|gif)$", "photo.txt", false], + ["^[A-Z][a-z]+$", "Hello", true], + ["^[A-Z][a-z]+$", "hello", false], + ["^[0-9]{4}-[0-9]{2}-[0-9]{2}$", "2026-04-16", true], + ["^[0-9]{4}-[0-9]{2}-[0-9]{2}$", "abcd-ef-gh", false], + ]; + + for (const [pattern, input, expected] of equivalenceCases) { + test(`NFA produces same result as DFA for /${pattern}/ on ${JSON.stringify(input)}`, () => { + const reDfa = RE2JS.compile(pattern); + const reNfa = forceNfaFallback(pattern); + assert.strictEqual(reDfa.test(input), expected); + assert.strictEqual(reNfa.test(input), expected); + }); + } + }); + }); +}); diff --git a/packages/re2/src/__utils__/chars.ts b/packages/re2/src/__utils__/chars.ts new file mode 100644 index 0000000..e9e53ec --- /dev/null +++ b/packages/re2/src/__utils__/chars.ts @@ -0,0 +1,17 @@ +export const codePoint = (v: string): number => { + const cp = v.codePointAt(0); + if (cp === undefined) { + throw new Error("codePoint: empty string"); + } + return cp; +}; + +export const codePointAtOrThrow = (s: string, i: number): number => { + const cp = s.codePointAt(i); + if (cp === undefined) { + throw new Error( + `codePointAt(${i}) returned undefined for ${JSON.stringify(s)}`, + ); + } + return cp; +}; diff --git a/packages/re2/src/__utils__/parser.ts b/packages/re2/src/__utils__/parser.ts new file mode 100644 index 0000000..7292f81 --- /dev/null +++ b/packages/re2/src/__utils__/parser.ts @@ -0,0 +1,140 @@ +import { FOLD_CASE, NON_GREEDY, WAS_DOLLAR } from "../RE2Flags.js"; +import { Regexp } from "../Regexp.js"; +import { MAX_RUNE, simpleFold } from "../Unicode.js"; + +const OP_NAMES = new Map([ + [Regexp.Op.NO_MATCH, "no"], + [Regexp.Op.EMPTY_MATCH, "emp"], + [Regexp.Op.LITERAL, "lit"], + [Regexp.Op.CHAR_CLASS, "cc"], + [Regexp.Op.ANY_CHAR_NOT_NL, "dnl"], + [Regexp.Op.ANY_CHAR, "dot"], + [Regexp.Op.BEGIN_LINE, "bol"], + [Regexp.Op.END_LINE, "eol"], + [Regexp.Op.BEGIN_TEXT, "bot"], + [Regexp.Op.END_TEXT, "eot"], + [Regexp.Op.WORD_BOUNDARY, "wb"], + [Regexp.Op.NO_WORD_BOUNDARY, "nwb"], + [Regexp.Op.CAPTURE, "cap"], + [Regexp.Op.STAR, "star"], + [Regexp.Op.PLUS, "plus"], + [Regexp.Op.QUEST, "que"], + [Regexp.Op.REPEAT, "rep"], + [Regexp.Op.CONCAT, "cat"], + [Regexp.Op.ALTERNATE, "alt"], +]); + +export const dumpRegexp = (re: Regexp): string => { + let b = ""; + if (!OP_NAMES.has(re.op)) { + b += `op${re.op}`; + } else { + const name = OP_NAMES.get(re.op) as string; + + switch (re.op) { + case Regexp.Op.STAR: + case Regexp.Op.PLUS: + case Regexp.Op.QUEST: + case Regexp.Op.REPEAT: + if ((re.flags & NON_GREEDY) !== 0) { + b += "n"; + } + b += name; + break; + case Regexp.Op.LITERAL: + if (re.runes.length > 1) { + b += "str"; + } else { + b += "lit"; + } + if ((re.flags & FOLD_CASE) !== 0) { + for (let r of re.runes) { + if (simpleFold(r) !== r) { + b += "fold"; + break; + } + } + } + break; + default: + b += name; + break; + } + } + b += "{"; + switch (re.op) { + case Regexp.Op.END_TEXT: + if ((re.flags & WAS_DOLLAR) === 0) { + b += "\\z"; + } + break; + case Regexp.Op.LITERAL: + for (let r of re.runes) { + b += String.fromCodePoint(r); + } + break; + case Regexp.Op.CONCAT: + case Regexp.Op.ALTERNATE: + for (let sub of re.subs) { + b += dumpRegexp(sub); + } + break; + case Regexp.Op.STAR: + case Regexp.Op.PLUS: + case Regexp.Op.QUEST: + b += dumpRegexp(re.subs[0]); + break; + case Regexp.Op.REPEAT: + b += `${re.min},${re.max}`; + b += " "; + b += dumpRegexp(re.subs[0]); + break; + case Regexp.Op.CAPTURE: + if (re.name !== null && re.name.length > 0) { + b += re.name; + b += ":"; + } + b += dumpRegexp(re.subs[0]); + break; + case Regexp.Op.CHAR_CLASS: { + let sep = ""; + for (let i = 0; i < re.runes.length; i += 2) { + b += sep; + sep = " "; + let lo = re.runes[i]; + let hi = re.runes[i + 1]; + if (lo === hi) { + b += `0x${lo.toString(16)}`; + } else { + b += `0x${lo.toString(16)}-0x${hi.toString(16)}`; + } + } + break; + } + } + b += "}"; + return b; +}; + +export const mkCharClass = (f: (r: number) => boolean): string => { + const re = new Regexp(Regexp.Op.CHAR_CLASS); + let runes: number[] = []; + let lo = -1; + + for (let i = 0; i <= MAX_RUNE; i++) { + if (f(i)) { + if (lo < 0) { + lo = i; + } + } else if (lo >= 0) { + runes = [...runes, lo, i - 1]; + lo = -1; + } + } + if (lo >= 0) { + runes = [...runes, lo, MAX_RUNE]; + } + + re.runes = runes; + return dumpRegexp(re); +}; diff --git a/packages/re2/src/exceptions.ts b/packages/re2/src/exceptions.ts new file mode 100644 index 0000000..9cbc94a --- /dev/null +++ b/packages/re2/src/exceptions.ts @@ -0,0 +1,103 @@ +class RE2JSException extends Error { + /** @param {string} message */ + constructor(message: string) { + super(message); + this.name = "RE2JSException"; + } +} + +/** + * An exception thrown by the parser if the pattern was invalid. + */ +class RE2JSSyntaxException extends RE2JSException { + error: string; + input: string | null; + + /** + * @param {string} error + * @param {string|null} [input=null] + */ + constructor(error: string, input: string | null = null) { + let message = `error parsing regexp: ${error}`; + if (input) { + message += `: \`${input}\``; + } + + super(message); + this.name = "RE2JSSyntaxException"; + this.message = message; + /** @type {string} */ + this.error = error; + /** @type {string|null} */ + this.input = input; + } + + /** + * Retrieves the description of the error. + * @returns {string} + */ + getDescription(): string { + return this.error; + } + + /** + * Retrieves the erroneous regular-expression pattern. + * @returns {string|null} + */ + getPattern(): string | null { + return this.input; + } +} + +/** + * An exception thrown by the compiler + */ +class RE2JSCompileException extends RE2JSException { + /** @param {string} message */ + constructor(message: string) { + super(message); + this.name = "RE2JSCompileException"; + } +} + +/** + * An exception thrown by using groups + */ +class RE2JSGroupException extends RE2JSException { + /** @param {string} message */ + constructor(message: string) { + super(message); + this.name = "RE2JSGroupException"; + } +} + +/** + * An exception thrown by flags + */ +class RE2JSFlagsException extends RE2JSException { + /** @param {string} message */ + constructor(message: string) { + super(message); + this.name = "RE2JSFlagsException"; + } +} + +/** + * An exception thrown for internal engine errors, such as corrupted bytecodes. + */ +class RE2JSInternalException extends RE2JSException { + /** @param {string} message */ + constructor(message: string) { + super(message); + this.name = "RE2JSInternalException"; + } +} + +export { + RE2JSException, + RE2JSSyntaxException, + RE2JSCompileException, + RE2JSGroupException, + RE2JSFlagsException, + RE2JSInternalException, +}; diff --git a/packages/re2/src/index.ts b/packages/re2/src/index.ts new file mode 100644 index 0000000..32b2c5d --- /dev/null +++ b/packages/re2/src/index.ts @@ -0,0 +1,213 @@ +import { ANCHOR_BOTH, PERL, UNICODE_GROUPS } from "./RE2Flags.js"; +import { fromUTF16 } from "./MachineInput.js"; +import { RE2 } from "./RE2.js"; +import { quoteMeta } from "./Utils.js"; +import { + RE2JSCompileException, + RE2JSException, + RE2JSFlagsException, + RE2JSGroupException, + RE2JSInternalException, + RE2JSSyntaxException, +} from "./exceptions.js"; + +/** + * A compiled representation of an RE2 regular expression + * + * @class + */ +class RE2JS { + patternInput: string; + flagsInput: number; + re2Input: RE2; + + /** + * Flag: case insensitive matching. + */ + static CASE_INSENSITIVE = 1; + /** + * Flag: dot ({@code .}) matches all characters, including newline. + */ + static DOTALL = 2; + /** + * Flag: multiline matching: {@code ^} and {@code $} match at beginning and end of line, not just + * beginning and end of input. + */ + static MULTILINE = 4; + /** + * Flag: Unicode groups (e.g. {@code \p{Greek}} ) will be syntax errors. + */ + static DISABLE_UNICODE_GROUPS = 8; + + /** + * Returns a literal pattern string for the specified string. + * + * @param {string} str The string to be literalized + * @returns {string} A literal string replacement + */ + static quote(str: string): string { + return quoteMeta(str); + } + + /** + * Helper: create new RE2JS with given regex and flags. + * @param {string} regex + * @param {number} [flags=0] + * @returns {RE2JS} + */ + static compile(regex: string, flags = 0): RE2JS { + return new RE2JS(regex, flags); + } + + static validateFlags(flags: number): void { + if ( + (flags & + ~( + RE2JS.MULTILINE | + RE2JS.DOTALL | + RE2JS.CASE_INSENSITIVE | + RE2JS.DISABLE_UNICODE_GROUPS + )) !== + 0 + ) { + throw new RE2JSFlagsException( + "Flags should only be a combination of MULTILINE, DOTALL, CASE_INSENSITIVE, DISABLE_UNICODE_GROUPS", + ); + } + } + + static buildRegexWithFlags(regex: string, flags = 0): string { + let fregex = regex; + if ((flags & RE2JS.CASE_INSENSITIVE) !== 0) { + fregex = `(?i)${fregex}`; + } + if ((flags & RE2JS.DOTALL) !== 0) { + fregex = `(?s)${fregex}`; + } + if ((flags & RE2JS.MULTILINE) !== 0) { + fregex = `(?m)${fregex}`; + } + return fregex; + } + + /** + * Matches a string against a regular expression. + * + * @param {string} regex the regular expression + * @param {string} input the input + * @returns {boolean} true if the regular expression matches the entire input + * @throws RE2JSSyntaxException if the regular expression is malformed + */ + static matches(regex: string, input: string): boolean { + return RE2JS.compile(regex).testExact(input); + } + + /** + * @param {string} pattern + * @param {number} flags + */ + constructor(pattern: string, flags = 0) { + let re2Flags = PERL; + if ((flags & RE2JS.DISABLE_UNICODE_GROUPS) !== 0) { + re2Flags &= ~UNICODE_GROUPS; + } + RE2JS.validateFlags(flags); + const fregex = RE2JS.buildRegexWithFlags(pattern, flags); + + this.patternInput = pattern; + this.flagsInput = flags; + this.re2Input = RE2.compileImpl(fregex, re2Flags); + } + + /** + * Releases memory used by internal caches associated with this pattern. + */ + reset(): void { + this.re2Input.reset(); + } + + /** + * Returns the flags used in the constructor. + * @returns {number} + */ + flags(): number { + return this.flagsInput; + } + + /** + * Returns the pattern used in the constructor. + * @returns {string} + */ + pattern(): string { + return this.patternInput; + } + + re2(): RE2 { + return this.re2Input; + } + + /** + * Matches a string against a regular expression. + * + * @param {string} input the input + * @returns {boolean} true if the regular expression matches the entire input + */ + matches(input: string): boolean { + return this.testExact(input); + } + + /** + * Tests whether the regular expression matches any part of the input string. + * + * @param {string} input - The input string to test against. + * @returns {boolean} `true` if the pattern is found anywhere in the input, `false` otherwise. + */ + test(input: string): boolean { + return this.re2Input.match(input); + } + + /** + * Tests whether the regular expression matches the ENTIRE input string. + * + * @param {string} input - The input string to test against. + * @returns {boolean} `true` if the exact input string fully matches the pattern, `false` otherwise. + */ + testExact(input: string): boolean { + return ( + this.re2Input.executeEngine(fromUTF16(input), 0, ANCHOR_BOTH, 0) !== null + ); + } + + /** + * @returns {string} + */ + toString(): string { + return this.patternInput; + } + + /** + * Returns the number of capturing groups in this matcher's pattern. + * @returns {number} + */ + groupCount(): number { + return this.re2Input.numberOfCapturingGroups(); + } + + /** + * Return a map of the capturing groups in this matcher's pattern. + * @returns {Map} + */ + namedGroups(): Map { + return this.re2Input.namedGroups; + } +} + +export { + RE2JS, + RE2JSException, + RE2JSSyntaxException, + RE2JSCompileException, + RE2JSGroupException, + RE2JSFlagsException, + RE2JSInternalException, +}; diff --git a/packages/re2/tsconfig.json b/packages/re2/tsconfig.json new file mode 100644 index 0000000..be51485 --- /dev/null +++ b/packages/re2/tsconfig.json @@ -0,0 +1,6 @@ +{ + "files": ["src/index.ts"], + "extends": "../../tsconfig.base.json", + "include": ["src/**/*.test.ts"], + "exclude": ["./src/__tests__", "./src/__fixtures__", "./src/__utils__"] +} diff --git a/scripts/release.js b/scripts/release.js index fde1258..d9a4905 100644 --- a/scripts/release.js +++ b/scripts/release.js @@ -41,6 +41,7 @@ npmPublish(); function npmPublish() { const command = `npm publish --tag ${tag}` + + " --workspace packages/re2" + " --workspace packages/cel" + " --workspace packages/cel-spec"; execSync(command, {