diff --git a/LIMITATIONS.md b/LIMITATIONS.md new file mode 100644 index 0000000..873182a --- /dev/null +++ b/LIMITATIONS.md @@ -0,0 +1,63 @@ +# Generex limitations + +Generex generates strings from a regex, but it doesn't understand the full Java regex dialect. +This is a reference for what won't work, and what to do about it. + +**Rule of thumb**: always validate generated output with +`Pattern.compile(yourPattern).matcher(generated).matches()` before trusting it. If that fails, +your pattern is in one of the categories below. + +--- + +## Patterns that don't work + +| Pattern feature | Example | Workaround | +| --- | --- | --- | +| Lookahead / lookbehind | `(?=...)`, `(?` | No workaround — restructure the regex without them. | +| Named groups | `(?...)` | Use plain `(...)`. | +| Inline flags | `(?i)`, `(?s)`, `(?m)`, `(?x)`, `(?i:...)` | Encode case-insensitivity by hand: `[Aa][Bb][Cc]`. | +| Unicode property escapes | `\p{L}`, `\p{Digit}`, `\P{...}` | List the characters you actually want explicitly. | +| Word boundary | `\b`, `\B` | Not expressible; restructure. | +| Character-class intersection | `[a-z&&[^aeiou]]` | List the actual characters: `[b-df-hj-np-tv-z]`. | +| Possessive / reluctant quantifiers | `*+`, `++`, `*?`, `+?` | Use the plain forms — generation doesn't care about greediness. | +| Octal / hex / control escapes | `\012`, `\x1F`, `\cX` | Embed the character literally. | + +If your pattern uses any of these, `Generex.isValidPattern(...)` may still return `true` and +generation may still produce output — it'll just be the wrong language. Always round-trip +through `Pattern.matches(...)`. + +--- + +## Characters that are special to Generex even when Java treats them as literals + +Generex parses patterns with a Brics-flavored engine that treats these as operators **outside** +character classes: + +| Character | What Generex does | Escape as | +| --- | --- | --- | +| `&` | Intersection | `\&` | +| `~` | Complement | `\~` | +| `#` | Empty language | `\#` | +| `@` | Any string | `\@` | +| `"..."` | Literal string | `\"...\"` | +| `<10-99>` | Numerical range | `\<10-99\>` | + +If your pattern contains any of these as data, escape them before constructing the `Generex`. + +--- + +## Operational caveats + +- **Infinite regexes (`a*`, `(ab)+`, `\w+`) default to a 50-character cap** when calling + `generex.random()` with no arguments. Pass explicit `random(min, max)` to override. +- **Infinite regexes use a 1000-iteration budget.** If a pattern is structured so that finding a + match would require more, Generex returns the closest partial match it found — which may not + actually match the regex. +- **`matchedStringsSize()` / `getMatchedString(n)` overflow silently** on languages with more + than `Long.MAX_VALUE` matches (`[a-zA-Z0-9]{1,30}` etc.). +- **`getAllMatchedStrings()` materializes the entire language.** Prefer `iterator()` for anything + non-trivial. +- **`\D`, `\S`, `\W` cover the full Unicode BMP** — including control characters, surrogates, + and unassigned codepoints. `[\D]` will happily produce a NUL byte or `￾`. If you need + printable output, list the allowed characters explicitly (e.g. `[a-zA-Z !-.]`). diff --git a/src/main/java/com/pkware/generex/Generex.java b/src/main/java/com/pkware/generex/Generex.java index 31de7b5..0566047 100644 --- a/src/main/java/com/pkware/generex/Generex.java +++ b/src/main/java/com/pkware/generex/Generex.java @@ -27,13 +27,11 @@ import java.util.ArrayDeque; import java.util.ArrayList; import java.util.Collection; -import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; -import java.util.Map.Entry; import java.util.Random; import java.util.Set; import java.util.regex.Matcher; @@ -48,14 +46,6 @@ */ public class Generex implements Iterable { - /** - * The predefined character classes supported by {@code Generex}. - *

- * An immutable map containing as keys the character classes and values the equivalent regular expression syntax. - * - * @see #createRegExp(String) - */ - private static final Map PREDEFINED_CHARACTER_CLASSES; private RegExp regExp; private Automaton automaton; private List matchedStrings = new ArrayList(); @@ -83,17 +73,6 @@ public class Generex implements Iterable { */ public static final int DEFAULT_INFINITE_MAX_LENGTH = 50; - static { - Map characterClasses = new HashMap(); - characterClasses.put("\\\\d", "[0-9]"); - characterClasses.put("\\\\D", "[^0-9]"); - characterClasses.put("\\\\s", "[ \t\n\f\r]"); - characterClasses.put("\\\\S", "[^ \t\n\f\r]"); - characterClasses.put("\\\\w", "[a-zA-Z_0-9]"); - characterClasses.put("\\\\W", "[^a-zA-Z_0-9]"); - PREDEFINED_CHARACTER_CLASSES = Collections.unmodifiableMap(characterClasses); - } - public Generex(String regex) { this(regex, new Random()); } @@ -116,23 +95,16 @@ public Generex(Automaton automaton, Random random) { /** * Creates a {@code RegExp} instance from the given regular expression. - *

- * Predefined character classes are replaced with equivalent regular expression syntax prior creating the instance. * * @param regex the regular expression used to build the {@code RegExp} instance * @return a {@code RegExp} instance for the given regular expression * @throws NullPointerException if the given regular expression is {@code null} * @throws IllegalArgumentException if an error occurred while parsing the given regular expression * @throws StackOverflowError if the regular expression has to many transitions - * @see #PREDEFINED_CHARACTER_CLASSES * @see #isValidPattern(String) */ private static RegExp createRegExp(String regex) { - String finalRegex = convertToBricsRegex(regex); - for (Entry charClass : PREDEFINED_CHARACTER_CLASSES.entrySet()) { - finalRegex = finalRegex.replaceAll(charClass.getKey(), charClass.getValue()); - } - return new RegExp(finalRegex); + return new RegExp(convertToBricsRegex(regex)); } /** @@ -148,6 +120,12 @@ private static RegExp createRegExp(String regex) { * since brics does not support non-capturing group syntax. This is a lossless * transformation because Generex only generates strings and never extracts capture * groups. + *

  • Expands the predefined shorthand classes {@code \d \D \s \S \w \W} into brics-native + * character classes. Inside a {@code [...]} they expand to class-body form (e.g. + * {@code \d -> 0-9}); outside a class they expand to the full bracketed form + * ({@code \d -> [0-9]}). Negated shorthands inside a class expand to an explicit + * complementary Unicode range, so {@code [a\D]} yields {@code [a} plus every + * non-digit Unicode BMP codepoint.
  • * * *

    The conversion is performed in a single pass that tracks escape sequences and character @@ -180,6 +158,14 @@ private static String convertToBricsRegex(@NotNull String regex) { } if (c == '\\') { + if (i + 1 < regex.length()) { + String expansion = expandShorthandClass(regex.charAt(i + 1), inCharClass); + if (expansion != null) { + result.append(expansion); + i++; + continue; + } + } result.append(c); escaped = true; continue; @@ -194,15 +180,17 @@ private static String convertToBricsRegex(@NotNull String regex) { if (c == '[') { inCharClass = true; result.append(c); + // Per regex standard, ] right after [ or [^ is a literal ] inside the class, not + // the closing bracket. Only pre-consume ^ when it pairs with such a literal ]; + // otherwise let the main loop handle ^ on its next iteration so it isn't appended + // twice. int next = i + 1; - // Per regex standard, ] right after [ or [^ is a literal ] inside the class, not the closing bracket. - if (next < regex.length() && regex.charAt(next) == '^') { - result.append('^'); - next++; - } - if (next < regex.length() && regex.charAt(next) == ']') { + boolean hasCaret = next < regex.length() && regex.charAt(next) == '^'; + int closeCandidate = hasCaret ? next + 1 : next; + if (closeCandidate < regex.length() && regex.charAt(closeCandidate) == ']') { + if (hasCaret) result.append('^'); result.append(']'); - i = next; + i = closeCandidate; } continue; } @@ -232,6 +220,31 @@ private static String convertToBricsRegex(@NotNull String regex) { return result.toString(); } + /** + * Returns the brics-compatible expansion of a Java shorthand character class, or {@code null} + * if {@code shorthand} is not one of {@code d D s S w W}. + * + *

    The expansion form depends on whether the caller is currently inside a {@code [...]}: + * inside a class it returns class-body form ({@code 0-9}) so it can be concatenated with + * other class members; outside a class it returns the bracketed form ({@code [0-9]}). + * Negative shorthands inside a class expand to explicit complementary Unicode BMP ranges, + * since brics character classes cannot mix negation with other set members. + */ + private static String expandShorthandClass(char shorthand, boolean inCharClass) { + switch (shorthand) { + case 'd': return inCharClass ? "0-9" : "[0-9]"; + case 'D': return inCharClass ? "-/:-￿" : "[^0-9]"; + case 's': return inCharClass ? " \t\n\f\r" : "[ \t\n\f\r]"; + case 'S': return inCharClass ? "- -!-￿" : "[^ \t\n\f\r]"; + case 'w': return inCharClass ? "a-zA-Z_0-9" : "[a-zA-Z_0-9]"; + // Range \[-\^ covers the contiguous code points 0x5b-0x5e ('[', '\', ']', '^'); each + // endpoint is escaped because both ] and \ would otherwise be parsed by brics as + // structural characters inside the class. + case 'W': return inCharClass ? "-/:-@\\[-\\^`{-￿" : "[^a-zA-Z_0-9]"; + default: return null; + } + } + /** * initialize the random instance used with a seed value to generate a * pseudo random suite of strings based on the passed seed and matches the used regular expression diff --git a/src/test/kotlin/com/pkware/generex/KotlinTests.kt b/src/test/kotlin/com/pkware/generex/KotlinTests.kt index 7600639..087a711 100644 --- a/src/test/kotlin/com/pkware/generex/KotlinTests.kt +++ b/src/test/kotlin/com/pkware/generex/KotlinTests.kt @@ -1,11 +1,13 @@ package com.pkware.generex import com.google.common.truth.Truth.assertThat +import org.junit.jupiter.api.Assertions.assertThrows import org.junit.jupiter.api.Test import org.junit.jupiter.params.ParameterizedTest import org.junit.jupiter.params.provider.Arguments import org.junit.jupiter.params.provider.MethodSource import org.junit.jupiter.params.provider.ValueSource +import java.util.regex.Pattern import java.util.stream.Stream import kotlin.collections.iterator import kotlin.math.max @@ -283,6 +285,71 @@ class KotlinTests { assertThat(averageMs).isLessThan(100.0) } + @ParameterizedTest + @MethodSource("shorthandInsideBracketsArgs") + fun `shorthand class inside brackets honors Java regex semantics`(pattern: String) { + val generex = Generex(pattern) + repeat(100) { + val result = generex.random() + assertThat(result).matches(pattern) + } + } + + @ParameterizedTest + @ValueSource(strings = ["\\S", "\\W", "\\D"]) + fun `inverse shorthand inside brackets honors Java regex xsemantics`(inverseShorthand: String) { + val generex = Generex("[$inverseShorthand]") + val expected = Pattern.compile(inverseShorthand) + repeat(10_000) { + assertThat(generex.random()).matches(expected) + } + } + + // ─── LIMITATIONS.md regression tests ───────────────────────────────────────────────────── + // + // These lock in the currently-documented behavior for unsupported Java regex constructs. + // If one of them starts failing, either we accidentally fixed a limitation (update the doc) + // or we accidentally regressed (investigate). + + @ParameterizedTest + @ValueSource(strings = [ + "(?abc)", // named group — brics resolves as a named automaton at build time + "\\p{L}", // unicode property — brics tries to parse {L} as a quantifier + "abc\"def", // unescaped double quote — brics treats " as literal-string delimiter + ]) + fun `limitation patterns rejected by Generex constructor`(pattern: String) { + assertThrows(IllegalArgumentException::class.java) { Generex(pattern) } + } + + @ParameterizedTest + @MethodSource("limitationSilentMismatchArgs") + fun `LIMITATIONS broken pattern misbehaves and the workaround fixes it`( + pattern: String, + workaround: String, + ) { + val broken = Generex(pattern) + repeat(100) { + // pattern in known limitations will not produce a valid generation + assertThat(broken.random()).doesNotMatch(pattern) + } + + val fixed = Generex(workaround) + repeat(100) { + // workaround pattern will produce a valid generation + assertThat(fixed.random()).matches(pattern) + } + } + + @ParameterizedTest + @MethodSource("bricsSpecialEscapeArgs") + fun `brics-special characters can be escaped to produce the literal`( + pattern: String, + expected: String, + ) { + val generex = Generex(pattern) + assertThat(generex.random()).isEqualTo(expected) + } + companion object { @JvmStatic @@ -350,5 +417,77 @@ class KotlinTests { Arguments.of("[a-zA-Z0-9]{1,100}"), Arguments.of("[a-zA-Z0-9]{1,200}"), ) + + @JvmStatic + fun shorthandInsideBracketsArgs() = Stream.of( + // Each positive shorthand alone in a class. + Arguments.of("[\\d]"), + Arguments.of("[\\s]"), + Arguments.of("[\\w]"), + // Each negative shorthand alone in a class. Brics treats the inner '^' as a literal, + // so the bug also corrupts these classes (extra '[', '^', ']' characters). + Arguments.of("[\\D]"), + Arguments.of("[\\S]"), + Arguments.of("[\\W]"), + // Shorthand at start, middle, and end of a class with literal neighbors. + Arguments.of("[\\dabc]"), + Arguments.of("[a\\dc]"), + Arguments.of("[abc\\d]"), + // Shorthand combined with explicit ranges. + Arguments.of("[\\dA-F]"), + Arguments.of("[A-F\\d]"), + Arguments.of("[A-Fa-f\\d]"), + // Multiple shorthands in a single class. + Arguments.of("[\\d\\w]"), + Arguments.of("[\\d\\s]"), + Arguments.of("[\\w\\W]"), + // Negated outer class containing a shorthand. + Arguments.of("[^\\d]"), + Arguments.of("[^\\w]"), + Arguments.of("[^a\\d]"), + // Quantified classes — the bug repeats the literal ']' under quantifier. + Arguments.of("[\\d]{1,4}"), + Arguments.of("[A-Fa-f\\d]{1,4}"), + Arguments.of("[\\d\\w]{2,5}"), + // Shorthand inside class adjacent to a literal sequence outside the class. + Arguments.of("prefix[\\d]suffix"), + Arguments.of("([\\d]){2,3}"), + ) + + @JvmStatic + fun limitationSilentMismatchArgs() = Stream.of( + // (brokenPattern, sampleSize, workaround) — workaround must be a brics-acceptable + // regex whose generated output also satisfies the brokenPattern under Java regex. + // + // Lookahead — brics doesn't recognize (?=, so `?=` leaks into output as literals. + Arguments.of("(?=ab)ab", "ab"), + // Backreference — brics treats \1 as literal `1`, not a group reference. + Arguments.of("(a)\\1", "aa"), + // Inline flag — brics doesn't strip (?i), so `?i` leaks into output. + Arguments.of("(?i)abc", "[Aa][Bb][Cc]"), + // Word boundary — brics treats \b as escaped `b`. + Arguments.of("\\babc", "abc"), + // Character-class intersection — brics doesn't honor && inside a class. + Arguments.of("[a-z&&[^aeiou]]", "[b-df-hj-np-tv-z]"), + // Octal escape — brics treats \012 as literal `012`. Workaround: embed the actual char. + Arguments.of("\\012", "\u000A"), + // Hex escape — brics treats \x1F as literal `x1F`. Workaround: embed the actual char. + Arguments.of("\\x1F", "\u001F"), + // Control escape — brics treats \cX as literal `cX`. Workaround: embed the actual char. + Arguments.of("\\cX", "\u0018"), + // Brics operators that Java treats as literals — escape them. + Arguments.of("abc&def", "abc\\&def"), + Arguments.of("abc#def", "abc\\#def"), + Arguments.of("<10-99>", "\\<10-99\\>"), + ) + + @JvmStatic + fun bricsSpecialEscapeArgs() = Stream.of( + // The brics-special operators from LIMITATIONS.md must round-trip when escaped. + Arguments.of("\\&", "&"), + Arguments.of("\\~", "~"), + Arguments.of("\\#", "#"), + Arguments.of("\\@", "@"), + ) } }