pkware · ethan-wrasman-pkware · May 28, 2026 · May 12, 2026 · May 12, 2026
@@ -0,0 +1,63 @@
+# Generex limitations
+
+Generex generates strings from a regex, but it doesn't understand the full Java regex dialect.
+This is a reference for what won't work, and what to do about it.
+
+**Rule of thumb**: always validate generated output with
+`Pattern.compile(yourPattern).matcher(generated).matches()` before trusting it. If that fails,
+your pattern is in one of the categories below.
+
+---
+
+## Patterns that don't work
+
+| Pattern feature | Example | Workaround |
+| --- | --- | --- |
+| Lookahead / lookbehind | `(?=...)`, `(?<!...)` | Restructure the regex to not need zero-width assertions. |
+| Backreferences | `\1`, `\k<name>` | No workaround — restructure the regex without them. |
+| Named groups | `(?<name>...)` | Use plain `(...)`. |
+| Inline flags | `(?i)`, `(?s)`, `(?m)`, `(?x)`, `(?i:...)` | Encode case-insensitivity by hand: `[Aa][Bb][Cc]`. |
+| Unicode property escapes | `\p{L}`, `\p{Digit}`, `\P{...}` | List the characters you actually want explicitly. |
+| Word boundary | `\b`, `\B` | Not expressible; restructure. |
+| Character-class intersection | `[a-z&&[^aeiou]]` | List the actual characters: `[b-df-hj-np-tv-z]`. |
+| Possessive / reluctant quantifiers | `*+`, `++`, `*?`, `+?` | Use the plain forms — generation doesn't care about greediness. |
+| Octal / hex / control escapes | `\012`, `\x1F`, `\cX` | Embed the character literally. |
+
+If your pattern uses any of these, `Generex.isValidPattern(...)` may still return `true` and
+generation may still produce output — it'll just be the wrong language. Always round-trip
+through `Pattern.matches(...)`.
+
+---
+
+## Characters that are special to Generex even when Java treats them as literals
+
+Generex parses patterns with a Brics-flavored engine that treats these as operators **outside**
+character classes:
+
+| Character | What Generex does | Escape as |
+| --- | --- | --- |
+| `&` | Intersection | `\&` |
+| `~` | Complement | `\~` |
+| `#` | Empty language | `\#` |
+| `@` | Any string | `\@` |
+| `"..."` | Literal string | `\"...\"` |
+| `<10-99>` | Numerical range | `\<10-99\>` |
+
+If your pattern contains any of these as data, escape them before constructing the `Generex`.
+
+---
+
+## Operational caveats
+
+- **Infinite regexes (`a*`, `(ab)+`, `\w+`) default to a 50-character cap** when calling
+  `generex.random()` with no arguments. Pass explicit `random(min, max)` to override.
+- **Infinite regexes use a 1000-iteration budget.** If a pattern is structured so that finding a
+  match would require more, Generex returns the closest partial match it found — which may not
+  actually match the regex.
+- **`matchedStringsSize()` / `getMatchedString(n)` overflow silently** on languages with more
+  than `Long.MAX_VALUE` matches (`[a-zA-Z0-9]{1,30}` etc.).
+- **`getAllMatchedStrings()` materializes the entire language.** Prefer `iterator()` for anything
+  non-trivial.
+- **`\D`, `\S`, `\W` cover the full Unicode BMP** — including control characters, surrogates,
+  and unassigned codepoints. `[\D]` will happily produce a NUL byte or ``. If you need
+  printable output, list the allowed characters explicitly (e.g. `[a-zA-Z !-.]`).
@@ -27,13 +27,11 @@
 import java.util.ArrayDeque;
 import java.util.ArrayList;
 import java.util.Collection;
-import java.util.Collections;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
-import java.util.Map.Entry;
 import java.util.Random;
 import java.util.Set;
 import java.util.regex.Matcher;
@@ -48,14 +46,6 @@
  */
 public class Generex implements Iterable<String> {
 
-    /**
-     * The predefined character classes supported by {@code Generex}.
-     * <p>
-     * An immutable map containing as keys the character classes and values the equivalent regular expression syntax.
-     *
-     * @see #createRegExp(String)
-     */
-    private static final Map<String, String> PREDEFINED_CHARACTER_CLASSES;
     private RegExp regExp;
     private Automaton automaton;
     private List<String> matchedStrings = new ArrayList<String>();
@@ -83,17 +73,6 @@ public class Generex implements Iterable<String> {
      */
     public static final int DEFAULT_INFINITE_MAX_LENGTH = 50;
 
-    static {
-        Map<String, String> characterClasses = new HashMap<String, String>();
-        characterClasses.put("\\\\d", "[0-9]");
-        characterClasses.put("\\\\D", "[^0-9]");
-        characterClasses.put("\\\\s", "[ \t\n\f\r]");
-        characterClasses.put("\\\\S", "[^ \t\n\f\r]");
-        characterClasses.put("\\\\w", "[a-zA-Z_0-9]");
-        characterClasses.put("\\\\W", "[^a-zA-Z_0-9]");
-        PREDEFINED_CHARACTER_CLASSES = Collections.unmodifiableMap(characterClasses);
-    }
-
     public Generex(String regex) {
         this(regex, new Random());
     }
@@ -116,23 +95,16 @@ public Generex(Automaton automaton, Random random) {
 
     /**
      * Creates a {@code RegExp} instance from the given regular expression.
-     * <p>
-     * Predefined character classes are replaced with equivalent regular expression syntax prior creating the instance.
      *
      * @param regex the regular expression used to build the {@code RegExp} instance
      * @return a {@code RegExp} instance for the given regular expression
      * @throws NullPointerException     if the given regular expression is {@code null}
      * @throws IllegalArgumentException if an error occurred while parsing the given regular expression
      * @throws StackOverflowError       if the regular expression has to many transitions
-     * @see #PREDEFINED_CHARACTER_CLASSES
      * @see #isValidPattern(String)
      */
     private static RegExp createRegExp(String regex) {
-        String finalRegex = convertToBricsRegex(regex);
-        for (Entry<String, String> charClass : PREDEFINED_CHARACTER_CLASSES.entrySet()) {
-            finalRegex = finalRegex.replaceAll(charClass.getKey(), charClass.getValue());
-        }
-        return new RegExp(finalRegex);
+        return new RegExp(convertToBricsRegex(regex));
     }
 
     /**
@@ -148,6 +120,12 @@ private static RegExp createRegExp(String regex) {
      *       since brics does not support non-capturing group syntax. This is a lossless
      *       transformation because Generex only generates strings and never extracts capture
      *       groups.</li>
+     *   <li>Expands the predefined shorthand classes {@code \d \D \s \S \w \W} into brics-native
+     *       character classes. Inside a {@code [...]} they expand to class-body form (e.g.
+     *       {@code \d -> 0-9}); outside a class they expand to the full bracketed form
+     *       ({@code \d -> [0-9]}). Negated shorthands inside a class expand to an explicit
+     *       complementary Unicode range, so {@code [a\D]} yields {@code [a} plus every
+     *       non-digit Unicode BMP codepoint.</li>
      * </ul>
      *
      * <p>The conversion is performed in a single pass that tracks escape sequences and character
@@ -180,6 +158,14 @@ private static String convertToBricsRegex(@NotNull String regex) {
             }
 
             if (c == '\\') {
+                if (i + 1 < regex.length()) {
+                    String expansion = expandShorthandClass(regex.charAt(i + 1), inCharClass);
+                    if (expansion != null) {
+                        result.append(expansion);
+                        i++;
+                        continue;
+                    }
+                }
                 result.append(c);
                 escaped = true;
                 continue;
@@ -194,15 +180,17 @@ private static String convertToBricsRegex(@NotNull String regex) {
             if (c == '[') {
                 inCharClass = true;
                 result.append(c);
+                // Per regex standard, ] right after [ or [^ is a literal ] inside the class, not
+                // the closing bracket. Only pre-consume ^ when it pairs with such a literal ];
+                // otherwise let the main loop handle ^ on its next iteration so it isn't appended
+                // twice.
                 int next = i + 1;
-                // Per regex standard, ] right after [ or [^ is a literal ] inside the class, not the closing bracket.
-                if (next < regex.length() && regex.charAt(next) == '^') {
-                    result.append('^');
-                    next++;
-                }
-                if (next < regex.length() && regex.charAt(next) == ']') {
+                boolean hasCaret = next < regex.length() && regex.charAt(next) == '^';
+                int closeCandidate = hasCaret ? next + 1 : next;
+                if (closeCandidate < regex.length() && regex.charAt(closeCandidate) == ']') {
+                    if (hasCaret) result.append('^');
                     result.append(']');
-                    i = next;
+                    i = closeCandidate;
                 }
                 continue;
             }
@@ -232,6 +220,31 @@ private static String convertToBricsRegex(@NotNull String regex) {
         return result.toString();
     }
 
+    /**
+     * Returns the brics-compatible expansion of a Java shorthand character class, or {@code null}
+     * if {@code shorthand} is not one of {@code d D s S w W}.
+     *
+     * <p>The expansion form depends on whether the caller is currently inside a {@code [...]}:
+     * inside a class it returns class-body form ({@code 0-9}) so it can be concatenated with
+     * other class members; outside a class it returns the bracketed form ({@code [0-9]}).
+     * Negative shorthands inside a class expand to explicit complementary Unicode BMP ranges,
+     * since brics character classes cannot mix negation with other set members.
+     */
+    private static String expandShorthandClass(char shorthand, boolean inCharClass) {
+        switch (shorthand) {
+            case 'd': return inCharClass ? "0-9" : "[0-9]";
+            case 'D': return inCharClass ? "-/:-" : "[^0-9]";
+            case 's': return inCharClass ? " \t\n\f\r" : "[ \t\n\f\r]";
+            case 'S': return inCharClass ? "--!-" : "[^ \t\n\f\r]";
+            case 'w': return inCharClass ? "a-zA-Z_0-9" : "[a-zA-Z_0-9]";
+            // Range \[-\^ covers the contiguous code points 0x5b-0x5e ('[', '\', ']', '^'); each
+            // endpoint is escaped because both ] and \ would otherwise be parsed by brics as
+            // structural characters inside the class.
+            case 'W': return inCharClass ? "-/:-@\\[-\\^`{-" : "[^a-zA-Z_0-9]";
+            default: return null;
+        }
+    }
+
     /**
      * initialize the random instance used with a seed value  to generate a
      * pseudo random suite of strings based on the passed seed and matches the used regular expression

@@ -1,11 +1,13 @@
 package com.pkware.generex
 
 import com.google.common.truth.Truth.assertThat
+import org.junit.jupiter.api.Assertions.assertThrows
 import org.junit.jupiter.api.Test
 import org.junit.jupiter.params.ParameterizedTest
 import org.junit.jupiter.params.provider.Arguments
 import org.junit.jupiter.params.provider.MethodSource
 import org.junit.jupiter.params.provider.ValueSource
+import java.util.regex.Pattern
 import java.util.stream.Stream
 import kotlin.collections.iterator
 import kotlin.math.max
@@ -283,6 +285,71 @@ class KotlinTests {
         assertThat(averageMs).isLessThan(100.0)
     }
 
+    @ParameterizedTest
+    @MethodSource("shorthandInsideBracketsArgs")
+    fun `shorthand class inside brackets honors Java regex semantics`(pattern: String) {
+        val generex = Generex(pattern)
+        repeat(100) {
+            val result = generex.random()
+            assertThat(result).matches(pattern)
+        }
+    }
+
+    @ParameterizedTest
+    @ValueSource(strings = ["\\S", "\\W", "\\D"])
+    fun `inverse shorthand inside brackets honors Java regex xsemantics`(inverseShorthand: String) {
+        val generex = Generex("[$inverseShorthand]")
+        val expected = Pattern.compile(inverseShorthand)
+        repeat(10_000) {
+            assertThat(generex.random()).matches(expected)
+        }
+    }
+
+    // ─── LIMITATIONS.md regression tests ─────────────────────────────────────────────────────
+    //
+    // These lock in the currently-documented behavior for unsupported Java regex constructs.
+    // If one of them starts failing, either we accidentally fixed a limitation (update the doc)
+    // or we accidentally regressed (investigate).
+
+    @ParameterizedTest
+    @ValueSource(strings = [
+        "(?<name>abc)",  // named group — brics resolves <name> as a named automaton at build time
+        "\\p{L}",         // unicode property — brics tries to parse {L} as a quantifier
+        "abc\"def",      // unescaped double quote — brics treats " as literal-string delimiter
+    ])
+    fun `limitation patterns rejected by Generex constructor`(pattern: String) {
+        assertThrows(IllegalArgumentException::class.java) { Generex(pattern) }
+    }
+
+    @ParameterizedTest
+    @MethodSource("limitationSilentMismatchArgs")
+    fun `LIMITATIONS broken pattern misbehaves and the workaround fixes it`(
+        pattern: String,
+        workaround: String,
+    ) {
+        val broken = Generex(pattern)
+        repeat(100) {
+            // pattern in known limitations will not produce a valid generation
+            assertThat(broken.random()).doesNotMatch(pattern)
+        }
+
+        val fixed = Generex(workaround)
+        repeat(100) {
+            // workaround pattern will produce a valid generation
+            assertThat(fixed.random()).matches(pattern)
+        }
+    }
+
+    @ParameterizedTest
+    @MethodSource("bricsSpecialEscapeArgs")
+    fun `brics-special characters can be escaped to produce the literal`(
+        pattern: String,
+        expected: String,
+    ) {
+        val generex = Generex(pattern)
+        assertThat(generex.random()).isEqualTo(expected)
+    }
+
     companion object {
 
         @JvmStatic
@@ -350,5 +417,77 @@ class KotlinTests {
             Arguments.of("[a-zA-Z0-9]{1,100}"),
             Arguments.of("[a-zA-Z0-9]{1,200}"),
         )
+
+        @JvmStatic
+        fun shorthandInsideBracketsArgs() = Stream.of(
+            // Each positive shorthand alone in a class.
+            Arguments.of("[\\d]"),
+            Arguments.of("[\\s]"),
+            Arguments.of("[\\w]"),
+            // Each negative shorthand alone in a class. Brics treats the inner '^' as a literal,
+            // so the bug also corrupts these classes (extra '[', '^', ']' characters).
+            Arguments.of("[\\D]"),
+            Arguments.of("[\\S]"),
+            Arguments.of("[\\W]"),
+            // Shorthand at start, middle, and end of a class with literal neighbors.
+            Arguments.of("[\\dabc]"),
+            Arguments.of("[a\\dc]"),
+            Arguments.of("[abc\\d]"),
+            // Shorthand combined with explicit ranges.
+            Arguments.of("[\\dA-F]"),
+            Arguments.of("[A-F\\d]"),
+            Arguments.of("[A-Fa-f\\d]"),
+            // Multiple shorthands in a single class.
+            Arguments.of("[\\d\\w]"),
+            Arguments.of("[\\d\\s]"),
+            Arguments.of("[\\w\\W]"),
+            // Negated outer class containing a shorthand.
+            Arguments.of("[^\\d]"),
+            Arguments.of("[^\\w]"),
+            Arguments.of("[^a\\d]"),
+            // Quantified classes — the bug repeats the literal ']' under quantifier.
+            Arguments.of("[\\d]{1,4}"),
+            Arguments.of("[A-Fa-f\\d]{1,4}"),
+            Arguments.of("[\\d\\w]{2,5}"),
+            // Shorthand inside class adjacent to a literal sequence outside the class.
+            Arguments.of("prefix[\\d]suffix"),
+            Arguments.of("([\\d]){2,3}"),
+        )
+
+        @JvmStatic
+        fun limitationSilentMismatchArgs() = Stream.of(
+            // (brokenPattern, sampleSize, workaround) — workaround must be a brics-acceptable
+            // regex whose generated output also satisfies the brokenPattern under Java regex.
+            //
+            // Lookahead — brics doesn't recognize (?=, so `?=` leaks into output as literals.
+            Arguments.of("(?=ab)ab", "ab"),
+            // Backreference — brics treats \1 as literal `1`, not a group reference.
+            Arguments.of("(a)\\1", "aa"),
+            // Inline flag — brics doesn't strip (?i), so `?i` leaks into output.
+            Arguments.of("(?i)abc", "[Aa][Bb][Cc]"),
+            // Word boundary — brics treats \b as escaped `b`.
+            Arguments.of("\\babc", "abc"),
+            // Character-class intersection — brics doesn't honor && inside a class.
+            Arguments.of("[a-z&&[^aeiou]]", "[b-df-hj-np-tv-z]"),
+            // Octal escape — brics treats \012 as literal `012`. Workaround: embed the actual char.
+            Arguments.of("\\012", "\u000A"),
+            // Hex escape — brics treats \x1F as literal `x1F`. Workaround: embed the actual char.
+            Arguments.of("\\x1F", "\u001F"),
+            // Control escape — brics treats \cX as literal `cX`. Workaround: embed the actual char.
+            Arguments.of("\\cX", "\u0018"),
+            // Brics operators that Java treats as literals — escape them.
+            Arguments.of("abc&def", "abc\\&def"),
+            Arguments.of("abc#def", "abc\\#def"),
+            Arguments.of("<10-99>", "\\<10-99\\>"),
+        )
+
+        @JvmStatic
+        fun bricsSpecialEscapeArgs() = Stream.of(
+            // The brics-special operators from LIMITATIONS.md must round-trip when escaped.
+            Arguments.of("\\&", "&"),
+            Arguments.of("\\~", "~"),
+            Arguments.of("\\#", "#"),
+            Arguments.of("\\@", "@"),
+        )
     }
 }