diff --git a/src/main/java/com/pkware/generex/Generex.java b/src/main/java/com/pkware/generex/Generex.java index 81925c3..ef1939f 100644 --- a/src/main/java/com/pkware/generex/Generex.java +++ b/src/main/java/com/pkware/generex/Generex.java @@ -22,6 +22,7 @@ import dk.brics.automaton.RegExp; import dk.brics.automaton.State; import dk.brics.automaton.Transition; +import org.jetbrains.annotations.NotNull; import java.util.ArrayList; import java.util.Collection; @@ -118,13 +119,110 @@ public Generex(Automaton automaton, Random random) { * @see #isValidPattern(String) */ private static RegExp createRegExp(String regex) { - String finalRegex = regex; + String finalRegex = convertToBricsRegex(regex); for (Entry charClass : PREDEFINED_CHARACTER_CLASSES.entrySet()) { finalRegex = finalRegex.replaceAll(charClass.getKey(), charClass.getValue()); } return new RegExp(finalRegex); } + /** + * Converts a regex pattern to brics-compatible syntax for use with Generex. + * + *

Performs the following transformations: + *

+ * + *

The conversion is performed in a single pass that tracks escape sequences and character + * class boundaries to avoid incorrect replacements. + * + * @param regex The Java regex pattern to convert. + * @return the brics-compatible regex. + */ + @NotNull + private static String convertToBricsRegex(@NotNull String regex) { + if (regex.isEmpty()) return regex; + + StringBuilder result = new StringBuilder(regex.length()); + boolean escaped = false; + boolean inCharClass = false; + int start = 0; + + // Strip leading ^ anchor (not escaped since it's the first character) + if (regex.charAt(0) == '^') { + start = 1; + } + + for (int i = start; i < regex.length(); i++) { + char c = regex.charAt(i); + + if (escaped) { + result.append(c); + escaped = false; + continue; + } + + if (c == '\\') { + result.append(c); + escaped = true; + continue; + } + + if (inCharClass) { + if (c == ']') inCharClass = false; + result.append(c); + continue; + } + + if (c == '[') { + inCharClass = true; + result.append(c); + int next = i + 1; + // Per regex standard, ] right after [ or [^ is a literal ] inside the class, not the closing bracket. + if (next < regex.length() && regex.charAt(next) == '^') { + result.append('^'); + next++; + } + if (next < regex.length() && regex.charAt(next) == ']') { + result.append(']'); + i = next; + } + continue; + } + + // Convert (?:...) to (...) — only outside character classes and not escaped + if (c == '(' && i + 2 < regex.length() && regex.charAt(i + 1) == '?' && regex.charAt(i + 2) == ':') { + result.append('('); + i += 2; + continue; + } + + result.append(c); + } + + // Strip trailing $ anchor if the last character is an unescaped $ + if (result.length() > 0 && result.charAt(result.length() - 1) == '$') { + // Count preceding backslashes — odd means $ is escaped, even means $ is an anchor + int backslashes = 0; + for (int i = result.length() - 2; i >= 0 && result.charAt(i) == '\\'; i--) { + backslashes++; + } + if (backslashes % 2 == 0) { + result.deleteCharAt(result.length() - 1); + } + } + + return result.toString(); + } + /** * initialize the random instance used with a seed value to generate a * pseudo random suite of strings based on the passed seed and matches the used regular expression @@ -365,11 +463,29 @@ public String random(int minLength, int maxLength) { targetLength = actualMinLength + random.nextInt(actualMaxLength - actualMinLength + 1); } - String result = prepareRandom("", automaton.getInitialState(), minLength, maxLength, targetLength); + String result = prepareRandom("", automaton.getInitialState(), minLength, maxLength, targetLength, isInfinite() ? new AttemptBudget() : null); // Substring in case a length of 'maxLength + 1' is returned, which is possible if a smaller string can't be produced. return result.substring(0, Math.min(maxLength, result.length())); } + /** + * Mutable counter shared by reference across recursive calls to {@link #prepareRandom}, + * used to cap the total number of iterations and prevent exponential backtracking + * for infinite regexes. + */ + private static class AttemptBudget { + private static final int MAX_ATTEMPTS = 1000; + int count = 0; + + boolean isExhausted() { + return count >= MAX_ATTEMPTS; + } + + void increment() { + count++; + } + } + /** * Recursive function used to generate a regex as defined by {@link Generex#random(int, int)}. * @@ -377,13 +493,19 @@ public String random(int minLength, int maxLength) { * @param state Current state of the regex. * @param minLength Minimum wanted length of the produced string. * @param maxLength Maximum wanted length of produced string. + * @param targetLength The desired length of the produced string, pre-selected uniformly from the valid range. + * @param budget Shared attempt counter to limit recursion for infinite regexes, or {@code null} for finite regexes. * @return A string built from the accumulation of previous transitions. */ - private String prepareRandom(String currentMatch, State state, int minLength, int maxLength, int targetLength) { + private String prepareRandom(String currentMatch, State state, int minLength, int maxLength, int targetLength, AttemptBudget budget) { // Return a string of length 'maxLength + 1' to indicate a dead branch. if (currentMatch.length() > maxLength || state.getTransitions().isEmpty()) return currentMatch; + // For infinite regexes, the automaton has cycles that can cause exponential recursion. + // This budget limit caps total recursive iterations to prevent hanging. + if (budget != null && budget.isExhausted()) return currentMatch; + String returnValue = null; if (state.isAccept()) { @@ -400,6 +522,10 @@ private String prepareRandom(String currentMatch, State state, int minLength, in // Will never start as empty due to the initial if statement in the function. while (!possibleTransitions.isEmpty()) { + if (budget != null) { + budget.increment(); + if (budget.isExhausted()) break; + } Transition randomTransition = pickRandomWeightedTransition(possibleTransitions, totalWeightedTransitions); int subTransitions = getWeightedTransitions(randomTransition); @@ -407,7 +533,7 @@ private String prepareRandom(String currentMatch, State state, int minLength, in possibleTransitions.remove(randomTransition); char randomChar = (char) (random.nextInt(subTransitions) + randomTransition.getMin()); - String result = prepareRandom(currentMatch + randomChar, randomTransition.getDest(), minLength, maxLength, targetLength); + String result = prepareRandom(currentMatch + randomChar, randomTransition.getDest(), minLength, maxLength, targetLength, budget); // Greedily return the first valid result found that is of the wanted length.. if (result.length() == targetLength) return result; @@ -415,7 +541,9 @@ private String prepareRandom(String currentMatch, State state, int minLength, in returnValue = getBestMatch(result, returnValue, minLength, maxLength, targetLength); } - return returnValue; + // For infinite regexes, if budget was exhausted before reaching an accept state, return currentMatch + // as a fallback instead of null. + return returnValue != null ? returnValue : currentMatch; } /** diff --git a/src/test/kotlin/com/pkware/generex/KotlinTests.kt b/src/test/kotlin/com/pkware/generex/KotlinTests.kt index 0d6abb7..0dfaf8e 100644 --- a/src/test/kotlin/com/pkware/generex/KotlinTests.kt +++ b/src/test/kotlin/com/pkware/generex/KotlinTests.kt @@ -188,6 +188,83 @@ class KotlinTests { assertThat(generated.length).isEqualTo(targetLength) } + @ParameterizedTest + @MethodSource("infiniteRegexArgs") + fun `infinite regex does not hang`(regex: String) { + val generex = Generex(regex) + repeat(10) { + val result = generex.random() + assertThat(result).matches(regex) + } + } + + @Test + fun `anchors are stripped from regex`() { + val regex = "^[A-Za-z]+$" + val generex = Generex(regex) + val result = generex.random() + assertThat(result).matches(regex) + assertThat(result).doesNotContain("^") + assertThat(result).doesNotContain("$") + } + + @Test + fun `non-capturing groups are converted to plain groups`() { + val regex = "(?:abc)+" + val generex = Generex(regex) + val result = generex.random() + assertThat(result).matches(regex) + } + + @Test + fun `escaped dollar sign at end is not stripped`() { + val regex = "abc\\$" + val generex = Generex(regex) + val result = generex.random() + assertThat(result).isEqualTo("abc$") + } + + @Test + fun `escaped caret at start is not stripped`() { + val regex = "\\^abc" + val generex = Generex(regex) + val result = generex.random() + assertThat(result).isEqualTo("^abc") + } + + @Test + fun `non-capturing group conversion skipped when escaped`() { + val generex = Generex("\\(?:") + assertThat(generex.random()).contains(":") + } + + @Test + fun `non-capturing group conversion skipped inside character class`() { + val generex = Generex("[(?:]") + val result = generex.random() + assertThat(result).matches("[(?:]") + } + + @Test + fun `non-capturing group conversion skipped when closing bracket is first char in character class`() { + val generex = Generex("[](?:]") + val result = generex.random() + assertThat(result).matches("[](?:]") + } + + @Test + fun `non-capturing group conversion skipped when closing bracket is first char in negated character class`() { + val generex = Generex("[^](?:]") + val result = generex.random() + assertThat(result).matches("[^](?:]") + } + + @Test + fun `escaped backslash before dollar sign is not stripped`() { + val generex = Generex("hello\\\\$") + assertThat(generex.random()).isEqualTo("hello\\") + } + companion object { @JvmStatic @@ -234,6 +311,15 @@ class KotlinTests { Arguments.of("\\d{1,10}"), ) + @JvmStatic + fun infiniteRegexArgs() = Stream.of( + Arguments.of("^[A-Za-z]+(?:[ '-][A-Za-z]+)*$"), + Arguments.of("[A-Za-z]+([ '-][A-Za-z]+)*"), + Arguments.of("(\\d{1,3}\\.){1,}\\d{1,3}"), + Arguments.of("[A-Z][a-z]*( [A-Z][a-z]*)*"), + Arguments.of("(a|b)+(c|d)*"), + ) + @JvmStatic fun regexExceedsColumnValue() = Stream.of( Arguments.of("(hi){3,5}", 7),