diff --git a/src/jinja2/lexer.py b/src/jinja2/lexer.py index e35cd471e..061452c12 100644 --- a/src/jinja2/lexer.py +++ b/src/jinja2/lexer.py @@ -26,9 +26,45 @@ # static regular expressions whitespace_re = re.compile(r"\s+") newline_re = re.compile(r"(\r\n|\r|\n)") -string_re = re.compile( - r"('([^'\\]*(?:\\.[^'\\]*)*)'" r'|"([^"\\]*(?:\\.[^"\\]*)*)")', re.S -) + + +def _match_string(source: str, pos: int) -> str | None: + """Match a quoted string starting at *pos*. + + This replaces the ``string_re`` regex to avoid catastrophic + backtracking on unclosed strings with many escape sequences. + The algorithm uses :meth:`str.find` to locate the closing quote + and then verifies that it is not escaped. + """ + if pos >= len(source): + return None + + quote = source[pos] + + if quote not in "'\"": + return None + + i = source.find(quote, pos + 1) + + while i != -1: + # Count backslashes before this quote. + bs = 0 + j = i - 1 + + while j >= pos + 1 and source[j] == "\\": + bs += 1 + j -= 1 + + if bs % 2 == 0: + # Even number of backslashes -> quote is not escaped. + return source[pos : i + 1] + + # Odd number of backslashes -> quote is escaped, keep searching. + i = source.find(quote, i + 1) + + return None + + integer_re = re.compile( r""" ( @@ -463,11 +499,50 @@ def __new__(cls, *members, **kwargs): # type: ignore class _Rule(t.NamedTuple): - pattern: t.Pattern[str] + pattern: t.Pattern[str] | "_StringPattern" tokens: str | tuple[str, ...] | tuple[Failure] command: str | None +class _StringMatch: + """A minimal match object returned by :class:`_StringPattern`.""" + + __slots__ = ("_text", "_pos") + + def __init__(self, text: str, pos: int) -> None: + self._text = text + self._pos = pos + + def group(self, n: int = 0) -> str: + if n == 0: + return self._text + raise IndexError("no such group") + + def end(self) -> int: + return self._pos + len(self._text) + + def groups(self) -> tuple[str, ...]: + return () + + def groupdict(self) -> dict[str, str]: + return {} + + +class _StringPattern: + """A regex-like object that matches string literals using a fast + manual scanner instead of a regex to avoid catastrophic backtracking + on unclosed strings with many escape sequences. + """ + + __slots__ = () + + def match(self, source: str, pos: int) -> _StringMatch | None: + result = _match_string(source, pos) + if result is None: + return None + return _StringMatch(result, pos) + + class Lexer: """Class that implements a lexer for a given environment. Automatically created by the environment class, usually you don't have to do that. @@ -489,7 +564,7 @@ def c(x: str) -> t.Pattern[str]: _Rule(float_re, TOKEN_FLOAT, None), _Rule(integer_re, TOKEN_INTEGER, None), _Rule(name_re, TOKEN_NAME, None), - _Rule(string_re, TOKEN_STRING, None), + _Rule(_StringPattern(), TOKEN_STRING, None), _Rule(operator_re, TOKEN_OPERATOR, None), ]