From 7dcc6117c8fbd1cb3e8a22f639a8e06ded79847a Mon Sep 17 00:00:00 2001 From: LucaCappelletti94 Date: Sun, 24 May 2026 00:15:50 +0200 Subject: [PATCH] Parser: fix exponential parse time on speculative prefix parsing --- sqlparser_bench/benches/sqlparser_bench.rs | 51 +++++++++++++++++++++- src/parser/mod.rs | 49 ++++++++++++++++++++- tests/sqlparser_common.rs | 46 +++++++++++++++++++ 3 files changed, 143 insertions(+), 3 deletions(-) diff --git a/sqlparser_bench/benches/sqlparser_bench.rs b/sqlparser_bench/benches/sqlparser_bench.rs index 46c201540..eb9316b47 100644 --- a/sqlparser_bench/benches/sqlparser_bench.rs +++ b/sqlparser_bench/benches/sqlparser_bench.rs @@ -16,7 +16,7 @@ // under the License. use criterion::{criterion_group, criterion_main, Criterion}; -use sqlparser::dialect::GenericDialect; +use sqlparser::dialect::{GenericDialect, PostgreSqlDialect, SQLiteDialect}; use sqlparser::keywords::Keyword; use sqlparser::parser::Parser; use sqlparser::tokenizer::{Span, Word}; @@ -177,11 +177,58 @@ fn parse_compound_chain(c: &mut Criterion) { group.finish(); } +/// Benchmark parsing pathological `IF(((...x` chains +/// that previously caused 2^N work in `parse_prefix`. Each nested +/// `current_time(` segment used to be explored twice at every level (once via +/// the speculative reserved-word arm, once via the unreserved-word fallback), +/// doubling work per level. Post-fix the cost is linear in chain length. +fn parse_prefix_keyword_call_chain(c: &mut Criterion) { + let mut group = c.benchmark_group("parse_prefix_keyword_call_chain"); + let dialect = PostgreSqlDialect {}; + + for &n in &[10usize, 20, 30] { + let sql = String::from("if(") + &"current_time(".repeat(n) + "x"; + + group.bench_function(format!("chain_{n}"), |b| { + b.iter(|| { + let _ = Parser::parse_sql(&dialect, std::hint::black_box(&sql)); + }); + }); + } + + group.finish(); +} + +/// Benchmark parsing pathological `case-case-case-...c` chains that +/// previously caused 2^N work in `parse_prefix`. Each `case` token used to +/// trigger a speculative `parse_case_expr` that recursively descends the +/// chain, but the unreserved-word fallback returns `Identifier(case)` so the +/// overall `parse_prefix` succeeds and the failure cache never fires. +/// Post-fix the per-arm cache short-circuits the speculative descent. +fn parse_prefix_case_chain(c: &mut Criterion) { + let mut group = c.benchmark_group("parse_prefix_case_chain"); + let dialect = SQLiteDialect {}; + + for &n in &[10usize, 20, 30] { + let sql = "case\t-".repeat(n) + "c"; + + group.bench_function(format!("chain_{n}"), |b| { + b.iter(|| { + let _ = Parser::parse_sql(&dialect, std::hint::black_box(&sql)); + }); + }); + } + + group.finish(); +} + criterion_group!( benches, basic_queries, word_to_ident, parse_many_identifiers, - parse_compound_chain + parse_compound_chain, + parse_prefix_keyword_call_chain, + parse_prefix_case_chain ); criterion_main!(benches); diff --git a/src/parser/mod.rs b/src/parser/mod.rs index 91ac386ae..91240ed03 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -15,6 +15,7 @@ #[cfg(not(feature = "std"))] use alloc::{ boxed::Box, + collections::BTreeMap, format, string::{String, ToString}, vec, @@ -24,6 +25,9 @@ use core::{ fmt::{self, Display}, str::FromStr, }; +#[cfg(feature = "std")] +use std::collections::BTreeMap; + use helpers::attached_token::AttachedToken; use log::debug; @@ -359,6 +363,12 @@ pub struct Parser<'a> { options: ParserOptions, /// Ensures the stack does not overflow by limiting recursion depth. recursion_counter: RecursionCounter, + /// Cached errors from `parse_prefix` calls that returned `Err`. See + /// [`Parser::parse_prefix`] for the 2^N patterns this guards. + failed_prefix_positions: BTreeMap, + /// Cached errors from the speculative reserved-word prefix arm. See + /// [`Parser::parse_prefix`] for the 2^N patterns this guards. + failed_reserved_word_prefix_positions: BTreeMap, } impl<'a> Parser<'a> { @@ -385,6 +395,8 @@ impl<'a> Parser<'a> { dialect, recursion_counter: RecursionCounter::new(DEFAULT_REMAINING_DEPTH), options: ParserOptions::new().with_trailing_commas(dialect.supports_trailing_commas()), + failed_prefix_positions: BTreeMap::new(), + failed_reserved_word_prefix_positions: BTreeMap::new(), } } @@ -446,6 +458,8 @@ impl<'a> Parser<'a> { pub fn with_tokens_with_locations(mut self, tokens: Vec) -> Self { self.tokens = tokens; self.index = 0; + self.failed_prefix_positions.clear(); + self.failed_reserved_word_prefix_positions.clear(); self } @@ -1717,6 +1731,23 @@ impl<'a> Parser<'a> { return prefix; } + // Memoize parse_prefix failures to break 2^N speculation when both + // prefix arms fail at every level (e.g. `IF(current_time(...x`). + // The per-arm cache in `parse_prefix_inner` complements this for + // chains where the reserved arm fails but the unreserved fallback + // succeeds (e.g. `case-case-...c`). + let start_index = self.index; + if let Some(cached) = self.failed_prefix_positions.get(&start_index) { + return Err(cached.clone()); + } + let result = self.parse_prefix_inner(); + if let Err(ref e) = result { + self.failed_prefix_positions.insert(start_index, e.clone()); + } + result + } + + fn parse_prefix_inner(&mut self) -> Result { // PostgreSQL allows any string literal to be preceded by a type name, indicating that the // string literal represents a literal of that type. Some examples: // @@ -1801,7 +1832,21 @@ impl<'a> Parser<'a> { // We first try to parse the word and following tokens as a special expression, and if that fails, // we rollback and try to parse it as an identifier. let w = w.clone(); - match self.try_parse(|parser| parser.parse_expr_prefix_by_reserved_word(&w, span)) { + // Memoize failed speculative reserved-word parses. When + // the reserved arm (CASE, CURRENT_TIME, etc.) does + // exponential work but the unreserved fallback ultimately + // succeeds, the overall `parse_prefix` returns `Ok` and the + // outer cache never fires. Chains like `case-case-...c` + // need this per-arm cache to break the doubling. + let try_parse_result = if let Some(cached) = self + .failed_reserved_word_prefix_positions + .get(&next_token_index) + { + Err(cached.clone()) + } else { + self.try_parse(|parser| parser.parse_expr_prefix_by_reserved_word(&w, span)) + }; + match try_parse_result { // This word indicated an expression prefix and parsing was successful Ok(Some(expr)) => Ok(expr), @@ -1815,6 +1860,8 @@ impl<'a> Parser<'a> { // we rollback and return the parsing error we got from trying to parse a // special expression (to maintain backwards compatibility of parsing errors). Err(e) => { + self.failed_reserved_word_prefix_positions + .insert(next_token_index, e.clone()); if !self.dialect.is_reserved_for_identifier(w.keyword) { if let Ok(Some(expr)) = self.maybe_parse(|parser| { parser.parse_expr_prefix_by_unreserved_word(&w, span) diff --git a/tests/sqlparser_common.rs b/tests/sqlparser_common.rs index f470b93ca..fc95b9770 100644 --- a/tests/sqlparser_common.rs +++ b/tests/sqlparser_common.rs @@ -19004,3 +19004,49 @@ fn parse_compound_chain_no_exponential_blowup() { rx.recv_timeout(Duration::from_secs(5)) .expect("parser should reject this quickly, not loop exponentially"); } + +/// Regression test for the 2^N parse-time blowup in `parse_prefix` on inputs +/// like `IF(current_time(current_time(...x`. Each nested `current_time(` used +/// to be explored twice at every level (once via the speculative reserved-word +/// arm, once via the unreserved-word fallback), doubling work per level. +/// Post-fix the failing parse short-circuits via the position-keyed cache. +#[test] +fn parse_prefix_keyword_call_chain_no_exponential_blowup() { + use std::sync::mpsc; + use std::thread; + use std::time::Duration; + + let sql = String::from("if(") + &"current_time(".repeat(30) + "x"; + + let (tx, rx) = mpsc::channel(); + thread::spawn(move || { + let _ = Parser::parse_sql(&PostgreSqlDialect {}, &sql); + let _ = tx.send(()); + }); + + rx.recv_timeout(Duration::from_secs(5)) + .expect("parser should reject this quickly, not loop exponentially"); +} + +/// Regression test for the 2^N parse-time blowup in `parse_prefix` on inputs +/// like `case-case-case-...c`. Each `case` token triggers a speculative +/// `parse_case_expr` that fails, but the unreserved-word fallback returns +/// `Identifier(case)`, so the outer failure cache never fires. Post-fix the +/// per-arm cache short-circuits the speculative descent. +#[test] +fn parse_prefix_case_chain_no_exponential_blowup() { + use std::sync::mpsc; + use std::thread; + use std::time::Duration; + + let sql = "case\t-".repeat(30) + "c"; + + let (tx, rx) = mpsc::channel(); + thread::spawn(move || { + let _ = Parser::parse_sql(&SQLiteDialect {}, &sql); + let _ = tx.send(()); + }); + + rx.recv_timeout(Duration::from_secs(5)) + .expect("parser should reject this quickly, not loop exponentially"); +}