diff --git a/sqlparser_bench/benches/sqlparser_bench.rs b/sqlparser_bench/benches/sqlparser_bench.rs index 46c201540..835be0493 100644 --- a/sqlparser_bench/benches/sqlparser_bench.rs +++ b/sqlparser_bench/benches/sqlparser_bench.rs @@ -177,11 +177,36 @@ fn parse_compound_chain(c: &mut Criterion) { group.finish(); } +/// Benchmark parsing pathological chains of `-NOT-.` segments +/// that previously triggered 2^N speculative `parse_not` work. The input +/// `SELECT x-not-b.x-not-b...` rejects at the trailing `.`. Each `-NOT-` +/// segment used to recurse into `parse_not` which re-walks the rest of the +/// chain, doubling the work at every level. Post-fix the cost is linear in +/// the number of segments. +fn parse_not_chain(c: &mut Criterion) { + let mut group = c.benchmark_group("parse_not_chain"); + let dialect = GenericDialect {}; + + for &n in &[10usize, 20, 30] { + let body: String = std::iter::repeat_n("x-not-b.", n).collect(); + let sql = format!("SELECT {body}"); + + group.bench_function(format!("chain_{n}"), |b| { + b.iter(|| { + let _ = Parser::parse_sql(&dialect, std::hint::black_box(&sql)); + }); + }); + } + + group.finish(); +} + criterion_group!( benches, basic_queries, word_to_ident, parse_many_identifiers, - parse_compound_chain + parse_compound_chain, + parse_not_chain ); criterion_main!(benches); diff --git a/src/parser/mod.rs b/src/parser/mod.rs index 91ac386ae..e648e1fd2 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -12,6 +12,8 @@ //! SQL Parser +#[cfg(not(feature = "std"))] +use alloc::collections::BTreeSet; #[cfg(not(feature = "std"))] use alloc::{ boxed::Box, @@ -25,6 +27,8 @@ use core::{ str::FromStr, }; use helpers::attached_token::AttachedToken; +#[cfg(feature = "std")] +use std::collections::BTreeSet; use log::debug; @@ -359,6 +363,14 @@ pub struct Parser<'a> { options: ParserOptions, /// Ensures the stack does not overflow by limiting recursion depth. recursion_counter: RecursionCounter, + /// Token indices where a speculative attempt to parse `NOT` as a unary + /// prefix operator has already failed during this parse. Re-entering + /// `parse_not` at the same position would repeat the same work and fail + /// again, so the second visit short-circuits to identifier interpretation. + /// Without this cache, inputs like `SELECT x-not-b.x-not-b...` (chains of + /// `-NOT-.` ending in a parse error) trigger 2^N exploration + /// because each `-NOT-` segment otherwise re-walks the rest of the chain. + failed_unary_not_positions: BTreeSet, } impl<'a> Parser<'a> { @@ -385,6 +397,7 @@ impl<'a> Parser<'a> { dialect, recursion_counter: RecursionCounter::new(DEFAULT_REMAINING_DEPTH), options: ParserOptions::new().with_trailing_commas(dialect.supports_trailing_commas()), + failed_unary_not_positions: BTreeSet::new(), } } @@ -446,6 +459,7 @@ impl<'a> Parser<'a> { pub fn with_tokens_with_locations(mut self, tokens: Vec) -> Self { self.tokens = tokens; self.index = 0; + self.failed_unary_not_positions.clear(); self } @@ -1801,6 +1815,17 @@ impl<'a> Parser<'a> { // We first try to parse the word and following tokens as a special expression, and if that fails, // we rollback and try to parse it as an identifier. let w = w.clone(); + // If we already tried to parse `NOT` as a unary prefix operator + // at this exact position and it failed, skip the speculative + // path and fall back to identifier interpretation directly. The + // speculative `parse_not` re-walks the rest of the expression, + // so repeating it on every visit causes 2^N work on chains + // like `x-not-b.x-not-b...` that end in a parse error. + if w.keyword == Keyword::NOT + && self.failed_unary_not_positions.contains(&self.index) + { + return self.parse_expr_prefix_by_unreserved_word(&w, span); + } match self.try_parse(|parser| parser.parse_expr_prefix_by_reserved_word(&w, span)) { // This word indicated an expression prefix and parsing was successful Ok(Some(expr)) => Ok(expr), @@ -1815,6 +1840,9 @@ impl<'a> Parser<'a> { // we rollback and return the parsing error we got from trying to parse a // special expression (to maintain backwards compatibility of parsing errors). Err(e) => { + if w.keyword == Keyword::NOT { + self.failed_unary_not_positions.insert(self.index); + } if !self.dialect.is_reserved_for_identifier(w.keyword) { if let Ok(Some(expr)) = self.maybe_parse(|parser| { parser.parse_expr_prefix_by_unreserved_word(&w, span) diff --git a/tests/sqlparser_common.rs b/tests/sqlparser_common.rs index f470b93ca..25b8bdccc 100644 --- a/tests/sqlparser_common.rs +++ b/tests/sqlparser_common.rs @@ -19004,3 +19004,29 @@ fn parse_compound_chain_no_exponential_blowup() { rx.recv_timeout(Duration::from_secs(5)) .expect("parser should reject this quickly, not loop exponentially"); } + +/// Regression test for the 2^N parse-time blowup on chains of +/// `-NOT-.` segments ending in a parse error. Each `-NOT-` +/// segment used to trigger a speculative `parse_not` that re-walks the rest of +/// the chain, so re-entering it at every `NOT` doubled the work. The fix +/// caches positions where the speculative `parse_not` already failed and +/// skips repeating it. Post-fix the parser rejects this in well under a +/// millisecond, so the timeout is a hang guard, not a perf threshold. +#[test] +fn parse_not_chain_no_exponential_blowup() { + use std::sync::mpsc; + use std::thread; + use std::time::Duration; + + let body: String = std::iter::repeat_n("x-not-b.", 30).collect(); + let sql = format!("SELECT {body}"); + + let (tx, rx) = mpsc::channel(); + thread::spawn(move || { + let _ = Parser::parse_sql(&GenericDialect {}, &sql); + let _ = tx.send(()); + }); + + rx.recv_timeout(Duration::from_secs(5)) + .expect("parser should reject this quickly, not loop exponentially"); +}