From b03ecbe6c769acd262b3bccc0cc7303a8a5bd927 Mon Sep 17 00:00:00 2001 From: Krishnan Govindraj Date: Fri, 27 Mar 2026 00:04:13 +0100 Subject: [PATCH 01/10] Implement unicode escaping --- rust/common/error/mod.rs | 2 ++ rust/value.rs | 60 +++++++++++++++++++++++++++++++++++++++- 2 files changed, 61 insertions(+), 1 deletion(-) diff --git a/rust/common/error/mod.rs b/rust/common/error/mod.rs index 6f4eac01..52877f68 100644 --- a/rust/common/error/mod.rs +++ b/rust/common/error/mod.rs @@ -84,6 +84,8 @@ error_messages! { TypeQLError 6: "Encountered invalid escape sequence {escape:?} while parsing {full_string:?}.", ReservedKeywordAsIdentifier { identifier: Identifier } = 7: "A reserved keyword '{identifier}' was used as identifier.", + InvalidUnicodeEscapeInString { escape: String, full_string: String } = + 8: "Encountered an invalid unicode escape sequence {escape:?} while parsing {full_string:?}.", /* MissingPatterns = 5: "The query has not been provided with any patterns.", diff --git a/rust/value.rs b/rust/value.rs index 564d749d..461c54ce 100644 --- a/rust/value.rs +++ b/rust/value.rs @@ -342,6 +342,16 @@ impl fmt::Display for StructLiteral { } impl StringLiteral { + fn unescape_unicode<'a>(bytes: &'a [u8]) -> std::result::Result { + let as_hex = std::str::from_utf8(bytes).expect("Should still be utf8"); + if bytes.len() == 4 { + let as_u32 = u32::from_str_radix(as_hex, 16).map_err(|_| as_hex)?; + char::from_u32(as_u32).ok_or(as_hex) + } else { + Err(as_hex) + } + } + pub fn unescape(&self) -> Result { self.process_unescape(|bytes, _buf, rest| match bytes[1] { BSP => Ok(('\x08', 2)), @@ -350,7 +360,13 @@ impl StringLiteral { FF_ => Ok(('\x0c', 2)), CR_ => Ok(('\x0d', 2)), c @ (b'"' | b'\'' | b'\\') => Ok((c as char, 2)), - b'u' => todo!("Unicode escape handling"), + b'u' => Self::unescape_unicode(&bytes[2..std::cmp::min(6, bytes.len())]).map(|c| (c, 6)).map_err(|hex| { + TypeQLError::InvalidUnicodeEscapeInString { + full_string: rest.to_owned(), + escape: format!(r"\u{}", hex), + } + .into() + }), _ => Err(TypeQLError::InvalidStringEscape { full_string: rest.to_owned(), escape: format!(r"\{}", rest.chars().nth(1).unwrap()), @@ -407,3 +423,45 @@ const TAB: u8 = b't'; const LF_: u8 = b'n'; const FF_: u8 = b'f'; const CR_: u8 = b'r'; + +#[cfg(test)] +pub mod tests { + use crate::value::TypeQLError; + #[test] + fn test_unicode_unescape() { + { + // Works + let escaped = r#""... \u0ca0\u005f\u0ca0""#; + let crate::ValueLiteral::String(parsed) = crate::parse_value(escaped).unwrap() else { + panic!("Not parsed as string"); + }; + assert_eq!(parsed.unescape().unwrap().as_str(), "... ಠ_ಠ"); + } + + { + // Not enough bytes + let escaped = r#""... \u012""#; + let crate::ValueLiteral::String(parsed) = crate::parse_value(escaped).unwrap() else { + panic!("Not parsed as string"); + }; + let error = parsed.unescape().unwrap_err(); + let TypeQLError::InvalidUnicodeEscapeInString { escape, .. } = &error.errors()[0] else { + panic!("Wrong error type. Was {error:?}") + }; + assert_eq!(escape, r"\u012"); + } + + { + // Invalid hex + let escaped = r#""... \uwu/ ...""#; + let crate::ValueLiteral::String(parsed) = crate::parse_value(escaped).unwrap() else { + panic!("Not parsed as string"); + }; + let error = parsed.unescape().unwrap_err(); + let TypeQLError::InvalidUnicodeEscapeInString { escape, .. } = &error.errors()[0] else { + panic!("Wrong error type. Was {error:?}") + }; + assert_eq!(escape, r"\uwu/ "); + } + } +} From 6671c1fad981f43ddf354d64a880c2c743e6b15f Mon Sep 17 00:00:00 2001 From: Krishnan Govindraj Date: Fri, 27 Mar 2026 00:07:44 +0100 Subject: [PATCH 02/10] add test usint capitals --- rust/value.rs | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/rust/value.rs b/rust/value.rs index 461c54ce..dd93a87a 100644 --- a/rust/value.rs +++ b/rust/value.rs @@ -438,6 +438,15 @@ pub mod tests { assert_eq!(parsed.unescape().unwrap().as_str(), "... ಠ_ಠ"); } + { + // Capital hex works too + let escaped = r#""... \u0CA0\u005F\u0CA0""#; + let crate::ValueLiteral::String(parsed) = crate::parse_value(escaped).unwrap() else { + panic!("Not parsed as string"); + }; + assert_eq!(parsed.unescape().unwrap().as_str(), "... ಠ_ಠ"); + } + { // Not enough bytes let escaped = r#""... \u012""#; From 569070d8cae2c71555fa3670013e2ffa990b4870 Mon Sep 17 00:00:00 2001 From: Krishnan Govindraj Date: Fri, 27 Mar 2026 13:01:59 +0100 Subject: [PATCH 03/10] Problem with rest. I might yeet --- rust/value.rs | 40 ++++++++++++++++++++++++++++++++++------ 1 file changed, 34 insertions(+), 6 deletions(-) diff --git a/rust/value.rs b/rust/value.rs index dd93a87a..d99beefc 100644 --- a/rust/value.rs +++ b/rust/value.rs @@ -360,13 +360,17 @@ impl StringLiteral { FF_ => Ok(('\x0c', 2)), CR_ => Ok(('\x0d', 2)), c @ (b'"' | b'\'' | b'\\') => Ok((c as char, 2)), - b'u' => Self::unescape_unicode(&bytes[2..std::cmp::min(6, bytes.len())]).map(|c| (c, 6)).map_err(|hex| { - TypeQLError::InvalidUnicodeEscapeInString { - full_string: rest.to_owned(), - escape: format!(r"\u{}", hex), + b'u' => { + compile_error!("Our 'escape' fields are wrong because \"rest\" isn't acutally rest here."); + let escape = &bytes[2..std::cmp::min(6, bytes.len())]; + match decode_four_hex_bytes(escape) { + Some(char) => Ok((char, 6)), + None => Err(TypeQLError::InvalidUnicodeEscapeInString { + full_string: rest.to_owned(), + escape: format!(r"\u{}", &rest[2..6]), + }.into()) } - .into() - }), + }, _ => Err(TypeQLError::InvalidStringEscape { full_string: rest.to_owned(), escape: format!(r"\{}", rest.chars().nth(1).unwrap()), @@ -424,6 +428,21 @@ const LF_: u8 = b'n'; const FF_: u8 = b'f'; const CR_: u8 = b'r'; +#[allow(arithmetic_overflow)] +fn decode_four_hex_bytes(bytes: &[u8]) -> Option { + if bytes.len() == 4 { + let u32_le: u32 = 0u32 + | (bytes[0] as char).to_digit(16)? << 12 + | (bytes[1] as char).to_digit(16)? << 8 + | (bytes[2] as char).to_digit(16)? << 4 + | (bytes[3] as char).to_digit(16)? << 0 ; + debug_assert!(char::from_u32(u32_le).is_some()); + char::from_u32(u32_le) + } else { + None + } +} + #[cfg(test)] pub mod tests { use crate::value::TypeQLError; @@ -447,6 +466,15 @@ pub mod tests { assert_eq!(parsed.unescape().unwrap().as_str(), "... ಠ_ಠ"); } + { + // Longer ones are just + let escaped = r#""... \u0CA01234""#; + let crate::ValueLiteral::String(parsed) = crate::parse_value(escaped).unwrap() else { + panic!("Not parsed as string"); + }; + assert_eq!(parsed.unescape().unwrap().as_str(), "... ಠ1234"); + } + { // Not enough bytes let escaped = r#""... \u012""#; From b48cbb44aa0b909127a641544dd3371928e8cf76 Mon Sep 17 00:00:00 2001 From: Krishnan Govindraj Date: Fri, 27 Mar 2026 14:28:32 +0100 Subject: [PATCH 04/10] Add lots of tests too --- rust/common/error/mod.rs | 2 - rust/value.rs | 173 +++++++++++++++++++++------------------ 2 files changed, 94 insertions(+), 81 deletions(-) diff --git a/rust/common/error/mod.rs b/rust/common/error/mod.rs index 52877f68..6f4eac01 100644 --- a/rust/common/error/mod.rs +++ b/rust/common/error/mod.rs @@ -84,8 +84,6 @@ error_messages! { TypeQLError 6: "Encountered invalid escape sequence {escape:?} while parsing {full_string:?}.", ReservedKeywordAsIdentifier { identifier: Identifier } = 7: "A reserved keyword '{identifier}' was used as identifier.", - InvalidUnicodeEscapeInString { escape: String, full_string: String } = - 8: "Encountered an invalid unicode escape sequence {escape:?} while parsing {full_string:?}.", /* MissingPatterns = 5: "The query has not been provided with any patterns.", diff --git a/rust/value.rs b/rust/value.rs index d99beefc..90118f05 100644 --- a/rust/value.rs +++ b/rust/value.rs @@ -353,42 +353,39 @@ impl StringLiteral { } pub fn unescape(&self) -> Result { - self.process_unescape(|bytes, _buf, rest| match bytes[1] { - BSP => Ok(('\x08', 2)), - TAB => Ok(('\x09', 2)), - LF_ => Ok(('\x0a', 2)), - FF_ => Ok(('\x0c', 2)), - CR_ => Ok(('\x0d', 2)), - c @ (b'"' | b'\'' | b'\\') => Ok((c as char, 2)), - b'u' => { - compile_error!("Our 'escape' fields are wrong because \"rest\" isn't acutally rest here."); - let escape = &bytes[2..std::cmp::min(6, bytes.len())]; - match decode_four_hex_bytes(escape) { - Some(char) => Ok((char, 6)), - None => Err(TypeQLError::InvalidUnicodeEscapeInString { - full_string: rest.to_owned(), - escape: format!(r"\u{}", &rest[2..6]), - }.into()) + self.process_unescape(|bytes| { + if bytes.len() < 2 { + return Err(1); + } + match bytes[1] { + BSP => Ok(('\x08', 2)), + TAB => Ok(('\x09', 2)), + LF_ => Ok(('\x0a', 2)), + FF_ => Ok(('\x0c', 2)), + CR_ => Ok(('\x0d', 2)), + c @ (b'"' | b'\'' | b'\\') => Ok((c as char, 2)), + b'u' => { + let escape = &bytes[2..std::cmp::min(6, bytes.len())]; + match decode_four_hex_bytes(escape) { + Some(char) => Ok((char, 6)), + None => Err(6), + } } - }, - _ => Err(TypeQLError::InvalidStringEscape { - full_string: rest.to_owned(), - escape: format!(r"\{}", rest.chars().nth(1).unwrap()), + _ => Err(2), } - .into()), }) } pub fn unescape_regex(&self) -> Result { - self.process_unescape(|bytes, _, _| match bytes[1] { - c @ b'"' => Ok((c as char, 2)), + self.process_unescape(|bytes| match bytes.get(1) { + Some(b'"') => Ok(('"', 2)), _ => Ok(('\\', 1)), }) } fn process_unescape(&self, escape_handler: F) -> Result where - F: Fn(&[u8], &mut String, &str) -> Result<(char, usize)>, + F: Fn(&[u8]) -> std::result::Result<(char, usize), usize>, { let bytes = self.value.as_bytes(); assert_eq!(bytes[0], bytes[bytes.len() - 1]); @@ -400,17 +397,13 @@ impl StringLiteral { while !rest.is_empty() { let (char, escaped_len) = if rest.as_bytes()[0] == b'\\' { - let bytes = rest.as_bytes(); - - if bytes.len() < 2 { - return Err(TypeQLError::InvalidStringEscape { + escape_handler(rest.as_bytes()).map_err(|expected_escaped_len| { + let safe_len = std::cmp::min(rest.len(), expected_escaped_len); + Into::::into(TypeQLError::InvalidStringEscape { full_string: escaped_string.to_owned(), - escape: String::from(r"\"), - } - .into()); - } - - escape_handler(bytes, &mut buf, escaped_string)? + escape: rest[..safe_len].to_owned(), + }) + })? } else { let char = rest.chars().next().expect("string is non-empty"); (char, char.len_utf8()) @@ -433,9 +426,9 @@ fn decode_four_hex_bytes(bytes: &[u8]) -> Option { if bytes.len() == 4 { let u32_le: u32 = 0u32 | (bytes[0] as char).to_digit(16)? << 12 - | (bytes[1] as char).to_digit(16)? << 8 - | (bytes[2] as char).to_digit(16)? << 4 - | (bytes[3] as char).to_digit(16)? << 0 ; + | (bytes[1] as char).to_digit(16)? << 8 + | (bytes[2] as char).to_digit(16)? << 4 + | (bytes[3] as char).to_digit(16)? << 0; debug_assert!(char::from_u32(u32_le).is_some()); char::from_u32(u32_le) } else { @@ -445,60 +438,82 @@ fn decode_four_hex_bytes(bytes: &[u8]) -> Option { #[cfg(test)] pub mod tests { - use crate::value::TypeQLError; + use crate::{ + value::{StringLiteral, TypeQLError}, + Result, + }; + + fn parse_to_string_literal(escaped: &str) -> StringLiteral { + let crate::ValueLiteral::String(parsed) = crate::parse_value(escaped).unwrap() else { + panic!("Not parsed as string"); + }; + parsed + } + #[test] - fn test_unicode_unescape() { + fn test_unescape_regex() { { - // Works - let escaped = r#""... \u0ca0\u005f\u0ca0""#; - let crate::ValueLiteral::String(parsed) = crate::parse_value(escaped).unwrap() else { - panic!("Not parsed as string"); - }; - assert_eq!(parsed.unescape().unwrap().as_str(), "... ಠ_ಠ"); + let escaped = r#""a\"b\"c""#; + let unescaped = parse_to_string_literal(escaped).unescape_regex().unwrap(); + assert_eq!(unescaped.as_str(), r#"a"b"c"#); } - { - // Capital hex works too - let escaped = r#""... \u0CA0\u005F\u0CA0""#; - let crate::ValueLiteral::String(parsed) = crate::parse_value(escaped).unwrap() else { - panic!("Not parsed as string"); - }; - assert_eq!(parsed.unescape().unwrap().as_str(), "... ಠ_ಠ"); + let escaped = r#""abc\123""#; + let unescaped = parse_to_string_literal(escaped).unescape_regex().unwrap(); + assert_eq!(unescaped.as_str(), r#"abc\123"#); } - + // Cases that fail at parsing { - // Longer ones are just - let escaped = r#""... \u0CA01234""#; - let crate::ValueLiteral::String(parsed) = crate::parse_value(escaped).unwrap() else { - panic!("Not parsed as string"); - }; - assert_eq!(parsed.unescape().unwrap().as_str(), "... ಠ1234"); + let escaped = r#""abc\""#; + assert!(crate::parse_value(escaped).is_err()); // Parsing fails as incomplete string literal + let string_literal = StringLiteral { value: escaped.to_owned() }; + let unescaped = string_literal.unescape_regex().unwrap(); + assert_eq!(unescaped.as_str(), r#"abc\"#); } + } - { - // Not enough bytes - let escaped = r#""... \u012""#; - let crate::ValueLiteral::String(parsed) = crate::parse_value(escaped).unwrap() else { - panic!("Not parsed as string"); - }; - let error = parsed.unescape().unwrap_err(); - let TypeQLError::InvalidUnicodeEscapeInString { escape, .. } = &error.errors()[0] else { - panic!("Wrong error type. Was {error:?}") - }; - assert_eq!(escape, r"\u012"); - } + fn assert_unescapes_to(escaped: &str, expected: &str) { + let unescaped = parse_to_string_literal(escaped).unescape().unwrap(); + assert_eq!(unescaped, expected); + } + + fn assert_unescape_errors(escaped: &str, expected_escape_sequence: &str) { + let error = parse_to_string_literal(escaped).unescape().unwrap_err(); + let TypeQLError::InvalidStringEscape { escape, .. } = &error.errors()[0] else { + panic!("Wrong error type. Was {error:?}") + }; + assert_eq!(escape, expected_escape_sequence); + } + #[test] + fn test_unescape() { + // Succeeds + assert_unescapes_to(r#""a\tb\tc""#, "a\tb\tc"); // works + assert_unescapes_to(r#""a\"b\"c""#, r#"a"b"c"#); // works + assert_unescapes_to(r#""a\'b\'c""#, r#"a'b'c"#); // works + assert_unescapes_to(r#""a\\b\\c""#, r#"a\b\c"#); // works + // - Unicode + assert_unescapes_to(r#""abc \u0ca0\u005f\u0ca0""#, "abc ಠ_ಠ"); // works + assert_unescapes_to(r#""abc \u0CA0\u005F\u0CA0""#, "abc ಠ_ಠ"); // caps + assert_unescapes_to(r#""abc \u0CA01234""#, "abc ಠ1234"); // consumes only 4 + + // Errors + assert_unescape_errors(r#""ab\c""#, r"\c"); // Invalid escape + + // - Unicode + assert_unescape_errors(r#""abc \u""#, r"\u"); // Not enough bytes + assert_unescape_errors(r#""abc \u012""#, r"\u012"); // Not enough bytes + assert_unescape_errors(r#""abc \uwu/ abc""#, r"\uwu/ "); // Invalid hex + // Cases that fail at parsing { - // Invalid hex - let escaped = r#""... \uwu/ ...""#; - let crate::ValueLiteral::String(parsed) = crate::parse_value(escaped).unwrap() else { - panic!("Not parsed as string"); - }; - let error = parsed.unescape().unwrap_err(); - let TypeQLError::InvalidUnicodeEscapeInString { escape, .. } = &error.errors()[0] else { + let escaped = r#""abc\""#; + assert!(crate::parse_value(escaped).is_err()); // Parsing fails as incomplete string literal + let string_literal = StringLiteral { value: escaped.to_owned() }; + let error = string_literal.unescape().unwrap_err(); + let TypeQLError::InvalidStringEscape { escape, .. } = &error.errors()[0] else { panic!("Wrong error type. Was {error:?}") }; - assert_eq!(escape, r"\uwu/ "); + assert_eq!(escape, r#"\"#); } } } From d454420f60ca4d4b39cf78307b33d6a5a6d79fa3 Mon Sep 17 00:00:00 2001 From: Krishnan Govindraj Date: Fri, 27 Mar 2026 14:57:34 +0100 Subject: [PATCH 05/10] Fix ugly into + unsafe slicing --- rust/value.rs | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/rust/value.rs b/rust/value.rs index 90118f05..b4046d8e 100644 --- a/rust/value.rs +++ b/rust/value.rs @@ -397,13 +397,15 @@ impl StringLiteral { while !rest.is_empty() { let (char, escaped_len) = if rest.as_bytes()[0] == b'\\' { - escape_handler(rest.as_bytes()).map_err(|expected_escaped_len| { - let safe_len = std::cmp::min(rest.len(), expected_escaped_len); - Into::::into(TypeQLError::InvalidStringEscape { - full_string: escaped_string.to_owned(), - escape: rest[..safe_len].to_owned(), - }) - })? + match escape_handler(rest.as_bytes()) { + Ok((char, escaped_len)) => (char, escaped_len), + Err(considered_escape_byte_length) => { + return Err(TypeQLError::InvalidStringEscape { + full_string: escaped_string.to_owned(), + escape: rest.chars().take(considered_escape_byte_length).collect(), + }.into()); + } + } } else { let char = rest.chars().next().expect("string is non-empty"); (char, char.len_utf8()) From 9f10deea8cc356ece1f49f290585d4f3455ca630 Mon Sep 17 00:00:00 2001 From: Krishnan Govindraj Date: Fri, 27 Mar 2026 21:21:04 +0100 Subject: [PATCH 06/10] Add a test to measure performance, but I've broken the implementation --- rust/BUILD | 1 + rust/value.rs | 94 ++++++++++++++++++++++++++++++++++++++------------- 2 files changed, 72 insertions(+), 23 deletions(-) diff --git a/rust/BUILD b/rust/BUILD index afa26bb6..cb7ee4c5 100644 --- a/rust/BUILD +++ b/rust/BUILD @@ -40,6 +40,7 @@ rust_test( deps = [ "@crates//:syn", "@crates//:proc-macro2", + "@crates//:rand", ], ) diff --git a/rust/value.rs b/rust/value.rs index b4046d8e..b930edd8 100644 --- a/rust/value.rs +++ b/rust/value.rs @@ -342,16 +342,6 @@ impl fmt::Display for StructLiteral { } impl StringLiteral { - fn unescape_unicode<'a>(bytes: &'a [u8]) -> std::result::Result { - let as_hex = std::str::from_utf8(bytes).expect("Should still be utf8"); - if bytes.len() == 4 { - let as_u32 = u32::from_str_radix(as_hex, 16).map_err(|_| as_hex)?; - char::from_u32(as_u32).ok_or(as_hex) - } else { - Err(as_hex) - } - } - pub fn unescape(&self) -> Result { self.process_unescape(|bytes| { if bytes.len() < 2 { @@ -392,28 +382,31 @@ impl StringLiteral { assert!(matches!(bytes[0], b'\'' | b'"')); let escaped_string = &self.value[1..self.value.len() - 1]; - let mut buf = String::with_capacity(escaped_string.len()); + let mut buf = Vec::with_capacity(escaped_string.len()); let mut rest = escaped_string; - while !rest.is_empty() { - let (char, escaped_len) = if rest.as_bytes()[0] == b'\\' { + let escaped_len = if rest.as_bytes()[0] == b'\\' { match escape_handler(rest.as_bytes()) { - Ok((char, escaped_len)) => (char, escaped_len), + Ok((char, escaped_len)) => { + let start = buf.len(); + buf.resize(buf.len() + char.len_utf8(),0); + char.encode_utf8(&mut buf[start..]); + rest = &rest[escaped_len..]; + }, Err(considered_escape_byte_length) => { + let considered_escape_sequence = rest.chars().take(considered_escape_byte_length).collect(); return Err(TypeQLError::InvalidStringEscape { full_string: escaped_string.to_owned(), - escape: rest.chars().take(considered_escape_byte_length).collect(), + escape: considered_escape_sequence, }.into()); } } } else { - let char = rest.chars().next().expect("string is non-empty"); - (char, char.len_utf8()) + buf.push(rest.as_bytes()[0]); + rest = &rest[1..]; }; - buf.push(char); - rest = &rest[escaped_len..]; } - Ok(buf) + Ok(String::from_utf8(buf).expect("Expected valid utf8").to_owned()) } } @@ -494,7 +487,7 @@ pub mod tests { assert_unescapes_to(r#""a\"b\"c""#, r#"a"b"c"#); // works assert_unescapes_to(r#""a\'b\'c""#, r#"a'b'c"#); // works assert_unescapes_to(r#""a\\b\\c""#, r#"a\b\c"#); // works - // - Unicode + // - Unicode assert_unescapes_to(r#""abc \u0ca0\u005f\u0ca0""#, "abc ಠ_ಠ"); // works assert_unescapes_to(r#""abc \u0CA0\u005F\u0CA0""#, "abc ಠ_ಠ"); // caps assert_unescapes_to(r#""abc \u0CA01234""#, "abc ಠ1234"); // consumes only 4 @@ -506,7 +499,7 @@ pub mod tests { assert_unescape_errors(r#""abc \u""#, r"\u"); // Not enough bytes assert_unescape_errors(r#""abc \u012""#, r"\u012"); // Not enough bytes assert_unescape_errors(r#""abc \uwu/ abc""#, r"\uwu/ "); // Invalid hex - // Cases that fail at parsing + // Cases that fail at parsing { let escaped = r#""abc\""#; assert!(crate::parse_value(escaped).is_err()); // Parsing fails as incomplete string literal @@ -518,4 +511,59 @@ pub mod tests { assert_eq!(escape, r#"\"#); } } -} + + #[test] + fn time_unescape_ascii() { + let text = generate_string(TIME_UNESCAPE_TEXT_LEN, |x| 32 + (x % 94)); + time_unescape(text); + } + + #[test] + fn time_unescape_unicode() { + // assert_eq!(None, (0..0x07ff).filter(|x| char::from_u32(*x).is_none()).next()); + let text = generate_string(TIME_UNESCAPE_TEXT_LEN, move |x| x & 0x07ff); + time_unescape(text); + } + + const TIME_UNESCAPE_TEXT_LEN: usize = 100000; + fn time_unescape(text: String) { + use std::time::Instant; + let iters = 10000; + + let string_literal = StringLiteral { value: text }; + let start = Instant::now(); + for _ in 0..iters { + string_literal.unescape().unwrap(); + } + let end = Instant::now(); + println!("{iters} on string of length {} iters in {}", string_literal.value.as_str().len(), (end - start).as_secs_f64()) + } + + fn generate_string(length: usize, mapper: fn(u32) -> u32) -> String { + use rand::{thread_rng, Rng, RngCore}; + let mut rng = thread_rng(); + let capacity: i64 = (1.2 * length as f64).ceil() as i64; + let mut text = String::with_capacity(capacity as usize); + text.push('"'); + let mut sanity: i64 = capacity; + while text.as_str().len() < length+1 && sanity >= 0 { + sanity -= 1; + match char::from_u32(mapper(rng.next_u32())) { + Some('\\') => { text.push('\\'); text.push('\\'); } + Some('\'') => { text.push('\\'); text.push('\''); } + Some('\"') => { text.push('\\'); text.push('\"'); } + Some('\x08') => { text.push('\\'); text.push('b'); } + Some('\x09') => { text.push('\\'); text.push('t'); } + Some('\x0a') => { text.push('\\'); text.push('n'); } + Some('\x0c') => { text.push('\\'); text.push('f'); } + Some('\x0d') => { text.push('\\'); text.push('r'); } + Some(ch) => { text.push(ch) }, + None => {} + } + } + text.push('"'); + assert!(text.as_str().len() > length && text.as_str().len() < length + 10); + text + } + +} \ No newline at end of file From 783e0b281cc8185b0f55a3d844f611f88171f74e Mon Sep 17 00:00:00 2001 From: Krishnan Govindraj Date: Fri, 27 Mar 2026 21:22:42 +0100 Subject: [PATCH 07/10] Revert the implementation --- rust/value.rs | 34 +++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/rust/value.rs b/rust/value.rs index b930edd8..2f4e2f91 100644 --- a/rust/value.rs +++ b/rust/value.rs @@ -341,7 +341,18 @@ impl fmt::Display for StructLiteral { } } + impl StringLiteral { + fn unescape_unicode<'a>(bytes: &'a [u8]) -> std::result::Result { + let as_hex = std::str::from_utf8(bytes).expect("Should still be utf8"); + if bytes.len() == 4 { + let as_u32 = u32::from_str_radix(as_hex, 16).map_err(|_| as_hex)?; + char::from_u32(as_u32).ok_or(as_hex) + } else { + Err(as_hex) + } + } + pub fn unescape(&self) -> Result { self.process_unescape(|bytes| { if bytes.len() < 2 { @@ -382,31 +393,28 @@ impl StringLiteral { assert!(matches!(bytes[0], b'\'' | b'"')); let escaped_string = &self.value[1..self.value.len() - 1]; - let mut buf = Vec::with_capacity(escaped_string.len()); + let mut buf = String::with_capacity(escaped_string.len()); let mut rest = escaped_string; + while !rest.is_empty() { - let escaped_len = if rest.as_bytes()[0] == b'\\' { + let (char, escaped_len) = if rest.as_bytes()[0] == b'\\' { match escape_handler(rest.as_bytes()) { - Ok((char, escaped_len)) => { - let start = buf.len(); - buf.resize(buf.len() + char.len_utf8(),0); - char.encode_utf8(&mut buf[start..]); - rest = &rest[escaped_len..]; - }, + Ok((char, escaped_len)) => (char, escaped_len), Err(considered_escape_byte_length) => { - let considered_escape_sequence = rest.chars().take(considered_escape_byte_length).collect(); return Err(TypeQLError::InvalidStringEscape { full_string: escaped_string.to_owned(), - escape: considered_escape_sequence, + escape: rest.chars().take(considered_escape_byte_length).collect(), }.into()); } } } else { - buf.push(rest.as_bytes()[0]); - rest = &rest[1..]; + let char = rest.chars().next().expect("string is non-empty"); + (char, char.len_utf8()) }; + buf.push(char); + rest = &rest[escaped_len..]; } - Ok(String::from_utf8(buf).expect("Expected valid utf8").to_owned()) + Ok(buf) } } From a730d2bd302c5ee373a842c95d70ae30754d6f9d Mon Sep 17 00:00:00 2001 From: Krishnan Govindraj Date: Fri, 27 Mar 2026 21:24:03 +0100 Subject: [PATCH 08/10] And replay the brokn stuff so I can fix it --- rust/value.rs | 34 +++++++++++++--------------------- 1 file changed, 13 insertions(+), 21 deletions(-) diff --git a/rust/value.rs b/rust/value.rs index 2f4e2f91..b930edd8 100644 --- a/rust/value.rs +++ b/rust/value.rs @@ -341,18 +341,7 @@ impl fmt::Display for StructLiteral { } } - impl StringLiteral { - fn unescape_unicode<'a>(bytes: &'a [u8]) -> std::result::Result { - let as_hex = std::str::from_utf8(bytes).expect("Should still be utf8"); - if bytes.len() == 4 { - let as_u32 = u32::from_str_radix(as_hex, 16).map_err(|_| as_hex)?; - char::from_u32(as_u32).ok_or(as_hex) - } else { - Err(as_hex) - } - } - pub fn unescape(&self) -> Result { self.process_unescape(|bytes| { if bytes.len() < 2 { @@ -393,28 +382,31 @@ impl StringLiteral { assert!(matches!(bytes[0], b'\'' | b'"')); let escaped_string = &self.value[1..self.value.len() - 1]; - let mut buf = String::with_capacity(escaped_string.len()); + let mut buf = Vec::with_capacity(escaped_string.len()); let mut rest = escaped_string; - while !rest.is_empty() { - let (char, escaped_len) = if rest.as_bytes()[0] == b'\\' { + let escaped_len = if rest.as_bytes()[0] == b'\\' { match escape_handler(rest.as_bytes()) { - Ok((char, escaped_len)) => (char, escaped_len), + Ok((char, escaped_len)) => { + let start = buf.len(); + buf.resize(buf.len() + char.len_utf8(),0); + char.encode_utf8(&mut buf[start..]); + rest = &rest[escaped_len..]; + }, Err(considered_escape_byte_length) => { + let considered_escape_sequence = rest.chars().take(considered_escape_byte_length).collect(); return Err(TypeQLError::InvalidStringEscape { full_string: escaped_string.to_owned(), - escape: rest.chars().take(considered_escape_byte_length).collect(), + escape: considered_escape_sequence, }.into()); } } } else { - let char = rest.chars().next().expect("string is non-empty"); - (char, char.len_utf8()) + buf.push(rest.as_bytes()[0]); + rest = &rest[1..]; }; - buf.push(char); - rest = &rest[escaped_len..]; } - Ok(buf) + Ok(String::from_utf8(buf).expect("Expected valid utf8").to_owned()) } } From 2fdc3f0b3d33be362a536db7949b647db6942d95 Mon Sep 17 00:00:00 2001 From: Krishnan Govindraj Date: Fri, 27 Mar 2026 22:53:44 +0100 Subject: [PATCH 09/10] Quick one with just bytes: 0.94s on ascii, 1.28 on unicode --- rust/value.rs | 79 ++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 56 insertions(+), 23 deletions(-) diff --git a/rust/value.rs b/rust/value.rs index b930edd8..c8e39ffc 100644 --- a/rust/value.rs +++ b/rust/value.rs @@ -383,26 +383,29 @@ impl StringLiteral { let escaped_string = &self.value[1..self.value.len() - 1]; let mut buf = Vec::with_capacity(escaped_string.len()); - let mut rest = escaped_string; + let mut rest: &[u8] = escaped_string.as_bytes(); while !rest.is_empty() { - let escaped_len = if rest.as_bytes()[0] == b'\\' { - match escape_handler(rest.as_bytes()) { + let escaped_len = if rest[0] == b'\\' { + match escape_handler(rest) { Ok((char, escaped_len)) => { let start = buf.len(); - buf.resize(buf.len() + char.len_utf8(),0); + buf.resize(buf.len() + char.len_utf8(), 0); char.encode_utf8(&mut buf[start..]); rest = &rest[escaped_len..]; - }, - Err(considered_escape_byte_length) => { - let considered_escape_sequence = rest.chars().take(considered_escape_byte_length).collect(); + } + Err(considered_escape_seq_length) => { + let offset = escaped_string.len() - rest.len(); + let considered_escape_sequence = + escaped_string[offset..].chars().take(considered_escape_seq_length).collect(); return Err(TypeQLError::InvalidStringEscape { full_string: escaped_string.to_owned(), escape: considered_escape_sequence, - }.into()); + } + .into()); } } } else { - buf.push(rest.as_bytes()[0]); + buf.push(rest[0]); rest = &rest[1..]; }; } @@ -487,7 +490,7 @@ pub mod tests { assert_unescapes_to(r#""a\"b\"c""#, r#"a"b"c"#); // works assert_unescapes_to(r#""a\'b\'c""#, r#"a'b'c"#); // works assert_unescapes_to(r#""a\\b\\c""#, r#"a\b\c"#); // works - // - Unicode + // - Unicode assert_unescapes_to(r#""abc \u0ca0\u005f\u0ca0""#, "abc ಠ_ಠ"); // works assert_unescapes_to(r#""abc \u0CA0\u005F\u0CA0""#, "abc ಠ_ಠ"); // caps assert_unescapes_to(r#""abc \u0CA01234""#, "abc ಠ1234"); // consumes only 4 @@ -499,6 +502,9 @@ pub mod tests { assert_unescape_errors(r#""abc \u""#, r"\u"); // Not enough bytes assert_unescape_errors(r#""abc \u012""#, r"\u012"); // Not enough bytes assert_unescape_errors(r#""abc \uwu/ abc""#, r"\uwu/ "); // Invalid hex + assert_unescape_errors(r#""abc \uΣ12Σ abc""#, r"\uΣ12Σ"); // Invalid hex, 4 chars more than 4 bytes + assert_unescape_errors(r#""abc \u123Σ abc""#, r"\u123Σ"); // Invalid hex, 4 chars more than 4 bytes + // Cases that fail at parsing { let escaped = r#""abc\""#; @@ -536,7 +542,11 @@ pub mod tests { string_literal.unescape().unwrap(); } let end = Instant::now(); - println!("{iters} on string of length {} iters in {}", string_literal.value.as_str().len(), (end - start).as_secs_f64()) + println!( + "{iters} on string of length {} iters in {}", + string_literal.value.as_str().len(), + (end - start).as_secs_f64() + ) } fn generate_string(length: usize, mapper: fn(u32) -> u32) -> String { @@ -546,18 +556,42 @@ pub mod tests { let mut text = String::with_capacity(capacity as usize); text.push('"'); let mut sanity: i64 = capacity; - while text.as_str().len() < length+1 && sanity >= 0 { + while text.as_str().len() < length + 1 && sanity >= 0 { sanity -= 1; match char::from_u32(mapper(rng.next_u32())) { - Some('\\') => { text.push('\\'); text.push('\\'); } - Some('\'') => { text.push('\\'); text.push('\''); } - Some('\"') => { text.push('\\'); text.push('\"'); } - Some('\x08') => { text.push('\\'); text.push('b'); } - Some('\x09') => { text.push('\\'); text.push('t'); } - Some('\x0a') => { text.push('\\'); text.push('n'); } - Some('\x0c') => { text.push('\\'); text.push('f'); } - Some('\x0d') => { text.push('\\'); text.push('r'); } - Some(ch) => { text.push(ch) }, + Some('\\') => { + text.push('\\'); + text.push('\\'); + } + Some('\'') => { + text.push('\\'); + text.push('\''); + } + Some('\"') => { + text.push('\\'); + text.push('\"'); + } + Some('\x08') => { + text.push('\\'); + text.push('b'); + } + Some('\x09') => { + text.push('\\'); + text.push('t'); + } + Some('\x0a') => { + text.push('\\'); + text.push('n'); + } + Some('\x0c') => { + text.push('\\'); + text.push('f'); + } + Some('\x0d') => { + text.push('\\'); + text.push('r'); + } + Some(ch) => text.push(ch), None => {} } } @@ -565,5 +599,4 @@ pub mod tests { assert!(text.as_str().len() > length && text.as_str().len() < length + 10); text } - -} \ No newline at end of file +} From ce73b3b9208227868263304f268c9b1de61ea711 Mon Sep 17 00:00:00 2001 From: Krishnan Govindraj Date: Fri, 27 Mar 2026 23:04:41 +0100 Subject: [PATCH 10/10] Add ignore to the bench tests --- rust/value.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/rust/value.rs b/rust/value.rs index c8e39ffc..98783e20 100644 --- a/rust/value.rs +++ b/rust/value.rs @@ -518,12 +518,14 @@ pub mod tests { } } + #[ignore] #[test] fn time_unescape_ascii() { let text = generate_string(TIME_UNESCAPE_TEXT_LEN, |x| 32 + (x % 94)); time_unescape(text); } + #[ignore] #[test] fn time_unescape_unicode() { // assert_eq!(None, (0..0x07ff).filter(|x| char::from_u32(*x).is_none()).next());