Implement unicode unescaping#438
Conversation
rust/value.rs
Outdated
|
|
||
| impl StringLiteral { | ||
| fn unescape_unicode<'a>(bytes: &'a [u8]) -> std::result::Result<char, &'a str> { | ||
| let as_hex = std::str::from_utf8(bytes).expect("Should still be utf8"); |
There was a problem hiding this comment.
You sliced the bytes, and you could have ended up slicing a multibyte character in the middle. You can't expect valid UTF-8.
rust/value.rs
Outdated
| } | ||
|
|
||
| impl StringLiteral { | ||
| fn unescape_unicode<'a>(bytes: &'a [u8]) -> std::result::Result<char, &'a str> { |
There was a problem hiding this comment.
No reason for this to be an associated function
rust/value.rs
Outdated
| b'u' => Self::unescape_unicode(&bytes[2..std::cmp::min(6, bytes.len())]).map(|c| (c, 6)).map_err(|hex| { | ||
| TypeQLError::InvalidUnicodeEscapeInString { | ||
| full_string: rest.to_owned(), | ||
| escape: format!(r"\u{}", hex), | ||
| } | ||
| .into() | ||
| }), |
There was a problem hiding this comment.
.map(..).map_err(...) is more readable as a match
rust/value.rs
Outdated
| let as_u32 = u32::from_str_radix(as_hex, 16).map_err(|_| as_hex)?; | ||
| char::from_u32(as_u32).ok_or(as_hex) |
There was a problem hiding this comment.
It might be easier and more performant to iterate through the bytes and accumulate rather than going through UTF-8 decoding and handling decoding and parsing errors.
rust/value.rs
Outdated
| return Err(TypeQLError::InvalidStringEscape { | ||
| escape_handler(rest.as_bytes()).map_err(|expected_escaped_len| { | ||
| let safe_len = std::cmp::min(rest.len(), expected_escaped_len); | ||
| Into::<crate::common::error::Error>::into(TypeQLError::InvalidStringEscape { |
There was a problem hiding this comment.
Whoa, what is going on here?
There was a problem hiding this comment.
It didn't infer for some reason. I can try again.
rust/value.rs
Outdated
| if bytes.len() < 2 { | ||
| return Err(TypeQLError::InvalidStringEscape { | ||
| escape_handler(rest.as_bytes()).map_err(|expected_escaped_len| { | ||
| let safe_len = std::cmp::min(rest.len(), expected_escaped_len); |
There was a problem hiding this comment.
That's still not safe, we need to make sure we slice on the character boundary. Something like \て would cause a panic.
There was a problem hiding this comment.
rest is still chars though. Would this happen?
| Err(considered_escape_seq_length) => { | ||
| let offset = escaped_string.len() - rest.len(); | ||
| let considered_escape_sequence = | ||
| escaped_string[offset..].chars().take(considered_escape_seq_length).collect(); |
There was a problem hiding this comment.
I've not changed this, but I also don't get why it's semantically wrong.
There was a problem hiding this comment.
Also, ceil_char_boundary doesn't look stable in the version of rust we're using.
Product change and motivation
Escaped unicode characters can be used in TypeQL string literals. These must be of the form '\uXXXX
whereX` is any hex digit.