Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions core/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ pub use presign::presign_upload_to_stage;
pub use presign::PresignedResponse;
pub use response::QueryStats;
pub use response::SchemaField;
pub use settings::BinaryFormat;
pub use settings::GeometryDataType;
pub use settings::QueryResultFormatSettings;
pub use settings::ResultFormatSettings;
Expand Down
32 changes: 32 additions & 0 deletions driver/tests/driver/select_simple.rs
Original file line number Diff line number Diff line change
Expand Up @@ -318,6 +318,38 @@ async fn select_array() {
assert_eq!(val4, vec![vec![120, 121, 122]]);
}

#[tokio::test]
async fn select_binary_respects_server_binary_output_format_for_json_results() {
let dsn = option_env!("TEST_DATABEND_DSN").unwrap_or(DEFAULT_DSN);
if dsn.starts_with("databend+flight://") {
return;
}

let client = Client::new(dsn.to_string());
let conn = client.get_conn().await.unwrap();

conn.exec("SET binary_output_format='base64'")
.await
.unwrap();
let row1 = conn
.query_row("select to_binary('xyz'), [to_binary('xyz')]")
.await
.unwrap()
.unwrap();
let (val1, val2): (Vec<u8>, Vec<Vec<u8>>) = row1.try_into().unwrap();
assert_eq!(val1, b"xyz".to_vec());
assert_eq!(val2, vec![b"xyz".to_vec()]);

conn.exec("SET binary_output_format='utf-8'").await.unwrap();
let row2 = conn
.query_row("select (to_binary('xyz'), to_binary('ab'))")
.await
.unwrap()
.unwrap();
let (val3,): ((Vec<u8>, Vec<u8>),) = row2.try_into().unwrap();
assert_eq!(val3, (b"xyz".to_vec(), b"ab".to_vec()));
}

#[tokio::test]
async fn select_map() {
let conn = prepare().await;
Expand Down
1 change: 1 addition & 0 deletions sql/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ flight-sql = ["dep:tonic"]
arrow = { workspace = true }
arrow-array = { workspace = true }
arrow-schema = { workspace = true }
base64 = "0.22.1"
chrono = { workspace = true }
chrono-tz = { workspace = true }
ethnum = "1.5.1"
Expand Down
168 changes: 161 additions & 7 deletions sql/src/value/string_decoder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,11 @@ use crate::cursor_ext::{
};
use crate::error::{ConvertError, Result};
use crate::value::base::GeoValue;
use base64::engine::general_purpose::STANDARD as BASE64_STANDARD;
use base64::Engine;
use chrono::{Datelike, NaiveDate};
use databend_client::schema::{DataType, DecimalDataType, DecimalSize, NumberDataType};
use databend_client::ResultFormatSettings;
use databend_client::{BinaryFormat, ResultFormatSettings};
use ethnum::i256;
use hex;
use jiff::{civil::DateTime as JiffDateTime, tz::TimeZone, Zoned};
Expand Down Expand Up @@ -63,7 +65,7 @@ impl TryFrom<(&DataType, String, &ResultFormatSettings)> for Value {
DataType::EmptyArray => Ok(Self::EmptyArray),
DataType::EmptyMap => Ok(Self::EmptyMap),
DataType::Boolean => Ok(Self::Boolean(v == "1")),
DataType::Binary => Ok(Self::Binary(hex::decode(v)?)),
DataType::Binary => Ok(Self::Binary(parse_binary_value(v.as_str(), settings)?)),
DataType::String => Ok(Self::String(v)),
DataType::Number(NumberDataType::Int8) => {
Ok(Self::Number(NumberValue::Int8(v.parse()?)))
Expand Down Expand Up @@ -300,11 +302,11 @@ impl ValueDecoder {
}

fn read_binary<R: AsRef<[u8]>>(&self, reader: &mut Cursor<R>) -> Result<Value> {
let buf = reader.fill_buf()?;
let n = collect_binary_number(buf);
let v = buf[..n].to_vec();
reader.consume(n);
Ok(Value::Binary(hex::decode(v)?))
let v = self.read_binary_text(reader)?;
Ok(Value::Binary(parse_binary_value(
v.as_str(),
&self.settings,
)?))
}

fn read_date<R: AsRef<[u8]>>(&self, reader: &mut Cursor<R>) -> Result<Value> {
Expand Down Expand Up @@ -500,6 +502,33 @@ impl ValueDecoder {
reader.set_position((start + raw.get().len()) as u64);
Ok(raw.to_string())
}

fn read_binary_text<R: AsRef<[u8]>>(&self, reader: &mut Cursor<R>) -> Result<String> {
let pos = reader.checkpoint();
let mut buf = Vec::new();
if reader.read_quoted_text(&mut buf, b'"').is_ok()
|| reader.read_quoted_text(&mut buf, b'\'').is_ok()
{
return Ok(unsafe { String::from_utf8_unchecked(buf) });
}
reader.rollback(pos);

let buf = reader.fill_buf()?;
let n = match self.settings.binary_output_format {
BinaryFormat::Hex => collect_binary_number(buf),
BinaryFormat::Base64 | BinaryFormat::Utf8 | BinaryFormat::Utf8Lossy => {
collect_binary_token(buf)
}
};
if n == 0 {
return Err(
ConvertError::new("binary", String::from_utf8_lossy(buf).to_string()).into(),
);
}
let v = std::str::from_utf8(&buf[..n])?.to_string();
reader.consume(n);
Ok(v)
}
}

fn parse_timestamp(ts_string: &str, tz: &TimeZone) -> Result<Value> {
Expand Down Expand Up @@ -580,3 +609,128 @@ fn parse_decimal(text: &str, size: DecimalSize) -> Result<NumberValue> {
}
}
}

fn parse_binary_value(v: &str, settings: &ResultFormatSettings) -> Result<Vec<u8>> {
match settings.binary_output_format {
BinaryFormat::Hex => Ok(hex::decode(v)?),
BinaryFormat::Base64 => BASE64_STANDARD
.decode(v)
.map_err(|e| Error::Parsing(e.to_string())),
BinaryFormat::Utf8 | BinaryFormat::Utf8Lossy => Ok(v.as_bytes().to_vec()),
}
}

fn collect_binary_token(buffer: &[u8]) -> usize {
let mut index = 0;
while index < buffer.len() {
match buffer[index] {
b',' | b']' | b')' | b'}' | b' ' | b'\t' | b'\r' | b'\n' => break,
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Stop binary token parsing at map key separator

When binary_output_format is base64 or utf-8, collect_binary_token is used for nested Binary values, but its delimiter list does not include :. In read_map, keys are parsed first and then a colon is expected; with input like {eHl6:1} the key parser will consume past : into the value, which then triggers a decode error or a later must_ignore_byte(':') failure. This breaks decoding for Map<Binary, ...> values in JSON results when keys are unquoted.

Useful? React with 👍 / 👎.

_ => index += 1,
}
}
index
}

#[cfg(test)]
mod tests {
use super::*;
use databend_client::schema::DataType;

fn settings(binary_output_format: BinaryFormat) -> ResultFormatSettings {
ResultFormatSettings {
binary_output_format,
..ResultFormatSettings::default()
}
}

#[test]
fn decode_binary_top_level_respects_hex_format() {
let value = Value::try_from((
&DataType::Binary,
"78797A".to_string(),
&settings(BinaryFormat::Hex),
))
.unwrap();
assert_eq!(value, Value::Binary(b"xyz".to_vec()));
}

#[test]
fn decode_binary_top_level_respects_base64_format() {
let value = Value::try_from((
&DataType::Binary,
"eHl6".to_string(),
&settings(BinaryFormat::Base64),
))
.unwrap();
assert_eq!(value, Value::Binary(b"xyz".to_vec()));
}

#[test]
fn decode_binary_top_level_respects_utf8_format() {
let value = Value::try_from((
&DataType::Binary,
"xyz".to_string(),
&settings(BinaryFormat::Utf8),
))
.unwrap();
assert_eq!(value, Value::Binary(b"xyz".to_vec()));
}

#[test]
fn decode_binary_top_level_respects_utf8_lossy_format() {
let value = Value::try_from((
&DataType::Binary,
"xy\u{FFFD}".to_string(),
&settings(BinaryFormat::Utf8Lossy),
))
.unwrap();
assert_eq!(value, Value::Binary("xy\u{FFFD}".as_bytes().to_vec()));
}

#[test]
fn decode_nested_binary_array_respects_hex_format() {
let decoder = ValueDecoder {
settings: settings(BinaryFormat::Hex),
};
let mut reader = Cursor::new(br#"[78797A]"#);

let value = decoder
.read_field(&DataType::Array(Box::new(DataType::Binary)), &mut reader)
.unwrap();

assert_eq!(value, Value::Array(vec![Value::Binary(b"xyz".to_vec())]));
}

#[test]
fn decode_nested_binary_array_respects_base64_format() {
let decoder = ValueDecoder {
settings: settings(BinaryFormat::Base64),
};
let mut reader = Cursor::new(br#"["eHl6"]"#);

let value = decoder
.read_field(&DataType::Array(Box::new(DataType::Binary)), &mut reader)
.unwrap();

assert_eq!(value, Value::Array(vec![Value::Binary(b"xyz".to_vec())]));
}

#[test]
fn decode_nested_binary_tuple_respects_utf8_format() {
let decoder = ValueDecoder {
settings: settings(BinaryFormat::Utf8),
};
let mut reader = Cursor::new(br#"("xyz","2024-10-22 10:11:12.000000")"#);

let value = decoder
.read_field(
&DataType::Tuple(vec![DataType::Binary, DataType::Timestamp]),
&mut reader,
)
.unwrap();

assert!(
matches!(value, Value::Tuple(values) if values[0] == Value::Binary(b"xyz".to_vec()))
);
}
}
Loading