diff --git a/be/src/core/data_type_serde/data_type_timestamptz_serde.cpp b/be/src/core/data_type_serde/data_type_timestamptz_serde.cpp index 3727faa6beee96..c22c65dbd5a011 100644 --- a/be/src/core/data_type_serde/data_type_timestamptz_serde.cpp +++ b/be/src/core/data_type_serde/data_type_timestamptz_serde.cpp @@ -250,4 +250,21 @@ std::string DataTypeTimeStampTzSerDe::to_olap_string(const Field& field) const { return CastToString::from_timestamptz(field.get(), 6); } +void DataTypeTimeStampTzSerDe::write_one_cell_to_binary(const IColumn& src_column, + ColumnString::Chars& chars, + int64_t row_num) const { + const auto type = static_cast(FieldType::OLAP_FIELD_TYPE_TIMESTAMPTZ); + const auto& data_ref = assert_cast(src_column).get_data_at(row_num); + const auto sc = static_cast(_scale); + + const size_t old_size = chars.size(); + const size_t new_size = old_size + sizeof(uint8_t) + sizeof(uint8_t) + data_ref.size; + chars.resize(new_size); + memcpy(chars.data() + old_size, reinterpret_cast(&type), sizeof(uint8_t)); + memcpy(chars.data() + old_size + sizeof(uint8_t), reinterpret_cast(&sc), + sizeof(uint8_t)); + memcpy(chars.data() + old_size + sizeof(uint8_t) + sizeof(uint8_t), data_ref.data, + data_ref.size); +} + } // namespace doris diff --git a/be/src/core/data_type_serde/data_type_timestamptz_serde.h b/be/src/core/data_type_serde/data_type_timestamptz_serde.h index 459003e040f241..0a595935d8fdd6 100644 --- a/be/src/core/data_type_serde/data_type_timestamptz_serde.h +++ b/be/src/core/data_type_serde/data_type_timestamptz_serde.h @@ -72,6 +72,10 @@ class DataTypeTimeStampTzSerDe : public DataTypeNumberSerDe(0); +static auto serde_tz_3 = std::make_shared(3); static auto serde_tz_6 = std::make_shared(6); static ColumnTimeStampTz::MutablePtr column_tz_0; +static ColumnTimeStampTz::MutablePtr column_tz_3; static ColumnTimeStampTz::MutablePtr column_tz_6; class DataTypeTimeStampTzSerDeTest : public ::testing::Test { @@ -51,6 +56,7 @@ class DataTypeTimeStampTzSerDeTest : public ::testing::Test { test_data_dir = root_dir + "/be/test/data/vec/columns"; column_tz_0 = ColumnTimeStampTz::create(); + column_tz_3 = ColumnTimeStampTz::create(); column_tz_6 = ColumnTimeStampTz::create(); load_columns_data(); @@ -70,6 +76,7 @@ class DataTypeTimeStampTzSerDeTest : public ::testing::Test { EXPECT_TRUE(!column->empty()); }; test_func(column_tz_0->get_ptr(), serde_tz_0, "TIMESTAMPTZ(0).csv"); + test_func(column_tz_3->get_ptr(), serde_tz_3, "TIMESTAMPTZ(3).csv"); test_func(column_tz_6->get_ptr(), serde_tz_6, "TIMESTAMPTZ(6).csv"); std::cout << "loading test dataset done" << std::endl; @@ -189,6 +196,82 @@ TEST_F(DataTypeTimeStampTzSerDeTest, serdes) { } }; test_func(*serde_tz_0, column_tz_0); + test_func(*serde_tz_3, column_tz_3); test_func(*serde_tz_6, column_tz_6); } + +// Roundtrip via write_one_cell_to_binary + DataTypeSerDe::deserialize_binary_to_* +// (the variant sparse-column path). Regression for DORIS-25915: the writer +// previously inherited DataTypeNumberSerDe's default, which omitted the scale +// byte, while the reader expected a scale byte — a 1-byte layout mismatch that +// silently corrupted timestamptz values stored in variant sparse paths. +TEST_F(DataTypeTimeStampTzSerDeTest, binary_roundtrip) { + auto test_func = [&](const DataTypeTimeStampTzSerDe& serde, + const ColumnTimeStampTz::MutablePtr& source_column, uint32_t scale) { + const size_t row_count = source_column->size(); + ASSERT_GT(row_count, 0); + + // Per-row layout: [type:1][scale:1][value:8] = 10 bytes. + constexpr size_t kBytesPerRow = + sizeof(uint8_t) + sizeof(uint8_t) + sizeof(TimestampTzValue::underlying_value); + + auto chars_col = ColumnString::create(); + ColumnString::Chars& chars = chars_col->get_chars(); + std::vector row_offsets; + row_offsets.reserve(row_count + 1); + row_offsets.push_back(0); + for (size_t i = 0; i < row_count; ++i) { + serde.write_one_cell_to_binary(*source_column, chars, i); + row_offsets.push_back(chars.size()); + } + + // Layout: each row contributes exactly 10 bytes; first byte is the + // TIMESTAMPTZ FieldType; second is the scale. + ASSERT_EQ(chars.size(), row_count * kBytesPerRow); + for (size_t i = 0; i < row_count; ++i) { + ASSERT_EQ(row_offsets[i + 1] - row_offsets[i], kBytesPerRow); + EXPECT_EQ(chars[row_offsets[i]], + static_cast(FieldType::OLAP_FIELD_TYPE_TIMESTAMPTZ)); + EXPECT_EQ(chars[row_offsets[i] + 1], static_cast(scale)); + } + + // Roundtrip via deserialize_binary_to_column (column path). + { + auto inner_col = ColumnTimeStampTz::create(); + auto null_map = ColumnUInt8::create(); + auto nullable_col = ColumnNullable::create(std::move(inner_col), std::move(null_map)); + for (size_t i = 0; i < row_count; ++i) { + const uint8_t* start = chars.data() + row_offsets[i]; + const uint8_t* end = + DataTypeSerDe::deserialize_binary_to_column(start, *nullable_col); + EXPECT_EQ(end - start, kBytesPerRow); + } + const auto& read_col = + assert_cast(nullable_col->get_nested_column()); + for (size_t i = 0; i < row_count; ++i) { + EXPECT_EQ(read_col.get_element(i), source_column->get_element(i)) + << "row " << i << " mismatched after binary roundtrip"; + } + } + + // Roundtrip via deserialize_binary_to_field (field path); scale + // should propagate into FieldInfo. + { + for (size_t i = 0; i < row_count; ++i) { + const uint8_t* start = chars.data() + row_offsets[i]; + Field f; + FieldInfo info; + const uint8_t* end = DataTypeSerDe::deserialize_binary_to_field(start, f, info); + EXPECT_EQ(end - start, kBytesPerRow); + EXPECT_EQ(info.scale, static_cast(scale)); + EXPECT_EQ(f.get(), source_column->get_element(i)) + << "row " << i << " mismatched after field roundtrip"; + } + } + }; + + test_func(*serde_tz_0, column_tz_0, 0); + test_func(*serde_tz_3, column_tz_3, 3); + test_func(*serde_tz_6, column_tz_6, 6); +} } // namespace doris diff --git a/be/test/data/vec/columns/TIMESTAMPTZ(3).csv b/be/test/data/vec/columns/TIMESTAMPTZ(3).csv new file mode 100644 index 00000000000000..078e9f15e521e4 --- /dev/null +++ b/be/test/data/vec/columns/TIMESTAMPTZ(3).csv @@ -0,0 +1,16 @@ +NULL +0000-01-01 00:00:00 +00:00 +0000-01-01 00:00:00.000 +00:00 +0000-01-01 00:00:00.001 +00:00 +0000-01-01 00:00:00.123 +00:00 +0000-01-01 00:00:00.999 +00:00 +2023-08-08 20:20:20 +00:00 +2023-08-08 20:20:20.000 +00:00 +2023-08-08 20:20:20.001 +00:00 +2023-08-08 20:20:20.123 +00:00 +2023-08-08 20:20:20.999 +00:00 +9999-12-31 23:59:59 +08:00 +9999-12-31 23:59:59.000 +08:00 +9999-12-31 23:59:59.001 +08:00 +9999-12-31 23:59:59.123 +08:00 +9999-12-31 23:59:59.999 +08:00 diff --git a/regression-test/data/variant_p0/test_variant_timestamptz_sparse.out b/regression-test/data/variant_p0/test_variant_timestamptz_sparse.out new file mode 100644 index 00000000000000..0ad3d02aa156e8 --- /dev/null +++ b/regression-test/data/variant_p0/test_variant_timestamptz_sparse.out @@ -0,0 +1,7 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !select_all -- +1 2003-12-21 10:29:00+08:00 2003-12-21 10:29:00+08:00 2003-12-21 10:29:00.000+08:00 2003-12-21 15:29:00.000+08:00 2003-12-21 15:29:00.000000+08:00 2003-12-21 07:29:00.000000+08:00 + +-- !select_cast_ts0 -- +1 2003-12-21 10:29:00+08:00 2003-12-21 10:29:00.000+08:00 2003-12-21 15:29:00.000000+08:00 + diff --git a/regression-test/suites/variant_p0/test_variant_timestamptz_sparse.groovy b/regression-test/suites/variant_p0/test_variant_timestamptz_sparse.groovy new file mode 100644 index 00000000000000..f2f28031892a6a --- /dev/null +++ b/regression-test/suites/variant_p0/test_variant_timestamptz_sparse.groovy @@ -0,0 +1,127 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Reproduces DORIS-25915: +// When VARIANT has typed paths that exceed variant_max_subcolumns_count and +// variant_enable_typed_paths_to_sparse=true, timestamptz values that fall to +// the sparse path are read back as only the timezone suffix (e.g. "+00:00") +// instead of the full timestamp. +suite("test_variant_timestamptz_sparse", "p0"){ + sql " set time_zone = '+08:00' " + + def tableName = "test_variant_timestamptz_sparse_repro" + sql "DROP TABLE IF EXISTS ${tableName}" + + sql """ + CREATE TABLE ${tableName} ( + pk int, + var VARIANT< + 'c1':bigint, + 'c2':bigint, + 'd1':date, + 'd2':date, + 'dt0':datetime(0), + 'dt0n':datetime(0), + 'dt3':datetime(3), + 'dt3n':datetime(3), + 'dt6':datetime(6), + 'dt6n':datetime(6), + 'ts0':timestamptz(0), + 'ts0n':timestamptz(0), + 'ts3':timestamptz(3), + 'ts3n':timestamptz(3), + 'ts6':timestamptz(6), + 'ts6n':timestamptz(6), + 'v1':string, + 'v2':string, + properties("variant_max_subcolumns_count" = "2","variant_enable_typed_paths_to_sparse" = "true") + > NULL, + INDEX c1 (`var`) USING INVERTED PROPERTIES("field_pattern" = "c1"), + INDEX c2 (`var`) USING INVERTED PROPERTIES("field_pattern" = "c2"), + INDEX d1 (`var`) USING INVERTED PROPERTIES("field_pattern" = "d1"), + INDEX d2 (`var`) USING INVERTED PROPERTIES("field_pattern" = "d2"), + INDEX dt0 (`var`) USING INVERTED PROPERTIES("field_pattern" = "dt0"), + INDEX dt0n (`var`) USING INVERTED PROPERTIES("field_pattern" = "dt0n"), + INDEX dt3 (`var`) USING INVERTED PROPERTIES("field_pattern" = "dt3"), + INDEX dt3n (`var`) USING INVERTED PROPERTIES("field_pattern" = "dt3n"), + INDEX dt6 (`var`) USING INVERTED PROPERTIES("field_pattern" = "dt6"), + INDEX dt6n (`var`) USING INVERTED PROPERTIES("field_pattern" = "dt6n"), + INDEX v1 (`var`) USING INVERTED PROPERTIES("field_pattern" = "v1"), + INDEX v2 (`var`) USING INVERTED PROPERTIES("field_pattern" = "v2") + ) ENGINE=OLAP + DUPLICATE KEY(pk) + DISTRIBUTED BY HASH(pk) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO ${tableName} VALUES + (1, '{"c1":1699516324,"c2":1690122421,"d1":"2024-02-25","d2":"2024-11-27","dt0":"2002-12-25 12:42:19","dt0n":"2015-02-05 02:36:13","dt3":"2015-02-22 02:09:01","dt3n":"2015-09-16 02:55:07","dt6":"2001-09-19 09:53:52","dt6n":"2003-12-21 02:29:00","ts0":"2003-12-21 02:29:00 +00:00","ts0n":"2003-12-21 02:29:00 +00:00","ts3":"2003-12-21 02:29:00 +00:00","ts3n":"2003-12-21 02:29:00 -05:00","ts6":"2003-12-21 02:29:00 -05:00","ts6n":"2003-12-21 02:29:00 +03:00","v1":"2024-12-10 10:16:19","v2":"2023-12-31 23:59:59"}'); + """ + + sql "SYNC" + + qt_select_all """ + SELECT + pk, + CAST(var['ts0'] AS string) AS str_ts0, + CAST(var['ts0n'] AS string) AS str_ts0n, + CAST(var['ts3'] AS string) AS str_ts3, + CAST(var['ts3n'] AS string) AS str_ts3n, + CAST(var['ts6'] AS string) AS str_ts6, + CAST(var['ts6n'] AS string) AS str_ts6n + FROM ${tableName} + ORDER BY pk; + """ + + qt_select_cast_ts0 """ + SELECT + pk, + CAST(var['ts0'] AS timestamptz(0)) AS cast_ts0, + CAST(var['ts3'] AS timestamptz(3)) AS cast_ts3, + CAST(var['ts6'] AS timestamptz(6)) AS cast_ts6 + FROM ${tableName} + ORDER BY pk; + """ + + // Explicit assertion that reproduces the bug. + // When the bug is present, str_ts0 == "+00:00" (only the timezone), and this + // assertion fails. When the fix lands, the value contains the full timestamp + // "2003-12-21 ..." and the assertion passes. + def rows = sql """ + SELECT + CAST(var['ts0'] AS string) AS str_ts0, + CAST(var['ts3n'] AS string) AS str_ts3n, + CAST(var['ts6'] AS string) AS str_ts6 + FROM ${tableName} + WHERE pk = 1; + """ + logger.info("DORIS-25915 repro rows: ${rows}") + + def str_ts0 = rows[0][0].toString() + def str_ts3n = rows[0][1].toString() + def str_ts6 = rows[0][2].toString() + + assertTrue(str_ts0.contains("2003-12-21"), + "DORIS-25915 bug present: var['ts0'] in sparse path returned '${str_ts0}', expected to contain '2003-12-21'") + assertTrue(str_ts3n.contains("2003-12-21"), + "DORIS-25915 bug present: var['ts3n'] in sparse path returned '${str_ts3n}', expected to contain '2003-12-21'") + assertTrue(str_ts6.contains("2003-12-21"), + "DORIS-25915 bug present: var['ts6'] in sparse path returned '${str_ts6}', expected to contain '2003-12-21'") + + sql "DROP TABLE IF EXISTS ${tableName}" +}