From 0a8ec0cf46cbd8ce3db0978cead2e90a593d78aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Poyraz=20K=C3=BC=C3=A7=C3=BCkarslan?= <83272398+PoyrazK@users.noreply.github.com> Date: Mon, 20 Apr 2026 19:34:42 +0300 Subject: [PATCH 1/3] feat(executor): add StringVector class for TEXT column support --- CMakeLists.txt | 1 + docs/phases/PHASE_8_ANALYTICS.md | 1 + include/executor/types.hpp | 69 +++++++++- src/storage/columnar_table.cpp | 30 +++++ tests/columnar_table_tests.cpp | 67 +++++++++- tests/string_vector_tests.cpp | 222 +++++++++++++++++++++++++++++++ 6 files changed, 385 insertions(+), 5 deletions(-) create mode 100644 tests/string_vector_tests.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index e1ffa642..900e99d1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -144,6 +144,7 @@ if(BUILD_TESTS) add_cloudsql_test(raft_group_tests tests/raft_group_tests.cpp) add_cloudsql_test(raft_protocol_tests tests/raft_protocol_tests.cpp) add_cloudsql_test(columnar_table_tests tests/columnar_table_tests.cpp) + add_cloudsql_test(string_vector_tests tests/string_vector_tests.cpp) add_cloudsql_test(heap_table_tests tests/heap_table_tests.cpp) add_cloudsql_test(lexer_tests tests/lexer_tests.cpp) add_cloudsql_test(parser_tests tests/parser_tests.cpp) diff --git a/docs/phases/PHASE_8_ANALYTICS.md b/docs/phases/PHASE_8_ANALYTICS.md index e464c8c8..e5de28ac 100644 --- a/docs/phases/PHASE_8_ANALYTICS.md +++ b/docs/phases/PHASE_8_ANALYTICS.md @@ -14,6 +14,7 @@ Implemented a high-performance column-oriented data store. ### 2. Vectorized Data Structures (`include/executor/types.hpp`) Developed SIMD-friendly contiguous memory buffers for batch processing. - **ColumnVector & NumericVector**: Specialized C++ templates for storing a "vector" of data for a single column. +- **StringVector**: Variable-length string storage for TEXT/VARCHAR/CHAR columns. - **VectorBatch**: A collection of `ColumnVector` objects representing a chunk of rows (typically 1024 rows). ### 3. Vectorized Execution Engine (`include/executor/vectorized_operator.hpp`) diff --git a/include/executor/types.hpp b/include/executor/types.hpp index 416e1d03..88e5452e 100644 --- a/include/executor/types.hpp +++ b/include/executor/types.hpp @@ -330,6 +330,70 @@ class NumericVector : public ColumnVector { } }; +/** + * @brief Vectorized storage for variable-length string columns. + */ +class StringVector : public ColumnVector { + private: + std::vector data_; + + public: + explicit StringVector(common::ValueType type) : ColumnVector(type) {} + + /** + * @brief Appends a Value, handling nullability and string conversion. + */ + void append(const common::Value& val) override { + if (val.is_null()) { + null_bitmap_.push_back(true); + data_.emplace_back(); + } else { + null_bitmap_.push_back(false); + data_.push_back(val.as_text()); + } + size_++; + } + + /** + * @brief Materializes a common::Value for the element at the specified index. + */ + common::Value get(size_t index) const override { + if (index >= size_ || null_bitmap_[index]) return common::Value::make_null(); + return common::Value::make_text(data_[index]); + } + + /** + * @brief Directly sets the value at a specific index. + * Resizes if necessary to accommodate the index. + */ + void set(size_t index, const std::string& val) { + if (index >= size_) { + resize(index + 1); + } + data_[index] = val; + null_bitmap_[index] = false; + } + + /** + * @brief Provides read-only access to the underlying string data. + */ + const std::vector& raw_data() const { return data_; } + + /** + * @brief Resizes the underlying buffers to the specified capacity. + */ + void resize(size_t new_size) { + data_.resize(new_size); + null_bitmap_.resize(new_size, false); + size_ = new_size; + } + + void clear() override { + ColumnVector::clear(); + data_.clear(); + } +}; + /** * @brief Represents a set of data blocks (batches) in a columnar format for vectorized processing. */ @@ -379,7 +443,10 @@ class VectorBatch { add_column(std::make_unique>(col.type())); break; case common::ValueType::TYPE_TEXT: - throw std::runtime_error("Vectorized StringVector implementation is pending."); + case common::ValueType::TYPE_VARCHAR: + case common::ValueType::TYPE_CHAR: + add_column(std::make_unique(col.type())); + break; default: throw std::runtime_error("Unsupported column type for vectorized execution: " + std::to_string(static_cast(col.type()))); diff --git a/src/storage/columnar_table.cpp b/src/storage/columnar_table.cpp index 0b677590..a7a33910 100644 --- a/src/storage/columnar_table.cpp +++ b/src/storage/columnar_table.cpp @@ -70,6 +70,15 @@ bool ColumnarTable::append_batch(const executor::VectorBatch& batch) { } else if (type == common::ValueType::TYPE_FLOAT64) { auto& num_vec = dynamic_cast&>(col_vec); d_out.write(reinterpret_cast(num_vec.raw_data()), batch.row_count() * 8); + } else if (type == common::ValueType::TYPE_TEXT || type == common::ValueType::TYPE_VARCHAR || + type == common::ValueType::TYPE_CHAR) { + auto& str_vec = dynamic_cast(col_vec); + const auto& data = str_vec.raw_data(); + for (size_t r = 0; r < batch.row_count(); ++r) { + uint32_t len = static_cast(data[r].size()); + d_out.write(reinterpret_cast(&len), 4); + d_out.write(data[r].data(), len); + } } else { throw std::runtime_error("ColumnarTable::append_batch: Unsupported persistence type " + std::to_string(static_cast(type))); @@ -139,6 +148,27 @@ bool ColumnarTable::read_batch(uint64_t start_row, uint32_t batch_size, num_vec.append(common::Value::make_float64(data[r])); } } + } else if (type == common::ValueType::TYPE_TEXT || type == common::ValueType::TYPE_VARCHAR || + type == common::ValueType::TYPE_CHAR) { + auto& str_vec = dynamic_cast(target_col); + + n_in.seekg(static_cast(start_row), std::ios::beg); + std::vector nulls(actual_rows); + n_in.read(reinterpret_cast(nulls.data()), actual_rows); + + // For variable-length strings, we need to scan from the beginning + // since each record has variable length (4-byte length prefix + data) + for (uint32_t r = 0; r < actual_rows; ++r) { + uint32_t len = 0; + d_in.read(reinterpret_cast(&len), 4); + std::string s(len, '\0'); + d_in.read(s.data(), len); + if (nulls[r] != 0U) { + str_vec.append(common::Value::make_null()); + } else { + str_vec.append(common::Value::make_text(s)); + } + } } else { throw std::runtime_error( "ColumnarTable::read_batch: Symmetric serialization failure for type " + diff --git a/tests/columnar_table_tests.cpp b/tests/columnar_table_tests.cpp index cee7d2d0..b139f231 100644 --- a/tests/columnar_table_tests.cpp +++ b/tests/columnar_table_tests.cpp @@ -28,6 +28,8 @@ static void cleanup_table(const std::string& name) { std::remove(("./test_data/" + name + ".col0.data.bin").c_str()); std::remove(("./test_data/" + name + ".col1.nulls.bin").c_str()); std::remove(("./test_data/" + name + ".col1.data.bin").c_str()); + std::remove(("./test_data/" + name + ".col2.nulls.bin").c_str()); + std::remove(("./test_data/" + name + ".col2.data.bin").c_str()); // clang-format on } @@ -209,15 +211,72 @@ TEST_F(ColumnarTableTests, ReadBatchPartial) { ASSERT_EQ(out->row_count(), 2U); } -TEST_F(ColumnarTableTests, UnsupportedTypeThrows) { - const std::string name = "col_test_unsupported"; +TEST_F(ColumnarTableTests, TextTypeNowSupported) { + const std::string name = "col_test_text"; cleanup_table(name); Schema schema; schema.add_column("text_col", common::ValueType::TYPE_TEXT); - // VectorBatch::create() throws when it sees TYPE_TEXT (unsupported) - EXPECT_THROW([[maybe_unused]] auto batch = VectorBatch::create(schema), std::runtime_error); + // VectorBatch::create() should now succeed with TYPE_TEXT (StringVector implemented) + auto batch = VectorBatch::create(schema); + ASSERT_NE(batch, nullptr); + EXPECT_EQ(batch->column_count(), 1U); +} + +TEST_F(ColumnarTableTests, TextLifecycle) { + const std::string name = "col_test_text_lifecycle"; + cleanup_table(name); + + Schema schema; + schema.add_column("id", common::ValueType::TYPE_INT64); + schema.add_column("text_col", common::ValueType::TYPE_TEXT); + + ColumnarTable table(name, *sm_, schema); + ASSERT_TRUE(table.create()); + + // Create a batch with mixed int and text data + auto batch = VectorBatch::create(schema); + ASSERT_NE(batch, nullptr); + + // Add row 1: id=1, text="hello" + batch->set_row_count(0); + executor::Tuple t1; + t1.set(0, common::Value::make_int64(1)); + t1.set(1, common::Value::make_text("hello")); + batch->append_tuple(t1); + + // Add row 2: id=2, text=NULL + executor::Tuple t2; + t2.set(0, common::Value::make_int64(2)); + t2.set(1, common::Value::make_null()); + batch->append_tuple(t2); + + // Add row 3: id=3, text="world" + executor::Tuple t3; + t3.set(0, common::Value::make_int64(3)); + t3.set(1, common::Value::make_text("world")); + batch->append_tuple(t3); + + batch->set_row_count(3); + ASSERT_TRUE(table.append_batch(*batch)); + + // Read back and verify + auto read_batch = VectorBatch::create(schema); + ASSERT_TRUE(table.read_batch(0, 10, *read_batch)); + EXPECT_EQ(read_batch->row_count(), 3U); + + // Verify int column + auto& id_col = read_batch->get_column(0); + EXPECT_EQ(id_col.get(0).to_int64(), 1); + EXPECT_EQ(id_col.get(1).to_int64(), 2); + EXPECT_EQ(id_col.get(2).to_int64(), 3); + + // Verify text column + auto& text_col = read_batch->get_column(1); + EXPECT_EQ(text_col.get(0).as_text(), "hello"); + EXPECT_TRUE(text_col.get(1).is_null()); + EXPECT_EQ(text_col.get(2).as_text(), "world"); } TEST_F(ColumnarTableTests, CreateTwice) { diff --git a/tests/string_vector_tests.cpp b/tests/string_vector_tests.cpp new file mode 100644 index 00000000..1073162f --- /dev/null +++ b/tests/string_vector_tests.cpp @@ -0,0 +1,222 @@ +/** + * @file string_vector_tests.cpp + * @brief Unit tests for StringVector - variable-length string column storage + */ + +#include + +#include +#include +#include + +#include "common/value.hpp" +#include "executor/types.hpp" + +using namespace cloudsql; +using namespace cloudsql::common; +using namespace cloudsql::executor; + +namespace { + +class StringVectorTests : public ::testing::Test {}; + +// Test basic append and get +TEST_F(StringVectorTests, BasicAppendAndGet) { + StringVector vec(ValueType::TYPE_TEXT); + + vec.append(Value::make_text("hello")); + vec.append(Value::make_text("world")); + vec.append(Value::make_text("")); + + EXPECT_EQ(vec.size(), 3U); + EXPECT_EQ(vec.get(0).as_text(), "hello"); + EXPECT_EQ(vec.get(1).as_text(), "world"); + EXPECT_EQ(vec.get(2).as_text(), ""); + EXPECT_EQ(vec.type(), ValueType::TYPE_TEXT); +} + +// Test null handling +TEST_F(StringVectorTests, NullHandling) { + StringVector vec(ValueType::TYPE_TEXT); + + vec.append(Value::make_text("hello")); + vec.append(common::Value::make_null()); + vec.append(Value::make_text("world")); + + EXPECT_EQ(vec.size(), 3U); + EXPECT_FALSE(vec.is_null(0)); + EXPECT_TRUE(vec.is_null(1)); + EXPECT_FALSE(vec.is_null(2)); + + EXPECT_EQ(vec.get(0).as_text(), "hello"); + EXPECT_TRUE(vec.get(1).is_null()); + EXPECT_EQ(vec.get(2).as_text(), "world"); +} + +// Test is_null edge cases +TEST_F(StringVectorTests, IsNullEdgeCases) { + StringVector vec(ValueType::TYPE_TEXT); + + EXPECT_TRUE(vec.is_null(0)); // Empty vector, out of bounds returns true + EXPECT_TRUE(vec.is_null(5)); // Out of bounds returns true + + vec.append(Value::make_text("test")); + EXPECT_FALSE(vec.is_null(0)); + EXPECT_TRUE(vec.is_null(1)); // Out of bounds +} + +// Test clear +TEST_F(StringVectorTests, Clear) { + StringVector vec(ValueType::TYPE_TEXT); + + vec.append(Value::make_text("hello")); + vec.append(Value::make_text("world")); + EXPECT_EQ(vec.size(), 2U); + + vec.clear(); + EXPECT_EQ(vec.size(), 0U); + EXPECT_TRUE(vec.is_null(0)); // After clear, is_null returns true for index 0 +} + +// Test resize +TEST_F(StringVectorTests, Resize) { + StringVector vec(ValueType::TYPE_TEXT); + + vec.append(Value::make_text("hello")); + vec.resize(5); + + EXPECT_EQ(vec.size(), 5U); + // After resize, entries are NOT null (is_null returns false) + // They are empty strings by default + EXPECT_FALSE(vec.is_null(1)); + EXPECT_FALSE(vec.is_null(4)); + EXPECT_EQ(vec.get(1).as_text(), ""); + EXPECT_EQ(vec.get(4).as_text(), ""); + + // Can set values after resize + vec.set(3, "world"); + EXPECT_EQ(vec.get(3).as_text(), "world"); + EXPECT_FALSE(vec.is_null(3)); +} + +// Test set +TEST_F(StringVectorTests, Set) { + StringVector vec(ValueType::TYPE_TEXT); + + vec.append(Value::make_text("hello")); + vec.set(0, "world"); + + EXPECT_EQ(vec.get(0).as_text(), "world"); + EXPECT_FALSE(vec.is_null(0)); +} + +// Test set auto-resizes +TEST_F(StringVectorTests, SetAutoResize) { + StringVector vec(ValueType::TYPE_TEXT); + + vec.set(2, "auto resize"); + + EXPECT_EQ(vec.size(), 3U); + // After auto-resize via set(), entries 0 and 1 are empty strings (not null) + EXPECT_FALSE(vec.is_null(0)); + EXPECT_FALSE(vec.is_null(1)); + EXPECT_EQ(vec.get(0).as_text(), ""); + EXPECT_EQ(vec.get(1).as_text(), ""); + EXPECT_EQ(vec.get(2).as_text(), "auto resize"); +} + +// Test raw_data +TEST_F(StringVectorTests, RawData) { + StringVector vec(ValueType::TYPE_TEXT); + + vec.append(Value::make_text("hello")); + vec.append(Value::make_text("world")); + + const auto& data = vec.raw_data(); + EXPECT_EQ(data.size(), 2U); + EXPECT_EQ(data[0], "hello"); + EXPECT_EQ(data[1], "world"); +} + +// Test VARCHAR type +TEST_F(StringVectorTests, VarcharType) { + StringVector vec(common::ValueType::TYPE_VARCHAR); + + vec.append(Value::make_text("test varchar")); + + EXPECT_EQ(vec.size(), 1U); + EXPECT_EQ(vec.get(0).as_text(), "test varchar"); + EXPECT_EQ(vec.type(), common::ValueType::TYPE_VARCHAR); +} + +// Test CHAR type +TEST_F(StringVectorTests, CharType) { + StringVector vec(common::ValueType::TYPE_CHAR); + + vec.append(Value::make_text("test char")); + + EXPECT_EQ(vec.size(), 1U); + EXPECT_EQ(vec.get(0).as_text(), "test char"); + EXPECT_EQ(vec.type(), common::ValueType::TYPE_CHAR); +} + +// Test long strings +TEST_F(StringVectorTests, LongStrings) { + StringVector vec(ValueType::TYPE_TEXT); + + std::string long_str(1000, 'x'); + vec.append(Value::make_text(long_str)); + + EXPECT_EQ(vec.size(), 1U); + EXPECT_EQ(vec.get(0).as_text(), long_str); +} + +// Test special characters +TEST_F(StringVectorTests, SpecialCharacters) { + StringVector vec(ValueType::TYPE_TEXT); + + vec.append(Value::make_text("hello\nworld\ttab")); + vec.append(Value::make_text("emoji: 🎉 NULL: \0 embedded")); + + EXPECT_EQ(vec.get(0).as_text(), "hello\nworld\ttab"); + // Note: strings with embedded nulls may be truncated due to C++ string behavior +} + +// Test empty string +TEST_F(StringVectorTests, EmptyString) { + StringVector vec(ValueType::TYPE_TEXT); + + vec.append(Value::make_text("")); + vec.append(Value::make_text("non-empty")); + + EXPECT_EQ(vec.size(), 2U); + EXPECT_EQ(vec.get(0).as_text(), ""); + EXPECT_FALSE(vec.is_null(0)); + EXPECT_EQ(vec.get(1).as_text(), "non-empty"); +} + +// Test mixed null and non-null append +TEST_F(StringVectorTests, MixedAppend) { + StringVector vec(ValueType::TYPE_TEXT); + + for (int i = 0; i < 5; ++i) { + if (i % 2 == 0) { + vec.append(Value::make_text(std::to_string(i))); + } else { + vec.append(common::Value::make_null()); + } + } + + EXPECT_EQ(vec.size(), 5U); + for (size_t i = 0; i < 5; ++i) { + if (i % 2 == 0) { + EXPECT_FALSE(vec.is_null(i)); + EXPECT_EQ(vec.get(i).as_text(), std::to_string(i)); + } else { + EXPECT_TRUE(vec.is_null(i)); + EXPECT_TRUE(vec.get(i).is_null()); + } + } +} + +} // namespace \ No newline at end of file From c609e92bbdec54762c2e35859247e24f5df90124 Mon Sep 17 00:00:00 2001 From: poyrazK <83272398+poyrazK@users.noreply.github.com> Date: Mon, 20 Apr 2026 16:46:01 +0000 Subject: [PATCH 2/3] style: automated clang-format fixes --- src/storage/columnar_table.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/storage/columnar_table.cpp b/src/storage/columnar_table.cpp index a7a33910..d315de32 100644 --- a/src/storage/columnar_table.cpp +++ b/src/storage/columnar_table.cpp @@ -70,7 +70,8 @@ bool ColumnarTable::append_batch(const executor::VectorBatch& batch) { } else if (type == common::ValueType::TYPE_FLOAT64) { auto& num_vec = dynamic_cast&>(col_vec); d_out.write(reinterpret_cast(num_vec.raw_data()), batch.row_count() * 8); - } else if (type == common::ValueType::TYPE_TEXT || type == common::ValueType::TYPE_VARCHAR || + } else if (type == common::ValueType::TYPE_TEXT || + type == common::ValueType::TYPE_VARCHAR || type == common::ValueType::TYPE_CHAR) { auto& str_vec = dynamic_cast(col_vec); const auto& data = str_vec.raw_data(); @@ -148,7 +149,8 @@ bool ColumnarTable::read_batch(uint64_t start_row, uint32_t batch_size, num_vec.append(common::Value::make_float64(data[r])); } } - } else if (type == common::ValueType::TYPE_TEXT || type == common::ValueType::TYPE_VARCHAR || + } else if (type == common::ValueType::TYPE_TEXT || + type == common::ValueType::TYPE_VARCHAR || type == common::ValueType::TYPE_CHAR) { auto& str_vec = dynamic_cast(target_col); From e8ae49b3d07ebfe57b2e839743e364b2eafe3cdf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Poyraz=20K=C3=BC=C3=A7=C3=BCkarslan?= <83272398+PoyrazK@users.noreply.github.com> Date: Mon, 20 Apr 2026 19:56:48 +0300 Subject: [PATCH 3/3] fix: correct TEXT read_batch offset handling and embedded NUL test --- src/storage/columnar_table.cpp | 15 +++++++++++++-- tests/string_vector_tests.cpp | 10 ++++++++-- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/src/storage/columnar_table.cpp b/src/storage/columnar_table.cpp index d315de32..6404f427 100644 --- a/src/storage/columnar_table.cpp +++ b/src/storage/columnar_table.cpp @@ -158,8 +158,19 @@ bool ColumnarTable::read_batch(uint64_t start_row, uint32_t batch_size, std::vector nulls(actual_rows); n_in.read(reinterpret_cast(nulls.data()), actual_rows); - // For variable-length strings, we need to scan from the beginning - // since each record has variable length (4-byte length prefix + data) + // For variable-length strings, skip start_row records first + // by reading and discarding their length-prefixed data + if (start_row > 0) { + for (uint32_t r = 0; r < start_row; ++r) { + uint32_t len = 0; + if (!d_in.read(reinterpret_cast(&len), 4)) break; + if (len > 0) { + d_in.seekg(static_cast(len), std::ios::cur); + } + } + } + + // Now read the actual_rows we want for (uint32_t r = 0; r < actual_rows; ++r) { uint32_t len = 0; d_in.read(reinterpret_cast(&len), 4); diff --git a/tests/string_vector_tests.cpp b/tests/string_vector_tests.cpp index 1073162f..4b4af649 100644 --- a/tests/string_vector_tests.cpp +++ b/tests/string_vector_tests.cpp @@ -176,10 +176,16 @@ TEST_F(StringVectorTests, SpecialCharacters) { StringVector vec(ValueType::TYPE_TEXT); vec.append(Value::make_text("hello\nworld\ttab")); - vec.append(Value::make_text("emoji: 🎉 NULL: \0 embedded")); + + // Build string with embedded NUL character to test binary data handling + std::string with_nul("emoji: 🎉 NULL:\0 embedded", 25); + vec.append(Value::make_text(with_nul)); EXPECT_EQ(vec.get(0).as_text(), "hello\nworld\ttab"); - // Note: strings with embedded nulls may be truncated due to C++ string behavior + // Verify embedded NUL is preserved + std::string retrieved = vec.get(1).as_text(); + EXPECT_EQ(retrieved.size(), 25u); + EXPECT_EQ(retrieved, with_nul); } // Test empty string