Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,7 @@ if(BUILD_TESTS)
add_cloudsql_test(raft_group_tests tests/raft_group_tests.cpp)
add_cloudsql_test(raft_protocol_tests tests/raft_protocol_tests.cpp)
add_cloudsql_test(columnar_table_tests tests/columnar_table_tests.cpp)
add_cloudsql_test(string_vector_tests tests/string_vector_tests.cpp)
add_cloudsql_test(heap_table_tests tests/heap_table_tests.cpp)
add_cloudsql_test(lexer_tests tests/lexer_tests.cpp)
add_cloudsql_test(parser_tests tests/parser_tests.cpp)
Expand Down
1 change: 1 addition & 0 deletions docs/phases/PHASE_8_ANALYTICS.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ Implemented a high-performance column-oriented data store.
### 2. Vectorized Data Structures (`include/executor/types.hpp`)
Developed SIMD-friendly contiguous memory buffers for batch processing.
- **ColumnVector & NumericVector**: Specialized C++ templates for storing a "vector" of data for a single column.
- **StringVector**: Variable-length string storage for TEXT/VARCHAR/CHAR columns.
- **VectorBatch**: A collection of `ColumnVector` objects representing a chunk of rows (typically 1024 rows).

### 3. Vectorized Execution Engine (`include/executor/vectorized_operator.hpp`)
Expand Down
69 changes: 68 additions & 1 deletion include/executor/types.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -330,6 +330,70 @@ class NumericVector : public ColumnVector {
}
};

/**
* @brief Vectorized storage for variable-length string columns.
*/
class StringVector : public ColumnVector {
private:
std::vector<std::string> data_;

public:
explicit StringVector(common::ValueType type) : ColumnVector(type) {}

/**
* @brief Appends a Value, handling nullability and string conversion.
*/
void append(const common::Value& val) override {
if (val.is_null()) {
null_bitmap_.push_back(true);
data_.emplace_back();
} else {
null_bitmap_.push_back(false);
data_.push_back(val.as_text());
}
size_++;
}

/**
* @brief Materializes a common::Value for the element at the specified index.
*/
common::Value get(size_t index) const override {
if (index >= size_ || null_bitmap_[index]) return common::Value::make_null();
return common::Value::make_text(data_[index]);
}

/**
* @brief Directly sets the value at a specific index.
* Resizes if necessary to accommodate the index.
*/
void set(size_t index, const std::string& val) {
if (index >= size_) {
resize(index + 1);
}
data_[index] = val;
null_bitmap_[index] = false;
}

/**
* @brief Provides read-only access to the underlying string data.
*/
const std::vector<std::string>& raw_data() const { return data_; }

/**
* @brief Resizes the underlying buffers to the specified capacity.
*/
void resize(size_t new_size) {
data_.resize(new_size);
null_bitmap_.resize(new_size, false);
size_ = new_size;
}

void clear() override {
ColumnVector::clear();
data_.clear();
}
};

/**
* @brief Represents a set of data blocks (batches) in a columnar format for vectorized processing.
*/
Expand Down Expand Up @@ -379,7 +443,10 @@ class VectorBatch {
add_column(std::make_unique<NumericVector<bool>>(col.type()));
break;
case common::ValueType::TYPE_TEXT:
throw std::runtime_error("Vectorized StringVector implementation is pending.");
case common::ValueType::TYPE_VARCHAR:
case common::ValueType::TYPE_CHAR:
add_column(std::make_unique<StringVector>(col.type()));
break;
default:
throw std::runtime_error("Unsupported column type for vectorized execution: " +
std::to_string(static_cast<int>(col.type())));
Expand Down
43 changes: 43 additions & 0 deletions src/storage/columnar_table.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,16 @@ bool ColumnarTable::append_batch(const executor::VectorBatch& batch) {
} else if (type == common::ValueType::TYPE_FLOAT64) {
auto& num_vec = dynamic_cast<executor::NumericVector<double>&>(col_vec);
d_out.write(reinterpret_cast<const char*>(num_vec.raw_data()), batch.row_count() * 8);
} else if (type == common::ValueType::TYPE_TEXT ||
type == common::ValueType::TYPE_VARCHAR ||
type == common::ValueType::TYPE_CHAR) {
auto& str_vec = dynamic_cast<executor::StringVector&>(col_vec);
const auto& data = str_vec.raw_data();
for (size_t r = 0; r < batch.row_count(); ++r) {
uint32_t len = static_cast<uint32_t>(data[r].size());
d_out.write(reinterpret_cast<const char*>(&len), 4);
d_out.write(data[r].data(), len);
}
} else {
throw std::runtime_error("ColumnarTable::append_batch: Unsupported persistence type " +
std::to_string(static_cast<int>(type)));
Expand Down Expand Up @@ -139,6 +149,39 @@ bool ColumnarTable::read_batch(uint64_t start_row, uint32_t batch_size,
num_vec.append(common::Value::make_float64(data[r]));
}
}
} else if (type == common::ValueType::TYPE_TEXT ||
type == common::ValueType::TYPE_VARCHAR ||
type == common::ValueType::TYPE_CHAR) {
auto& str_vec = dynamic_cast<executor::StringVector&>(target_col);

n_in.seekg(static_cast<std::streamoff>(start_row), std::ios::beg);
std::vector<uint8_t> nulls(actual_rows);
n_in.read(reinterpret_cast<char*>(nulls.data()), actual_rows);

// For variable-length strings, skip start_row records first
// by reading and discarding their length-prefixed data
if (start_row > 0) {
for (uint32_t r = 0; r < start_row; ++r) {
uint32_t len = 0;
if (!d_in.read(reinterpret_cast<char*>(&len), 4)) break;
if (len > 0) {
d_in.seekg(static_cast<std::streamoff>(len), std::ios::cur);
}
}
}

// Now read the actual_rows we want
for (uint32_t r = 0; r < actual_rows; ++r) {
uint32_t len = 0;
d_in.read(reinterpret_cast<char*>(&len), 4);
std::string s(len, '\0');
d_in.read(s.data(), len);
if (nulls[r] != 0U) {
str_vec.append(common::Value::make_null());
} else {
str_vec.append(common::Value::make_text(s));
}
}
Comment thread
coderabbitai[bot] marked this conversation as resolved.
} else {
throw std::runtime_error(
"ColumnarTable::read_batch: Symmetric serialization failure for type " +
Expand Down
67 changes: 63 additions & 4 deletions tests/columnar_table_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ static void cleanup_table(const std::string& name) {
std::remove(("./test_data/" + name + ".col0.data.bin").c_str());
std::remove(("./test_data/" + name + ".col1.nulls.bin").c_str());
std::remove(("./test_data/" + name + ".col1.data.bin").c_str());
std::remove(("./test_data/" + name + ".col2.nulls.bin").c_str());
std::remove(("./test_data/" + name + ".col2.data.bin").c_str());
// clang-format on
}

Expand Down Expand Up @@ -209,15 +211,72 @@ TEST_F(ColumnarTableTests, ReadBatchPartial) {
ASSERT_EQ(out->row_count(), 2U);
}

TEST_F(ColumnarTableTests, UnsupportedTypeThrows) {
const std::string name = "col_test_unsupported";
TEST_F(ColumnarTableTests, TextTypeNowSupported) {
const std::string name = "col_test_text";
cleanup_table(name);

Schema schema;
schema.add_column("text_col", common::ValueType::TYPE_TEXT);

// VectorBatch::create() throws when it sees TYPE_TEXT (unsupported)
EXPECT_THROW([[maybe_unused]] auto batch = VectorBatch::create(schema), std::runtime_error);
// VectorBatch::create() should now succeed with TYPE_TEXT (StringVector implemented)
auto batch = VectorBatch::create(schema);
ASSERT_NE(batch, nullptr);
EXPECT_EQ(batch->column_count(), 1U);
}

TEST_F(ColumnarTableTests, TextLifecycle) {
const std::string name = "col_test_text_lifecycle";
cleanup_table(name);

Schema schema;
schema.add_column("id", common::ValueType::TYPE_INT64);
schema.add_column("text_col", common::ValueType::TYPE_TEXT);

ColumnarTable table(name, *sm_, schema);
ASSERT_TRUE(table.create());

// Create a batch with mixed int and text data
auto batch = VectorBatch::create(schema);
ASSERT_NE(batch, nullptr);

// Add row 1: id=1, text="hello"
batch->set_row_count(0);
executor::Tuple t1;
t1.set(0, common::Value::make_int64(1));
t1.set(1, common::Value::make_text("hello"));
batch->append_tuple(t1);

// Add row 2: id=2, text=NULL
executor::Tuple t2;
t2.set(0, common::Value::make_int64(2));
t2.set(1, common::Value::make_null());
batch->append_tuple(t2);

// Add row 3: id=3, text="world"
executor::Tuple t3;
t3.set(0, common::Value::make_int64(3));
t3.set(1, common::Value::make_text("world"));
batch->append_tuple(t3);

batch->set_row_count(3);
ASSERT_TRUE(table.append_batch(*batch));

// Read back and verify
auto read_batch = VectorBatch::create(schema);
ASSERT_TRUE(table.read_batch(0, 10, *read_batch));
EXPECT_EQ(read_batch->row_count(), 3U);

// Verify int column
auto& id_col = read_batch->get_column(0);
EXPECT_EQ(id_col.get(0).to_int64(), 1);
EXPECT_EQ(id_col.get(1).to_int64(), 2);
EXPECT_EQ(id_col.get(2).to_int64(), 3);

// Verify text column
auto& text_col = read_batch->get_column(1);
EXPECT_EQ(text_col.get(0).as_text(), "hello");
EXPECT_TRUE(text_col.get(1).is_null());
EXPECT_EQ(text_col.get(2).as_text(), "world");
}

TEST_F(ColumnarTableTests, CreateTwice) {
Expand Down
Loading
Loading