Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
54 commits
Select commit Hold shift + click to select a range
0d442fd
Add alp code
sfc-gh-pgaur Dec 4, 2025
06d1e19
Integrate ALP with arrow
sfc-gh-pgaur Dec 4, 2025
a98c594
Add alp benchmark
sfc-gh-pgaur Dec 4, 2025
c297f97
Add datasets for alp benchmarking
sfc-gh-pgaur Dec 4, 2025
ab928e8
Update cmake file
sfc-gh-pgaur Dec 4, 2025
6a95a59
Move hpp files to h
sfc-gh-pgaur Dec 6, 2025
865e46a
Update flow digram and layout digram to use ASCII and not unicode cha…
sfc-gh-pgaur Dec 7, 2025
cb6d0b6
Rename cpp files to cc
sfc-gh-pgaur Dec 7, 2025
496e23b
Update documentation to align with arrow's doxygen style
sfc-gh-pgaur Dec 7, 2025
8803b52
Adapt methods and variable names to arrow style
sfc-gh-pgaur Dec 7, 2025
31e94ec
Update the tests to adhere to arrow style code
sfc-gh-pgaur Dec 7, 2025
46c0ecc
Update callers
sfc-gh-pgaur Dec 7, 2025
a70b08f
Fuse FOR and decode loop
sfc-gh-pgaur Dec 7, 2025
ccbb1dd
Reduce memory allocation in the decompress call
sfc-gh-pgaur Dec 7, 2025
6a01df2
Attempt at making decoding faster with SIMD
sfc-gh-pgaur Dec 8, 2025
4ced783
Revert "Attempt at making decoding faster with SIMD"
sfc-gh-pgaur Dec 8, 2025
4fac73c
Move cpp files to cc
sfc-gh-pgaur Dec 8, 2025
1cb0852
Move data file to parquet-testing submodule
sfc-gh-pgaur Dec 8, 2025
8d307a6
Update path to the data file
sfc-gh-pgaur Dec 9, 2025
0908342
Adapt files names to arrow convention
sfc-gh-pgaur Dec 15, 2025
e56c877
File rename
sfc-gh-pgaur Dec 15, 2025
cfa00ba
Obtain compressed size and number of elements from page header
sfc-gh-pgaur Dec 15, 2025
a1d11ee
Fix namespace depth
sfc-gh-pgaur Dec 16, 2025
719468b
Better pack the compression block header
sfc-gh-pgaur Dec 16, 2025
69b4e07
Rename class
sfc-gh-pgaur Dec 16, 2025
193a808
Rearrage field for vector metadata for better packing
sfc-gh-pgaur Dec 16, 2025
11aaab3
Add spec files
sfc-gh-pgaur Dec 31, 2025
c7e7569
Add more tests
sfc-gh-pgaur Jan 2, 2026
acbc8ba
Move number of elements metadata info to uber header
sfc-gh-pgaur Jan 12, 2026
6b3e2ed
Keep version as 1
sfc-gh-pgaur Jan 12, 2026
506088f
Tests pass
sfc-gh-pgaur Jan 12, 2026
f5f5011
Get rid of numElements and derive it for each vector
sfc-gh-pgaur Jan 12, 2026
e0e367c
Update the md files
sfc-gh-pgaur Jan 12, 2026
81bcc81
Fix ubsan error
sfc-gh-pgaur Jan 13, 2026
71b244a
Use safe copy
sfc-gh-pgaur Jan 13, 2026
18e3c59
Add more unit tests
sfc-gh-pgaur Jan 13, 2026
687e1af
Templatize AlpEncodedVectorInfo
sfc-gh-pgaur Jan 14, 2026
3a538cc
Fix
sfc-gh-pgaur Jan 14, 2026
8b942af
[SpecReview] Num values in page is at max int32
sfc-gh-pgaur Jan 14, 2026
82a3c3b
[SpecReview] Rename layout to IntegerEncoding
sfc-gh-pgaur Jan 14, 2026
5b4d16d
[SpecReview] Rename enum
sfc-gh-pgaur Jan 14, 2026
ac2d679
[SpecReview] Make AlpHeader exact 8 bytes
sfc-gh-pgaur Jan 14, 2026
f66fe95
[SpecReview] Move all vector data towards the start
sfc-gh-pgaur Jan 15, 2026
c67c745
[SpecReview][Metadata] Introduce wrapper to read metadata
sfc-gh-pgaur Jan 15, 2026
37ac31d
[SpecReview] Separate out ALP and FOR metadata
sfc-gh-pgaur Jan 15, 2026
58df7a3
[SpecReview] Remove unnecessary field
sfc-gh-pgaur Jan 16, 2026
50f380c
[FutureFlexibility] Add the FOR specific code behind switch
sfc-gh-pgaur Jan 16, 2026
283de64
Add unit test
sfc-gh-pgaur Jan 17, 2026
777c9a8
ALP: Implement offset-based layout for O(1) random access
sfc-gh-pgaur Jan 26, 2026
63cbec9
ALP: Update documentation for offset-based layout
sfc-gh-pgaur Jan 26, 2026
b813a6d
ALP: Add CreateSamplingPreset and EncodeWithPreset APIs
sfc-gh-pgaur Jan 27, 2026
bdb28ec
ALP: Update all comments and docs for offset-based layout
sfc-gh-pgaur Jan 27, 2026
850f16f
[WIP] test driver for creating ALP encoded files
alamb Feb 5, 2026
7cb4f48
example
alamb Feb 5, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions cpp/examples/parquet/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

add_executable(parquet-low-level-example low_level_api/reader_writer.cc)
add_executable(parquet-low-level-example2 low_level_api/reader_writer2.cc)
add_executable(parquet-write-parquet low_level_api/write_parquet.cc)
add_executable(parquet-arrow-example parquet_arrow/reader_writer.cc)
add_executable(parquet-stream-api-example parquet_stream_api/stream_reader_writer.cc)
target_include_directories(parquet-low-level-example PRIVATE low_level_api/)
Expand Down Expand Up @@ -58,6 +59,7 @@ endif()
target_link_libraries(parquet-arrow-example ${PARQUET_EXAMPLE_LINK_LIBS})
target_link_libraries(parquet-low-level-example ${PARQUET_EXAMPLE_LINK_LIBS})
target_link_libraries(parquet-low-level-example2 ${PARQUET_EXAMPLE_LINK_LIBS})
target_link_libraries(parquet-write-parquet ${PARQUET_EXAMPLE_LINK_LIBS})
target_link_libraries(parquet-stream-api-example ${PARQUET_EXAMPLE_LINK_LIBS})

if(PARQUET_REQUIRE_ENCRYPTION)
Expand All @@ -69,6 +71,7 @@ endif()
add_dependencies(parquet
parquet-low-level-example
parquet-low-level-example2
parquet-write-parquet
parquet-arrow-example
parquet-stream-api-example)

Expand Down
172 changes: 172 additions & 0 deletions cpp/examples/parquet/low_level_api/write_parquet.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#include <arrow/io/file.h>
#include <arrow/util/logging.h>
#include <parquet/api/writer.h>
#include <parquet/types.h>

#include <cctype>
#include <filesystem>
#include <iostream>
#include <memory>
#include <string>
#include <vector>

namespace {

constexpr int64_t kNumRows = 1024;
constexpr char kColumnName[] = "value_f64";

struct EncodingSelection {
parquet::Encoding::type encoding = parquet::Encoding::UNKNOWN;
bool requires_dictionary = false;
};

std::string NormalizeEncodingName(const std::string& raw) {
std::string normalized;
normalized.reserve(raw.size());
for (char ch : raw) {
if (ch == '-' || ch == ' ') {
normalized.push_back('_');
} else {
normalized.push_back(static_cast<char>(std::toupper(static_cast<unsigned char>(ch))));
}
}
return normalized;
}

EncodingSelection ParseEncoding(const std::string& raw_name) {
const std::string name = NormalizeEncodingName(raw_name);
if (name == "PLAIN") {
return {parquet::Encoding::PLAIN, false};
}
if (name == "BYTE_STREAM_SPLIT" || name == "BYTESTREAMSPLIT") {
return {parquet::Encoding::BYTE_STREAM_SPLIT, false};
}
if (name == "ALP") {
return {parquet::Encoding::ALP, false};
}
if (name == "RLE_DICTIONARY") {
return {parquet::Encoding::RLE_DICTIONARY, true};
}
if (name == "PLAIN_DICTIONARY") {
return {parquet::Encoding::PLAIN_DICTIONARY, true};
}
return {};
}

void PrintUsage() {
std::cerr << "Usage: write_parquet --encoding <encoding_name> <output_directory>\n"
<< "Supported encodings for DOUBLE: PLAIN, BYTE_STREAM_SPLIT, ALP,\n"
<< " RLE_DICTIONARY, PLAIN_DICTIONARY\n";
}

std::shared_ptr<parquet::schema::GroupNode> MakeSchema() {
parquet::schema::NodeVector fields;
fields.push_back(parquet::schema::PrimitiveNode::Make(
kColumnName, parquet::Repetition::REQUIRED, parquet::Type::DOUBLE,
parquet::ConvertedType::NONE));
return std::static_pointer_cast<parquet::schema::GroupNode>(
parquet::schema::GroupNode::Make("schema", parquet::Repetition::REQUIRED, fields));
}

std::vector<double> MakeValues(bool dictionary_friendly) {
std::vector<double> values;
values.reserve(kNumRows);
if (dictionary_friendly) {
for (int64_t i = 0; i < kNumRows; ++i) {
values.push_back(static_cast<double>(i % 8));
}
} else {
for (int64_t i = 0; i < kNumRows; ++i) {
values.push_back(0.125 + static_cast<double>(i) * 0.25);
}
}
return values;
}

} // namespace

int main(int argc, char** argv) {
if (argc != 4 || std::string(argv[1]) != "--encoding") {
PrintUsage();
return 2;
}

const std::string encoding_name = argv[2];
const std::filesystem::path output_dir(argv[3]);

if (!std::filesystem::exists(output_dir) || !std::filesystem::is_directory(output_dir)) {
std::cerr << "Output directory does not exist or is not a directory: "
<< output_dir.string() << "\n";
return 2;
}

EncodingSelection selection = ParseEncoding(encoding_name);
if (selection.encoding == parquet::Encoding::UNKNOWN) {
std::cerr << "Unsupported encoding: " << encoding_name << "\n";
PrintUsage();
return 2;
}

try {
std::string normalized = NormalizeEncodingName(encoding_name);
std::filesystem::path out_path =
output_dir / ("single_f64_" + normalized + ".parquet");

std::shared_ptr<arrow::io::FileOutputStream> out_file;
PARQUET_ASSIGN_OR_THROW(out_file, arrow::io::FileOutputStream::Open(out_path.string()));

auto schema = MakeSchema();

parquet::WriterProperties::Builder builder;
builder.compression(parquet::Compression::UNCOMPRESSED);

if (selection.requires_dictionary) {
builder.enable_dictionary();
builder.encoding(parquet::Encoding::PLAIN);
} else {
builder.disable_dictionary();
builder.encoding(selection.encoding);
}

std::shared_ptr<parquet::WriterProperties> props = builder.build();

std::shared_ptr<parquet::ParquetFileWriter> file_writer =
parquet::ParquetFileWriter::Open(out_file, schema, props);

parquet::RowGroupWriter* row_group_writer = file_writer->AppendRowGroup();
auto* double_writer =
static_cast<parquet::DoubleWriter*>(row_group_writer->NextColumn());

std::vector<double> values = MakeValues(selection.requires_dictionary);
double_writer->WriteBatch(static_cast<int64_t>(values.size()), nullptr, nullptr,
values.data());

file_writer->Close();
ARROW_DCHECK(out_file->Close().ok());

std::cout << "Wrote " << out_path.string() << "\n";
std::cout << "Requested encoding: " << normalized << "\n";
} catch (const std::exception& e) {
std::cerr << "Parquet write error: " << e.what() << "\n";
return 1;
}

return 0;
}
3 changes: 3 additions & 0 deletions cpp/examples/parquet/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@ example_execs = {
'sources': files('low_level_api/reader_writer2.cc'),
'include_dir': include_directories('low_level_api'),
},
'parquet-write-parquet': {
'sources': files('low_level_api/write_parquet.cc'),
},
'parquet-arrow-example': {
'sources': files('parquet_arrow/reader_writer.cc'),
},
Expand Down
6 changes: 6 additions & 0 deletions cpp/src/arrow/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -565,6 +565,12 @@ if(ARROW_WITH_ZSTD)
list(APPEND ARROW_UTIL_SRCS util/compression_zstd.cc)
endif()

# ALP (for Parquet encoder/decoder)
list(APPEND ARROW_UTIL_SRCS
util/alp/alp.cc
util/alp/alp_sampler.cc
util/alp/alp_wrapper.cc)

arrow_add_object_library(ARROW_UTIL ${ARROW_UTIL_SRCS})

# Disable DLL exports in vendored uriparser library
Expand Down
7 changes: 7 additions & 0 deletions cpp/src/arrow/util/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,13 @@ add_arrow_test(bit-utility-test
rle_encoding_test.cc
test_common.cc)

add_arrow_test(alp-test
SOURCES
alp/alp_test.cc
alp/alp.cc
alp/alp_sampler.cc
alp/alp_wrapper.cc)

add_arrow_test(crc32-test
SOURCES
crc32_test.cc
Expand Down
Loading