apache · alamb · Dec 4, 2025 · Dec 4, 2025 · Dec 4, 2025 · Dec 4, 2025
diff --git a/cpp/examples/parquet/CMakeLists.txt b/cpp/examples/parquet/CMakeLists.txt
@@ -17,6 +17,7 @@
 
 add_executable(parquet-low-level-example low_level_api/reader_writer.cc)
 add_executable(parquet-low-level-example2 low_level_api/reader_writer2.cc)
+add_executable(parquet-write-parquet low_level_api/write_parquet.cc)
 add_executable(parquet-arrow-example parquet_arrow/reader_writer.cc)
 add_executable(parquet-stream-api-example parquet_stream_api/stream_reader_writer.cc)
 target_include_directories(parquet-low-level-example PRIVATE low_level_api/)
@@ -58,6 +59,7 @@ endif()
 target_link_libraries(parquet-arrow-example ${PARQUET_EXAMPLE_LINK_LIBS})
 target_link_libraries(parquet-low-level-example ${PARQUET_EXAMPLE_LINK_LIBS})
 target_link_libraries(parquet-low-level-example2 ${PARQUET_EXAMPLE_LINK_LIBS})
+target_link_libraries(parquet-write-parquet ${PARQUET_EXAMPLE_LINK_LIBS})
 target_link_libraries(parquet-stream-api-example ${PARQUET_EXAMPLE_LINK_LIBS})
 
 if(PARQUET_REQUIRE_ENCRYPTION)
@@ -69,6 +71,7 @@ endif()
 add_dependencies(parquet
                  parquet-low-level-example
                  parquet-low-level-example2
+                 parquet-write-parquet
                  parquet-arrow-example
                  parquet-stream-api-example)
 

diff --git a/cpp/examples/parquet/low_level_api/write_parquet.cc b/cpp/examples/parquet/low_level_api/write_parquet.cc
@@ -0,0 +1,172 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <arrow/io/file.h>
+#include <arrow/util/logging.h>
+#include <parquet/api/writer.h>
+#include <parquet/types.h>
+
+#include <cctype>
+#include <filesystem>
+#include <iostream>
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace {
+
+constexpr int64_t kNumRows = 1024;
+constexpr char kColumnName[] = "value_f64";
+
+struct EncodingSelection {
+  parquet::Encoding::type encoding = parquet::Encoding::UNKNOWN;
+  bool requires_dictionary = false;
+};
+
+std::string NormalizeEncodingName(const std::string& raw) {
+  std::string normalized;
+  normalized.reserve(raw.size());
+  for (char ch : raw) {
+    if (ch == '-' || ch == ' ') {
+      normalized.push_back('_');
+    } else {
+      normalized.push_back(static_cast<char>(std::toupper(static_cast<unsigned char>(ch))));
+    }
+  }
+  return normalized;
+}
+
+EncodingSelection ParseEncoding(const std::string& raw_name) {
+  const std::string name = NormalizeEncodingName(raw_name);
+  if (name == "PLAIN") {
+    return {parquet::Encoding::PLAIN, false};
+  }
+  if (name == "BYTE_STREAM_SPLIT" || name == "BYTESTREAMSPLIT") {
+    return {parquet::Encoding::BYTE_STREAM_SPLIT, false};
+  }
+  if (name == "ALP") {
+    return {parquet::Encoding::ALP, false};
+  }
+  if (name == "RLE_DICTIONARY") {
+    return {parquet::Encoding::RLE_DICTIONARY, true};
+  }
+  if (name == "PLAIN_DICTIONARY") {
+    return {parquet::Encoding::PLAIN_DICTIONARY, true};
+  }
+  return {};
+}
+
+void PrintUsage() {
+  std::cerr << "Usage: write_parquet --encoding <encoding_name> <output_directory>\n"
+            << "Supported encodings for DOUBLE: PLAIN, BYTE_STREAM_SPLIT, ALP,\n"
+            << "  RLE_DICTIONARY, PLAIN_DICTIONARY\n";
+}
+
+std::shared_ptr<parquet::schema::GroupNode> MakeSchema() {
+  parquet::schema::NodeVector fields;
+  fields.push_back(parquet::schema::PrimitiveNode::Make(
+      kColumnName, parquet::Repetition::REQUIRED, parquet::Type::DOUBLE,
+      parquet::ConvertedType::NONE));
+  return std::static_pointer_cast<parquet::schema::GroupNode>(
+      parquet::schema::GroupNode::Make("schema", parquet::Repetition::REQUIRED, fields));
+}
+
+std::vector<double> MakeValues(bool dictionary_friendly) {
+  std::vector<double> values;
+  values.reserve(kNumRows);
+  if (dictionary_friendly) {
+    for (int64_t i = 0; i < kNumRows; ++i) {
+      values.push_back(static_cast<double>(i % 8));
+    }
+  } else {
+    for (int64_t i = 0; i < kNumRows; ++i) {
+      values.push_back(0.125 + static_cast<double>(i) * 0.25);
+    }
+  }
+  return values;
+}
+
+}  // namespace
+
+int main(int argc, char** argv) {
+  if (argc != 4 || std::string(argv[1]) != "--encoding") {
+    PrintUsage();
+    return 2;
+  }
+
+  const std::string encoding_name = argv[2];
+  const std::filesystem::path output_dir(argv[3]);
+
+  if (!std::filesystem::exists(output_dir) || !std::filesystem::is_directory(output_dir)) {
+    std::cerr << "Output directory does not exist or is not a directory: "
+              << output_dir.string() << "\n";
+    return 2;
+  }
+
+  EncodingSelection selection = ParseEncoding(encoding_name);
+  if (selection.encoding == parquet::Encoding::UNKNOWN) {
+    std::cerr << "Unsupported encoding: " << encoding_name << "\n";
+    PrintUsage();
+    return 2;
+  }
+
+  try {
+    std::string normalized = NormalizeEncodingName(encoding_name);
+    std::filesystem::path out_path =
+        output_dir / ("single_f64_" + normalized + ".parquet");
+
+    std::shared_ptr<arrow::io::FileOutputStream> out_file;
+    PARQUET_ASSIGN_OR_THROW(out_file, arrow::io::FileOutputStream::Open(out_path.string()));
+
+    auto schema = MakeSchema();
+
+    parquet::WriterProperties::Builder builder;
+    builder.compression(parquet::Compression::UNCOMPRESSED);
+
+    if (selection.requires_dictionary) {
+      builder.enable_dictionary();
+      builder.encoding(parquet::Encoding::PLAIN);
+    } else {
+      builder.disable_dictionary();
+      builder.encoding(selection.encoding);
+    }
+
+    std::shared_ptr<parquet::WriterProperties> props = builder.build();
+
+    std::shared_ptr<parquet::ParquetFileWriter> file_writer =
+        parquet::ParquetFileWriter::Open(out_file, schema, props);
+
+    parquet::RowGroupWriter* row_group_writer = file_writer->AppendRowGroup();
+    auto* double_writer =
+        static_cast<parquet::DoubleWriter*>(row_group_writer->NextColumn());
+
+    std::vector<double> values = MakeValues(selection.requires_dictionary);
+    double_writer->WriteBatch(static_cast<int64_t>(values.size()), nullptr, nullptr,
+                              values.data());
+
+    file_writer->Close();
+    ARROW_DCHECK(out_file->Close().ok());
+
+    std::cout << "Wrote " << out_path.string() << "\n";
+    std::cout << "Requested encoding: " << normalized << "\n";
+  } catch (const std::exception& e) {
+    std::cerr << "Parquet write error: " << e.what() << "\n";
+    return 1;
+  }
+
+  return 0;
+}
diff --git a/cpp/examples/parquet/meson.build b/cpp/examples/parquet/meson.build
@@ -24,6 +24,9 @@ example_execs = {
         'sources': files('low_level_api/reader_writer2.cc'),
         'include_dir': include_directories('low_level_api'),
     },
+    'parquet-write-parquet': {
+        'sources': files('low_level_api/write_parquet.cc'),
+    },
     'parquet-arrow-example': {
         'sources': files('parquet_arrow/reader_writer.cc'),
     },

diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt
@@ -565,6 +565,12 @@ if(ARROW_WITH_ZSTD)
   list(APPEND ARROW_UTIL_SRCS util/compression_zstd.cc)
 endif()
 
+# ALP (for Parquet encoder/decoder)
+list(APPEND ARROW_UTIL_SRCS
+  util/alp/alp.cc
+  util/alp/alp_sampler.cc
+  util/alp/alp_wrapper.cc)
+
 arrow_add_object_library(ARROW_UTIL ${ARROW_UTIL_SRCS})
 
 # Disable DLL exports in vendored uriparser library

diff --git a/cpp/src/arrow/util/CMakeLists.txt b/cpp/src/arrow/util/CMakeLists.txt
@@ -103,6 +103,13 @@ add_arrow_test(bit-utility-test
                rle_encoding_test.cc
                test_common.cc)
 
+add_arrow_test(alp-test
+               SOURCES
+               alp/alp_test.cc
+               alp/alp.cc
+               alp/alp_sampler.cc
+               alp/alp_wrapper.cc)
+
 add_arrow_test(crc32-test
                SOURCES
                crc32_test.cc