From 7e98e82bc6f9b5f4b45a6e187a9ae2023acea3b7 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 25 Jun 2024 11:05:02 -0400 Subject: [PATCH 1/5] Implement parquet-integration-testing Integration Tests --- Cargo.toml | 2 + parquet-integration-testing/Cargo.toml | 35 ++++++ parquet-integration-testing/README.md | 32 ++++++ .../src/bin/parquet-integration-testing.rs | 108 ++++++++++++++++++ 4 files changed, 177 insertions(+) create mode 100644 parquet-integration-testing/Cargo.toml create mode 100644 parquet-integration-testing/README.md create mode 100644 parquet-integration-testing/src/bin/parquet-integration-testing.rs diff --git a/Cargo.toml b/Cargo.toml index e0144faa7a92..a6beb899dbc2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -57,6 +57,8 @@ exclude = [ # significantly changing how it is compiled within the workspace, causing the whole workspace to be compiled from # scratch this way, this is a stand-alone package that compiles independently of the others. "arrow-pyarrow-integration-testing", + # parquet inregration testing likewise contains different flags + "parquet-integration-testing", # object_store is excluded because it follows a separate release cycle from the other arrow crates "object_store" ] diff --git a/parquet-integration-testing/Cargo.toml b/parquet-integration-testing/Cargo.toml new file mode 100644 index 000000000000..71b49322b79c --- /dev/null +++ b/parquet-integration-testing/Cargo.toml @@ -0,0 +1,35 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[package] +name = "python-integration-testing" +description = "Binaries used for testing parquet-rs compatibility (NOT published to crates.io)" +version = { workspace = true } +homepage = { workspace = true } +repository = { workspace = true } +authors = { workspace = true } +license = { workspace = true } +edition = { workspace = true } +publish = false +rust-version = { workspace = true } + + +[dependencies] +arrow = { path = "../arrow" } +parquet = { path = "../parquet" } +serde_json = { version = "1.0", default-features = false, features = ["std"] } +pretty_assertions = "1.4.0" \ No newline at end of file diff --git a/parquet-integration-testing/README.md b/parquet-integration-testing/README.md new file mode 100644 index 000000000000..eca0867c8cc1 --- /dev/null +++ b/parquet-integration-testing/README.md @@ -0,0 +1,32 @@ + + +# Apache Parquet Rust Integration Testing + +The binary in this repo: + +1. Reads files from the parquet-testing repo +2. Creates a JSON file with appropriately formatted contents +3. Compare these JSON files with "known good" golden master files + +## Running + +```shell +cargo run +``` diff --git a/parquet-integration-testing/src/bin/parquet-integration-testing.rs b/parquet-integration-testing/src/bin/parquet-integration-testing.rs new file mode 100644 index 000000000000..14570a35042e --- /dev/null +++ b/parquet-integration-testing/src/bin/parquet-integration-testing.rs @@ -0,0 +1,108 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + + +// Draft program for testing the parquet-rs library + +use std::fs::{canonicalize, File}; +use std::path::Path; +use arrow::util::display::array_value_to_string; +use arrow::util::pretty::pretty_format_columns; +use parquet::arrow::arrow_reader::ArrowReaderBuilder; +use serde::Serialize; +use serde_json::json; + +fn main() { + println!("PWD: {:?}", std::env::var("PWD")); + let parquet_data_path = "/Users/andrewlamb/Software/arrow-rs/parquet-testing/data"; + let expected_data_path = "/Users/andrewlamb/Software/arrow-rs/parquet-integration-testing/data"; + + let filenames = vec!["alltypes_plain.parquet", "alltypes_plain_dictionary.parquet"]; + + for filename in filenames { + let parquet_file_path = Path::from(parquet_data_path).join(filename).canonicalize().unwrap(); + let expected_file_path = Path::from(expected_data_path).join(format!("{}filename}.json")); + + println!("Begin test: {filename}"); + println!(" Reading parquet file: {parquet_file_path}"); + println!(" Expected JSON file: {expected_file_path}"); + let parquet_json = read_parquet_data(&parquet_file_path); + let expected_json = std::fs::read_to_string(expected_file_path); + } + + +} + +// prototype demonstration of checking type support for parquet-rs encoding +// check read support by reading a file with the specified encoding correctly + +// | PLAIN | | | | | +// | PLAIN_DICTIONARY | | | | | +// | RLE_DICTIONARY | | | | | +// | RLE | | | | | +// | BIT_PACKED (deprecated) | | | | | +// | DELTA_BINARY_PACKED | | | | | + +// The idea is to produce a file like this: +// ```text +// { +// filename: "filename.parquet", +// rows: [ +// { +// "column1": "value1", +// "column2": 123, +// "column3": null +// }, +// .. +// { +// "column1": "value2", +// "column2": 456, +// "column3": "value3" +// } +// ] +// } +// ``` + +/// The function reads a parquet file and writes a JSON representation of the data within +fn read_parquet_data(parquet_data_path: &str) -> String { + let file = File::open(&parquet_data_path).unwrap(); + let mut reader = ArrowReaderBuilder::try_new(file).unwrap().build().unwrap(); + + + let mut rows = vec![]; + while let Some(batch) = reader.next() { + let batch = batch.unwrap(); + let columns = batch.columns(); + let schema = batch.schema(); + for i in 0..batch.num_rows() { + let mut row = vec![]; + for (field, column) in schema.fields.iter().zip(columns.iter()) { + let name = field.name(); + let value = array_value_to_string(column.as_ref(), i).unwrap(); + row.push(json!({name: value})); + } + rows.push(json!(row)); + } + } + + let value = json!({ + "filename": parquet_data_path, + "rows": rows + }); + + serde_json::to_string_pretty(&value).unwrap(); +} \ No newline at end of file From 74597f1643080ed7ba737dba51b76fe2c9f2e353 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 25 Jun 2024 11:53:31 -0400 Subject: [PATCH 2/5] initial data path --- Cargo.toml | 2 +- parquet-integration-testing/.gitignore | 1 + parquet-integration-testing/Cargo.toml | 13 ++-- .../data/alltypes_plain.parquet.json | 1 + .../src/bin/parquet-integration-testing.rs | 68 ++++++++++++------- 5 files changed, 50 insertions(+), 35 deletions(-) create mode 100644 parquet-integration-testing/.gitignore create mode 100644 parquet-integration-testing/data/alltypes_plain.parquet.json diff --git a/Cargo.toml b/Cargo.toml index a6beb899dbc2..7798ea0b239c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -36,7 +36,7 @@ members = [ "arrow-row", "arrow-schema", "arrow-select", - "arrow-string", + "arrow-string", "foo", "parquet", "parquet_derive", "parquet_derive_test", diff --git a/parquet-integration-testing/.gitignore b/parquet-integration-testing/.gitignore new file mode 100644 index 000000000000..1fcb1529f8e5 --- /dev/null +++ b/parquet-integration-testing/.gitignore @@ -0,0 +1 @@ +out diff --git a/parquet-integration-testing/Cargo.toml b/parquet-integration-testing/Cargo.toml index 71b49322b79c..c85ba8be39bc 100644 --- a/parquet-integration-testing/Cargo.toml +++ b/parquet-integration-testing/Cargo.toml @@ -18,18 +18,13 @@ [package] name = "python-integration-testing" description = "Binaries used for testing parquet-rs compatibility (NOT published to crates.io)" -version = { workspace = true } -homepage = { workspace = true } -repository = { workspace = true } -authors = { workspace = true } -license = { workspace = true } -edition = { workspace = true } publish = false -rust-version = { workspace = true } +edition = "2021" [dependencies] -arrow = { path = "../arrow" } -parquet = { path = "../parquet" } +arrow = { path = "../arrow", features = ["prettyprint"] } +parquet = { path = "../parquet", features = ["arrow"]} +serde = "1.0.203" serde_json = { version = "1.0", default-features = false, features = ["std"] } pretty_assertions = "1.4.0" \ No newline at end of file diff --git a/parquet-integration-testing/data/alltypes_plain.parquet.json b/parquet-integration-testing/data/alltypes_plain.parquet.json new file mode 100644 index 000000000000..1288ce1f39b6 --- /dev/null +++ b/parquet-integration-testing/data/alltypes_plain.parquet.json @@ -0,0 +1 @@ +{"filename":"/Users/andrewlamb/Software/arrow-rs/parquet-testing/data/alltypes_plain.parquet","rows":[[{"id":"4"},{"bool_col":"true"},{"tinyint_col":"0"},{"smallint_col":"0"},{"int_col":"0"},{"bigint_col":"0"},{"float_col":"0.0"},{"double_col":"0.0"},{"date_string_col":"30332f30312f3039"},{"string_col":"30"},{"timestamp_col":"2009-03-01T00:00:00"}],[{"id":"5"},{"bool_col":"false"},{"tinyint_col":"1"},{"smallint_col":"1"},{"int_col":"1"},{"bigint_col":"10"},{"float_col":"1.1"},{"double_col":"10.1"},{"date_string_col":"30332f30312f3039"},{"string_col":"31"},{"timestamp_col":"2009-03-01T00:01:00"}],[{"id":"6"},{"bool_col":"true"},{"tinyint_col":"0"},{"smallint_col":"0"},{"int_col":"0"},{"bigint_col":"0"},{"float_col":"0.0"},{"double_col":"0.0"},{"date_string_col":"30342f30312f3039"},{"string_col":"30"},{"timestamp_col":"2009-04-01T00:00:00"}],[{"id":"7"},{"bool_col":"false"},{"tinyint_col":"1"},{"smallint_col":"1"},{"int_col":"1"},{"bigint_col":"10"},{"float_col":"1.1"},{"double_col":"10.1"},{"date_string_col":"30342f30312f3039"},{"string_col":"31"},{"timestamp_col":"2009-04-01T00:01:00"}],[{"id":"2"},{"bool_col":"true"},{"tinyint_col":"0"},{"smallint_col":"0"},{"int_col":"0"},{"bigint_col":"0"},{"float_col":"0.0"},{"double_col":"0.0"},{"date_string_col":"30322f30312f3039"},{"string_col":"30"},{"timestamp_col":"2009-02-01T00:00:00"}],[{"id":"3"},{"bool_col":"false"},{"tinyint_col":"1"},{"smallint_col":"1"},{"int_col":"1"},{"bigint_col":"10"},{"float_col":"1.1"},{"double_col":"10.1"},{"date_string_col":"30322f30312f3039"},{"string_col":"31"},{"timestamp_col":"2009-02-01T00:01:00"}],[{"id":"0"},{"bool_col":"true"},{"tinyint_col":"0"},{"smallint_col":"0"},{"int_col":"0"},{"bigint_col":"0"},{"float_col":"0.0"},{"double_col":"0.0"},{"date_string_col":"30312f30312f3039"},{"string_col":"30"},{"timestamp_col":"2009-01-01T00:00:00"}],[{"id":"1"},{"bool_col":"false"},{"tinyint_col":"1"},{"smallint_col":"1"},{"int_col":"1"},{"bigint_col":"10"},{"float_col":"1.1"},{"double_col":"10.1"},{"date_string_col":"30312f30312f3039"},{"string_col":"31"},{"timestamp_col":"2009-01-01T00:01:00"}]]} \ No newline at end of file diff --git a/parquet-integration-testing/src/bin/parquet-integration-testing.rs b/parquet-integration-testing/src/bin/parquet-integration-testing.rs index 14570a35042e..7eeecba0d04a 100644 --- a/parquet-integration-testing/src/bin/parquet-integration-testing.rs +++ b/parquet-integration-testing/src/bin/parquet-integration-testing.rs @@ -15,36 +15,57 @@ // specific language governing permissions and limitations // under the License. - -// Draft program for testing the parquet-rs library - -use std::fs::{canonicalize, File}; -use std::path::Path; use arrow::util::display::array_value_to_string; use arrow::util::pretty::pretty_format_columns; use parquet::arrow::arrow_reader::ArrowReaderBuilder; +use pretty_assertions::assert_eq; use serde::Serialize; -use serde_json::json; +use serde_json::{json, Value}; +/// Test driver for parquet-integration testing +use std::fs::{canonicalize, File}; +use std::path::{Path, PathBuf}; fn main() { - println!("PWD: {:?}", std::env::var("PWD")); - let parquet_data_path = "/Users/andrewlamb/Software/arrow-rs/parquet-testing/data"; - let expected_data_path = "/Users/andrewlamb/Software/arrow-rs/parquet-integration-testing/data"; + // paths are relative to arrow-rs/parquet-integration-testing + let parquet_data_path = + PathBuf::from("../parquet-testing/data").canonicalize().unwrap(); + let expected_data_path = + PathBuf::from("data").canonicalize().unwrap(); + let output_data_path = + PathBuf::from("out").canonicalize().unwrap(); - let filenames = vec!["alltypes_plain.parquet", "alltypes_plain_dictionary.parquet"]; + std::fs::create_dir_all(&output_data_path).unwrap(); + + let filenames = vec![ + "alltypes_plain.parquet", + //"alltypes_plain_dictionary.parquet", + ]; for filename in filenames { - let parquet_file_path = Path::from(parquet_data_path).join(filename).canonicalize().unwrap(); - let expected_file_path = Path::from(expected_data_path).join(format!("{}filename}.json")); + let parquet_file_path = parquet_data_path.join(filename); + + let expected_file_path = expected_data_path + .join(format!("{filename}.json")); + + // For development, also write the actual parsed value to a file + let output_file_path = output_data_path + .join(format!("{filename}.json")); + + println!("Begin test: {filename}"); - println!(" Reading parquet file: {parquet_file_path}"); - println!(" Expected JSON file: {expected_file_path}"); - let parquet_json = read_parquet_data(&parquet_file_path); - let expected_json = std::fs::read_to_string(expected_file_path); - } + println!(" Input parquet file: {parquet_file_path:?}"); + println!(" Expected JSON file: {expected_file_path:?}"); + println!(" Output JSON file: {output_file_path:?}"); + let parquet_json = read_parquet_data(&parquet_file_path); + let output_file = File::create(&output_file_path).unwrap(); + serde_json::to_writer(output_file, &parquet_json).unwrap(); + let expected_file = File::open(expected_file_path).unwrap(); + let expected_json: Value = serde_json::from_reader(expected_file).unwrap(); + assert_eq!(parquet_json, expected_json) + } } // prototype demonstration of checking type support for parquet-rs encoding @@ -78,11 +99,10 @@ fn main() { // ``` /// The function reads a parquet file and writes a JSON representation of the data within -fn read_parquet_data(parquet_data_path: &str) -> String { - let file = File::open(&parquet_data_path).unwrap(); +fn read_parquet_data(parquet_data_path: &Path) -> Value { + let file = File::open(parquet_data_path).unwrap(); let mut reader = ArrowReaderBuilder::try_new(file).unwrap().build().unwrap(); - let mut rows = vec![]; while let Some(batch) = reader.next() { let batch = batch.unwrap(); @@ -99,10 +119,8 @@ fn read_parquet_data(parquet_data_path: &str) -> String { } } - let value = json!({ + json!({ "filename": parquet_data_path, "rows": rows - }); - - serde_json::to_string_pretty(&value).unwrap(); -} \ No newline at end of file + }) +} From cd46026ae19dbcf2df2b079fb6511f50f93b06cb Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 25 Jun 2024 12:09:16 -0400 Subject: [PATCH 3/5] refactor --- .../src/bin/parquet-integration-testing.rs | 107 ++++++++++-------- 1 file changed, 61 insertions(+), 46 deletions(-) diff --git a/parquet-integration-testing/src/bin/parquet-integration-testing.rs b/parquet-integration-testing/src/bin/parquet-integration-testing.rs index 7eeecba0d04a..443a871625b7 100644 --- a/parquet-integration-testing/src/bin/parquet-integration-testing.rs +++ b/parquet-integration-testing/src/bin/parquet-integration-testing.rs @@ -26,32 +26,77 @@ use std::fs::{canonicalize, File}; use std::path::{Path, PathBuf}; fn main() { - // paths are relative to arrow-rs/parquet-integration-testing - let parquet_data_path = - PathBuf::from("../parquet-testing/data").canonicalize().unwrap(); - let expected_data_path = - PathBuf::from("data").canonicalize().unwrap(); - let output_data_path = - PathBuf::from("out").canonicalize().unwrap(); - - std::fs::create_dir_all(&output_data_path).unwrap(); + let integration_test = IntegrationTest::new(); let filenames = vec![ "alltypes_plain.parquet", - //"alltypes_plain_dictionary.parquet", ]; for filename in filenames { - let parquet_file_path = parquet_data_path.join(filename); + integration_test.read_test(filename); + } +} + +// prototype demonstration of checking type support for parquet-rs encoding +// check read support by reading a file with the specified encoding correctly +#[derive(Debug)] +struct IntegrationTest { + parquet_data_path: PathBuf, + expected_data_path: PathBuf, + output_data_path: PathBuf, +} - let expected_file_path = expected_data_path - .join(format!("{filename}.json")); +impl IntegrationTest { + pub fn new() -> Self { + // TODO error handling - // For development, also write the actual parsed value to a file - let output_file_path = output_data_path - .join(format!("{filename}.json")); + // paths are relative to arrow-rs/parquet-integration-testing + let parquet_data_path = PathBuf::from("../parquet-testing/data") + .canonicalize() + .unwrap(); + let expected_data_path = PathBuf::from("data").canonicalize().unwrap(); + let output_data_path = PathBuf::from("out").canonicalize().unwrap(); + std::fs::create_dir_all(&output_data_path).unwrap(); + Self { + parquet_data_path, + expected_data_path, + output_data_path, + } + } + + /// Read a parquet file, create a JSON representation, and compare to the + /// known good value in data + /// + /// The output JSON looks like this: + /// + /// ```text + /// { + /// filename: "filename.parquet", + /// rows: [ + /// { + /// "column1": "value1", + /// "column2": 123, + /// "column3": null + /// }, + /// .. + /// { + /// "column1": "value2", + /// "column2": 456, + /// "column3": "value3" + /// } + /// ] + /// } + /// ``` + fn read_test(&self, filename: &str) { + let parquet_file_path = self.parquet_data_path.join(filename); + + let expected_file_path = self.expected_data_path.join(format!("{filename}.json")); + + // For ease of development, write the actual parsed value to a file (to + // permit easy updates, for example) + let output_file_path = self.output_data_path.join(format!("{filename}.json")); println!("Begin test: {filename}"); println!(" Input parquet file: {parquet_file_path:?}"); @@ -68,36 +113,6 @@ fn main() { } } -// prototype demonstration of checking type support for parquet-rs encoding -// check read support by reading a file with the specified encoding correctly - -// | PLAIN | | | | | -// | PLAIN_DICTIONARY | | | | | -// | RLE_DICTIONARY | | | | | -// | RLE | | | | | -// | BIT_PACKED (deprecated) | | | | | -// | DELTA_BINARY_PACKED | | | | | - -// The idea is to produce a file like this: -// ```text -// { -// filename: "filename.parquet", -// rows: [ -// { -// "column1": "value1", -// "column2": 123, -// "column3": null -// }, -// .. -// { -// "column1": "value2", -// "column2": 456, -// "column3": "value3" -// } -// ] -// } -// ``` - /// The function reads a parquet file and writes a JSON representation of the data within fn read_parquet_data(parquet_data_path: &Path) -> Value { let file = File::open(parquet_data_path).unwrap(); From a345055eeb74cfc272b718a3b89e4b209ab4e8b9 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 25 Jun 2024 12:27:51 -0400 Subject: [PATCH 4/5] Add basic metadata test --- .../data/alltypes_plain.parquet.data.json | 285 ++++++++++++++++++ .../data/alltypes_plain.parquet.json | 1 - .../data/alltypes_plain.parquet.metadata.json | 8 + .../src/bin/parquet-integration-testing.rs | 92 +++++- 4 files changed, 374 insertions(+), 12 deletions(-) create mode 100644 parquet-integration-testing/data/alltypes_plain.parquet.data.json delete mode 100644 parquet-integration-testing/data/alltypes_plain.parquet.json create mode 100644 parquet-integration-testing/data/alltypes_plain.parquet.metadata.json diff --git a/parquet-integration-testing/data/alltypes_plain.parquet.data.json b/parquet-integration-testing/data/alltypes_plain.parquet.data.json new file mode 100644 index 000000000000..2c24a72290c8 --- /dev/null +++ b/parquet-integration-testing/data/alltypes_plain.parquet.data.json @@ -0,0 +1,285 @@ +{ + "filename": "alltypes_plain.parquet", + "rows": [ + [ + { + "id": "4" + }, + { + "bool_col": "true" + }, + { + "tinyint_col": "0" + }, + { + "smallint_col": "0" + }, + { + "int_col": "0" + }, + { + "bigint_col": "0" + }, + { + "float_col": "0.0" + }, + { + "double_col": "0.0" + }, + { + "date_string_col": "30332f30312f3039" + }, + { + "string_col": "30" + }, + { + "timestamp_col": "2009-03-01T00:00:00" + } + ], + [ + { + "id": "5" + }, + { + "bool_col": "false" + }, + { + "tinyint_col": "1" + }, + { + "smallint_col": "1" + }, + { + "int_col": "1" + }, + { + "bigint_col": "10" + }, + { + "float_col": "1.1" + }, + { + "double_col": "10.1" + }, + { + "date_string_col": "30332f30312f3039" + }, + { + "string_col": "31" + }, + { + "timestamp_col": "2009-03-01T00:01:00" + } + ], + [ + { + "id": "6" + }, + { + "bool_col": "true" + }, + { + "tinyint_col": "0" + }, + { + "smallint_col": "0" + }, + { + "int_col": "0" + }, + { + "bigint_col": "0" + }, + { + "float_col": "0.0" + }, + { + "double_col": "0.0" + }, + { + "date_string_col": "30342f30312f3039" + }, + { + "string_col": "30" + }, + { + "timestamp_col": "2009-04-01T00:00:00" + } + ], + [ + { + "id": "7" + }, + { + "bool_col": "false" + }, + { + "tinyint_col": "1" + }, + { + "smallint_col": "1" + }, + { + "int_col": "1" + }, + { + "bigint_col": "10" + }, + { + "float_col": "1.1" + }, + { + "double_col": "10.1" + }, + { + "date_string_col": "30342f30312f3039" + }, + { + "string_col": "31" + }, + { + "timestamp_col": "2009-04-01T00:01:00" + } + ], + [ + { + "id": "2" + }, + { + "bool_col": "true" + }, + { + "tinyint_col": "0" + }, + { + "smallint_col": "0" + }, + { + "int_col": "0" + }, + { + "bigint_col": "0" + }, + { + "float_col": "0.0" + }, + { + "double_col": "0.0" + }, + { + "date_string_col": "30322f30312f3039" + }, + { + "string_col": "30" + }, + { + "timestamp_col": "2009-02-01T00:00:00" + } + ], + [ + { + "id": "3" + }, + { + "bool_col": "false" + }, + { + "tinyint_col": "1" + }, + { + "smallint_col": "1" + }, + { + "int_col": "1" + }, + { + "bigint_col": "10" + }, + { + "float_col": "1.1" + }, + { + "double_col": "10.1" + }, + { + "date_string_col": "30322f30312f3039" + }, + { + "string_col": "31" + }, + { + "timestamp_col": "2009-02-01T00:01:00" + } + ], + [ + { + "id": "0" + }, + { + "bool_col": "true" + }, + { + "tinyint_col": "0" + }, + { + "smallint_col": "0" + }, + { + "int_col": "0" + }, + { + "bigint_col": "0" + }, + { + "float_col": "0.0" + }, + { + "double_col": "0.0" + }, + { + "date_string_col": "30312f30312f3039" + }, + { + "string_col": "30" + }, + { + "timestamp_col": "2009-01-01T00:00:00" + } + ], + [ + { + "id": "1" + }, + { + "bool_col": "false" + }, + { + "tinyint_col": "1" + }, + { + "smallint_col": "1" + }, + { + "int_col": "1" + }, + { + "bigint_col": "10" + }, + { + "float_col": "1.1" + }, + { + "double_col": "10.1" + }, + { + "date_string_col": "30312f30312f3039" + }, + { + "string_col": "31" + }, + { + "timestamp_col": "2009-01-01T00:01:00" + } + ] + ] +} \ No newline at end of file diff --git a/parquet-integration-testing/data/alltypes_plain.parquet.json b/parquet-integration-testing/data/alltypes_plain.parquet.json deleted file mode 100644 index 1288ce1f39b6..000000000000 --- a/parquet-integration-testing/data/alltypes_plain.parquet.json +++ /dev/null @@ -1 +0,0 @@ -{"filename":"/Users/andrewlamb/Software/arrow-rs/parquet-testing/data/alltypes_plain.parquet","rows":[[{"id":"4"},{"bool_col":"true"},{"tinyint_col":"0"},{"smallint_col":"0"},{"int_col":"0"},{"bigint_col":"0"},{"float_col":"0.0"},{"double_col":"0.0"},{"date_string_col":"30332f30312f3039"},{"string_col":"30"},{"timestamp_col":"2009-03-01T00:00:00"}],[{"id":"5"},{"bool_col":"false"},{"tinyint_col":"1"},{"smallint_col":"1"},{"int_col":"1"},{"bigint_col":"10"},{"float_col":"1.1"},{"double_col":"10.1"},{"date_string_col":"30332f30312f3039"},{"string_col":"31"},{"timestamp_col":"2009-03-01T00:01:00"}],[{"id":"6"},{"bool_col":"true"},{"tinyint_col":"0"},{"smallint_col":"0"},{"int_col":"0"},{"bigint_col":"0"},{"float_col":"0.0"},{"double_col":"0.0"},{"date_string_col":"30342f30312f3039"},{"string_col":"30"},{"timestamp_col":"2009-04-01T00:00:00"}],[{"id":"7"},{"bool_col":"false"},{"tinyint_col":"1"},{"smallint_col":"1"},{"int_col":"1"},{"bigint_col":"10"},{"float_col":"1.1"},{"double_col":"10.1"},{"date_string_col":"30342f30312f3039"},{"string_col":"31"},{"timestamp_col":"2009-04-01T00:01:00"}],[{"id":"2"},{"bool_col":"true"},{"tinyint_col":"0"},{"smallint_col":"0"},{"int_col":"0"},{"bigint_col":"0"},{"float_col":"0.0"},{"double_col":"0.0"},{"date_string_col":"30322f30312f3039"},{"string_col":"30"},{"timestamp_col":"2009-02-01T00:00:00"}],[{"id":"3"},{"bool_col":"false"},{"tinyint_col":"1"},{"smallint_col":"1"},{"int_col":"1"},{"bigint_col":"10"},{"float_col":"1.1"},{"double_col":"10.1"},{"date_string_col":"30322f30312f3039"},{"string_col":"31"},{"timestamp_col":"2009-02-01T00:01:00"}],[{"id":"0"},{"bool_col":"true"},{"tinyint_col":"0"},{"smallint_col":"0"},{"int_col":"0"},{"bigint_col":"0"},{"float_col":"0.0"},{"double_col":"0.0"},{"date_string_col":"30312f30312f3039"},{"string_col":"30"},{"timestamp_col":"2009-01-01T00:00:00"}],[{"id":"1"},{"bool_col":"false"},{"tinyint_col":"1"},{"smallint_col":"1"},{"int_col":"1"},{"bigint_col":"10"},{"float_col":"1.1"},{"double_col":"10.1"},{"date_string_col":"30312f30312f3039"},{"string_col":"31"},{"timestamp_col":"2009-01-01T00:01:00"}]]} \ No newline at end of file diff --git a/parquet-integration-testing/data/alltypes_plain.parquet.metadata.json b/parquet-integration-testing/data/alltypes_plain.parquet.metadata.json new file mode 100644 index 000000000000..9bb5c3a56265 --- /dev/null +++ b/parquet-integration-testing/data/alltypes_plain.parquet.metadata.json @@ -0,0 +1,8 @@ +{ + "filename33": "alltypes_plain.parquet", + "row_goups": [ + { + "num_rows": 8 + } + ] +} \ No newline at end of file diff --git a/parquet-integration-testing/src/bin/parquet-integration-testing.rs b/parquet-integration-testing/src/bin/parquet-integration-testing.rs index 443a871625b7..d40f620fe603 100644 --- a/parquet-integration-testing/src/bin/parquet-integration-testing.rs +++ b/parquet-integration-testing/src/bin/parquet-integration-testing.rs @@ -33,7 +33,8 @@ fn main() { ]; for filename in filenames { - integration_test.read_test(filename); + integration_test.data_test(filename); + integration_test.metadata_test(filename) } } @@ -89,31 +90,75 @@ impl IntegrationTest { /// ] /// } /// ``` - fn read_test(&self, filename: &str) { + fn data_test(&self, filename: &str) { let parquet_file_path = self.parquet_data_path.join(filename); - - let expected_file_path = self.expected_data_path.join(format!("{filename}.json")); + let expected_file_path = self.expected_data_path.join(format!("{filename}.data.json")); // For ease of development, write the actual parsed value to a file (to // permit easy updates, for example) - let output_file_path = self.output_data_path.join(format!("{filename}.json")); + let output_file_path = self.output_data_path.join(format!("{filename}.data.json")); - println!("Begin test: {filename}"); + println!("Begin data test: {filename}"); println!(" Input parquet file: {parquet_file_path:?}"); println!(" Expected JSON file: {expected_file_path:?}"); println!(" Output JSON file: {output_file_path:?}"); let parquet_json = read_parquet_data(&parquet_file_path); let output_file = File::create(&output_file_path).unwrap(); - serde_json::to_writer(output_file, &parquet_json).unwrap(); + serde_json::to_writer_pretty(output_file, &parquet_json).unwrap(); + + // read expected file if present, default to {} if not + let expected_json = if let Ok(expected_file) = File::open(expected_file_path) { + serde_json::from_reader(expected_file).unwrap() + } else { + json!({}) + }; + assert_eq!(parquet_json, expected_json) + } + + /// Read a parquet file, create a JSON representation of its metadata, and compares to the + /// known good value in data + /// + /// The output JSON looks like this: + /// + /// ```text + /// { + /// filename: "filename.parquet", + /// .. + /// .. + /// } + /// ``` + fn metadata_test(&self, filename: &str) { + let parquet_file_path = self.parquet_data_path.join(filename); + let expected_file_path = self.expected_data_path.join(format!("{filename}.metadata.json")); + + // For ease of development, write the actual parsed value to a file (to + // permit easy updates, for example) + let output_file_path = self.output_data_path.join(format!("{filename}.metadata.json")); + + println!("Begin metadata test: {filename}"); + println!(" Input parquet file: {parquet_file_path:?}"); + println!(" Expected JSON file: {expected_file_path:?}"); + println!(" Output JSON file: {output_file_path:?}"); - let expected_file = File::open(expected_file_path).unwrap(); - let expected_json: Value = serde_json::from_reader(expected_file).unwrap(); + let parquet_json = read_parquet_metadata(&parquet_file_path); + let output_file = File::create(&output_file_path).unwrap(); + serde_json::to_writer_pretty(output_file, &parquet_json).unwrap(); + + // read expected file if present, default to {} if not + let expected_json = if let Ok(expected_file) = File::open(expected_file_path) { + serde_json::from_reader(expected_file).unwrap() + } else { + json!({}) + }; assert_eq!(parquet_json, expected_json) } } -/// The function reads a parquet file and writes a JSON representation of the data within + + +/// The function reads a parquet file and returns a JSON representation of the +/// data within fn read_parquet_data(parquet_data_path: &Path) -> Value { let file = File::open(parquet_data_path).unwrap(); let mut reader = ArrowReaderBuilder::try_new(file).unwrap().build().unwrap(); @@ -134,8 +179,33 @@ fn read_parquet_data(parquet_data_path: &Path) -> Value { } } + let filename = parquet_data_path.file_name().unwrap().to_string_lossy(); + json!({ - "filename": parquet_data_path, + "filename": filename, "rows": rows }) } + + +/// The function reads a parquet file and writes a JSON representation of the +/// metatadata (thrift encoded) within +fn read_parquet_metadata(parquet_data_path: &Path) -> Value { + let file = File::open(parquet_data_path).unwrap(); + let metadata = ArrowReaderBuilder::try_new(file).unwrap().metadata().clone(); + + // todo print out schema + let row_groups : Vec<_> = metadata.row_groups().iter() + .map(|rg| { + json!({ + "num_rows": rg.num_rows(), + }) + }).collect();; + + let filename = parquet_data_path.file_name().unwrap().to_string_lossy(); + + json!({ + "filename33": filename, + "row_goups": row_groups, + }) +} \ No newline at end of file From 168cec8f3e77af29903542594de6e2408312bdc0 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 25 Jun 2024 12:35:41 -0400 Subject: [PATCH 5/5] updates --- .../data/alltypes_plain.parquet.metadata.json | 51 ++++++++++++++++++- .../src/bin/parquet-integration-testing.rs | 23 +++++++++ 2 files changed, 73 insertions(+), 1 deletion(-) diff --git a/parquet-integration-testing/data/alltypes_plain.parquet.metadata.json b/parquet-integration-testing/data/alltypes_plain.parquet.metadata.json index 9bb5c3a56265..2c66d3cf2ac3 100644 --- a/parquet-integration-testing/data/alltypes_plain.parquet.metadata.json +++ b/parquet-integration-testing/data/alltypes_plain.parquet.metadata.json @@ -2,7 +2,56 @@ "filename33": "alltypes_plain.parquet", "row_goups": [ { - "num_rows": 8 + "columns": [ + { + "file_offset": 77, + "file_path": null + }, + { + "file_offset": 133, + "file_path": null + }, + { + "file_offset": 215, + "file_path": null + }, + { + "file_offset": 303, + "file_path": null + }, + { + "file_offset": 392, + "file_path": null + }, + { + "file_offset": 484, + "file_path": null + }, + { + "file_offset": 571, + "file_path": null + }, + { + "file_offset": 665, + "file_path": null + }, + { + "file_offset": 793, + "file_path": null + }, + { + "file_offset": 889, + "file_path": null + }, + { + "file_offset": 1068, + "file_path": null + } + ], + "file_offset": null, + "num_rows": 8, + "ordinal": null, + "total_byte_size": 671 } ] } \ No newline at end of file diff --git a/parquet-integration-testing/src/bin/parquet-integration-testing.rs b/parquet-integration-testing/src/bin/parquet-integration-testing.rs index d40f620fe603..8317149b1563 100644 --- a/parquet-integration-testing/src/bin/parquet-integration-testing.rs +++ b/parquet-integration-testing/src/bin/parquet-integration-testing.rs @@ -24,6 +24,8 @@ use serde_json::{json, Value}; /// Test driver for parquet-integration testing use std::fs::{canonicalize, File}; use std::path::{Path, PathBuf}; +use parquet::file::metadata::ColumnChunkMetaData; +use parquet::format::ColumnMetaData; fn main() { let integration_test = IntegrationTest::new(); @@ -197,8 +199,15 @@ fn read_parquet_metadata(parquet_data_path: &Path) -> Value { // todo print out schema let row_groups : Vec<_> = metadata.row_groups().iter() .map(|rg| { + let columns: Vec<_> = rg.columns().iter() + .map(column_metadata_to_json) + .collect(); json!({ "num_rows": rg.num_rows(), + "total_byte_size": rg.total_byte_size(), + "file_offset": rg.file_offset(), + "ordinal": rg.ordinal(), + "columns": columns, }) }).collect();; @@ -208,4 +217,18 @@ fn read_parquet_metadata(parquet_data_path: &Path) -> Value { "filename33": filename, "row_goups": row_groups, }) +} + +fn column_metadata_to_json(column_metadata: &ColumnChunkMetaData) -> Value { + + json!({ + "file_path": column_metadata.file_path(), + "file_offset": column_metadata.file_offset(), + // todo: column metadata + // "num_values": column_metadata.num_values(), + // todo column-type/ column-path/descr + //"file_path": column_metadata.file_path(), + + }) + } \ No newline at end of file