From 7e98e82bc6f9b5f4b45a6e187a9ae2023acea3b7 Mon Sep 17 00:00:00 2001
From: Andrew Lamb <andrew@nerdnetworks.org>
Date: Tue, 25 Jun 2024 11:05:02 -0400
Subject: [PATCH 1/5] Implement parquet-integration-testing Integration Tests

---
 Cargo.toml                                    |   2 +
 parquet-integration-testing/Cargo.toml        |  35 ++++++
 parquet-integration-testing/README.md         |  32 ++++++
 .../src/bin/parquet-integration-testing.rs    | 108 ++++++++++++++++++
 4 files changed, 177 insertions(+)
 create mode 100644 parquet-integration-testing/Cargo.toml
 create mode 100644 parquet-integration-testing/README.md
 create mode 100644 parquet-integration-testing/src/bin/parquet-integration-testing.rs

diff --git a/Cargo.toml b/Cargo.toml
index e0144faa7a92..a6beb899dbc2 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -57,6 +57,8 @@ exclude = [
     # significantly changing how it is compiled within the workspace, causing the whole workspace to be compiled from
     # scratch this way, this is a stand-alone package that compiles independently of the others.
     "arrow-pyarrow-integration-testing",
+    # parquet inregration testing likewise contains different flags
+    "parquet-integration-testing",
     # object_store is excluded because it follows a separate release cycle from the other arrow crates
     "object_store"
 ]
diff --git a/parquet-integration-testing/Cargo.toml b/parquet-integration-testing/Cargo.toml
new file mode 100644
index 000000000000..71b49322b79c
--- /dev/null
+++ b/parquet-integration-testing/Cargo.toml
@@ -0,0 +1,35 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+[package]
+name = "python-integration-testing"
+description = "Binaries used for testing parquet-rs compatibility (NOT published to crates.io)"
+version = { workspace = true }
+homepage = { workspace = true }
+repository = { workspace = true }
+authors = { workspace = true }
+license = { workspace = true }
+edition = { workspace = true }
+publish = false
+rust-version = { workspace = true }
+
+
+[dependencies]
+arrow = { path = "../arrow" }
+parquet = { path = "../parquet" }
+serde_json = { version = "1.0", default-features = false, features = ["std"] }
+pretty_assertions = "1.4.0"
\ No newline at end of file
diff --git a/parquet-integration-testing/README.md b/parquet-integration-testing/README.md
new file mode 100644
index 000000000000..eca0867c8cc1
--- /dev/null
+++ b/parquet-integration-testing/README.md
@@ -0,0 +1,32 @@
+<!---
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+# Apache Parquet Rust Integration Testing
+
+The binary in this repo:
+
+1. Reads files from the parquet-testing repo
+2. Creates a JSON file with appropriately formatted contents
+3. Compare these JSON files with "known good" golden master files 
+
+## Running
+
+```shell
+cargo run
+```
diff --git a/parquet-integration-testing/src/bin/parquet-integration-testing.rs b/parquet-integration-testing/src/bin/parquet-integration-testing.rs
new file mode 100644
index 000000000000..14570a35042e
--- /dev/null
+++ b/parquet-integration-testing/src/bin/parquet-integration-testing.rs
@@ -0,0 +1,108 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+
+// Draft program for testing the parquet-rs library
+
+use std::fs::{canonicalize, File};
+use std::path::Path;
+use arrow::util::display::array_value_to_string;
+use arrow::util::pretty::pretty_format_columns;
+use parquet::arrow::arrow_reader::ArrowReaderBuilder;
+use serde::Serialize;
+use serde_json::json;
+
+fn main() {
+    println!("PWD: {:?}", std::env::var("PWD"));
+    let parquet_data_path = "/Users/andrewlamb/Software/arrow-rs/parquet-testing/data";
+    let expected_data_path = "/Users/andrewlamb/Software/arrow-rs/parquet-integration-testing/data";
+
+    let filenames = vec!["alltypes_plain.parquet", "alltypes_plain_dictionary.parquet"];
+
+    for filename in filenames {
+        let parquet_file_path = Path::from(parquet_data_path).join(filename).canonicalize().unwrap();
+        let expected_file_path = Path::from(expected_data_path).join(format!("{}filename}.json"));
+
+        println!("Begin test: {filename}");
+        println!("  Reading parquet file: {parquet_file_path}");
+        println!("  Expected JSON file: {expected_file_path}");
+        let parquet_json = read_parquet_data(&parquet_file_path);
+        let expected_json = std::fs::read_to_string(expected_file_path);
+    }
+
+
+}
+
+// prototype demonstration of checking type support for parquet-rs encoding
+// check read support by reading a file with the specified encoding correctly
+
+// | PLAIN                                     |       |        |       |       |
+// | PLAIN_DICTIONARY                          |       |        |       |       |
+// | RLE_DICTIONARY                            |       |        |       |       |
+// | RLE                                       |       |        |       |       |
+// | BIT_PACKED (deprecated)                   |       |        |       |       |
+// | DELTA_BINARY_PACKED                       |       |        |       |       |
+
+// The idea is to produce a file like this:
+// ```text
+// {
+//   filename: "filename.parquet",
+//   rows: [
+//     {
+//       "column1": "value1",
+//       "column2": 123,
+//       "column3": null
+//     },
+//     ..
+//     {
+//       "column1": "value2",
+//       "column2": 456,
+//       "column3": "value3"
+//     }
+//   ]
+// }
+// ```
+
+/// The function reads a parquet file and writes a JSON representation of the data within
+fn read_parquet_data(parquet_data_path: &str) -> String {
+    let file = File::open(&parquet_data_path).unwrap();
+    let mut reader = ArrowReaderBuilder::try_new(file).unwrap().build().unwrap();
+
+
+    let mut rows = vec![];
+    while let Some(batch) = reader.next() {
+        let batch = batch.unwrap();
+        let columns = batch.columns();
+        let schema = batch.schema();
+        for i in 0..batch.num_rows() {
+            let mut row = vec![];
+            for (field, column) in schema.fields.iter().zip(columns.iter()) {
+                let name = field.name();
+                let value = array_value_to_string(column.as_ref(), i).unwrap();
+                row.push(json!({name: value}));
+            }
+            rows.push(json!(row));
+        }
+    }
+
+    let value = json!({
+        "filename": parquet_data_path,
+        "rows": rows
+    });
+
+    serde_json::to_string_pretty(&value).unwrap();
+}
\ No newline at end of file

From 74597f1643080ed7ba737dba51b76fe2c9f2e353 Mon Sep 17 00:00:00 2001
From: Andrew Lamb <andrew@nerdnetworks.org>
Date: Tue, 25 Jun 2024 11:53:31 -0400
Subject: [PATCH 2/5] initial data path

---
 Cargo.toml                                    |  2 +-
 parquet-integration-testing/.gitignore        |  1 +
 parquet-integration-testing/Cargo.toml        | 13 ++--
 .../data/alltypes_plain.parquet.json          |  1 +
 .../src/bin/parquet-integration-testing.rs    | 68 ++++++++++++-------
 5 files changed, 50 insertions(+), 35 deletions(-)
 create mode 100644 parquet-integration-testing/.gitignore
 create mode 100644 parquet-integration-testing/data/alltypes_plain.parquet.json

diff --git a/Cargo.toml b/Cargo.toml
index a6beb899dbc2..7798ea0b239c 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -36,7 +36,7 @@ members = [
     "arrow-row",
     "arrow-schema",
     "arrow-select",
-    "arrow-string",
+    "arrow-string", "foo",
     "parquet",
     "parquet_derive",
     "parquet_derive_test",
diff --git a/parquet-integration-testing/.gitignore b/parquet-integration-testing/.gitignore
new file mode 100644
index 000000000000..1fcb1529f8e5
--- /dev/null
+++ b/parquet-integration-testing/.gitignore
@@ -0,0 +1 @@
+out
diff --git a/parquet-integration-testing/Cargo.toml b/parquet-integration-testing/Cargo.toml
index 71b49322b79c..c85ba8be39bc 100644
--- a/parquet-integration-testing/Cargo.toml
+++ b/parquet-integration-testing/Cargo.toml
@@ -18,18 +18,13 @@
 [package]
 name = "python-integration-testing"
 description = "Binaries used for testing parquet-rs compatibility (NOT published to crates.io)"
-version = { workspace = true }
-homepage = { workspace = true }
-repository = { workspace = true }
-authors = { workspace = true }
-license = { workspace = true }
-edition = { workspace = true }
 publish = false
-rust-version = { workspace = true }
+edition = "2021"
 
 
 [dependencies]
-arrow = { path = "../arrow" }
-parquet = { path = "../parquet" }
+arrow = { path = "../arrow", features = ["prettyprint"] }
+parquet = { path = "../parquet", features = ["arrow"]}
+serde = "1.0.203"
 serde_json = { version = "1.0", default-features = false, features = ["std"] }
 pretty_assertions = "1.4.0"
\ No newline at end of file
diff --git a/parquet-integration-testing/data/alltypes_plain.parquet.json b/parquet-integration-testing/data/alltypes_plain.parquet.json
new file mode 100644
index 000000000000..1288ce1f39b6
--- /dev/null
+++ b/parquet-integration-testing/data/alltypes_plain.parquet.json
@@ -0,0 +1 @@
+{"filename":"/Users/andrewlamb/Software/arrow-rs/parquet-testing/data/alltypes_plain.parquet","rows":[[{"id":"4"},{"bool_col":"true"},{"tinyint_col":"0"},{"smallint_col":"0"},{"int_col":"0"},{"bigint_col":"0"},{"float_col":"0.0"},{"double_col":"0.0"},{"date_string_col":"30332f30312f3039"},{"string_col":"30"},{"timestamp_col":"2009-03-01T00:00:00"}],[{"id":"5"},{"bool_col":"false"},{"tinyint_col":"1"},{"smallint_col":"1"},{"int_col":"1"},{"bigint_col":"10"},{"float_col":"1.1"},{"double_col":"10.1"},{"date_string_col":"30332f30312f3039"},{"string_col":"31"},{"timestamp_col":"2009-03-01T00:01:00"}],[{"id":"6"},{"bool_col":"true"},{"tinyint_col":"0"},{"smallint_col":"0"},{"int_col":"0"},{"bigint_col":"0"},{"float_col":"0.0"},{"double_col":"0.0"},{"date_string_col":"30342f30312f3039"},{"string_col":"30"},{"timestamp_col":"2009-04-01T00:00:00"}],[{"id":"7"},{"bool_col":"false"},{"tinyint_col":"1"},{"smallint_col":"1"},{"int_col":"1"},{"bigint_col":"10"},{"float_col":"1.1"},{"double_col":"10.1"},{"date_string_col":"30342f30312f3039"},{"string_col":"31"},{"timestamp_col":"2009-04-01T00:01:00"}],[{"id":"2"},{"bool_col":"true"},{"tinyint_col":"0"},{"smallint_col":"0"},{"int_col":"0"},{"bigint_col":"0"},{"float_col":"0.0"},{"double_col":"0.0"},{"date_string_col":"30322f30312f3039"},{"string_col":"30"},{"timestamp_col":"2009-02-01T00:00:00"}],[{"id":"3"},{"bool_col":"false"},{"tinyint_col":"1"},{"smallint_col":"1"},{"int_col":"1"},{"bigint_col":"10"},{"float_col":"1.1"},{"double_col":"10.1"},{"date_string_col":"30322f30312f3039"},{"string_col":"31"},{"timestamp_col":"2009-02-01T00:01:00"}],[{"id":"0"},{"bool_col":"true"},{"tinyint_col":"0"},{"smallint_col":"0"},{"int_col":"0"},{"bigint_col":"0"},{"float_col":"0.0"},{"double_col":"0.0"},{"date_string_col":"30312f30312f3039"},{"string_col":"30"},{"timestamp_col":"2009-01-01T00:00:00"}],[{"id":"1"},{"bool_col":"false"},{"tinyint_col":"1"},{"smallint_col":"1"},{"int_col":"1"},{"bigint_col":"10"},{"float_col":"1.1"},{"double_col":"10.1"},{"date_string_col":"30312f30312f3039"},{"string_col":"31"},{"timestamp_col":"2009-01-01T00:01:00"}]]}
\ No newline at end of file
diff --git a/parquet-integration-testing/src/bin/parquet-integration-testing.rs b/parquet-integration-testing/src/bin/parquet-integration-testing.rs
index 14570a35042e..7eeecba0d04a 100644
--- a/parquet-integration-testing/src/bin/parquet-integration-testing.rs
+++ b/parquet-integration-testing/src/bin/parquet-integration-testing.rs
@@ -15,36 +15,57 @@
 // specific language governing permissions and limitations
 // under the License.
 
-
-// Draft program for testing the parquet-rs library
-
-use std::fs::{canonicalize, File};
-use std::path::Path;
 use arrow::util::display::array_value_to_string;
 use arrow::util::pretty::pretty_format_columns;
 use parquet::arrow::arrow_reader::ArrowReaderBuilder;
+use pretty_assertions::assert_eq;
 use serde::Serialize;
-use serde_json::json;
+use serde_json::{json, Value};
+/// Test driver for parquet-integration testing
+use std::fs::{canonicalize, File};
+use std::path::{Path, PathBuf};
 
 fn main() {
-    println!("PWD: {:?}", std::env::var("PWD"));
-    let parquet_data_path = "/Users/andrewlamb/Software/arrow-rs/parquet-testing/data";
-    let expected_data_path = "/Users/andrewlamb/Software/arrow-rs/parquet-integration-testing/data";
+    // paths are relative to arrow-rs/parquet-integration-testing
+    let parquet_data_path =
+        PathBuf::from("../parquet-testing/data").canonicalize().unwrap();
+    let expected_data_path =
+        PathBuf::from("data").canonicalize().unwrap();
+    let output_data_path =
+        PathBuf::from("out").canonicalize().unwrap();
 
-    let filenames = vec!["alltypes_plain.parquet", "alltypes_plain_dictionary.parquet"];
+    std::fs::create_dir_all(&output_data_path).unwrap();
+
+    let filenames = vec![
+        "alltypes_plain.parquet",
+        //"alltypes_plain_dictionary.parquet",
+    ];
 
     for filename in filenames {
-        let parquet_file_path = Path::from(parquet_data_path).join(filename).canonicalize().unwrap();
-        let expected_file_path = Path::from(expected_data_path).join(format!("{}filename}.json"));
+        let parquet_file_path = parquet_data_path.join(filename);
+
+        let expected_file_path = expected_data_path
+            .join(format!("{filename}.json"));
+
+        // For development, also write the actual parsed value to a file
+        let output_file_path = output_data_path
+            .join(format!("{filename}.json"));
+
+
 
         println!("Begin test: {filename}");
-        println!("  Reading parquet file: {parquet_file_path}");
-        println!("  Expected JSON file: {expected_file_path}");
-        let parquet_json = read_parquet_data(&parquet_file_path);
-        let expected_json = std::fs::read_to_string(expected_file_path);
-    }
+        println!("  Input parquet file: {parquet_file_path:?}");
+        println!("  Expected JSON file: {expected_file_path:?}");
+        println!("  Output JSON file: {output_file_path:?}");
 
+        let parquet_json = read_parquet_data(&parquet_file_path);
+        let output_file = File::create(&output_file_path).unwrap();
+        serde_json::to_writer(output_file, &parquet_json).unwrap();
 
+        let expected_file = File::open(expected_file_path).unwrap();
+        let expected_json: Value = serde_json::from_reader(expected_file).unwrap();
+        assert_eq!(parquet_json, expected_json)
+    }
 }
 
 // prototype demonstration of checking type support for parquet-rs encoding
@@ -78,11 +99,10 @@ fn main() {
 // ```
 
 /// The function reads a parquet file and writes a JSON representation of the data within
-fn read_parquet_data(parquet_data_path: &str) -> String {
-    let file = File::open(&parquet_data_path).unwrap();
+fn read_parquet_data(parquet_data_path: &Path) -> Value {
+    let file = File::open(parquet_data_path).unwrap();
     let mut reader = ArrowReaderBuilder::try_new(file).unwrap().build().unwrap();
 
-
     let mut rows = vec![];
     while let Some(batch) = reader.next() {
         let batch = batch.unwrap();
@@ -99,10 +119,8 @@ fn read_parquet_data(parquet_data_path: &str) -> String {
         }
     }
 
-    let value = json!({
+    json!({
         "filename": parquet_data_path,
         "rows": rows
-    });
-
-    serde_json::to_string_pretty(&value).unwrap();
-}
\ No newline at end of file
+    })
+}

From cd46026ae19dbcf2df2b079fb6511f50f93b06cb Mon Sep 17 00:00:00 2001
From: Andrew Lamb <andrew@nerdnetworks.org>
Date: Tue, 25 Jun 2024 12:09:16 -0400
Subject: [PATCH 3/5] refactor

---
 .../src/bin/parquet-integration-testing.rs    | 107 ++++++++++--------
 1 file changed, 61 insertions(+), 46 deletions(-)

diff --git a/parquet-integration-testing/src/bin/parquet-integration-testing.rs b/parquet-integration-testing/src/bin/parquet-integration-testing.rs
index 7eeecba0d04a..443a871625b7 100644
--- a/parquet-integration-testing/src/bin/parquet-integration-testing.rs
+++ b/parquet-integration-testing/src/bin/parquet-integration-testing.rs
@@ -26,32 +26,77 @@ use std::fs::{canonicalize, File};
 use std::path::{Path, PathBuf};
 
 fn main() {
-    // paths are relative to arrow-rs/parquet-integration-testing
-    let parquet_data_path =
-        PathBuf::from("../parquet-testing/data").canonicalize().unwrap();
-    let expected_data_path =
-        PathBuf::from("data").canonicalize().unwrap();
-    let output_data_path =
-        PathBuf::from("out").canonicalize().unwrap();
-
-    std::fs::create_dir_all(&output_data_path).unwrap();
+    let integration_test = IntegrationTest::new();
 
     let filenames = vec![
         "alltypes_plain.parquet",
-        //"alltypes_plain_dictionary.parquet",
     ];
 
     for filename in filenames {
-        let parquet_file_path = parquet_data_path.join(filename);
+        integration_test.read_test(filename);
+    }
+}
+
+// prototype demonstration of checking type support for parquet-rs encoding
+// check read support by reading a file with the specified encoding correctly
+#[derive(Debug)]
+struct IntegrationTest {
+    parquet_data_path: PathBuf,
+    expected_data_path: PathBuf,
+    output_data_path: PathBuf,
+}
 
-        let expected_file_path = expected_data_path
-            .join(format!("{filename}.json"));
+impl IntegrationTest {
+    pub fn new() -> Self {
+        // TODO error handling
 
-        // For development, also write the actual parsed value to a file
-        let output_file_path = output_data_path
-            .join(format!("{filename}.json"));
+        // paths are relative to arrow-rs/parquet-integration-testing
+        let parquet_data_path = PathBuf::from("../parquet-testing/data")
+            .canonicalize()
+            .unwrap();
+        let expected_data_path = PathBuf::from("data").canonicalize().unwrap();
+        let output_data_path = PathBuf::from("out").canonicalize().unwrap();
 
+        std::fs::create_dir_all(&output_data_path).unwrap();
 
+        Self {
+            parquet_data_path,
+            expected_data_path,
+            output_data_path,
+        }
+    }
+
+    /// Read a parquet file, create a JSON representation, and compare to the
+    /// known good value in data
+    ///
+    /// The output JSON looks like this:
+    ///
+    /// ```text
+    /// {
+    ///   filename: "filename.parquet",
+    ///   rows: [
+    ///     {
+    ///       "column1": "value1",
+    ///       "column2": 123,
+    ///       "column3": null
+    ///     },
+    ///     ..
+    ///     {
+    ///       "column1": "value2",
+    ///       "column2": 456,
+    ///       "column3": "value3"
+    ///     }
+    ///   ]
+    /// }
+    /// ```
+    fn read_test(&self, filename: &str) {
+        let parquet_file_path = self.parquet_data_path.join(filename);
+
+        let expected_file_path = self.expected_data_path.join(format!("{filename}.json"));
+
+        // For ease of development, write the actual parsed value to a file (to
+        // permit easy updates, for example)
+        let output_file_path = self.output_data_path.join(format!("{filename}.json"));
 
         println!("Begin test: {filename}");
         println!("  Input parquet file: {parquet_file_path:?}");
@@ -68,36 +113,6 @@ fn main() {
     }
 }
 
-// prototype demonstration of checking type support for parquet-rs encoding
-// check read support by reading a file with the specified encoding correctly
-
-// | PLAIN                                     |       |        |       |       |
-// | PLAIN_DICTIONARY                          |       |        |       |       |
-// | RLE_DICTIONARY                            |       |        |       |       |
-// | RLE                                       |       |        |       |       |
-// | BIT_PACKED (deprecated)                   |       |        |       |       |
-// | DELTA_BINARY_PACKED                       |       |        |       |       |
-
-// The idea is to produce a file like this:
-// ```text
-// {
-//   filename: "filename.parquet",
-//   rows: [
-//     {
-//       "column1": "value1",
-//       "column2": 123,
-//       "column3": null
-//     },
-//     ..
-//     {
-//       "column1": "value2",
-//       "column2": 456,
-//       "column3": "value3"
-//     }
-//   ]
-// }
-// ```
-
 /// The function reads a parquet file and writes a JSON representation of the data within
 fn read_parquet_data(parquet_data_path: &Path) -> Value {
     let file = File::open(parquet_data_path).unwrap();

From a345055eeb74cfc272b718a3b89e4b209ab4e8b9 Mon Sep 17 00:00:00 2001
From: Andrew Lamb <andrew@nerdnetworks.org>
Date: Tue, 25 Jun 2024 12:27:51 -0400
Subject: [PATCH 4/5] Add basic metadata test

---
 .../data/alltypes_plain.parquet.data.json     | 285 ++++++++++++++++++
 .../data/alltypes_plain.parquet.json          |   1 -
 .../data/alltypes_plain.parquet.metadata.json |   8 +
 .../src/bin/parquet-integration-testing.rs    |  92 +++++-
 4 files changed, 374 insertions(+), 12 deletions(-)
 create mode 100644 parquet-integration-testing/data/alltypes_plain.parquet.data.json
 delete mode 100644 parquet-integration-testing/data/alltypes_plain.parquet.json
 create mode 100644 parquet-integration-testing/data/alltypes_plain.parquet.metadata.json

diff --git a/parquet-integration-testing/data/alltypes_plain.parquet.data.json b/parquet-integration-testing/data/alltypes_plain.parquet.data.json
new file mode 100644
index 000000000000..2c24a72290c8
--- /dev/null
+++ b/parquet-integration-testing/data/alltypes_plain.parquet.data.json
@@ -0,0 +1,285 @@
+{
+  "filename": "alltypes_plain.parquet",
+  "rows": [
+    [
+      {
+        "id": "4"
+      },
+      {
+        "bool_col": "true"
+      },
+      {
+        "tinyint_col": "0"
+      },
+      {
+        "smallint_col": "0"
+      },
+      {
+        "int_col": "0"
+      },
+      {
+        "bigint_col": "0"
+      },
+      {
+        "float_col": "0.0"
+      },
+      {
+        "double_col": "0.0"
+      },
+      {
+        "date_string_col": "30332f30312f3039"
+      },
+      {
+        "string_col": "30"
+      },
+      {
+        "timestamp_col": "2009-03-01T00:00:00"
+      }
+    ],
+    [
+      {
+        "id": "5"
+      },
+      {
+        "bool_col": "false"
+      },
+      {
+        "tinyint_col": "1"
+      },
+      {
+        "smallint_col": "1"
+      },
+      {
+        "int_col": "1"
+      },
+      {
+        "bigint_col": "10"
+      },
+      {
+        "float_col": "1.1"
+      },
+      {
+        "double_col": "10.1"
+      },
+      {
+        "date_string_col": "30332f30312f3039"
+      },
+      {
+        "string_col": "31"
+      },
+      {
+        "timestamp_col": "2009-03-01T00:01:00"
+      }
+    ],
+    [
+      {
+        "id": "6"
+      },
+      {
+        "bool_col": "true"
+      },
+      {
+        "tinyint_col": "0"
+      },
+      {
+        "smallint_col": "0"
+      },
+      {
+        "int_col": "0"
+      },
+      {
+        "bigint_col": "0"
+      },
+      {
+        "float_col": "0.0"
+      },
+      {
+        "double_col": "0.0"
+      },
+      {
+        "date_string_col": "30342f30312f3039"
+      },
+      {
+        "string_col": "30"
+      },
+      {
+        "timestamp_col": "2009-04-01T00:00:00"
+      }
+    ],
+    [
+      {
+        "id": "7"
+      },
+      {
+        "bool_col": "false"
+      },
+      {
+        "tinyint_col": "1"
+      },
+      {
+        "smallint_col": "1"
+      },
+      {
+        "int_col": "1"
+      },
+      {
+        "bigint_col": "10"
+      },
+      {
+        "float_col": "1.1"
+      },
+      {
+        "double_col": "10.1"
+      },
+      {
+        "date_string_col": "30342f30312f3039"
+      },
+      {
+        "string_col": "31"
+      },
+      {
+        "timestamp_col": "2009-04-01T00:01:00"
+      }
+    ],
+    [
+      {
+        "id": "2"
+      },
+      {
+        "bool_col": "true"
+      },
+      {
+        "tinyint_col": "0"
+      },
+      {
+        "smallint_col": "0"
+      },
+      {
+        "int_col": "0"
+      },
+      {
+        "bigint_col": "0"
+      },
+      {
+        "float_col": "0.0"
+      },
+      {
+        "double_col": "0.0"
+      },
+      {
+        "date_string_col": "30322f30312f3039"
+      },
+      {
+        "string_col": "30"
+      },
+      {
+        "timestamp_col": "2009-02-01T00:00:00"
+      }
+    ],
+    [
+      {
+        "id": "3"
+      },
+      {
+        "bool_col": "false"
+      },
+      {
+        "tinyint_col": "1"
+      },
+      {
+        "smallint_col": "1"
+      },
+      {
+        "int_col": "1"
+      },
+      {
+        "bigint_col": "10"
+      },
+      {
+        "float_col": "1.1"
+      },
+      {
+        "double_col": "10.1"
+      },
+      {
+        "date_string_col": "30322f30312f3039"
+      },
+      {
+        "string_col": "31"
+      },
+      {
+        "timestamp_col": "2009-02-01T00:01:00"
+      }
+    ],
+    [
+      {
+        "id": "0"
+      },
+      {
+        "bool_col": "true"
+      },
+      {
+        "tinyint_col": "0"
+      },
+      {
+        "smallint_col": "0"
+      },
+      {
+        "int_col": "0"
+      },
+      {
+        "bigint_col": "0"
+      },
+      {
+        "float_col": "0.0"
+      },
+      {
+        "double_col": "0.0"
+      },
+      {
+        "date_string_col": "30312f30312f3039"
+      },
+      {
+        "string_col": "30"
+      },
+      {
+        "timestamp_col": "2009-01-01T00:00:00"
+      }
+    ],
+    [
+      {
+        "id": "1"
+      },
+      {
+        "bool_col": "false"
+      },
+      {
+        "tinyint_col": "1"
+      },
+      {
+        "smallint_col": "1"
+      },
+      {
+        "int_col": "1"
+      },
+      {
+        "bigint_col": "10"
+      },
+      {
+        "float_col": "1.1"
+      },
+      {
+        "double_col": "10.1"
+      },
+      {
+        "date_string_col": "30312f30312f3039"
+      },
+      {
+        "string_col": "31"
+      },
+      {
+        "timestamp_col": "2009-01-01T00:01:00"
+      }
+    ]
+  ]
+}
\ No newline at end of file
diff --git a/parquet-integration-testing/data/alltypes_plain.parquet.json b/parquet-integration-testing/data/alltypes_plain.parquet.json
deleted file mode 100644
index 1288ce1f39b6..000000000000
--- a/parquet-integration-testing/data/alltypes_plain.parquet.json
+++ /dev/null
@@ -1 +0,0 @@
-{"filename":"/Users/andrewlamb/Software/arrow-rs/parquet-testing/data/alltypes_plain.parquet","rows":[[{"id":"4"},{"bool_col":"true"},{"tinyint_col":"0"},{"smallint_col":"0"},{"int_col":"0"},{"bigint_col":"0"},{"float_col":"0.0"},{"double_col":"0.0"},{"date_string_col":"30332f30312f3039"},{"string_col":"30"},{"timestamp_col":"2009-03-01T00:00:00"}],[{"id":"5"},{"bool_col":"false"},{"tinyint_col":"1"},{"smallint_col":"1"},{"int_col":"1"},{"bigint_col":"10"},{"float_col":"1.1"},{"double_col":"10.1"},{"date_string_col":"30332f30312f3039"},{"string_col":"31"},{"timestamp_col":"2009-03-01T00:01:00"}],[{"id":"6"},{"bool_col":"true"},{"tinyint_col":"0"},{"smallint_col":"0"},{"int_col":"0"},{"bigint_col":"0"},{"float_col":"0.0"},{"double_col":"0.0"},{"date_string_col":"30342f30312f3039"},{"string_col":"30"},{"timestamp_col":"2009-04-01T00:00:00"}],[{"id":"7"},{"bool_col":"false"},{"tinyint_col":"1"},{"smallint_col":"1"},{"int_col":"1"},{"bigint_col":"10"},{"float_col":"1.1"},{"double_col":"10.1"},{"date_string_col":"30342f30312f3039"},{"string_col":"31"},{"timestamp_col":"2009-04-01T00:01:00"}],[{"id":"2"},{"bool_col":"true"},{"tinyint_col":"0"},{"smallint_col":"0"},{"int_col":"0"},{"bigint_col":"0"},{"float_col":"0.0"},{"double_col":"0.0"},{"date_string_col":"30322f30312f3039"},{"string_col":"30"},{"timestamp_col":"2009-02-01T00:00:00"}],[{"id":"3"},{"bool_col":"false"},{"tinyint_col":"1"},{"smallint_col":"1"},{"int_col":"1"},{"bigint_col":"10"},{"float_col":"1.1"},{"double_col":"10.1"},{"date_string_col":"30322f30312f3039"},{"string_col":"31"},{"timestamp_col":"2009-02-01T00:01:00"}],[{"id":"0"},{"bool_col":"true"},{"tinyint_col":"0"},{"smallint_col":"0"},{"int_col":"0"},{"bigint_col":"0"},{"float_col":"0.0"},{"double_col":"0.0"},{"date_string_col":"30312f30312f3039"},{"string_col":"30"},{"timestamp_col":"2009-01-01T00:00:00"}],[{"id":"1"},{"bool_col":"false"},{"tinyint_col":"1"},{"smallint_col":"1"},{"int_col":"1"},{"bigint_col":"10"},{"float_col":"1.1"},{"double_col":"10.1"},{"date_string_col":"30312f30312f3039"},{"string_col":"31"},{"timestamp_col":"2009-01-01T00:01:00"}]]}
\ No newline at end of file
diff --git a/parquet-integration-testing/data/alltypes_plain.parquet.metadata.json b/parquet-integration-testing/data/alltypes_plain.parquet.metadata.json
new file mode 100644
index 000000000000..9bb5c3a56265
--- /dev/null
+++ b/parquet-integration-testing/data/alltypes_plain.parquet.metadata.json
@@ -0,0 +1,8 @@
+{
+  "filename33": "alltypes_plain.parquet",
+  "row_goups": [
+    {
+      "num_rows": 8
+    }
+  ]
+}
\ No newline at end of file
diff --git a/parquet-integration-testing/src/bin/parquet-integration-testing.rs b/parquet-integration-testing/src/bin/parquet-integration-testing.rs
index 443a871625b7..d40f620fe603 100644
--- a/parquet-integration-testing/src/bin/parquet-integration-testing.rs
+++ b/parquet-integration-testing/src/bin/parquet-integration-testing.rs
@@ -33,7 +33,8 @@ fn main() {
     ];
 
     for filename in filenames {
-        integration_test.read_test(filename);
+        integration_test.data_test(filename);
+        integration_test.metadata_test(filename)
     }
 }
 
@@ -89,31 +90,75 @@ impl IntegrationTest {
     ///   ]
     /// }
     /// ```
-    fn read_test(&self, filename: &str) {
+    fn data_test(&self, filename: &str) {
         let parquet_file_path = self.parquet_data_path.join(filename);
-
-        let expected_file_path = self.expected_data_path.join(format!("{filename}.json"));
+        let expected_file_path = self.expected_data_path.join(format!("{filename}.data.json"));
 
         // For ease of development, write the actual parsed value to a file (to
         // permit easy updates, for example)
-        let output_file_path = self.output_data_path.join(format!("{filename}.json"));
+        let output_file_path = self.output_data_path.join(format!("{filename}.data.json"));
 
-        println!("Begin test: {filename}");
+        println!("Begin data test: {filename}");
         println!("  Input parquet file: {parquet_file_path:?}");
         println!("  Expected JSON file: {expected_file_path:?}");
         println!("  Output JSON file: {output_file_path:?}");
 
         let parquet_json = read_parquet_data(&parquet_file_path);
         let output_file = File::create(&output_file_path).unwrap();
-        serde_json::to_writer(output_file, &parquet_json).unwrap();
+        serde_json::to_writer_pretty(output_file, &parquet_json).unwrap();
+
+        // read expected file if present, default to {} if not
+        let expected_json = if let Ok(expected_file) = File::open(expected_file_path) {
+            serde_json::from_reader(expected_file).unwrap()
+        } else {
+            json!({})
+        };
+        assert_eq!(parquet_json, expected_json)
+    }
+
+    /// Read a parquet file, create a JSON representation of its metadata, and compares to the
+    /// known good value in data
+    ///
+    /// The output JSON looks like this:
+    ///
+    /// ```text
+    /// {
+    ///   filename: "filename.parquet",
+    ///   ..
+    ///     ..
+    /// }
+    /// ```
+    fn metadata_test(&self, filename: &str) {
+        let parquet_file_path = self.parquet_data_path.join(filename);
+        let expected_file_path = self.expected_data_path.join(format!("{filename}.metadata.json"));
+
+        // For ease of development, write the actual parsed value to a file (to
+        // permit easy updates, for example)
+        let output_file_path = self.output_data_path.join(format!("{filename}.metadata.json"));
+
+        println!("Begin metadata test: {filename}");
+        println!("  Input parquet file: {parquet_file_path:?}");
+        println!("  Expected JSON file: {expected_file_path:?}");
+        println!("  Output JSON file: {output_file_path:?}");
 
-        let expected_file = File::open(expected_file_path).unwrap();
-        let expected_json: Value = serde_json::from_reader(expected_file).unwrap();
+        let parquet_json = read_parquet_metadata(&parquet_file_path);
+        let output_file = File::create(&output_file_path).unwrap();
+        serde_json::to_writer_pretty(output_file, &parquet_json).unwrap();
+
+        // read expected file if present, default to {} if not
+        let expected_json = if let Ok(expected_file) = File::open(expected_file_path) {
+            serde_json::from_reader(expected_file).unwrap()
+        } else {
+            json!({})
+        };
         assert_eq!(parquet_json, expected_json)
     }
 }
 
-/// The function reads a parquet file and writes a JSON representation of the data within
+
+
+/// The function reads a parquet file and returns a JSON representation of the
+/// data within
 fn read_parquet_data(parquet_data_path: &Path) -> Value {
     let file = File::open(parquet_data_path).unwrap();
     let mut reader = ArrowReaderBuilder::try_new(file).unwrap().build().unwrap();
@@ -134,8 +179,33 @@ fn read_parquet_data(parquet_data_path: &Path) -> Value {
         }
     }
 
+    let filename = parquet_data_path.file_name().unwrap().to_string_lossy();
+
     json!({
-        "filename": parquet_data_path,
+        "filename": filename,
         "rows": rows
     })
 }
+
+
+/// The function reads a parquet file and writes a JSON representation of the
+/// metatadata (thrift encoded) within
+fn read_parquet_metadata(parquet_data_path: &Path) -> Value {
+    let file = File::open(parquet_data_path).unwrap();
+    let metadata = ArrowReaderBuilder::try_new(file).unwrap().metadata().clone();
+
+    // todo print out schema
+    let row_groups : Vec<_> = metadata.row_groups().iter()
+        .map(|rg| {
+        json!({
+            "num_rows": rg.num_rows(),
+        })
+    }).collect();;
+
+    let filename = parquet_data_path.file_name().unwrap().to_string_lossy();
+
+    json!({
+        "filename33": filename,
+        "row_goups": row_groups,
+    })
+}
\ No newline at end of file

From 168cec8f3e77af29903542594de6e2408312bdc0 Mon Sep 17 00:00:00 2001
From: Andrew Lamb <andrew@nerdnetworks.org>
Date: Tue, 25 Jun 2024 12:35:41 -0400
Subject: [PATCH 5/5] updates

---
 .../data/alltypes_plain.parquet.metadata.json | 51 ++++++++++++++++++-
 .../src/bin/parquet-integration-testing.rs    | 23 +++++++++
 2 files changed, 73 insertions(+), 1 deletion(-)

diff --git a/parquet-integration-testing/data/alltypes_plain.parquet.metadata.json b/parquet-integration-testing/data/alltypes_plain.parquet.metadata.json
index 9bb5c3a56265..2c66d3cf2ac3 100644
--- a/parquet-integration-testing/data/alltypes_plain.parquet.metadata.json
+++ b/parquet-integration-testing/data/alltypes_plain.parquet.metadata.json
@@ -2,7 +2,56 @@
   "filename33": "alltypes_plain.parquet",
   "row_goups": [
     {
-      "num_rows": 8
+      "columns": [
+        {
+          "file_offset": 77,
+          "file_path": null
+        },
+        {
+          "file_offset": 133,
+          "file_path": null
+        },
+        {
+          "file_offset": 215,
+          "file_path": null
+        },
+        {
+          "file_offset": 303,
+          "file_path": null
+        },
+        {
+          "file_offset": 392,
+          "file_path": null
+        },
+        {
+          "file_offset": 484,
+          "file_path": null
+        },
+        {
+          "file_offset": 571,
+          "file_path": null
+        },
+        {
+          "file_offset": 665,
+          "file_path": null
+        },
+        {
+          "file_offset": 793,
+          "file_path": null
+        },
+        {
+          "file_offset": 889,
+          "file_path": null
+        },
+        {
+          "file_offset": 1068,
+          "file_path": null
+        }
+      ],
+      "file_offset": null,
+      "num_rows": 8,
+      "ordinal": null,
+      "total_byte_size": 671
     }
   ]
 }
\ No newline at end of file
diff --git a/parquet-integration-testing/src/bin/parquet-integration-testing.rs b/parquet-integration-testing/src/bin/parquet-integration-testing.rs
index d40f620fe603..8317149b1563 100644
--- a/parquet-integration-testing/src/bin/parquet-integration-testing.rs
+++ b/parquet-integration-testing/src/bin/parquet-integration-testing.rs
@@ -24,6 +24,8 @@ use serde_json::{json, Value};
 /// Test driver for parquet-integration testing
 use std::fs::{canonicalize, File};
 use std::path::{Path, PathBuf};
+use parquet::file::metadata::ColumnChunkMetaData;
+use parquet::format::ColumnMetaData;
 
 fn main() {
     let integration_test = IntegrationTest::new();
@@ -197,8 +199,15 @@ fn read_parquet_metadata(parquet_data_path: &Path) -> Value {
     // todo print out schema
     let row_groups : Vec<_> = metadata.row_groups().iter()
         .map(|rg| {
+            let columns: Vec<_> = rg.columns().iter()
+                .map(column_metadata_to_json)
+                .collect();
         json!({
             "num_rows": rg.num_rows(),
+            "total_byte_size": rg.total_byte_size(),
+            "file_offset": rg.file_offset(),
+            "ordinal": rg.ordinal(),
+            "columns": columns,
         })
     }).collect();;
 
@@ -208,4 +217,18 @@ fn read_parquet_metadata(parquet_data_path: &Path) -> Value {
         "filename33": filename,
         "row_goups": row_groups,
     })
+}
+
+fn column_metadata_to_json(column_metadata: &ColumnChunkMetaData) -> Value {
+
+    json!({
+        "file_path": column_metadata.file_path(),
+        "file_offset": column_metadata.file_offset(),
+        // todo: column metadata
+        // "num_values": column_metadata.num_values(),
+        // todo column-type/ column-path/descr
+        //"file_path": column_metadata.file_path(),
+
+    })
+
 }
\ No newline at end of file