diff --git a/parquet/src/arrow/schema/primitive.rs b/parquet/src/arrow/schema/primitive.rs index 24b18b39bc18..2362f12019f4 100644 --- a/parquet/src/arrow/schema/primitive.rs +++ b/parquet/src/arrow/schema/primitive.rs @@ -276,8 +276,8 @@ fn from_byte_array(info: &BasicTypeInfo, precision: i32, scale: i32) -> Result Ok(DataType::Utf8), (Some(LogicalType::Bson), _) => Ok(DataType::Binary), (Some(LogicalType::Enum), _) => Ok(DataType::Binary), - (Some(LogicalType::Geometry), _) => Ok(DataType::Binary), - (Some(LogicalType::Geography), _) => Ok(DataType::Binary), + (Some(LogicalType::Geometry { .. }), _) => Ok(DataType::Binary), + (Some(LogicalType::Geography { .. }), _) => Ok(DataType::Binary), (None, ConvertedType::NONE) => Ok(DataType::Binary), (None, ConvertedType::JSON) => Ok(DataType::Utf8), (None, ConvertedType::BSON) => Ok(DataType::Binary), diff --git a/parquet/src/basic.rs b/parquet/src/basic.rs index 6353b5f4ee63..fa183ebd58dd 100644 --- a/parquet/src/basic.rs +++ b/parquet/src/basic.rs @@ -28,8 +28,8 @@ use crate::errors::{ParquetError, Result}; // Re-export crate::format types used in this module pub use crate::format::{ - BsonType, DateType, DecimalType, EnumType, IntType, JsonType, ListType, MapType, NullType, - StringType, TimeType, TimeUnit, TimestampType, UUIDType, + BsonType, DateType, DecimalType, EnumType, GeographyType, GeometryType, IntType, JsonType, + ListType, MapType, NullType, StringType, TimeType, TimeUnit, TimestampType, UUIDType, }; // ---------------------------------------------------------------------- @@ -231,9 +231,18 @@ pub enum LogicalType { /// A Variant value. Variant, /// A geospatial feature in the Well-Known Binary (WKB) format with linear/planar edges interpolation. - Geometry, + Geometry { + /// A custom CRS. If unset, it defaults to "OGC:CRS84", which means that the geometries + /// must be stored in longitude, latitude based on the WGS84 datum. + crs: Option, + }, /// A geospatial feature in the WKB format with an explicit (non-linear/non-planar) edges interpolation. - Geography, + Geography { + /// A custom CRS. If unset, the CRS defaults to "OGC:CRS84". + crs: Option, + /// Edge interpolation method. + algorithm: EdgeInterpolationAlgorithm, + }, } // ---------------------------------------------------------------------- @@ -584,9 +593,9 @@ impl ColumnOrder { LogicalType::Unknown => SortOrder::UNDEFINED, LogicalType::Uuid => SortOrder::UNSIGNED, LogicalType::Float16 => SortOrder::SIGNED, - LogicalType::Variant | LogicalType::Geometry | LogicalType::Geography => { - SortOrder::UNDEFINED - } + LogicalType::Variant + | LogicalType::Geometry { .. } + | LogicalType::Geography { .. } => SortOrder::UNDEFINED, }, // Fall back to converted type None => Self::get_converted_sort_order(converted_type, physical_type), @@ -660,6 +669,37 @@ impl ColumnOrder { } } +// ---------------------------------------------------------------------- +// Mirrors `parquet::EdgeInterpolationAlgorithm` + +/// Edge interpolation algorithm for Geography logical type +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub enum EdgeInterpolationAlgorithm { + /// Edges are interpolated as geodesics on a sphere. + SPHERICAL, + + /// + VINCENTY, + + /// Thomas, Paul D. Spheroidal geodesics, reference systems, & local geometry. US Naval Oceanographic Office, 1970 + THOMAS, + + /// Thomas, Paul D. Mathematical models for navigation systems. US Naval Oceanographic Office, 1965. + ANDOYER, + + /// Karney, Charles FF. "Algorithms for geodesics." Journal of Geodesy 87 (2013): 43-55 + KARNEY, + + /// An unknown/unrecognized algorithm + UNKNOWN(i32), +} + +impl Default for EdgeInterpolationAlgorithm { + fn default() -> Self { + Self::SPHERICAL + } +} + impl fmt::Display for Type { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "{self:?}") @@ -708,6 +748,12 @@ impl fmt::Display for ColumnOrder { } } +impl fmt::Display for EdgeInterpolationAlgorithm { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{self:?}") + } +} + // ---------------------------------------------------------------------- // parquet::Type <=> Type conversion @@ -850,8 +896,31 @@ impl From for LogicalType { parquet::LogicalType::UUID(_) => LogicalType::Uuid, parquet::LogicalType::FLOAT16(_) => LogicalType::Float16, parquet::LogicalType::VARIANT(_) => LogicalType::Variant, - parquet::LogicalType::GEOMETRY(_) => LogicalType::Geometry, - parquet::LogicalType::GEOGRAPHY(_) => LogicalType::Geography, + parquet::LogicalType::GEOMETRY(t) => LogicalType::Geometry { crs: t.crs }, + parquet::LogicalType::GEOGRAPHY(t) => LogicalType::Geography { + crs: t.crs, + algorithm: t + .algorithm + .map(|algorithm| match algorithm { + parquet::EdgeInterpolationAlgorithm::SPHERICAL => { + EdgeInterpolationAlgorithm::SPHERICAL + } + parquet::EdgeInterpolationAlgorithm::VINCENTY => { + EdgeInterpolationAlgorithm::VINCENTY + } + parquet::EdgeInterpolationAlgorithm::ANDOYER => { + EdgeInterpolationAlgorithm::ANDOYER + } + parquet::EdgeInterpolationAlgorithm::THOMAS => { + EdgeInterpolationAlgorithm::THOMAS + } + parquet::EdgeInterpolationAlgorithm::KARNEY => { + EdgeInterpolationAlgorithm::KARNEY + } + _ => EdgeInterpolationAlgorithm::UNKNOWN(algorithm.0), + }) + .unwrap_or(EdgeInterpolationAlgorithm::SPHERICAL), + }, } } } @@ -894,8 +963,30 @@ impl From for parquet::LogicalType { LogicalType::Uuid => parquet::LogicalType::UUID(Default::default()), LogicalType::Float16 => parquet::LogicalType::FLOAT16(Default::default()), LogicalType::Variant => parquet::LogicalType::VARIANT(Default::default()), - LogicalType::Geometry => parquet::LogicalType::GEOMETRY(Default::default()), - LogicalType::Geography => parquet::LogicalType::GEOGRAPHY(Default::default()), + LogicalType::Geometry { crs } => parquet::LogicalType::GEOMETRY(GeometryType { crs }), + LogicalType::Geography { crs, algorithm } => { + parquet::LogicalType::GEOGRAPHY(GeographyType { + crs, + algorithm: match algorithm { + EdgeInterpolationAlgorithm::SPHERICAL => None, + EdgeInterpolationAlgorithm::VINCENTY => { + Some(parquet::EdgeInterpolationAlgorithm::VINCENTY) + } + EdgeInterpolationAlgorithm::THOMAS => { + Some(parquet::EdgeInterpolationAlgorithm::THOMAS) + } + EdgeInterpolationAlgorithm::ANDOYER => { + Some(parquet::EdgeInterpolationAlgorithm::ANDOYER) + } + EdgeInterpolationAlgorithm::KARNEY => { + Some(parquet::EdgeInterpolationAlgorithm::KARNEY) + } + EdgeInterpolationAlgorithm::UNKNOWN(code) => { + Some(parquet::EdgeInterpolationAlgorithm(code)) + } + }, + }) + } } } } @@ -950,8 +1041,8 @@ impl From> for ConvertedType { LogicalType::Uuid | LogicalType::Float16 | LogicalType::Variant - | LogicalType::Geometry - | LogicalType::Geography + | LogicalType::Geometry { .. } + | LogicalType::Geography { .. } | LogicalType::Unknown => ConvertedType::NONE, }, None => ConvertedType::NONE, @@ -1201,8 +1292,11 @@ impl str::FromStr for LogicalType { "Interval parquet logical type not yet supported" )), "FLOAT16" => Ok(LogicalType::Float16), - "GEOMETRY" => Ok(LogicalType::Geometry), - "GEOGRAPHY" => Ok(LogicalType::Geography), + "GEOMETRY" => Ok(LogicalType::Geometry { crs: None }), + "GEOGRAPHY" => Ok(LogicalType::Geography { + crs: None, + algorithm: EdgeInterpolationAlgorithm::SPHERICAL, + }), other => Err(general_err!("Invalid parquet logical type {}", other)), } } @@ -1852,6 +1946,17 @@ mod tests { ConvertedType::from(Some(LogicalType::Float16)), ConvertedType::NONE ); + assert_eq!( + ConvertedType::from(Some(LogicalType::Geometry { crs: None })), + ConvertedType::NONE + ); + assert_eq!( + ConvertedType::from(Some(LogicalType::Geography { + crs: None, + algorithm: EdgeInterpolationAlgorithm::default() + })), + ConvertedType::NONE + ); assert_eq!( ConvertedType::from(Some(LogicalType::Unknown)), ConvertedType::NONE @@ -2239,7 +2344,15 @@ mod tests { check_sort_order(signed, SortOrder::SIGNED); // Undefined comparison - let undefined = vec![LogicalType::List, LogicalType::Map]; + let undefined = vec![ + LogicalType::List, + LogicalType::Map, + LogicalType::Geometry { crs: None }, + LogicalType::Geography { + crs: None, + algorithm: EdgeInterpolationAlgorithm::default(), + }, + ]; check_sort_order(undefined, SortOrder::UNDEFINED); } @@ -2428,4 +2541,20 @@ mod tests { "Parquet error: unknown encoding: gzip(-10)" ); } + + #[test] + fn test_display_edge_algorithm() { + assert_eq!( + EdgeInterpolationAlgorithm::SPHERICAL.to_string(), + "SPHERICAL" + ); + assert_eq!(EdgeInterpolationAlgorithm::VINCENTY.to_string(), "VINCENTY"); + assert_eq!(EdgeInterpolationAlgorithm::THOMAS.to_string(), "THOMAS"); + assert_eq!(EdgeInterpolationAlgorithm::ANDOYER.to_string(), "ANDOYER"); + assert_eq!(EdgeInterpolationAlgorithm::KARNEY.to_string(), "KARNEY"); + assert_eq!( + EdgeInterpolationAlgorithm::UNKNOWN(99).to_string(), + "UNKNOWN(99)" + ); + } } diff --git a/parquet/src/geospatial/statistics.rs b/parquet/src/geospatial/statistics.rs index 2a39c494bd0f..7eddae55610c 100644 --- a/parquet/src/geospatial/statistics.rs +++ b/parquet/src/geospatial/statistics.rs @@ -44,9 +44,7 @@ use crate::geospatial::bounding_box::BoundingBox; /// ``` #[derive(Clone, Debug, PartialEq, Default)] pub struct GeospatialStatistics { - /// Optional bounding defining the spatial extent, where None represents a lack of information. bbox: Option, - /// Optional list of geometry type identifiers, where None represents lack of information geospatial_types: Option>, } @@ -58,6 +56,16 @@ impl GeospatialStatistics { geospatial_types, } } + + /// Optional list of geometry type identifiers, where None represents lack of information + pub fn geospatial_types(&self) -> Option<&Vec> { + self.geospatial_types.as_ref() + } + + /// Optional bounding defining the spatial extent, where None represents a lack of information. + pub fn bounding_box(&self) -> Option<&BoundingBox> { + self.bbox.as_ref() + } } /// Converts a Thrift-generated geospatial statistics object to the internal representation. @@ -127,38 +135,4 @@ mod tests { assert_eq!(thrift_bbox_m.mmin, Some(OrderedFloat(10.0))); assert_eq!(thrift_bbox_m.mmax, Some(OrderedFloat(20.0))); } - - #[test] - fn test_read_geospatial_statistics_from_file() { - use crate::file::reader::{FileReader, SerializedFileReader}; - use std::fs::File; - - let path = format!( - "{}/geospatial/geospatial.parquet", - arrow::util::test_util::parquet_test_data(), - ); - let file = File::open(path).unwrap(); - let reader = SerializedFileReader::try_from(file).unwrap(); - let metadata = reader.metadata(); - - // geospatial.parquet schema: - // optional binary field_id=-1 group (String); - // optional binary field_id=-1 wkt (String); - // optional binary field_id=-1 geometry (Geometry(crs=)); - let geo_statistics = metadata.row_group(0).column(2).geo_statistics(); - assert!(geo_statistics.is_some()); - - let expected_bbox = BoundingBox::new(10.0, 40.0, 10.0, 40.0) - .with_zrange(30.0, 80.0) - .with_mrange(200.0, 1600.0); - let expected_geospatial_types = vec![ - 1, 2, 3, 4, 5, 6, 7, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 2001, 2002, 2003, 2004, - 2005, 2006, 2007, 3001, 3002, 3003, 3004, 3005, 3006, 3007, - ]; - assert_eq!( - geo_statistics.unwrap().geospatial_types, - Some(expected_geospatial_types) - ); - assert_eq!(geo_statistics.unwrap().bbox, Some(expected_bbox)); - } } diff --git a/parquet/src/schema/printer.rs b/parquet/src/schema/printer.rs index 5ef068da915b..ee8e2a54de4e 100644 --- a/parquet/src/schema/printer.rs +++ b/parquet/src/schema/printer.rs @@ -327,8 +327,20 @@ fn print_logical_and_converted( LogicalType::Map => "MAP".to_string(), LogicalType::Float16 => "FLOAT16".to_string(), LogicalType::Variant => "VARIANT".to_string(), - LogicalType::Geometry => "GEOMETRY".to_string(), - LogicalType::Geography => "GEOGRAPHY".to_string(), + LogicalType::Geometry { crs } => { + if let Some(crs) = crs { + format!("GEOMETRY({crs})") + } else { + "GEOMETRY".to_string() + } + } + LogicalType::Geography { crs, algorithm } => { + if let Some(crs) = crs { + format!("GEOGRAPHY({algorithm}, {crs})") + } else { + format!("GEOGRAPHY({algorithm})") + } + } LogicalType::Unknown => "UNKNOWN".to_string(), }, None => { @@ -449,7 +461,7 @@ mod tests { use std::sync::Arc; - use crate::basic::{Repetition, Type as PhysicalType}; + use crate::basic::{EdgeInterpolationAlgorithm, Repetition, Type as PhysicalType}; use crate::errors::Result; use crate::schema::parser::parse_message_type; @@ -779,6 +791,62 @@ mod tests { .unwrap(), "REQUIRED BYTE_ARRAY field [42] (STRING);", ), + ( + build_primitive_type( + "field", + None, + PhysicalType::BYTE_ARRAY, + Some(LogicalType::Geometry { crs: None }), + ConvertedType::NONE, + Repetition::REQUIRED, + ) + .unwrap(), + "REQUIRED BYTE_ARRAY field (GEOMETRY);", + ), + ( + build_primitive_type( + "field", + None, + PhysicalType::BYTE_ARRAY, + Some(LogicalType::Geometry { + crs: Some("non-missing CRS".to_string()), + }), + ConvertedType::NONE, + Repetition::REQUIRED, + ) + .unwrap(), + "REQUIRED BYTE_ARRAY field (GEOMETRY(non-missing CRS));", + ), + ( + build_primitive_type( + "field", + None, + PhysicalType::BYTE_ARRAY, + Some(LogicalType::Geography { + crs: None, + algorithm: EdgeInterpolationAlgorithm::default(), + }), + ConvertedType::NONE, + Repetition::REQUIRED, + ) + .unwrap(), + "REQUIRED BYTE_ARRAY field (GEOGRAPHY(SPHERICAL));", + ), + ( + build_primitive_type( + "field", + None, + PhysicalType::BYTE_ARRAY, + Some(LogicalType::Geography { + crs: Some("non-missing CRS".to_string()), + algorithm: EdgeInterpolationAlgorithm::default(), + }), + ConvertedType::NONE, + Repetition::REQUIRED, + ) + .unwrap(), + "REQUIRED BYTE_ARRAY field (GEOGRAPHY(SPHERICAL, non-missing CRS));", + ), ]; types_and_strings.into_iter().for_each(|(field, expected)| { diff --git a/parquet/src/schema/types.rs b/parquet/src/schema/types.rs index 2f6131571ecc..3b8d1328ac00 100644 --- a/parquet/src/schema/types.rs +++ b/parquet/src/schema/types.rs @@ -401,8 +401,8 @@ impl<'a> PrimitiveTypeBuilder<'a> { (LogicalType::String, PhysicalType::BYTE_ARRAY) => {} (LogicalType::Json, PhysicalType::BYTE_ARRAY) => {} (LogicalType::Bson, PhysicalType::BYTE_ARRAY) => {} - (LogicalType::Geometry, PhysicalType::BYTE_ARRAY) => {} - (LogicalType::Geography, PhysicalType::BYTE_ARRAY) => {} + (LogicalType::Geometry { .. }, PhysicalType::BYTE_ARRAY) => {} + (LogicalType::Geography { .. }, PhysicalType::BYTE_ARRAY) => {} (LogicalType::Uuid, PhysicalType::FIXED_LEN_BYTE_ARRAY) if self.length == 16 => {} (LogicalType::Uuid, PhysicalType::FIXED_LEN_BYTE_ARRAY) => { return Err(general_err!( diff --git a/parquet/tests/geospatial.rs b/parquet/tests/geospatial.rs new file mode 100644 index 000000000000..733aa8864f3b --- /dev/null +++ b/parquet/tests/geospatial.rs @@ -0,0 +1,124 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Tests for Geometry and Geography logical types + +use parquet::{ + basic::{EdgeInterpolationAlgorithm, LogicalType}, + file::{ + metadata::ParquetMetaData, + reader::{FileReader, SerializedFileReader}, + }, + geospatial::bounding_box::BoundingBox, +}; +use serde_json::Value; +use std::fs::File; + +fn read_metadata(geospatial_test_file: &str) -> ParquetMetaData { + let path = format!( + "{}/geospatial/{geospatial_test_file}", + arrow::util::test_util::parquet_test_data(), + ); + let file = File::open(path).unwrap(); + let reader = SerializedFileReader::try_from(file).unwrap(); + reader.metadata().clone() +} + +#[test] +fn test_read_logical_type() { + // Some crs values are short strings + let expected_logical_type = [ + ("crs-default.parquet", LogicalType::Geometry { crs: None }), + ( + "crs-srid.parquet", + LogicalType::Geometry { + crs: Some("srid:5070".to_string()), + }, + ), + ( + "crs-projjson.parquet", + LogicalType::Geometry { + crs: Some("projjson:projjson_epsg_5070".to_string()), + }, + ), + ( + "crs-geography.parquet", + LogicalType::Geography { + crs: None, + algorithm: EdgeInterpolationAlgorithm::SPHERICAL, + }, + ), + ]; + + for (geospatial_file, expected_type) in expected_logical_type { + let metadata = read_metadata(geospatial_file); + let logical_type = metadata + .file_metadata() + .schema_descr() + .column(1) + .logical_type() + .unwrap(); + + assert_eq!(logical_type, expected_type); + } + + // The crs value may also contain arbitrary values (in this case some JSON + // a bit too lengthy to type out) + let metadata = read_metadata("crs-arbitrary-value.parquet"); + let logical_type = metadata + .file_metadata() + .schema_descr() + .column(1) + .logical_type() + .unwrap(); + + if let LogicalType::Geometry { crs } = logical_type { + let crs_parsed: Value = serde_json::from_str(&crs.unwrap()).unwrap(); + assert_eq!(crs_parsed.get("id").unwrap().get("code").unwrap(), 5070); + } else { + panic!("Expected geometry type but got {logical_type:?}"); + } +} + +#[test] +fn test_read_geospatial_statistics() { + let metadata = read_metadata("geospatial.parquet"); + + // geospatial.parquet schema: + // optional binary field_id=-1 group (String); + // optional binary field_id=-1 wkt (String); + // optional binary field_id=-1 geometry (Geometry(crs=)); + let fields = metadata.file_metadata().schema().get_fields(); + let logical_type = fields[2].get_basic_info().logical_type().unwrap(); + assert_eq!(logical_type, LogicalType::Geometry { crs: None }); + + let geo_statistics = metadata.row_group(0).column(2).geo_statistics(); + assert!(geo_statistics.is_some()); + + let expected_bbox = BoundingBox::new(10.0, 40.0, 10.0, 40.0) + .with_zrange(30.0, 80.0) + .with_mrange(200.0, 1600.0); + let expected_geospatial_types = vec![ + 1, 2, 3, 4, 5, 6, 7, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 2001, 2002, 2003, 2004, + 2005, 2006, 2007, 3001, 3002, 3003, 3004, 3005, 3006, 3007, + ]; + assert_eq!( + geo_statistics.unwrap().geospatial_types(), + Some(&expected_geospatial_types) + ); + assert_eq!(geo_statistics.unwrap().bounding_box(), Some(&expected_bbox)); +}