diff --git a/Cargo.lock b/Cargo.lock index 9101ba3..c5358e9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -981,6 +981,19 @@ version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" +[[package]] +name = "bigdecimal" +version = "0.4.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a22f228ab7a1b23027ccc6c350b72868017af7ea8356fbdf19f8d991c690013" +dependencies = [ + "autocfg", + "libm", + "num-bigint", + "num-integer", + "num-traits", +] + [[package]] name = "bumpalo" version = "3.16.0" @@ -1174,6 +1187,28 @@ version = "1.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" +[[package]] +name = "jiff" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f33145a5cbea837164362c7bd596106eb7c5198f97d1ba6f6ebb3223952e488" +dependencies = [ + "jiff-static", + "portable-atomic", + "portable-atomic-util", +] + +[[package]] +name = "jiff-static" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43ce13c40ec6956157a3635d97a1ee2df323b263f09ea14165131289cb0f5c19" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "js-sys" version = "0.3.70" @@ -1288,6 +1323,27 @@ dependencies = [ "serde", ] +[[package]] +name = "marrow-convert" +version = "0.1.0" +dependencies = [ + "bigdecimal", + "chrono", + "jiff", + "marrow", + "marrow-convert-derive", + "uuid", +] + +[[package]] +name = "marrow-convert-derive" +version = "0.1.0" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "memchr" version = "2.7.4" @@ -1379,24 +1435,33 @@ dependencies = [ [[package]] name = "portable-atomic" -version = "1.9.0" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "350e9b48cbc6b0e028b0473b114454c6316e57336ee184ceab6e53f72c178b3e" + +[[package]] +name = "portable-atomic-util" +version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc9c68a3f6da06753e9335d63e27f6b9754dd1920d941135b7ea8224f141adb2" +checksum = "d8a2f0d8d040d7848a709caf78912debcc3f33ee4b3cac47d73d1e1069e83507" +dependencies = [ + "portable-atomic", +] [[package]] name = "proc-macro2" -version = "1.0.86" +version = "1.0.94" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77" +checksum = "a31971752e70b8b2686d7e46ec17fb38dad4051d94024c88df49b667caea9c84" dependencies = [ "unicode-ident", ] [[package]] name = "quote" -version = "1.0.37" +version = "1.0.40" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af" +checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d" dependencies = [ "proc-macro2", ] @@ -1468,9 +1533,9 @@ checksum = "e3a9fe34e3e7a50316060351f37187a3f546bce95496156754b601a5fa71b76e" [[package]] name = "syn" -version = "2.0.79" +version = "2.0.100" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89132cd0bf050864e1d38dc3bbc07a0eb8e7530af26344d3d2bbbef83499f590" +checksum = "b09a44accad81e1ba1cd74a32461ba89dee89095ba17b32f5d03683b1b1fc2a0" dependencies = [ "proc-macro2", "quote", @@ -1540,6 +1605,12 @@ version = "1.0.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe" +[[package]] +name = "uuid" +version = "1.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "458f7a779bf54acc9f347480ac654f68407d3aab21269a6e3c9f922acd9e2da9" + [[package]] name = "version_check" version = "0.9.5" diff --git a/Cargo.toml b/Cargo.toml index e169fc3..b610719 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,5 @@ [workspace] -members = ["marrow", "test_with_arrow"] -default-members = ["marrow", "test_with_arrow"] +members = ["marrow", "marrow-convert", "marrow-convert-derive", "test_with_arrow"] resolver = "2" diff --git a/marrow-convert-derive/Cargo.toml b/marrow-convert-derive/Cargo.toml new file mode 100644 index 0000000..d369e5f --- /dev/null +++ b/marrow-convert-derive/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "marrow-convert-derive" +version = "0.1.0" +edition = "2024" + +[lib] +proc-macro = true + +[dependencies] +syn = "2.0" +quote = "1.0" +proc-macro2 = "1.0" diff --git a/marrow-convert-derive/src/array_push.rs b/marrow-convert-derive/src/array_push.rs new file mode 100644 index 0000000..c3c5043 --- /dev/null +++ b/marrow-convert-derive/src/array_push.rs @@ -0,0 +1,92 @@ +use quote::{format_ident, quote}; +use syn::{ + Data, DataEnum, DataStruct, DeriveInput, Fields, GenericParam, Ident, Type, spanned::Spanned, +}; + +pub fn derive_array_push(input: proc_macro2::TokenStream) -> proc_macro2::TokenStream { + let input: DeriveInput = syn::parse2(input).unwrap(); + + if input + .generics + .params + .iter() + .any(|p| matches!(p, GenericParam::Type(_))) + { + panic!("Deriving TypeInfo for generics with type parameters is not supported") + } + + match &input.data { + Data::Struct(data) => derive_for_struct(&input, data), + Data::Enum(data) => derive_for_enum(&input, data), + Data::Union(_) => panic!("Deriving TypeInfo for unions is not supported"), + } +} + +fn derive_for_struct(input: &DeriveInput, data: &DataStruct) -> proc_macro2::TokenStream { + if data.fields.len() >= 16 { + panic!("Only structs with at most 16 fields are supported"); + } + + let ident = &input.ident; + let generics = &input.generics; + let generics = if generics.params.is_empty() { + quote! {} + } else { + quote! { #generics, } + }; + + let mut field_defs = Vec::new(); + let mut field_uses = Vec::new(); + let mut field_push = Vec::new(); + + for (idx, (name, ty)) in get_fields_and_names(&data.fields).into_iter().enumerate() { + let ident = format_ident!("T{idx}"); + field_defs.push(quote! { #ident : ::marrow_convert::builder::ArrayPush<#ty> }); + field_uses.push(quote! { #ident }); + field_push.push( + quote! { ::marrow_convert::builder::ArrayPush::push_value(#ident, &value.#name )?; }, + ); + } + + quote! { + const _: () = { + impl<#generics #(#field_defs),*> ::marrow_convert::builder::ArrayPush<#ident> for ::marrow_convert::builder::compound::StructBuilder<(#(#field_uses,)*)> { + fn push_value(&mut self, value: &#ident) -> ::marrow_convert::Result<()> { + self.len += 1; + let (#(#field_uses,)*) = &mut self.children; + #(#field_push)* + Ok(()) + } + } + }; + } +} + +pub fn get_fields_and_names(fields: &Fields) -> Vec<(Ident, Type)> { + let mut result = Vec::new(); + match fields { + Fields::Named(fields) => { + for field in &fields.named { + let ident = field.ident.clone().expect("Named field without ident"); + let ty = field.ty.clone(); + result.push((ident, ty)); + } + } + Fields::Unnamed(fields) => { + for (idx, field) in fields.unnamed.iter().enumerate() { + result.push(( + Ident::new(&idx.to_string(), field.ty.span()), + field.ty.clone(), + )); + } + } + Fields::Unit => unimplemented!("Unit structs are currently not implemented"), + } + + result +} + +fn derive_for_enum(input: &DeriveInput, data: &DataEnum) -> proc_macro2::TokenStream { + let _ = (input, data); + todo!() +} diff --git a/marrow-convert-derive/src/default_builder.rs b/marrow-convert-derive/src/default_builder.rs new file mode 100644 index 0000000..d825cfa --- /dev/null +++ b/marrow-convert-derive/src/default_builder.rs @@ -0,0 +1,104 @@ +use quote::{format_ident, quote}; +use syn::{Data, DataEnum, DataStruct, DeriveInput, GenericParam, LitStr}; + +use super::array_push::get_fields_and_names; + +pub fn derive_default_builder(input: proc_macro2::TokenStream) -> proc_macro2::TokenStream { + let input: DeriveInput = syn::parse2(input).unwrap(); + + if input + .generics + .params + .iter() + .any(|p| matches!(p, GenericParam::Type(_))) + { + panic!("Deriving TypeInfo for generics with type parameters is not supported") + } + + match &input.data { + Data::Struct(data) => derive_for_struct(&input, data), + Data::Enum(data) => derive_for_enum(&input, data), + Data::Union(_) => panic!("Deriving TypeInfo for unions is not supported"), + } +} + +fn derive_for_struct(input: &DeriveInput, data: &DataStruct) -> proc_macro2::TokenStream { + if data.fields.len() >= 16 { + panic!("Only structs with at most 16 fields are supported"); + } + + let ident = &input.ident; + + let builder_ident = format_ident!("{ident}Builder"); + + let mut field_uses = Vec::new(); + let mut field_push = Vec::new(); + let mut field_builders = Vec::new(); + let mut field_metas = Vec::new(); + let mut field_inits = Vec::new(); + + for (idx, (name, ty)) in get_fields_and_names(&data.fields).into_iter().enumerate() { + let ident = format_ident!("t{idx}"); + field_uses.push(quote! { #ident }); + field_push.push( + quote! { ::marrow_convert::builder::ArrayPush::push_value(#ident, &value.#name )?; }, + ); + + let field_name = LitStr::new(&name.to_string(), name.span()); + + field_builders + .push(quote! { <#ty as ::marrow_convert::builder::DefaultArrayBuilder>::ArrayBuilder }); + field_metas.push(quote! { + ::marrow::datatypes::FieldMeta { + name: String::from(#field_name), + ..::std::default::Default::default() + } + }); + field_inits.push(quote! { + <#ty as ::marrow_convert::builder::DefaultArrayBuilder>::default_builder() + }) + } + + return quote! { + const _: () = { + pub struct #builder_ident(::marrow_convert::builder::compound::StructBuilder<(#(#field_builders,)*)>); + + impl ::marrow_convert::builder::DefaultArrayBuilder for #ident { + type ArrayBuilder = #builder_ident; + + fn default_builder() -> Self::ArrayBuilder { + #builder_ident(::marrow_convert::builder::compound::StructBuilder { + len: 0, + meta: vec![#(#field_metas),*], + children: (#(#field_inits,)*), + }) + } + } + + impl ::marrow_convert::builder::ArrayBuilder for #builder_ident { + fn push_default(&mut self) -> ::marrow_convert::Result<()> { + self.0.push_default() + } + + fn build_array(&mut self) -> ::marrow_convert::Result<::marrow::array::Array> { + self.0.build_array() + } + } + + impl ::marrow_convert::builder::ArrayPush<#ident> for #builder_ident { + fn push_value(&mut self, value: &#ident) -> ::marrow_convert::Result<()> { + self.0.len += 1; + let (#(#field_uses,)*) = &mut self.0.children; + #(#field_push)* + Ok(()) + } + } + + }; + }; +} + +fn derive_for_enum(input: &DeriveInput, data: &DataEnum) -> proc_macro2::TokenStream { + let _ = (input, data); + todo!() +} diff --git a/marrow-convert-derive/src/lib.rs b/marrow-convert-derive/src/lib.rs new file mode 100644 index 0000000..9dc0e0d --- /dev/null +++ b/marrow-convert-derive/src/lib.rs @@ -0,0 +1,20 @@ +use proc_macro::TokenStream; + +mod array_push; +mod default_builder; +mod type_info; + +#[proc_macro_derive(DefaultArrayType, attributes(marrow))] +pub fn derive_type_info(input: TokenStream) -> TokenStream { + type_info::derive_type_info(input.into()).into() +} + +#[proc_macro_derive(ArrayPush, attributes(marrow))] +pub fn derive_array_push(input: TokenStream) -> TokenStream { + array_push::derive_array_push(input.into()).into() +} + +#[proc_macro_derive(DefaultArrayBuilder, attributes(marrow))] +pub fn derive_default_builder(input: TokenStream) -> TokenStream { + default_builder::derive_default_builder(input.into()).into() +} diff --git a/marrow-convert-derive/src/type_info.rs b/marrow-convert-derive/src/type_info.rs new file mode 100644 index 0000000..4cb8a3f --- /dev/null +++ b/marrow-convert-derive/src/type_info.rs @@ -0,0 +1,326 @@ +use quote::{ToTokens, quote}; +use syn::{ + Attribute, Data, DataEnum, DataStruct, DeriveInput, Expr, Field, Fields, GenericParam, Ident, + Lit, LitStr, Meta, Token, punctuated::Punctuated, spanned::Spanned, +}; + +pub fn derive_type_info(input: proc_macro2::TokenStream) -> proc_macro2::TokenStream { + let input: DeriveInput = syn::parse2(input).unwrap(); + + if input + .generics + .params + .iter() + .any(|p| matches!(p, GenericParam::Type(_))) + { + panic!("Deriving TypeInfo for generics with type parameters is not supported") + } + + match &input.data { + Data::Struct(data) => derive_for_struct(&input, data), + Data::Enum(data) => derive_for_enum(&input, data), + Data::Union(_) => panic!("Deriving TypeInfo for unions is not supported"), + } +} + +#[derive(Debug, Default)] +struct FieldArgs { + // TODO: use a path here + with: Option, +} + +impl FieldArgs { + pub fn from_attrs(attrs: &[Attribute]) -> Self { + let mut result = Self::default(); + + for attr in attrs { + if !attr.path().is_ident("marrow") { + continue; + } + + let nested = attr + .parse_args_with(Punctuated::::parse_terminated) + .unwrap(); + for meta in nested { + match meta { + Meta::NameValue(meta) => { + if !meta.path.is_ident("with") { + continue; + } + match meta.value { + Expr::Lit(lit) => match lit.lit { + Lit::Str(str) => { + result.with = Some(Ident::new(&str.value(), str.span())); + } + _ => unimplemented!(), + }, + _ => unimplemented!(), + } + } + _ => unimplemented!(), + } + } + } + result + } +} + +#[derive(Debug, Default)] +struct VariantArgs { + with: Option, +} + +impl VariantArgs { + pub fn from_attrs(attrs: &[Attribute]) -> Self { + let mut result = Self::default(); + + for attr in attrs { + if !attr.path().is_ident("marrow_type_info") { + continue; + } + + let nested = attr + .parse_args_with(Punctuated::::parse_terminated) + .unwrap(); + for meta in nested { + match meta { + Meta::NameValue(meta) => { + if !meta.path.is_ident("with") { + continue; + } + match meta.value { + Expr::Lit(lit) => match lit.lit { + Lit::Str(str) => { + result.with = Some(Ident::new(&str.value(), str.span())); + } + _ => unimplemented!(), + }, + _ => unimplemented!(), + } + } + _ => unimplemented!(), + } + } + } + result + } +} + +fn derive_for_struct(input: &DeriveInput, data: &DataStruct) -> proc_macro2::TokenStream { + let name = &input.ident; + + let generics_decl = &input.generics; + let generics_use = if !input.generics.params.is_empty() { + let generics_use = input.generics.params.iter().map(|p| match p { + GenericParam::Const(p) => p.ident.to_token_stream(), + GenericParam::Lifetime(p) => p.lifetime.to_token_stream(), + GenericParam::Type(_) => panic!(), + }); + quote! { + <#(#generics_use),*> + } + } else { + quote! {} + }; + + let fields = get_fields(&data.fields); + let body = match fields.as_slice() { + [] => panic!(), + [(NameSource::Index, _, field)] => { + // TODO: ensure no args + let field_ty = &field.ty; + quote! { <#field_ty>::get_field(context) } + } + fields => { + let mut field_exprs = Vec::new(); + + for (_, field_name, field) in fields { + let ty = &field.ty; + let args = FieldArgs::from_attrs(&field.attrs); + + if let Some(func) = args.with.as_ref() { + field_exprs.push(quote! { + fields.push(context.nest(#field_name, #func::<#ty>)?); + }); + } else { + field_exprs.push(quote! { + fields.push(context.get_field::<#ty>(#field_name)?); + }) + } + } + + quote! { + let mut fields = ::std::vec::Vec::<::marrow::datatypes::Field>::new(); + #( #field_exprs; )* + + Ok(::marrow::datatypes::Field { + name: ::std::string::String::from(context.get_name()), + data_type: ::marrow::datatypes::DataType::Struct(fields), + nullable: false, + metadata: ::std::default::Default::default(), + }) + } + } + }; + + quote! { + const _: () = { + impl #generics_decl ::marrow_convert::types::DefaultArrayType for #name #generics_use { + fn get_field( + context: ::marrow_convert::types::Context<'_>, + ) -> ::marrow_convert::Result<::marrow::datatypes::Field> { + #body + } + } + }; + } +} + +fn derive_for_enum(input: &DeriveInput, data: &DataEnum) -> proc_macro2::TokenStream { + let mut variant_exprs = Vec::new(); + + let name = &input.ident; + let generics_decl = &input.generics; + let generics_use = if !input.generics.params.is_empty() { + let generics_use = input.generics.params.iter().map(|p| match p { + GenericParam::Const(p) => p.ident.to_token_stream(), + GenericParam::Lifetime(p) => p.lifetime.to_token_stream(), + GenericParam::Type(_) => panic!(), + }); + quote! { + <#(#generics_use),*> + } + } else { + quote! {} + }; + + for (idx, variant) in data.variants.iter().enumerate() { + let variant_name = &variant.ident; + let variant_name = LitStr::new(&variant_name.to_string(), variant_name.span()); + let variant_args = VariantArgs::from_attrs(&variant.attrs); + + if let Some(func) = variant_args.with.as_ref() { + variant_exprs.push(quote! { #func(stringify!(#variant_name)) }); + continue; + } + + let variant_idx = i8::try_from(idx).unwrap(); + + let fields = get_fields(&variant.fields); + match fields.as_slice() { + [] => { + // use nesting to allow overwrites + variant_exprs.push(quote! { + (#variant_idx, context.nest(#variant_name, |context| { + Ok(::marrow::datatypes::Field { + name: ::std::string::String::from(context.get_name()), + data_type: ::marrow::datatypes::DataType::Null, + nullable: true, + metadata: ::std::default::Default::default(), + }) + })?) + }); + } + [(NameSource::Index, _, field)] => { + let field_ty = &field.ty; + variant_exprs.push(quote! { + (#variant_idx, context.nest(#variant_name, <#field_ty>::get_field)?) + }); + } + fields => { + let mut field_exprs = Vec::new(); + for (_, field_name, field) in fields { + let field_ty = &field.ty; + field_exprs.push(quote! { + context.get_field::<#field_ty>(#field_name)? + }); + } + variant_exprs.push(quote! { + (#variant_idx, context.nest(#variant_name, |context| Ok(::marrow::datatypes::Field { + name: ::std::string::String::from(context.get_name()), + data_type: ::marrow::datatypes::DataType::Struct(vec![#(#field_exprs),*]), + nullable: false, + metadata: ::std::default::Default::default(), + }))?) + }); + } + } + } + + quote! { + const _: () = { + impl #generics_decl ::marrow_convert::types::DefaultArrayType for #name #generics_use { + fn get_field( + context: ::marrow_convert::types::Context<'_>, + ) -> ::marrow_convert::Result<::marrow::datatypes::Field> { + let mut variants = ::std::vec::Vec::<(::std::primitive::i8, ::marrow::datatypes::Field)>::new(); + #( variants.push(#variant_exprs); )* + + Ok(::marrow::datatypes::Field { + name: ::std::string::String::from(context.get_name()), + data_type: ::marrow::datatypes::DataType::Union(variants, ::marrow::datatypes::UnionMode::Dense), + nullable: false, + metadata: ::std::default::Default::default(), + }) + } + } + }; + } +} + +fn get_fields(fields: &Fields) -> Vec<(NameSource, LitStr, &Field)> { + let mut result = Vec::new(); + match fields { + Fields::Unit => {} + Fields::Named(fields) => { + for field in &fields.named { + let Some(name) = field.ident.as_ref() else { + unreachable!("Named field must have a name"); + }; + let name = LitStr::new(&name.to_string(), name.span()); + result.push((NameSource::Ident, name, field)); + } + } + Fields::Unnamed(fields) => { + for (idx, field) in fields.unnamed.iter().enumerate() { + let name = LitStr::new(&idx.to_string(), field.span()); + result.push((NameSource::Index, name, field)); + } + } + } + result +} + +#[derive(Debug, Clone, Copy, PartialEq)] +enum NameSource { + Ident, + Index, +} + +#[test] +#[should_panic(expected = "Deriving TypeInfo for generics with type parameters is not supported")] +fn reject_unsupported() { + derive_type_info(quote! { + struct Example { + field: T, + } + }); +} + +#[test] +fn lifetimes_are_supported() { + derive_type_info(quote! { + struct Example<'a> { + field: &'a i64, + } + }); +} + +#[test] +fn const_params_are_supported() { + derive_type_info(quote! { + struct Example { + field: [u8; N], + } + }); +} diff --git a/marrow-convert/Cargo.toml b/marrow-convert/Cargo.toml new file mode 100644 index 0000000..adc86af --- /dev/null +++ b/marrow-convert/Cargo.toml @@ -0,0 +1,14 @@ +[package] +name = "marrow-convert" +version = "0.1.0" +edition = "2024" + +[dependencies] +marrow = { path = "../marrow", default-features = false } +marrow-convert-derive = { path = "../marrow-convert-derive" } + +jiff = { version = "0.2", default-features = false } + +chrono = { version = "0.4", default-features = false } +bigdecimal = {version = "0.4", default-features = false } +uuid = { version = "1.10.0", default-features = false} diff --git a/marrow-convert/Design.md b/marrow-convert/Design.md new file mode 100644 index 0000000..684e637 --- /dev/null +++ b/marrow-convert/Design.md @@ -0,0 +1,9 @@ +# Overall design + +- M:N relationship between Rust and Arrow types + - A single Rust type can be converted into different Arrow types + - Different Rust types can be converted into the same Arrow type + - E.g., `jiff::Timestamp` and `chrono::DateTime` can both be converted to the Arrow + `Timestamp` type + - E.g., `jiff::Timestamp` can both be converted to the Arrow `Timestamp` and the Arrow `Utf8` typ +- Allow to fully specify the builders at compile time \ No newline at end of file diff --git a/marrow-convert/src/error.rs b/marrow-convert/src/error.rs new file mode 100644 index 0000000..b2a3d2c --- /dev/null +++ b/marrow-convert/src/error.rs @@ -0,0 +1,26 @@ +use std::{convert::Infallible, num::TryFromIntError}; + +pub type Result = std::result::Result; + +#[derive(Debug, PartialEq)] +pub struct Error(pub(crate) String); + +impl std::error::Error for Error {} + +impl std::fmt::Display for Error { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "Error({:?})", self.0) + } +} + +impl From for Error { + fn from(_: Infallible) -> Self { + unreachable!() + } +} + +impl From for Error { + fn from(value: TryFromIntError) -> Self { + Self(value.to_string()) + } +} diff --git a/marrow-convert/src/internal/builder/list.rs b/marrow-convert/src/internal/builder/list.rs new file mode 100644 index 0000000..93d8183 --- /dev/null +++ b/marrow-convert/src/internal/builder/list.rs @@ -0,0 +1,152 @@ +use marrow::{ + array::{Array, ListArray}, + datatypes::FieldMeta, +}; + +use crate::{Error, Result}; + +use super::{ArrayBuilder, ArrayPush, DefaultArrayBuilder}; + +struct GenericListBuilder { + offsets: Vec, + builder: B, +} + +impl GenericListBuilder { + pub fn new(builder: B) -> Self { + Self { + offsets: vec![O::default()], + builder, + } + } +} + +trait Offset: Default + Copy + std::ops::Add { + const ONE: Self; + const ARRAY_VARIANT: fn(ListArray) -> Array; +} + +impl Offset for i32 { + const ONE: Self = 1; + const ARRAY_VARIANT: fn(ListArray) -> Array = Array::List; +} + +impl Offset for i64 { + const ONE: Self = 1; + const ARRAY_VARIANT: fn(ListArray) -> Array = Array::LargeList; +} + +impl ArrayBuilder for GenericListBuilder { + fn push_default(&mut self) -> Result<()> { + let Some(last_offset) = self.offsets.last() else { + return Err(Error(String::from("invalid state"))); + }; + self.offsets.push(*last_offset); + Ok(()) + } + + fn build_array(&mut self) -> Result { + Ok(O::ARRAY_VARIANT(ListArray { + validity: None, + offsets: std::mem::replace(&mut self.offsets, vec![O::default()]), + meta: FieldMeta { + name: String::from("element"), + ..Default::default() + }, + elements: Box::new(self.builder.build_array()?), + })) + } +} + +impl> ArrayPush<[T]> for GenericListBuilder { + fn push_value(&mut self, value: &[T]) -> Result<()> { + let Some(last_offset) = self.offsets.last().copied() else { + return Err(Error(String::from("invalid state"))); + }; + + let mut pushed = O::default(); + for item in value { + self.builder.push_value(item)?; + pushed = pushed + O::ONE; + } + + self.offsets.push(last_offset + pushed); + Ok(()) + } +} + +pub struct ListBuilder(GenericListBuilder); + +impl ListBuilder { + pub fn new(builder: B) -> Self { + Self(GenericListBuilder::new(builder)) + } +} + +impl ArrayBuilder for ListBuilder { + fn push_default(&mut self) -> Result<()> { + self.0.push_default() + } + + fn build_array(&mut self) -> Result { + self.0.build_array() + } +} + +impl> ArrayPush<[T]> for ListBuilder { + fn push_value(&mut self, value: &[T]) -> Result<()> { + self.0.push_value(value) + } +} + +impl> ArrayPush> for ListBuilder { + fn push_value(&mut self, value: &Vec) -> Result<()> { + self.0.push_value(value.as_slice()) + } +} + +pub struct LargeListBuilder(GenericListBuilder); + +impl LargeListBuilder { + pub fn new(builder: B) -> Self { + Self(GenericListBuilder::new(builder)) + } +} + +impl ArrayBuilder for LargeListBuilder { + fn push_default(&mut self) -> Result<()> { + self.0.push_default() + } + + fn build_array(&mut self) -> Result { + self.0.build_array() + } +} + +impl> ArrayPush<[T]> for LargeListBuilder { + fn push_value(&mut self, value: &[T]) -> Result<()> { + self.0.push_value(value) + } +} + +impl> ArrayPush> for LargeListBuilder { + fn push_value(&mut self, value: &Vec) -> Result<()> { + self.0.push_value(value.as_slice()) + } +} + +impl DefaultArrayBuilder for Vec { + type ArrayBuilder = LargeListBuilder; + + fn default_builder() -> Self::ArrayBuilder { + LargeListBuilder::new(T::default_builder()) + } +} + +impl DefaultArrayBuilder for [T] { + type ArrayBuilder = LargeListBuilder; + + fn default_builder() -> Self::ArrayBuilder { + LargeListBuilder::new(T::default_builder()) + } +} diff --git a/marrow-convert/src/internal/builder/mod.rs b/marrow-convert/src/internal/builder/mod.rs new file mode 100644 index 0000000..6f096a7 --- /dev/null +++ b/marrow-convert/src/internal/builder/mod.rs @@ -0,0 +1,52 @@ +use marrow::array::Array; + +use crate::Result; + +pub mod list; +pub mod option; +pub mod primitive; +pub mod r#struct; +pub mod union; + +pub trait ArrayBuilder { + fn push_default(&mut self) -> Result<()>; + fn build_array(&mut self) -> Result; +} + +pub trait ArrayPush: ArrayBuilder { + fn push_value(&mut self, value: &T) -> Result<()>; +} + +impl> ArrayPush<&T> for B { + fn push_value(&mut self, value: &&T) -> Result<()> { + self.push_value(*value) + } +} + +impl> ArrayPush<&mut T> for B { + fn push_value(&mut self, value: &&mut T) -> Result<()> { + self.push_value(*value) + } +} + +pub trait DefaultArrayBuilder { + type ArrayBuilder: ArrayBuilder; + + fn default_builder() -> Self::ArrayBuilder; +} + +impl DefaultArrayBuilder for &T { + type ArrayBuilder = T::ArrayBuilder; + + fn default_builder() -> Self::ArrayBuilder { + T::default_builder() + } +} + +impl DefaultArrayBuilder for &mut T { + type ArrayBuilder = T::ArrayBuilder; + + fn default_builder() -> Self::ArrayBuilder { + T::default_builder() + } +} diff --git a/marrow-convert/src/internal/builder/option.rs b/marrow-convert/src/internal/builder/option.rs new file mode 100644 index 0000000..1ce8e88 --- /dev/null +++ b/marrow-convert/src/internal/builder/option.rs @@ -0,0 +1,69 @@ +use marrow::array::Array; + +use crate::{Error, Result}; + +use super::{ArrayBuilder, ArrayPush, DefaultArrayBuilder}; + +pub struct OptionBuilder { + len: usize, + validity: Vec, + builder: B, +} + +impl OptionBuilder { + pub fn new(builder: B) -> Self { + Self { + len: 0, + validity: Vec::new(), + builder, + } + } +} + +impl ArrayBuilder for OptionBuilder { + fn push_default(&mut self) -> Result<()> { + marrow::bits::push(&mut self.validity, &mut self.len, false); + self.builder.push_default()?; + Ok(()) + } + + fn build_array(&mut self) -> Result { + let array = self.builder.build_array()?; + let validity = std::mem::take(&mut self.validity); + let _ = std::mem::take(&mut self.len); + with_validity(array, validity) + } +} + +impl> ArrayPush> for OptionBuilder { + fn push_value(&mut self, value: &Option) -> Result<()> { + match value { + Some(value) => { + marrow::bits::push(&mut self.validity, &mut self.len, true); + self.builder.push_value(value) + } + None => self.push_default(), + } + } +} + +impl DefaultArrayBuilder for Option { + type ArrayBuilder = OptionBuilder; + + fn default_builder() -> Self::ArrayBuilder { + OptionBuilder::new(T::default_builder()) + } +} + +fn with_validity(array: Array, validity: Vec) -> Result { + // TODO: check compatibility + match array { + Array::Null(array) => Ok(Array::Null(array)), + Array::Boolean(mut array) => { + array.validity = Some(validity); + Ok(Array::Boolean(array)) + } + // TODO: add more .. + _ => Err(Error(String::from("Cannot set valditiy for array"))), + } +} diff --git a/marrow-convert/src/internal/builder/primitive.rs b/marrow-convert/src/internal/builder/primitive.rs new file mode 100644 index 0000000..edc69e7 --- /dev/null +++ b/marrow-convert/src/internal/builder/primitive.rs @@ -0,0 +1,162 @@ +use marrow::{ + array::{Array, BooleanArray, NullArray, PrimitiveArray}, + types::f16, +}; + +use crate::Result; + +use super::{ArrayBuilder, ArrayPush, DefaultArrayBuilder}; + +#[derive(Debug, Default)] +struct PrimitiveBuilder { + values: Vec, + build_impl: B, +} + +trait BuildPrimitiveArrayImpl { + fn build(&self, values: &mut Vec) -> Result; +} + +impl> ArrayBuilder for PrimitiveBuilder { + fn push_default(&mut self) -> Result<()> { + self.values.push(T::default()); + Ok(()) + } + + fn build_array(&mut self) -> Result { + self.build_impl.build(&mut self.values) + } +} + +#[derive(Debug, Default)] +struct BuildNative; + +macro_rules! impl_build_native { + ($(($ty:ident, $variant:ident),)*) => { + $( + impl BuildPrimitiveArrayImpl<$ty> for BuildNative { + fn build(&self, values: &mut Vec<$ty>) -> Result { + Ok(Array::$variant(PrimitiveArray { + validity: None, + values: std::mem::take(values), + })) + } + } + )* + }; +} + +impl_build_native!( + (i8, Int8), + (i16, Int16), + (i32, Int32), + (i64, Int64), + (u8, UInt8), + (u16, UInt16), + (u32, UInt32), + (u64, UInt64), + (f16, Float16), + (f32, Float32), + (f64, Float64), +); + +macro_rules! define_builder { + ($(($builder:ident, $ty:ident),)*) => { + $( + #[derive(Debug, Default)] + pub struct $builder(PrimitiveBuilder<$ty, BuildNative>); + + impl ArrayBuilder for $builder { + fn push_default(&mut self) -> Result<()> { + self.0.push_default() + } + + fn build_array(&mut self) -> Result { + self.0.build_array() + } + } + + impl ArrayPush<$ty> for $builder { + fn push_value(&mut self, value: &$ty) -> Result<()> { + self.0.values.push(*value); + Ok(()) + } + } + + impl DefaultArrayBuilder for $ty { + type ArrayBuilder = $builder; + + fn default_builder() -> Self::ArrayBuilder { + $builder::default() + } + } + )* + }; +} + +define_builder!( + (Int8Builder, i8), + (Int16Builder, i16), + (Int32Builder, i32), + (Int64Builder, i64), + (UInt8Builder, u8), + (UInt16Builder, u16), + (UInt32Builder, u32), + (UInt64Builder, u64), + (Float16Builder, f16), + (Float32Builder, f32), + (Float64Builder, f64), +); + +#[derive(Debug, Default)] +pub struct NullBuilder(usize); + +impl ArrayBuilder for NullBuilder { + fn push_default(&mut self) -> Result<()> { + self.0 += 1; + Ok(()) + } + + fn build_array(&mut self) -> Result { + Ok(Array::Null(NullArray { + len: std::mem::take(&mut self.0), + })) + } +} + +impl DefaultArrayBuilder for () { + type ArrayBuilder = NullBuilder; + + fn default_builder() -> Self::ArrayBuilder { + NullBuilder::default() + } +} + +#[derive(Debug, Default)] +pub struct BooleanBuilder { + len: usize, + values: Vec, +} + +impl ArrayBuilder for BooleanBuilder { + fn push_default(&mut self) -> Result<()> { + marrow::bits::push(&mut self.values, &mut self.len, false); + Ok(()) + } + + fn build_array(&mut self) -> Result { + Ok(Array::Boolean(BooleanArray { + len: std::mem::take(&mut self.len), + values: std::mem::take(&mut self.values), + validity: None, + })) + } +} + +impl DefaultArrayBuilder for bool { + type ArrayBuilder = BooleanBuilder; + + fn default_builder() -> Self::ArrayBuilder { + BooleanBuilder::default() + } +} diff --git a/marrow-convert/src/internal/builder/struct.rs b/marrow-convert/src/internal/builder/struct.rs new file mode 100644 index 0000000..86e55fe --- /dev/null +++ b/marrow-convert/src/internal/builder/struct.rs @@ -0,0 +1,163 @@ +use marrow::{ + array::{Array, StructArray}, + datatypes::FieldMeta, +}; + +use crate::{Error, Result}; + +use super::ArrayBuilder; + +// TODO: add simple doc test showing how to implement a custom impl +/// Support to build struct builders +/// +/// When pushing a value the following invariants need to be observed: +/// +/// - A value must be pushed to each child field +/// - The `len` field must be incremented +/// +pub struct StructBuilder { + pub meta: Vec, + pub len: usize, + pub children: C, +} + +macro_rules! impl_struct_builder { + ($($el:ident,)*) => { + #[allow(non_snake_case, clippy::vec_init_then_push)] + impl<$($el: ArrayBuilder),*> ArrayBuilder for StructBuilder<($($el,)*)> { + fn push_default(&mut self) -> Result<()> { + let ($($el,)*) = &mut self.children; + self.len += 1; + $($el.push_default()?;)* + Ok(()) + } + + fn build_array(&mut self) -> Result { + let ($($el,)*) = &mut self.children; + let mut arrays = Vec::new(); + // TODO: ensure all builders are called? + $(arrays.push($el.build_array()?);)* + + if arrays.len() != self.meta.len() { + return Err(Error(String::from("Not matching number of meta and children"))); + } + + let fields = std::iter::zip(&self.meta, arrays).map(|(meta, array)| (meta.clone(), array)).collect(); + + Ok(Array::Struct(StructArray { + len: self.len, + validity: None, + fields, + })) + } + } + }; +} + +// TODO: is a struct without fields valid? +impl_struct_builder!(A,); +impl_struct_builder!(A, B,); +impl_struct_builder!(A, B, C,); +impl_struct_builder!(A, B, C, D,); +impl_struct_builder!(A, B, C, D, E,); +impl_struct_builder!(A, B, C, D, E, F,); +impl_struct_builder!(A, B, C, D, E, F, G,); +impl_struct_builder!(A, B, C, D, E, F, G, H,); +impl_struct_builder!(A, B, C, D, E, F, G, H, I,); +impl_struct_builder!(A, B, C, D, E, F, G, H, I, J,); +impl_struct_builder!(A, B, C, D, E, F, G, H, I, J, K,); +impl_struct_builder!(A, B, C, D, E, F, G, H, I, J, K, L,); +impl_struct_builder!(A, B, C, D, E, F, G, H, I, J, K, L, M,); +impl_struct_builder!(A, B, C, D, E, F, G, H, I, J, K, L, M, N,); +impl_struct_builder!(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O,); +impl_struct_builder!(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P,); + +#[test] +fn struct_example() { + use super::{ArrayPush, DefaultArrayBuilder}; + + struct S { + a: i8, + b: i32, + } + + // move into derive(ArrayPush) + // Allows to customize the builder + const _: () = { + impl, B: ArrayPush> ArrayPush for StructBuilder<(A, B)> { + fn push_value(&mut self, value: &S) -> Result<()> { + self.len += 1; + self.children.0.push_value(&value.a)?; + self.children.1.push_value(&value.b)?; + Ok(()) + } + } + }; + + // move into derive(DefaultBuilder) + const _: () = { + struct Builder( + StructBuilder<( + ::ArrayBuilder, + ::ArrayBuilder, + )>, + ); + + impl ArrayBuilder for Builder { + fn push_default(&mut self) -> Result<()> { + self.0.push_default() + } + + fn build_array(&mut self) -> Result { + self.0.build_array() + } + } + + impl DefaultArrayBuilder for S { + type ArrayBuilder = Builder; + + fn default_builder() -> Self::ArrayBuilder { + Builder(StructBuilder { + len: 0, + meta: vec![ + FieldMeta { + name: String::from("a"), + ..Default::default() + }, + FieldMeta { + name: String::from("b"), + ..Default::default() + }, + ], + children: ( + (::default_builder()), + (::default_builder()), + ), + }) + } + } + + // NOTE: implement separately to allow independent derives + impl ArrayPush for Builder { + fn push_value(&mut self, value: &S) -> Result<()> { + self.0.len += 1; + self.0.children.0.push_value(&value.a)?; + self.0.children.1.push_value(&value.b)?; + Ok(()) + } + } + }; + + // the public API + let mut builder = S::default_builder(); + builder.push_value(&S { a: 0, b: -21 }).unwrap(); + builder.push_value(&S { a: 1, b: -42 }).unwrap(); + let array = builder.build_array().unwrap(); + + let [(_, a), (_, b)] = array.into_struct_fields().expect("invalid array type"); + let a = a.into_int8().expect("invalid array type"); + let b = b.into_int32().expect("invalid array type"); + + assert_eq!(a.values, vec![0, 1]); + assert_eq!(b.values, vec![-21, -42]); +} diff --git a/marrow-convert/src/internal/builder/union.rs b/marrow-convert/src/internal/builder/union.rs new file mode 100644 index 0000000..7ccbbe7 --- /dev/null +++ b/marrow-convert/src/internal/builder/union.rs @@ -0,0 +1,284 @@ +use marrow::{ + array::{Array, UnionArray}, + datatypes::FieldMeta, +}; + +use crate::Result; + +use crate::internal::util::TupleLen; + +use super::ArrayBuilder; + +/// Helper struct to simplify implementing sparse Union builders +/// +/// When pushing a value the following invariants need to be observed: +/// +/// - A discriminator must be pushed to the `types` value +/// - A value must be pushed to each child field +#[derive(Debug)] +pub struct SparseUnionBuilder { + pub types: Vec, + pub meta: Vec, + pub children: C, +} + +macro_rules! impl_sparse_union_builder { + ($($el:ident,)*) => { + #[allow(non_snake_case, clippy::vec_init_then_push)] + impl<$($el: ArrayBuilder),*> ArrayBuilder for SparseUnionBuilder<($($el,)*)> { + fn push_default(&mut self) -> Result<()> { + let ($($el,)*) = &mut self.children; + $($el.push_default()?;)* + self.types.push(0); + Ok(()) + } + + fn build_array(&mut self) -> Result { + const { + assert!(<($($el,)*) as TupleLen>::LEN < (i8::MAX as usize)); + } + + let types = std::mem::take(&mut self.types); + let mut arrays = Vec::new(); + let ($($el,)*) = &mut self.children; + $(arrays.push($el.build_array()?);)* + + let fields = std::iter::zip(&self.meta, arrays) + .enumerate() + .map(|(i, (meta, array))| (i as i8, meta.clone(), array)) + .collect(); + + Ok(Array::Union(UnionArray { + types, + fields, + offsets: None, + })) + } + } + }; +} + +impl_sparse_union_builder!(A,); +impl_sparse_union_builder!(A, B,); +impl_sparse_union_builder!(A, B, C,); +impl_sparse_union_builder!(A, B, C, D,); +impl_sparse_union_builder!(A, B, C, D, E,); +impl_sparse_union_builder!(A, B, C, D, E, F,); +impl_sparse_union_builder!(A, B, C, D, E, F, G,); +impl_sparse_union_builder!(A, B, C, D, E, F, G, H,); +impl_sparse_union_builder!(A, B, C, D, E, F, G, H, I,); +impl_sparse_union_builder!(A, B, C, D, E, F, G, H, I, J,); +impl_sparse_union_builder!(A, B, C, D, E, F, G, H, I, J, K,); +impl_sparse_union_builder!(A, B, C, D, E, F, G, H, I, J, K, L,); +impl_sparse_union_builder!(A, B, C, D, E, F, G, H, I, J, K, L, M,); +impl_sparse_union_builder!(A, B, C, D, E, F, G, H, I, J, K, L, M, N,); +impl_sparse_union_builder!(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O,); +impl_sparse_union_builder!(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P,); + +/// Helper struct to simplify implementing dense Union builders +/// +/// When pushing a value the following invariants need to be observed: +/// +/// - A discriminator must be pushed to the `types` value +/// - A value must be pushed for the relevant variant +#[derive(Debug)] +pub struct DenseUnionBuilder { + pub types: DenseTypes, + pub offsets: Vec, + pub meta: Vec, + pub children: C, +} + +#[derive(Debug)] +pub struct DenseTypes { + types: Vec, + offsets: Vec, + current_offset: Vec, +} + +impl DenseTypes { + pub fn new(num_types: usize) -> Self { + Self { + types: Vec::new(), + offsets: Vec::new(), + current_offset: vec![0; num_types], + } + } + + pub fn take(&mut self) -> Self { + let num_types = self.current_offset.len(); + Self { + types: std::mem::take(&mut self.types), + offsets: std::mem::take(&mut self.offsets), + current_offset: std::mem::replace(&mut self.current_offset, vec![0; num_types]), + } + } + + pub fn push(&mut self, variant: i8) -> Result<()> { + assert!(variant >= 0); + + self.types.push(variant); + self.offsets.push(self.current_offset[variant as usize]); + self.current_offset[variant as usize] += 1; + Ok(()) + } +} + +macro_rules! impl_dense_union_builder { + ($first:ident, $($el:ident,)*) => { + #[allow(non_snake_case, clippy::vec_init_then_push)] + impl<$first: ArrayBuilder $(, $el: ArrayBuilder)*> ArrayBuilder for DenseUnionBuilder<($first, $($el,)*)> { + fn push_default(&mut self) -> Result<()> { + #[allow(unused_variables)] + let ($first, $($el,)*) = &mut self.children; + $first.push_default()?; + self.types.push(0)?; + Ok(()) + } + + fn build_array(&mut self) -> Result { + const { + assert!(<($first, $($el,)*) as TupleLen>::LEN < (i8::MAX as usize)); + } + + let DenseTypes { types, offsets, ..} = self.types.take(); + let mut arrays = Vec::new(); + let ($first, $($el,)*) = &mut self.children; + arrays.push($first.build_array()?); + $(arrays.push($el.build_array()?);)* + + let fields = std::iter::zip(&self.meta, arrays) + .enumerate() + .map(|(i, (meta, array))| (i as i8, meta.clone(), array)) + .collect(); + + Ok(Array::Union(UnionArray { + types, + offsets: Some(offsets), + fields, + })) + } + } + }; +} + +impl_dense_union_builder!(A,); +impl_dense_union_builder!(A, B,); +impl_dense_union_builder!(A, B, C,); +impl_dense_union_builder!(A, B, C, D,); +impl_dense_union_builder!(A, B, C, D, E,); +impl_dense_union_builder!(A, B, C, D, E, F,); +impl_dense_union_builder!(A, B, C, D, E, F, G,); +impl_dense_union_builder!(A, B, C, D, E, F, G, H,); +impl_dense_union_builder!(A, B, C, D, E, F, G, H, I,); +impl_dense_union_builder!(A, B, C, D, E, F, G, H, I, J,); +impl_dense_union_builder!(A, B, C, D, E, F, G, H, I, J, K,); +impl_dense_union_builder!(A, B, C, D, E, F, G, H, I, J, K, L,); +impl_dense_union_builder!(A, B, C, D, E, F, G, H, I, J, K, L, M,); +impl_dense_union_builder!(A, B, C, D, E, F, G, H, I, J, K, L, M, N,); +impl_dense_union_builder!(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O,); +impl_dense_union_builder!(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P,); + +#[test] +fn enum_example() { + use super::{ArrayPush, DefaultArrayBuilder}; + + enum Enum { + A(i32), + B(i64), + } + + // TODO: push into derive(ArrayPush) + const _: () = { + impl, B: ArrayPush> ArrayPush for SparseUnionBuilder<(A, B)> { + #[allow(non_snake_case)] + fn push_value(&mut self, value: &Enum) -> Result<()> { + match value { + Enum::A(inner) => { + self.types.push(0); + let (A, B) = &mut self.children; + A.push_value(inner)?; + B.push_default()?; + } + Enum::B(inner) => { + self.types.push(1); + let (A, B) = &mut self.children; + A.push_default()?; + B.push_value(inner)?; + } + } + Ok(()) + } + } + }; + + // TODO: push into derive(DefaultArrayBuilder) + const _: () = { + struct Builder( + SparseUnionBuilder<( + ::ArrayBuilder, + ::ArrayBuilder, + )>, + ); + + #[allow(non_snake_case)] + impl ArrayBuilder for Builder { + fn push_default(&mut self) -> Result<()> { + self.0.types.push(0); + + let (A, B) = &mut self.0.children; + A.push_default()?; + B.push_default()?; + Ok(()) + } + + fn build_array(&mut self) -> Result { + self.0.build_array() + } + } + + // TODO: in practice implement separately to allow indepdent derives + impl ArrayPush for Builder { + fn push_value(&mut self, value: &Enum) -> Result<()> { + self.0.push_value(value) + } + } + + impl DefaultArrayBuilder for Enum { + type ArrayBuilder = Builder; + + fn default_builder() -> Self::ArrayBuilder { + Builder(SparseUnionBuilder { + types: Vec::new(), + meta: vec![ + FieldMeta { + name: String::from("A"), + ..Default::default() + }, + FieldMeta { + name: String::from("B"), + ..Default::default() + }, + ], + children: ( + ::default_builder(), + ::default_builder(), + ), + }) + } + } + }; + + // the public API + let mut builder = Enum::default_builder(); + builder.push_value(&Enum::A(13)).unwrap(); + builder.push_value(&Enum::B(21)).unwrap(); + let array = builder.build_array().unwrap(); + + let [(_, _, a), (_, _, b)] = array.into_union_fields().expect("invalid array type"); + let a = a.into_int32().expect("invalid array type"); + let b = b.into_int64().expect("invalid array type"); + + assert_eq!(a.values, vec![13, 0]); + assert_eq!(b.values, vec![0, 21]); +} diff --git a/marrow-convert/src/internal/mod.rs b/marrow-convert/src/internal/mod.rs new file mode 100644 index 0000000..bf7bf7a --- /dev/null +++ b/marrow-convert/src/internal/mod.rs @@ -0,0 +1,4 @@ +pub mod builder; +pub mod type_info; +pub mod type_info_impls; +pub mod util; diff --git a/marrow-convert/src/internal/type_info.rs b/marrow-convert/src/internal/type_info.rs new file mode 100644 index 0000000..691aac8 --- /dev/null +++ b/marrow-convert/src/internal/type_info.rs @@ -0,0 +1,140 @@ +use std::{ + any::{Any, TypeId}, + collections::HashMap, + rc::Rc, +}; + +use marrow::datatypes::{DataType, Field}; + +use crate::{Error, Result}; + +#[derive(Debug, Default)] +pub struct Options { + data: HashMap>, + overwrites: HashMap, +} + +impl Options { + pub fn set(&mut self, value: T) { + let type_id = TypeId::of::(); + self.data.insert(type_id, Rc::new(value)); + } + + pub fn get(&self) -> Option<&T> { + let key = TypeId::of::(); + let value = self.data.get(&key)?; + let Some(value) = value.downcast_ref() else { + unreachable!(); + }; + Some(value) + } + + pub fn with_default_string_type(mut self, data_type: DataType) -> Self { + // TOOD: check for valid string type + self.set(DefaultStringType(data_type)); + self + } + + pub fn with_default_list_index_type(mut self, list_type: ListIndexType) -> Self { + self.set(LargeList(matches!(list_type, ListIndexType::Int64))); + self + } + + pub fn overwrite(mut self, path: &str, field: Field) -> Self { + self.overwrites.insert(path.to_owned(), field); + self + } +} + +pub enum ListIndexType { + Int32, + Int64, +} + +impl TryFrom for ListIndexType { + type Error = Error; + + fn try_from(value: DataType) -> std::result::Result { + match value { + DataType::Int32 => Ok(Self::Int32), + DataType::Int64 => Ok(Self::Int64), + dt => Err(Error(format!( + "Cannot interpretr {dt:?} as a ListIndexType" + ))), + } + } +} + +#[derive(Debug, Clone, Copy)] +pub struct Context<'a> { + path: &'a str, + name: &'a str, + options: &'a Options, +} + +impl Context<'_> { + pub fn get_name(&self) -> &str { + self.name + } + + pub fn get_path(&self) -> &str { + self.path + } + + pub fn get_options(&self) -> &Options { + self.options + } + + pub fn get_field(&self, name: &str) -> Result { + self.nest(name, T::get_field) + } + + /// Call a function with a context for nested field + pub fn nest) -> Result>( + &self, + name: &str, + scope: F, + ) -> Result { + let path = format!("{}.{}", self.path, name); + + if let Some(overwrite) = self.options.overwrites.get(&path) { + let mut overwrite = overwrite.clone(); + overwrite.name = String::from(name); + return Ok(overwrite); + } + + let child_context = Context { + path: &path, + name, + options: self.options, + }; + + scope(child_context) + } +} + +pub fn get_field(name: &str, options: &Options) -> Result { + let context = Context { + path: "$", + name, + options, + }; + T::get_field(context) +} + +pub fn get_data_type(options: &Options) -> Result { + Ok(get_field::("item", options)?.data_type) +} + +pub struct DefaultStringType(pub DataType); + +pub struct LargeList(pub bool); + +/// Get the Arrow type information for a given Rust type +/// +/// The functions cannot be called directly. First construct a [Context], then call the +/// corresponding methods. +pub trait DefaultArrayType { + /// See [get_field] + fn get_field(context: Context<'_>) -> Result; +} diff --git a/marrow-convert/src/internal/type_info_impls/collections.rs b/marrow-convert/src/internal/type_info_impls/collections.rs new file mode 100644 index 0000000..964d51f --- /dev/null +++ b/marrow-convert/src/internal/type_info_impls/collections.rs @@ -0,0 +1,66 @@ +use std::collections::{BTreeMap, BTreeSet, BinaryHeap, HashMap, HashSet, LinkedList, VecDeque}; + +use marrow::datatypes::Field; + +use crate::{ + Result, + types::{Context, DefaultArrayType}, +}; + +use super::utils::{new_list_field, new_map_field}; + +/// Map a vec to an Arrow List +impl DefaultArrayType for Vec { + fn get_field(context: Context<'_>) -> Result { + new_list_field::(context) + } +} + +/// Map a `VecDeque` to an Arrow List +impl DefaultArrayType for VecDeque { + fn get_field(context: Context<'_>) -> Result { + new_list_field::(context) + } +} + +/// Map a `LinkedList` to an Arrow List +impl DefaultArrayType for LinkedList { + fn get_field(context: Context<'_>) -> Result { + new_list_field::(context) + } +} + +/// Map a `BinaryHeap` to an Arrow List +impl DefaultArrayType for BinaryHeap { + fn get_field(context: Context<'_>) -> Result { + new_list_field::(context) + } +} + +/// Map a `BTreeSet` to an Arrow List +impl DefaultArrayType for BTreeSet { + fn get_field(context: Context<'_>) -> Result { + new_list_field::(context) + } +} + +/// Map a `HashSet` to an Arrow List +impl DefaultArrayType for HashSet { + fn get_field(context: Context<'_>) -> Result { + new_list_field::(context) + } +} + +/// Map a `BTreeMap` to an Arrow Map +impl DefaultArrayType for BTreeMap { + fn get_field(context: Context<'_>) -> Result { + new_map_field::(context) + } +} + +/// Map a `HashMap` to an Arrow Map +impl DefaultArrayType for HashMap { + fn get_field(context: Context<'_>) -> Result { + new_map_field::(context) + } +} diff --git a/marrow-convert/src/internal/type_info_impls/compounds.rs b/marrow-convert/src/internal/type_info_impls/compounds.rs new file mode 100644 index 0000000..baa9b72 --- /dev/null +++ b/marrow-convert/src/internal/type_info_impls/compounds.rs @@ -0,0 +1,78 @@ +use marrow::datatypes::{DataType, Field}; + +use crate::{ + Result, + types::{Context, DefaultArrayType}, +}; + +use super::utils::new_list_field; + +impl DefaultArrayType for [T] { + fn get_field(context: Context<'_>) -> Result { + new_list_field::(context) + } +} + +impl DefaultArrayType for [T; N] { + fn get_field(context: Context<'_>) -> Result { + let base_field = context.get_field::("element")?; + let n = i32::try_from(N)?; + + // TODO: allow to customize + let data_type = if matches!(base_field.data_type, DataType::UInt8) { + DataType::FixedSizeBinary(n) + } else { + DataType::FixedSizeList(Box::new(base_field), n) + }; + + Ok(Field { + name: context.get_name().to_owned(), + data_type, + nullable: false, + metadata: Default::default(), + }) + } +} + +macro_rules! impl_tuples { + ($( ( $($name:ident,)* ), )*) => { + $( + impl<$($name: DefaultArrayType),*> DefaultArrayType for ( $($name,)* ) { + #[allow(unused_assignments, clippy::vec_init_then_push)] + fn get_field(context: Context<'_>) -> Result { + let mut idx = 0; + let mut fields = Vec::new(); + $( + fields.push(context.get_field::<$name>(&idx.to_string())?); + idx += 1; + )* + + Ok(Field { + name: context.get_name().to_owned(), + data_type: DataType::Struct(fields), + ..Field::default() + }) + } + } + )* + }; +} + +impl_tuples!( + (A,), + (A, B,), + (A, B, C,), + (A, B, C, D,), + (A, B, C, D, E,), + (A, B, C, D, E, F,), + (A, B, C, D, E, F, G,), + (A, B, C, D, E, F, G, H,), + (A, B, C, D, E, F, G, H, I,), + (A, B, C, D, E, F, G, H, I, J,), + (A, B, C, D, E, F, G, H, I, J, K,), + (A, B, C, D, E, F, G, H, I, J, K, L,), + (A, B, C, D, E, F, G, H, I, J, K, L, M,), + (A, B, C, D, E, F, G, H, I, J, K, L, M, N,), + (A, B, C, D, E, F, G, H, I, J, K, L, M, N, O,), + (A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P,), +); diff --git a/marrow-convert/src/internal/type_info_impls/ext/bigdecimal.rs b/marrow-convert/src/internal/type_info_impls/ext/bigdecimal.rs new file mode 100644 index 0000000..7e15e29 --- /dev/null +++ b/marrow-convert/src/internal/type_info_impls/ext/bigdecimal.rs @@ -0,0 +1,17 @@ +use marrow::datatypes::{DataType, Field}; + +use crate::{ + Result, + types::{Context, DefaultArrayType}, +}; + +impl DefaultArrayType for bigdecimal::BigDecimal { + fn get_field(context: Context<'_>) -> Result { + Ok(Field { + name: String::from(context.get_name()), + // TODO: find better defaults + data_type: DataType::Decimal128(5, 5), + ..Default::default() + }) + } +} diff --git a/marrow-convert/src/internal/type_info_impls/ext/chrono.rs b/marrow-convert/src/internal/type_info_impls/ext/chrono.rs new file mode 100644 index 0000000..771b88e --- /dev/null +++ b/marrow-convert/src/internal/type_info_impls/ext/chrono.rs @@ -0,0 +1,47 @@ +use chrono::Utc; +use marrow::datatypes::{DataType, Field, TimeUnit}; + +use crate::{ + Result, + types::{Context, DefaultArrayType}, +}; + +impl DefaultArrayType for chrono::NaiveDate { + fn get_field(context: Context<'_>) -> Result { + Ok(Field { + name: String::from(context.get_name()), + data_type: DataType::Date32, + ..Default::default() + }) + } +} + +impl DefaultArrayType for chrono::NaiveTime { + fn get_field(context: Context<'_>) -> Result { + Ok(Field { + name: String::from(context.get_name()), + data_type: DataType::Time32(TimeUnit::Millisecond), + ..Default::default() + }) + } +} + +impl DefaultArrayType for chrono::NaiveDateTime { + fn get_field(context: Context<'_>) -> Result { + Ok(Field { + name: String::from(context.get_name()), + data_type: DataType::Timestamp(TimeUnit::Millisecond, None), + ..Default::default() + }) + } +} + +impl DefaultArrayType for chrono::DateTime { + fn get_field(context: Context<'_>) -> Result { + Ok(Field { + name: String::from(context.get_name()), + data_type: DataType::Timestamp(TimeUnit::Millisecond, Some(String::from("UTC"))), + ..Default::default() + }) + } +} diff --git a/marrow-convert/src/internal/type_info_impls/ext/jiff.rs b/marrow-convert/src/internal/type_info_impls/ext/jiff.rs new file mode 100644 index 0000000..8feea93 --- /dev/null +++ b/marrow-convert/src/internal/type_info_impls/ext/jiff.rs @@ -0,0 +1,46 @@ +use marrow::datatypes::{DataType, Field, TimeUnit}; + +use crate::{ + Result, + types::{Context, DefaultArrayType}, +}; + +impl DefaultArrayType for jiff::civil::Date { + fn get_field(context: Context<'_>) -> Result { + Ok(Field { + name: String::from(context.get_name()), + data_type: DataType::Date32, + ..Default::default() + }) + } +} + +impl DefaultArrayType for jiff::civil::Time { + fn get_field(context: Context<'_>) -> Result { + Ok(Field { + name: String::from(context.get_name()), + data_type: DataType::Time32(TimeUnit::Millisecond), + ..Default::default() + }) + } +} + +impl DefaultArrayType for jiff::Span { + fn get_field(context: Context<'_>) -> Result { + Ok(Field { + name: String::from(context.get_name()), + data_type: DataType::Duration(TimeUnit::Millisecond), + ..Default::default() + }) + } +} + +impl DefaultArrayType for jiff::Timestamp { + fn get_field(context: Context<'_>) -> Result { + Ok(Field { + name: String::from(context.get_name()), + data_type: DataType::Timestamp(TimeUnit::Millisecond, None), + ..Default::default() + }) + } +} diff --git a/marrow-convert/src/internal/type_info_impls/ext/mod.rs b/marrow-convert/src/internal/type_info_impls/ext/mod.rs new file mode 100644 index 0000000..b9f9ea4 --- /dev/null +++ b/marrow-convert/src/internal/type_info_impls/ext/mod.rs @@ -0,0 +1,4 @@ +mod bigdecimal; +mod chrono; +mod jiff; +mod uuid; diff --git a/marrow-convert/src/internal/type_info_impls/ext/uuid.rs b/marrow-convert/src/internal/type_info_impls/ext/uuid.rs new file mode 100644 index 0000000..8fd08e4 --- /dev/null +++ b/marrow-convert/src/internal/type_info_impls/ext/uuid.rs @@ -0,0 +1,23 @@ +use std::collections::HashMap; + +use marrow::datatypes::{DataType, Field}; + +use crate::{ + Result, + types::{Context, DefaultArrayType}, +}; + +impl DefaultArrayType for uuid::Uuid { + fn get_field(context: Context<'_>) -> Result { + let mut metadata = HashMap::new(); + metadata.insert("ARROW:extension:name".into(), "arrow.uuid".into()); + metadata.insert("ARROW:extension:metadata".into(), String::new()); + + Ok(Field { + name: String::from(context.get_name()), + data_type: DataType::FixedSizeBinary(16), + metadata, + ..Default::default() + }) + } +} diff --git a/marrow-convert/src/internal/type_info_impls/mod.rs b/marrow-convert/src/internal/type_info_impls/mod.rs new file mode 100644 index 0000000..797a4cd --- /dev/null +++ b/marrow-convert/src/internal/type_info_impls/mod.rs @@ -0,0 +1,7 @@ +mod collections; +mod compounds; +mod ext; +mod primitives; +mod std; +mod utils; +mod wrappers; diff --git a/marrow-convert/src/internal/type_info_impls/primitives.rs b/marrow-convert/src/internal/type_info_impls/primitives.rs new file mode 100644 index 0000000..e5d2bab --- /dev/null +++ b/marrow-convert/src/internal/type_info_impls/primitives.rs @@ -0,0 +1,73 @@ +use marrow::{ + datatypes::{DataType, Field}, + types::f16, +}; + +use crate::{ + Result, + types::{Context, DefaultArrayType}, +}; + +use super::utils::new_string_field; + +macro_rules! define_primitive { + ($(($ty:ty, $dt:expr),)*) => { + $( + impl DefaultArrayType for $ty { + fn get_field(context: Context<'_>) -> Result { + Ok(Field { + name: context.get_name().to_owned(), + data_type: $dt, + ..Field::default() + }) + } + } + )* + }; +} + +define_primitive!( + (bool, DataType::Boolean), + (u8, DataType::UInt8), + (u16, DataType::UInt16), + (u32, DataType::UInt32), + (u64, DataType::UInt64), + (i8, DataType::Int8), + (i16, DataType::Int16), + (i32, DataType::Int32), + (i64, DataType::Int64), + (f16, DataType::Float16), + (f32, DataType::Float32), + (f64, DataType::Float64), + (char, DataType::UInt32), +); + +impl DefaultArrayType for () { + fn get_field(context: Context<'_>) -> Result { + let _ = context; + Ok(Field { + name: context.get_name().to_owned(), + data_type: DataType::Null, + nullable: true, + metadata: Default::default(), + }) + } +} + +impl DefaultArrayType for str { + fn get_field(context: Context<'_>) -> Result { + Ok(new_string_field(context)) + } +} + +impl DefaultArrayType for &T { + fn get_field(context: Context<'_>) -> Result { + T::get_field(context) + } +} + +impl DefaultArrayType for &mut T { + fn get_field(context: Context<'_>) -> Result { + T::get_field(context) + } +} diff --git a/marrow-convert/src/internal/type_info_impls/std.rs b/marrow-convert/src/internal/type_info_impls/std.rs new file mode 100644 index 0000000..f10e5be --- /dev/null +++ b/marrow-convert/src/internal/type_info_impls/std.rs @@ -0,0 +1,158 @@ +use std::{ + num::NonZero, + ops::{Bound, Range, RangeFrom, RangeInclusive, RangeTo, RangeToInclusive}, + sync::atomic::{ + AtomicBool, AtomicI8, AtomicI16, AtomicI32, AtomicI64, AtomicU8, AtomicU16, AtomicU32, + AtomicU64, + }, + time::{Duration, SystemTime}, +}; + +use marrow::datatypes::{DataType, Field, TimeUnit, UnionMode}; + +use crate::{ + Result, + types::{Context, DefaultArrayType}, +}; + +use super::utils::new_string_field; + +impl DefaultArrayType for String { + fn get_field(context: Context<'_>) -> Result { + Ok(new_string_field(context)) + } +} + +/// Map an option to a nullable field +impl DefaultArrayType for Option { + fn get_field(context: Context<'_>) -> Result { + let mut base_field = T::get_field(context)?; + base_field.nullable = true; + Ok(base_field) + } +} + +/// Map a `Result` to an Arrow Union with `Ok` and `Err` variants +impl DefaultArrayType for Result { + fn get_field(context: Context<'_>) -> Result { + let ok = context.get_field::("Ok")?; + let err = context.get_field::("Err")?; + + Ok(Field { + name: String::from(context.get_name()), + data_type: DataType::Union(vec![(0, ok), (1, err)], UnionMode::Dense), + ..Default::default() + }) + } +} + +/// Map a `Range` to an Arrow `FixedSizeList(.., 2)` +impl DefaultArrayType for Range { + fn get_field(context: Context<'_>) -> Result { + <[T; 2]>::get_field(context) + } +} + +/// Map a `RangeInclusive` to an Arrow `FixedSizeList(.., 2)` +impl DefaultArrayType for RangeInclusive { + fn get_field(context: Context<'_>) -> Result { + <[T; 2]>::get_field(context) + } +} + +/// Map a `RangeTo` to the index type +impl DefaultArrayType for RangeTo { + fn get_field(context: Context<'_>) -> Result { + T::get_field(context) + } +} + +/// Map a `RangeToInclusive` to the index type +impl DefaultArrayType for RangeToInclusive { + fn get_field(context: Context<'_>) -> Result { + T::get_field(context) + } +} + +/// Map a `RangeFrom` to the index type +impl DefaultArrayType for RangeFrom { + fn get_field(context: Context<'_>) -> Result { + T::get_field(context) + } +} + +/// Map a `Bound` to an Arrow Union with variants `Included`, `Excluded`, `Unbounded` +impl DefaultArrayType for Bound { + fn get_field(context: Context<'_>) -> Result { + let included = context.get_field::("Included")?; + let excluded = context.get_field::("Excluded")?; + let unbounded = context.get_field::<()>("Unbounded")?; + + Ok(Field { + name: String::from(context.get_name()), + data_type: DataType::Union( + vec![(0, included), (1, excluded), (2, unbounded)], + UnionMode::Dense, + ), + ..Default::default() + }) + } +} + +macro_rules! impl_nonzero { + ($($ty:ident),* $(,)?) => { + $( + impl DefaultArrayType for NonZero<$ty> { + fn get_field(context: Context<'_>) -> Result { + <$ty>::get_field(context) + } + } + )* + }; +} + +impl_nonzero!(u8, u16, u32, u64, i8, i16, i32, i64); + +macro_rules! impl_atomic { + ($(($atomic:ident, $ty:ident)),* $(,)?) => { + $( + impl DefaultArrayType for $atomic { + fn get_field(context: Context<'_>) -> Result { + $ty::get_field(context) + } + } + )* + }; +} + +impl_atomic!( + (AtomicBool, bool), + (AtomicI8, i8), + (AtomicI16, i16), + (AtomicI32, i32), + (AtomicI64, i64), + (AtomicU8, u8), + (AtomicU16, u16), + (AtomicU32, u32), + (AtomicU64, u64), +); + +impl DefaultArrayType for Duration { + fn get_field(context: Context<'_>) -> Result { + Ok(Field { + name: String::from(context.get_name()), + data_type: DataType::Duration(TimeUnit::Millisecond), + ..Default::default() + }) + } +} + +impl DefaultArrayType for SystemTime { + fn get_field(context: Context<'_>) -> Result { + Ok(Field { + name: String::from(context.get_name()), + data_type: DataType::Timestamp(TimeUnit::Millisecond, None), + ..Default::default() + }) + } +} diff --git a/marrow-convert/src/internal/type_info_impls/utils.rs b/marrow-convert/src/internal/type_info_impls/utils.rs new file mode 100644 index 0000000..b5904de --- /dev/null +++ b/marrow-convert/src/internal/type_info_impls/utils.rs @@ -0,0 +1,59 @@ +use marrow::datatypes::{DataType, Field}; + +use crate::{ + Result, + internal::type_info::{DefaultStringType, LargeList}, + types::{Context, DefaultArrayType}, +}; + +pub fn new_field(name: &str, data_type: DataType) -> Field { + Field { + name: name.to_owned(), + data_type, + nullable: false, + metadata: Default::default(), + } +} + +pub fn new_string_field(context: Context<'_>) -> Field { + let ty = if let Some(DefaultStringType(ty)) = context.get_options().get() { + ty.clone() + } else { + DataType::LargeUtf8 + }; + new_field(context.get_name(), ty) +} + +pub fn new_list_field(context: Context<'_>) -> Result { + let larget_list = if let Some(LargeList(large_list)) = context.get_options().get() { + *large_list + } else { + false + }; + + let base_field = context.get_field::("element")?; + + Ok(Field { + name: context.get_name().to_owned(), + data_type: if larget_list { + DataType::LargeList(Box::new(base_field)) + } else { + DataType::List(Box::new(base_field)) + }, + nullable: false, + metadata: Default::default(), + }) +} + +pub fn new_map_field( + context: Context<'_>, +) -> Result { + let key_field = context.get_field::("key")?; + let value_field = context.get_field::("value")?; + let entry_field = new_field("entry", DataType::Struct(vec![key_field, value_field])); + + Ok(new_field( + context.get_name(), + DataType::Map(Box::new(entry_field), false), + )) +} diff --git a/marrow-convert/src/internal/type_info_impls/wrappers.rs b/marrow-convert/src/internal/type_info_impls/wrappers.rs new file mode 100644 index 0000000..a0b0491 --- /dev/null +++ b/marrow-convert/src/internal/type_info_impls/wrappers.rs @@ -0,0 +1,70 @@ +use std::{ + borrow::Cow, + cell::{Cell, RefCell}, + marker::PhantomData, + rc::Rc, + sync::{Arc, Mutex, RwLock}, +}; + +use marrow::datatypes::Field; + +use crate::{ + Result, + types::{Context, DefaultArrayType}, +}; + +impl DefaultArrayType for PhantomData { + fn get_field(context: Context<'_>) -> Result { + let mut field = T::get_field(context)?; + field.nullable = true; + Ok(field) + } +} + +impl DefaultArrayType for Box { + fn get_field(context: Context<'_>) -> Result { + T::get_field(context) + } +} + +impl DefaultArrayType for Cell { + fn get_field(context: Context<'_>) -> Result { + T::get_field(context) + } +} + +impl DefaultArrayType for RefCell { + fn get_field(context: Context<'_>) -> Result { + T::get_field(context) + } +} + +impl DefaultArrayType for Mutex { + fn get_field(context: Context<'_>) -> Result { + T::get_field(context) + } +} + +impl DefaultArrayType for RwLock { + fn get_field(context: Context<'_>) -> Result { + T::get_field(context) + } +} + +impl DefaultArrayType for Rc { + fn get_field(context: Context<'_>) -> Result { + T::get_field(context) + } +} + +impl DefaultArrayType for Arc { + fn get_field(context: Context<'_>) -> Result { + T::get_field(context) + } +} + +impl<'a, T: DefaultArrayType + ToOwned + ?Sized + 'a> DefaultArrayType for Cow<'a, T> { + fn get_field(context: Context<'_>) -> Result { + T::get_field(context) + } +} diff --git a/marrow-convert/src/internal/util.rs b/marrow-convert/src/internal/util.rs new file mode 100644 index 0000000..61a24b0 --- /dev/null +++ b/marrow-convert/src/internal/util.rs @@ -0,0 +1,34 @@ +pub trait TupleLen { + const LEN: usize; +} + +macro_rules! impl_tuple_len { + ($head:ident, $($tail:ident,)*) => { + impl<$head, $($tail),*> TupleLen for ($head, $($tail,)*) { + const LEN: usize = 1 + <($($tail,)*) as TupleLen>::LEN; + } + }; + () => { + impl TupleLen for () { + const LEN: usize = 0; + } + }; +} + +impl_tuple_len!(); +impl_tuple_len!(A,); +impl_tuple_len!(A, B,); +impl_tuple_len!(A, B, C,); +impl_tuple_len!(A, B, C, D,); +impl_tuple_len!(A, B, C, D, E,); +impl_tuple_len!(A, B, C, D, E, F,); +impl_tuple_len!(A, B, C, D, E, F, G,); +impl_tuple_len!(A, B, C, D, E, F, G, H,); +impl_tuple_len!(A, B, C, D, E, F, G, H, I,); +impl_tuple_len!(A, B, C, D, E, F, G, H, I, J,); +impl_tuple_len!(A, B, C, D, E, F, G, H, I, J, K,); +impl_tuple_len!(A, B, C, D, E, F, G, H, I, J, K, L,); +impl_tuple_len!(A, B, C, D, E, F, G, H, I, J, K, L, M,); +impl_tuple_len!(A, B, C, D, E, F, G, H, I, J, K, L, M, N,); +impl_tuple_len!(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O,); +impl_tuple_len!(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P,); diff --git a/marrow-convert/src/lib.rs b/marrow-convert/src/lib.rs new file mode 100644 index 0000000..a50b010 --- /dev/null +++ b/marrow-convert/src/lib.rs @@ -0,0 +1,52 @@ +#![deny(rustdoc::broken_intra_doc_links)] +mod error; +mod internal; + +#[cfg(test)] +mod tests; + +pub use error::{Error, Result}; + +/// Traits to derive schema information from a type +pub mod types { + pub use crate::internal::type_info::{ + Context, DefaultArrayType, Options, get_data_type, get_field, + }; + + /// Derive [DefaultArrayType] for a given Rust type + /// + /// Currently structs and enums without type generic are supported. + pub use marrow_convert_derive::DefaultArrayType; +} + +/// Traits to allow constructing arrays from Rust objects +pub mod builder { + pub use crate::internal::builder::list::{LargeListBuilder, ListBuilder}; + pub use crate::internal::builder::primitive::{ + BooleanBuilder, Float16Builder, Float32Builder, Float64Builder, Int8Builder, Int16Builder, + Int32Builder, Int64Builder, NullBuilder, UInt8Builder, UInt16Builder, UInt32Builder, + UInt64Builder, + }; + pub use crate::internal::builder::{ArrayBuilder, ArrayPush, DefaultArrayBuilder}; + + /// Collect builders to simplify implementing custom builders for compound types (structs and + /// enums) + pub mod compound { + pub use crate::internal::builder::{ + r#struct::StructBuilder, + union::{DenseTypes, DenseUnionBuilder, SparseUnionBuilder}, + }; + } + + /// Derive [ArrayPush] for a given type + pub use marrow_convert_derive::ArrayPush; + + /// Derive [DefaultArrayBuilder] for a given type + pub use marrow_convert_derive::DefaultArrayBuilder; +} + +/// Additional documentation +pub mod docs { + #[doc = include_str!("../Design.md")] + pub mod design {} +} diff --git a/marrow-convert/src/tests.rs b/marrow-convert/src/tests.rs new file mode 100644 index 0000000..3a0efab --- /dev/null +++ b/marrow-convert/src/tests.rs @@ -0,0 +1,15 @@ +use marrow::datatypes::DataType; + +use crate::types::{Options, get_data_type}; + +#[test] +fn examples() { + assert_eq!( + get_data_type::(&Options::default()), + Ok(DataType::Int64) + ); + assert_eq!( + get_data_type::<[u8; 8]>(&Options::default()), + Ok(DataType::FixedSizeBinary(8)) + ); +} diff --git a/marrow-convert/tests/derive.rs b/marrow-convert/tests/derive.rs new file mode 100644 index 0000000..b68352d --- /dev/null +++ b/marrow-convert/tests/derive.rs @@ -0,0 +1 @@ +mod derive_tests; diff --git a/marrow-convert/tests/derive_tests/mod.rs b/marrow-convert/tests/derive_tests/mod.rs new file mode 100644 index 0000000..d511cdb --- /dev/null +++ b/marrow-convert/tests/derive_tests/mod.rs @@ -0,0 +1,3 @@ +mod test_array_push; +mod test_default_array_builder; +mod test_type_info; diff --git a/marrow-convert/tests/derive_tests/test_array_push.rs b/marrow-convert/tests/derive_tests/test_array_push.rs new file mode 100644 index 0000000..18dea3d --- /dev/null +++ b/marrow-convert/tests/derive_tests/test_array_push.rs @@ -0,0 +1,37 @@ +use marrow::datatypes::FieldMeta; +use marrow_convert::builder::{ArrayBuilder, ArrayPush}; + +#[test] +fn example() { + #[derive(marrow_convert::builder::ArrayPush)] + struct S { + a: i32, + b: i64, + } + + let mut builder = marrow_convert::builder::compound::StructBuilder { + len: 0, + meta: vec![ + FieldMeta { + name: String::from("a"), + ..Default::default() + }, + FieldMeta { + name: String::from("b"), + ..Default::default() + }, + ], + children: ( + marrow_convert::builder::Int32Builder::default(), + marrow_convert::builder::Int64Builder::default(), + ), + }; + + builder.push_value(&S { a: 1, b: -1 }).unwrap(); + builder.push_value(&S { a: 2, b: -2 }).unwrap(); + builder.push_value(&S { a: 3, b: -3 }).unwrap(); + + let array = builder.build_array().unwrap(); + // TODO: check resulting array + std::mem::drop(array); +} diff --git a/marrow-convert/tests/derive_tests/test_default_array_builder.rs b/marrow-convert/tests/derive_tests/test_default_array_builder.rs new file mode 100644 index 0000000..406f070 --- /dev/null +++ b/marrow-convert/tests/derive_tests/test_default_array_builder.rs @@ -0,0 +1,20 @@ +use marrow_convert::builder::{ArrayBuilder, ArrayPush, DefaultArrayBuilder}; + +#[test] +fn example() { + #[derive(DefaultArrayBuilder)] + struct S { + a: i32, + b: i64, + } + + let mut builder = S::default_builder(); + + builder.push_value(&S { a: 1, b: -1 }).unwrap(); + builder.push_value(&S { a: 2, b: -2 }).unwrap(); + builder.push_value(&S { a: 3, b: -3 }).unwrap(); + + let array = builder.build_array().unwrap(); + // TODO: check resulting array + std::mem::drop(array); +} diff --git a/marrow-convert/tests/derive_tests/test_type_info.rs b/marrow-convert/tests/derive_tests/test_type_info.rs new file mode 100644 index 0000000..92ddda0 --- /dev/null +++ b/marrow-convert/tests/derive_tests/test_type_info.rs @@ -0,0 +1,563 @@ +use marrow::{ + datatypes::{DataType, Field, TimeUnit, UnionMode}, + types::f16, +}; +use marrow_convert::{ + Result, + types::{Context, DefaultArrayType, Options}, +}; + +#[test] +fn example() { + #[derive(DefaultArrayType)] + #[allow(dead_code)] + struct S { + a: i64, + b: [u8; 4], + } + + assert_eq!( + marrow_convert::types::get_data_type::(&Options::default()), + Ok(DataType::Struct(vec![ + Field { + name: String::from("a"), + data_type: DataType::Int64, + ..Default::default() + }, + Field { + name: String::from("b"), + data_type: DataType::FixedSizeBinary(4), + ..Default::default() + } + ])) + ); +} + +#[test] +fn overwrites() { + #[derive(DefaultArrayType)] + #[allow(dead_code)] + struct S { + a: i64, + b: [u8; 4], + } + + assert_eq!( + marrow_convert::types::get_data_type::(&Options::default().overwrite( + "$.b", + Field { + data_type: DataType::Binary, + ..Field::default() + } + )), + Ok(DataType::Struct(vec![ + Field { + name: String::from("a"), + data_type: DataType::Int64, + ..Default::default() + }, + Field { + name: String::from("b"), + data_type: DataType::Binary, + ..Default::default() + } + ])) + ); +} + +#[test] +fn newtype() { + #[derive(DefaultArrayType)] + #[allow(dead_code)] + struct S(f16); + + assert_eq!( + marrow_convert::types::get_data_type::(&Options::default()), + Ok(DataType::Float16) + ); +} + +#[test] +fn tuple() { + #[derive(DefaultArrayType)] + #[allow(dead_code)] + struct S(u8, [u8; 4]); + + assert_eq!( + marrow_convert::types::get_data_type::(&Options::default()), + Ok(DataType::Struct(vec![ + Field { + name: String::from("0"), + data_type: DataType::UInt8, + ..Field::default() + }, + Field { + name: String::from("1"), + data_type: DataType::FixedSizeBinary(4), + ..Field::default() + }, + ])) + ); +} + +#[test] +fn customize() { + #[derive(DefaultArrayType)] + #[allow(dead_code)] + struct S { + #[marrow(with = "timestamp_field")] + a: i64, + b: [u8; 4], + } + + fn timestamp_field(context: Context<'_>) -> Result { + Ok(Field { + name: String::from(context.get_name()), + data_type: DataType::Timestamp(TimeUnit::Millisecond, None), + ..Default::default() + }) + } + + assert_eq!( + marrow_convert::types::get_data_type::(&Options::default()), + Ok(DataType::Struct(vec![ + Field { + name: String::from("a"), + data_type: DataType::Timestamp(TimeUnit::Millisecond, None), + ..Default::default() + }, + Field { + name: String::from("b"), + data_type: DataType::FixedSizeBinary(4), + ..Default::default() + } + ])) + ); +} + +#[test] +fn fieldless_union() { + #[derive(DefaultArrayType)] + #[allow(dead_code)] + enum E { + A, + B, + C, + } + + assert_eq!( + marrow_convert::types::get_data_type::(&Options::default()), + Ok(DataType::Union( + vec![ + ( + 0, + Field { + name: String::from("A"), + data_type: DataType::Null, + nullable: true, + metadata: Default::default(), + } + ), + ( + 1, + Field { + name: String::from("B"), + data_type: DataType::Null, + nullable: true, + metadata: Default::default(), + } + ), + ( + 2, + Field { + name: String::from("C"), + data_type: DataType::Null, + nullable: true, + metadata: Default::default(), + } + ), + ], + UnionMode::Dense + )) + ); +} + +#[test] +fn new_type_enum() { + #[derive(DefaultArrayType)] + #[allow(dead_code)] + enum Enum { + Struct(Struct), + Int64(i64), + } + + #[derive(DefaultArrayType)] + #[allow(dead_code)] + struct Struct { + a: bool, + b: (), + } + + assert_eq!( + marrow_convert::types::get_data_type::(&Options::default()), + Ok(DataType::Union( + vec![ + ( + 0, + Field { + name: String::from("Struct"), + data_type: DataType::Struct(vec![ + Field { + name: String::from("a"), + data_type: DataType::Boolean, + ..Default::default() + }, + Field { + name: String::from("b"), + data_type: DataType::Null, + nullable: true, + ..Default::default() + }, + ]), + nullable: false, + metadata: Default::default(), + } + ), + ( + 1, + Field { + name: String::from("Int64"), + data_type: DataType::Int64, + ..Default::default() + } + ), + ], + UnionMode::Dense + )) + ); +} + +#[test] +fn new_tuple_enum() { + #[derive(DefaultArrayType)] + #[allow(dead_code)] + enum Enum { + Int64(i64), + Tuple(i8, u32), + } + + assert_eq!( + marrow_convert::types::get_data_type::(&Options::default()), + Ok(DataType::Union( + vec![ + ( + 0, + Field { + name: String::from("Int64"), + data_type: DataType::Int64, + ..Field::default() + } + ), + ( + 1, + Field { + name: String::from("Tuple"), + data_type: DataType::Struct(vec![ + Field { + name: String::from("0"), + data_type: DataType::Int8, + ..Field::default() + }, + Field { + name: String::from("1"), + data_type: DataType::UInt32, + ..Field::default() + }, + ]), + ..Field::default() + } + ), + ], + UnionMode::Dense + )) + ); +} + +#[test] +fn new_struct_enum() { + #[derive(DefaultArrayType)] + #[allow(dead_code)] + enum Enum { + Int64(i64), + Struct { a: f32, b: String }, + } + + assert_eq!( + marrow_convert::types::get_data_type::(&Options::default()), + Ok(DataType::Union( + vec![ + ( + 0, + Field { + name: String::from("Int64"), + data_type: DataType::Int64, + ..Field::default() + } + ), + ( + 1, + Field { + name: String::from("Struct"), + data_type: DataType::Struct(vec![ + Field { + name: String::from("a"), + data_type: DataType::Float32, + ..Field::default() + }, + Field { + name: String::from("b"), + data_type: DataType::LargeUtf8, + ..Field::default() + }, + ]), + ..Field::default() + } + ), + ], + UnionMode::Dense + )) + ); +} + +#[test] +fn const_generics() { + #[derive(DefaultArrayType)] + #[allow(unused)] + struct Struct { + data: [u8; N], + } + + assert_eq!( + marrow_convert::types::get_data_type::>(&Options::default()), + Ok(DataType::Struct(vec![Field { + name: String::from("data"), + data_type: DataType::FixedSizeBinary(4), + nullable: false, + metadata: Default::default(), + },])) + ); +} + +#[test] +fn liftime_generics() { + #[derive(DefaultArrayType)] + #[allow(unused)] + struct Struct<'a, 'b> { + a: &'a u8, + b: &'b u16, + } + + assert_eq!( + marrow_convert::types::get_data_type::(&Options::default()), + Ok(DataType::Struct(vec![ + Field { + name: String::from("a"), + data_type: DataType::UInt8, + ..Default::default() + }, + Field { + name: String::from("b"), + data_type: DataType::UInt16, + ..Default::default() + }, + ])) + ); +} + +#[test] +fn liftime_generics_with_bounds() { + #[derive(DefaultArrayType)] + #[allow(unused)] + struct Struct<'a, 'b: 'a> { + a: &'a u8, + b: &'b u16, + } + + assert_eq!( + marrow_convert::types::get_data_type::(&Options::default()), + Ok(DataType::Struct(vec![ + Field { + name: String::from("a"), + data_type: DataType::UInt8, + ..Default::default() + }, + Field { + name: String::from("b"), + data_type: DataType::UInt16, + ..Default::default() + }, + ])) + ); +} + +#[test] +fn liftime_generics_with_where_clause() { + #[derive(DefaultArrayType)] + #[allow(unused)] + struct Struct<'a, 'b> + where + 'a: 'b, + { + a: &'a u8, + b: &'b u16, + } + + assert_eq!( + marrow_convert::types::get_data_type::(&Options::default()), + Ok(DataType::Struct(vec![ + Field { + name: String::from("a"), + data_type: DataType::UInt8, + ..Default::default() + }, + Field { + name: String::from("b"), + data_type: DataType::UInt16, + ..Default::default() + }, + ])) + ); +} + +#[test] +fn enums_const_generics() { + #[derive(DefaultArrayType)] + #[allow(unused)] + enum Enum { + Data([u8; N]), + } + + assert_eq!( + marrow_convert::types::get_data_type::>(&Options::default()), + Ok(DataType::Union( + vec![( + 0, + Field { + name: String::from("Data"), + data_type: DataType::FixedSizeBinary(4), + nullable: false, + metadata: Default::default(), + } + ),], + UnionMode::Dense + )), + ); +} + +#[test] +fn enums_with_liftime_generics() { + #[derive(DefaultArrayType)] + #[allow(unused)] + enum Enum<'a, 'b> { + A(&'a u8), + B(&'b u16), + } + + assert_eq!( + marrow_convert::types::get_data_type::(&Options::default()), + Ok(DataType::Union( + vec![ + ( + 0, + Field { + name: String::from("A"), + data_type: DataType::UInt8, + ..Default::default() + } + ), + ( + 1, + Field { + name: String::from("B"), + data_type: DataType::UInt16, + ..Default::default() + } + ), + ], + UnionMode::Dense + )) + ); +} + +#[test] +fn enum_liftime_generics_with_bounds() { + #[derive(DefaultArrayType)] + #[allow(unused)] + enum Enum<'a, 'b: 'a> { + A(&'a u8), + B(&'b u16), + } + + assert_eq!( + marrow_convert::types::get_data_type::(&Options::default()), + Ok(DataType::Union( + vec![ + ( + 0, + Field { + name: String::from("A"), + data_type: DataType::UInt8, + ..Default::default() + } + ), + ( + 1, + Field { + name: String::from("B"), + data_type: DataType::UInt16, + ..Default::default() + } + ), + ], + UnionMode::Dense + )) + ); +} + +#[test] +fn enum_liftime_generics_with_where_clause() { + #[derive(DefaultArrayType)] + #[allow(unused)] + enum Enum<'a, 'b> + where + 'a: 'b, + { + A(&'a u8), + B(&'b u16), + } + + assert_eq!( + marrow_convert::types::get_data_type::(&Options::default()), + Ok(DataType::Union( + vec![ + ( + 0, + Field { + name: String::from("A"), + data_type: DataType::UInt8, + ..Default::default() + } + ), + ( + 1, + Field { + name: String::from("B"), + data_type: DataType::UInt16, + ..Default::default() + } + ), + ], + UnionMode::Dense + )) + ); +} diff --git a/marrow/src/array.rs b/marrow/src/array.rs index 0a0fb8a..e2ad938 100644 --- a/marrow/src/array.rs +++ b/marrow/src/array.rs @@ -256,6 +256,76 @@ impl Array { Self::Union(array) => View::Union(array.as_view()), } } + + /// Extract the underlying primitive array if the array is of type Int8 + pub fn into_int8(self) -> Result, Array> { + match self { + Self::Int8(res) => Ok(res), + this => Err(this), + } + } + + /// Extract the underlying primitive array if the array is of type Int32 + pub fn into_int32(self) -> Result, Array> { + match self { + Self::Int32(res) => Ok(res), + this => Err(this), + } + } + + /// Extract the underlying primitive array if the array is of type Int64 + pub fn into_int64(self) -> Result, Array> { + match self { + Self::Int64(res) => Ok(res), + this => Err(this), + } + } + + /// Extract the underlying arrays of a struct array + pub fn into_struct_fields(self) -> Result<[(FieldMeta, Array); N], Array> { + match self { + Array::Struct(this) => { + let StructArray { + len, + validity, + fields, + } = this; + + match <[(FieldMeta, Array); N]>::try_from(fields) { + Ok(fields) => Ok(fields), + // rebuild the original array + Err(fields) => Err(Array::Struct(StructArray { + len, + validity, + fields, + })), + } + } + this => Err(this), + } + } + + /// Extract the underlying arrays of a union array + pub fn into_union_fields(self) -> Result<[(i8, FieldMeta, Array); N], Array> { + match self { + Array::Union(this) => { + let UnionArray { + types, + offsets, + fields, + } = this; + match <[(i8, FieldMeta, Array); N]>::try_from(fields) { + Ok(fields) => Ok(fields), + Err(fields) => Err(Array::Union(UnionArray { + types, + offsets, + fields, + })), + } + } + this => Err(this), + } + } } /// An array without data diff --git a/marrow/src/datatypes.rs b/marrow/src/datatypes.rs index aa11b1d..22521b5 100644 --- a/marrow/src/datatypes.rs +++ b/marrow/src/datatypes.rs @@ -37,7 +37,7 @@ impl std::default::Default for Field { Self { data_type: DataType::Null, name: Default::default(), - nullable: Default::default(), + nullable: false, metadata: Default::default(), } } diff --git a/marrow/src/types.rs b/marrow/src/types.rs index ec64d1e..4a63387 100644 --- a/marrow/src/types.rs +++ b/marrow/src/types.rs @@ -1,5 +1,8 @@ //! Specialized element types of arrays +/// Reexport the used f16 type +pub use half::f16; + /// Represent a calendar interval as days and milliseconds #[derive(Debug, PartialEq, Clone, Copy, bytemuck::AnyBitPattern, bytemuck::NoUninit)] #[repr(C)] diff --git a/x.py b/x.py index cd1af8e..3885a9a 100644 --- a/x.py +++ b/x.py @@ -210,11 +210,11 @@ def doc(private=False, open=False): @cmd() def check_cargo_toml(): - import tomli + import tomllib print(":: check Cargo.toml") with open(self_path / "marrow" / "Cargo.toml", "rb") as fobj: - config = tomli.load(fobj) + config = tomllib.load(fobj) for label, features in [ (