diff --git a/arrow/array/util.go b/arrow/array/util.go index 6a1d29cb..5cbcbeab 100644 --- a/arrow/array/util.go +++ b/arrow/array/util.go @@ -447,6 +447,8 @@ func getMaxBufferLen(dt arrow.DataType, length int) int { return bufferLen case arrow.OffsetsDataType: return maxOf(dt.OffsetTypeTraits().BytesRequired(length + 1)) + case arrow.BinaryViewDataType: + return maxOf(arrow.ViewHeaderSizeBytes * length) case *arrow.FixedSizeListType: return maxOf(getMaxBufferLen(dt.Elem(), int(dt.Len())*length)) case arrow.ExtensionType: @@ -496,6 +498,8 @@ func (n *nullArrayFactory) create() *Data { dictData = arr.Data() case arrow.FixedWidthDataType: bufs = append(bufs, n.buf) + case arrow.BinaryViewDataType: + bufs = append(bufs, n.buf) case arrow.BinaryDataType: bufs = append(bufs, n.buf, n.buf) case arrow.OffsetsDataType: diff --git a/arrow/compute/cast.go b/arrow/compute/cast.go index 95f2f8f1..df95a0f0 100644 --- a/arrow/compute/cast.go +++ b/arrow/compute/cast.go @@ -21,6 +21,7 @@ package compute import ( "context" "fmt" + "math" "sync" "github.com/apache/arrow-go/v18/arrow" @@ -28,6 +29,7 @@ import ( "github.com/apache/arrow-go/v18/arrow/bitutil" "github.com/apache/arrow-go/v18/arrow/compute/exec" "github.com/apache/arrow-go/v18/arrow/compute/internal/kernels" + "github.com/apache/arrow-go/v18/arrow/memory" ) var ( @@ -52,6 +54,21 @@ var ( return NewDatum(d[0]), nil } + // Coalesce multi-buffer view inputs. exec.ArraySpan's fixed + // [3]BufferSpan cannot carry BinaryView/StringView arrays with + // more than one overflow data buffer, so SetMembers would panic + // before any kernel runs. Rebuild such inputs with a single data + // buffer up front. See GH-184 review feedback. + coalescedDatum, err := coalesceMultiBufferViewDatum(ctx, d[0]) + if err != nil { + return nil, err + } + if coalescedDatum != nil { + defer coalescedDatum.Release() + d = append([]Datum(nil), d...) + d[0] = coalescedDatum + } + fn, err := getCastFunction(castOpts.ToType) if err != nil { return nil, fmt.Errorf("%w from %s", err, d[0].(ArrayLikeDatum).Type()) @@ -65,6 +82,245 @@ func RegisterScalarCast(reg FunctionRegistry) { reg.AddFunction(castMetaFunc, false) } +// coalesceMultiBufferViewDatum returns a new Datum whose view arrays each +// have a single overflow data buffer, or nil if the input does not need +// coalescing. Both ArrayDatum and ChunkedDatum view inputs are handled, +// including dictionaries whose values are multi-buffer view arrays (the +// recursive check matches what exec.ArraySpan.SetMembers traverses). +// Other datums are never coalesced. Callers are responsible for +// releasing the returned datum when it is non-nil. +func coalesceMultiBufferViewDatum(ctx context.Context, d Datum) (Datum, error) { + switch v := d.(type) { + case *ArrayDatum: + if !needsViewCoalesce(v.Value) { + return nil, nil + } + mem := exec.GetAllocator(ctx) + newData, err := coalesceArrayData(mem, v.Value) + if err != nil { + return nil, err + } + return &ArrayDatum{Value: newData}, nil + + case *ChunkedDatum: + chunks := v.Value.Chunks() + needAny := false + for _, c := range chunks { + if needsViewCoalesce(c.Data()) { + needAny = true + break + } + } + if !needAny { + return nil, nil + } + mem := exec.GetAllocator(ctx) + newChunks := make([]arrow.Array, len(chunks)) + for i, c := range chunks { + if !needsViewCoalesce(c.Data()) { + c.Retain() + newChunks[i] = c + continue + } + newData, err := coalesceArrayData(mem, c.Data()) + if err != nil { + for j := 0; j < i; j++ { + newChunks[j].Release() + } + return nil, err + } + newChunks[i] = array.MakeFromData(newData) + newData.Release() + } + chunked := arrow.NewChunked(v.Value.DataType(), newChunks) + for _, nc := range newChunks { + nc.Release() + } + return &ChunkedDatum{Value: chunked}, nil + } + return nil, nil +} + +// needsViewCoalesce reports whether data carries a view array whose +// payload spans more than the single overflow data buffer exec.ArraySpan +// can carry. The check recurses into dictionary values and ordinary +// child arrays because exec.ArraySpan.SetMembers recursively spans both; +// extension inputs reuse their storage layout in place, so they are +// treated as their StorageType for this traversal. +func needsViewCoalesce(data arrow.ArrayData) bool { + dt := data.DataType() + if ext, ok := dt.(arrow.ExtensionType); ok { + dt = ext.StorageType() + } + switch dt.ID() { + case arrow.BINARY_VIEW, arrow.STRING_VIEW: + return len(data.Buffers()) > 3 + case arrow.DICTIONARY: + return needsViewCoalesce(data.Dictionary()) + } + for _, c := range data.Children() { + if needsViewCoalesce(c) { + return true + } + } + return false +} + +// coalesceArrayData rebuilds data so that every view descendant lives in +// a single overflow buffer. View arrays are rebuilt via +// rebuildViewSingleBuffer; dictionaries keep their index buffers and +// recursively rebuild their values; nested types (list, struct, etc.) +// keep their own buffers and recursively rebuild each child. Extension +// inputs are rebuilt at their storage layer and then re-wrapped so the +// original extension datatype survives. Shares already-compliant +// children by retaining them rather than copying. +func coalesceArrayData(mem memory.Allocator, data arrow.ArrayData) (arrow.ArrayData, error) { + if ext, ok := data.DataType().(arrow.ExtensionType); ok { + storageData, err := reshapeArrayDataType(data, ext.StorageType()) + if err != nil { + return nil, err + } + defer storageData.Release() + newStorage, err := coalesceArrayData(mem, storageData) + if err != nil { + return nil, err + } + defer newStorage.Release() + return reshapeArrayDataType(newStorage, data.DataType()) + } + + switch data.DataType().ID() { + case arrow.BINARY_VIEW, arrow.STRING_VIEW: + return rebuildViewSingleBuffer(mem, data) + case arrow.DICTIONARY: + newValues, err := coalesceArrayData(mem, data.Dictionary()) + if err != nil { + return nil, err + } + defer newValues.Release() + newDict, ok := newValues.(*array.Data) + if !ok { + return nil, fmt.Errorf("%w: unexpected dictionary values data type %T", arrow.ErrInvalid, newValues) + } + return array.NewDataWithDictionary(data.DataType(), data.Len(), + data.Buffers(), data.NullN(), data.Offset(), newDict), nil + } + + children := data.Children() + if len(children) == 0 { + return nil, fmt.Errorf("%w: coalesceArrayData: no view descendants in %s", arrow.ErrInvalid, data.DataType()) + } + newChildren := make([]arrow.ArrayData, len(children)) + for i, c := range children { + if !needsViewCoalesce(c) { + c.Retain() + newChildren[i] = c + continue + } + nc, err := coalesceArrayData(mem, c) + if err != nil { + for j := 0; j < i; j++ { + newChildren[j].Release() + } + return nil, err + } + newChildren[i] = nc + } + result := array.NewData(data.DataType(), data.Len(), data.Buffers(), + newChildren, data.NullN(), data.Offset()) + for _, nc := range newChildren { + nc.Release() + } + return result, nil +} + +// reshapeArrayDataType returns a new ArrayData that shares src's buffers, +// children, nulls, offset, and dictionary but reports the given datatype. +// Used by the extension unwrap/rewrap path in coalesceArrayData so the +// Dictionary() member survives both directions (extension<->storage) +// when the extension's storage type is a dictionary. +func reshapeArrayDataType(src arrow.ArrayData, dt arrow.DataType) (arrow.ArrayData, error) { + if dict := src.Dictionary(); dict != nil { + d, ok := dict.(*array.Data) + if !ok { + return nil, fmt.Errorf("%w: unexpected dictionary data type %T", arrow.ErrInvalid, dict) + } + return array.NewDataWithDictionary(dt, src.Len(), src.Buffers(), + src.NullN(), src.Offset(), d), nil + } + return array.NewData(dt, src.Len(), src.Buffers(), + src.Children(), src.NullN(), src.Offset()), nil +} + +// rebuildViewSingleBuffer rebuilds a BinaryView/StringView ArrayData so that +// all non-inline payload lives in a single contiguous overflow buffer. The +// caller owns the returned ArrayData. +func rebuildViewSingleBuffer(mem memory.Allocator, data arrow.ArrayData) (arrow.ArrayData, error) { + arr := array.MakeFromData(data) + defer arr.Release() + + var ( + getLen func(int) int + getInto func(i int, dst func([]byte)) + ) + switch a := arr.(type) { + case *array.BinaryView: + getLen = a.ValueLen + getInto = func(i int, dst func([]byte)) { dst(a.Value(i)) } + case *array.StringView: + getLen = a.ValueLen + getInto = func(i int, dst func([]byte)) { + s := a.Value(i) + dst([]byte(s)) + } + default: + return nil, fmt.Errorf("%w: unexpected view array type %T", arrow.ErrInvalid, arr) + } + total, err := sumOutOfLineBytes(arr.Len(), arr.IsNull, getLen) + if err != nil { + return nil, err + } + + bldr := array.NewBuilder(mem, arr.DataType()) + defer bldr.Release() + bldr.Reserve(arr.Len()) + + switch b := bldr.(type) { + case *array.BinaryViewBuilder: + if total > 0 { + b.ReserveData(int(total)) + } + case *array.StringViewBuilder: + if total > 0 { + b.ReserveData(int(total)) + } + default: + return nil, fmt.Errorf("%w: unexpected view builder type %T", arrow.ErrInvalid, bldr) + } + + appendBytes := func([]byte) {} + switch b := bldr.(type) { + case *array.BinaryViewBuilder: + appendBytes = b.Append + case *array.StringViewBuilder: + appendBytes = b.BinaryViewBuilder.Append + } + + for i := 0; i < arr.Len(); i++ { + if arr.IsNull(i) { + bldr.AppendNull() + continue + } + getInto(i, appendBytes) + } + + newArr := bldr.NewArray() + defer newArr.Release() + result := newArr.Data() + result.Retain() + return result, nil +} + type castFunction struct { ScalarFunction @@ -150,7 +406,18 @@ func unpackDictionary(ctx *exec.KernelCtx, batch *exec.ExecSpan, out *exec.ExecR arrow.ErrInvalid, toType, dictType) } - unpacked, err := TakeArray(ctx.Ctx, dictArr.Dictionary(), dictArr.Indices()) + var ( + unpacked arrow.Array + err error + ) + switch dictArr.Dictionary().DataType().ID() { + case arrow.STRING_VIEW, arrow.BINARY_VIEW: + // array_take has no view kernel, so unpack view-typed dictionaries + // directly into a fresh view array instead of going through TakeArray. + unpacked, err = unpackViewDictionary(exec.GetAllocator(ctx.Ctx), dictArr) + default: + unpacked, err = TakeArray(ctx.Ctx, dictArr.Dictionary(), dictArr.Indices()) + } if err != nil { return err } @@ -168,6 +435,135 @@ func unpackDictionary(ctx *exec.KernelCtx, batch *exec.ExecSpan, out *exec.ExecR return nil } +// unpackViewDictionary materializes a dictionary whose values are a view +// type (string_view or binary_view) into a flat view array of the same +// type, preserving per-element nulls from the dictionary indices and +// per-slot nulls from the dictionary values themselves. Out-of-line +// payload is pre-reserved on the builder so the resulting array lives +// in a single overflow data buffer; exec.ArraySpan's fixed [3]BufferSpan +// cannot carry view arrays that span multiple data buffers. This exists +// because array_take has no view kernel; the cast meta function still +// needs to expand dictionaries as part of DICTIONARY -> X casts. +func unpackViewDictionary(mem memory.Allocator, dictArr *array.Dictionary) (arrow.Array, error) { + switch vals := dictArr.Dictionary().(type) { + case *array.StringView: + bldr := array.NewStringViewBuilder(mem) + defer bldr.Release() + if err := unpackViewDictionaryIntoBuilder(dictArr, vals, vals.ValueLen, viewDictBuilderAdapter{ + builder: bldr, + reserveData: bldr.ReserveData, + appendValue: func(idx int) { bldr.Append(vals.Value(idx)) }, + appendNull: bldr.AppendNull, + }); err != nil { + return nil, err + } + return bldr.NewArray(), nil + case *array.BinaryView: + bldr := array.NewBinaryViewBuilder(mem) + defer bldr.Release() + if err := unpackViewDictionaryIntoBuilder(dictArr, vals, vals.ValueLen, viewDictBuilderAdapter{ + builder: bldr, + reserveData: bldr.ReserveData, + appendValue: func(idx int) { bldr.Append(vals.Value(idx)) }, + appendNull: bldr.AppendNull, + }); err != nil { + return nil, err + } + return bldr.NewArray(), nil + default: + return nil, fmt.Errorf("%w: unpackViewDictionary: expected view-typed dictionary values, got %s", + arrow.ErrInvalid, vals.DataType()) + } +} + +// viewValuesNull is the null-check half of the view dictionary values +// interface; exposed separately because *array.StringView and +// *array.BinaryView both satisfy it while their Value() return types +// differ (string vs []byte) and cannot be expressed in one interface. +type viewValuesNull interface { + IsNull(int) bool +} + +// viewDictBuilderAdapter packages the concrete builder + typed closures +// needed by unpackViewDictionaryIntoBuilder; boxing them keeps the +// helper's parameter list tight and lets each caller express the +// builder-specific Append in one place. +type viewDictBuilderAdapter struct { + builder array.Builder + reserveData func(int) + appendValue func(idx int) + appendNull func() +} + +// unpackViewDictionaryIntoBuilder runs the shared reserve/walk pipeline +// for both view-typed dictionary branches of unpackViewDictionary. The +// caller owns the builder lifecycle and supplies the adapter so Append +// can stay typed to the concrete builder (StringViewBuilder.Append takes +// string, BinaryViewBuilder.Append takes []byte). +func unpackViewDictionaryIntoBuilder(dictArr *array.Dictionary, vals viewValuesNull, valLen func(int) int, adapter viewDictBuilderAdapter) error { + outOfLine, err := sumOutOfLineBytes(dictArr.Len(), + func(i int) bool { + if dictArr.IsNull(i) { + return true + } + return vals.IsNull(dictArr.GetValueIndex(i)) + }, + func(i int) int { return valLen(dictArr.GetValueIndex(i)) }, + ) + if err != nil { + return err + } + adapter.builder.Reserve(dictArr.Len()) + if outOfLine > 0 { + adapter.reserveData(int(outOfLine)) + } + buildFromDictionary(dictArr, vals.IsNull, adapter.appendValue, adapter.appendNull) + return nil +} + +// buildFromDictionary walks dictArr and routes each position to either +// appendValue(valueIndex) or appendNull(). A position is null when the +// dictionary index itself is null or when the referenced value is null. +func buildFromDictionary(dictArr *array.Dictionary, valIsNull func(int) bool, appendValue func(idx int), appendNull func()) { + for i := 0; i < dictArr.Len(); i++ { + if dictArr.IsNull(i) { + appendNull() + continue + } + idx := dictArr.GetValueIndex(i) + if valIsNull(idx) { + appendNull() + continue + } + appendValue(idx) + } +} + +// sumOutOfLineBytes returns the total non-inline payload (in bytes) that +// would be written to a view array's overflow buffer when iterating n +// positions with the given predicates, and arrow.ErrInvalid if the total +// exceeds the single-overflow-buffer limit (math.MaxInt32). Used by +// rebuildViewSingleBuffer (iterating top-level view rows), the +// binary-to-view cast kernel, and unpackViewDictionary (where the +// closures translate dictionary positions into value-level lookups). +func sumOutOfLineBytes(n int, isNull func(int) bool, valueLen func(int) int) (int64, error) { + var total int64 + for i := 0; i < n; i++ { + if isNull(i) { + continue + } + vlen := valueLen(i) + if !arrow.IsViewInline(vlen) { + total += int64(vlen) + if total > math.MaxInt32 { + return 0, fmt.Errorf("%w: view out-of-line payload (%d bytes) exceeds single-buffer limit (%d bytes)", + arrow.ErrInvalid, total, math.MaxInt32) + } + } + } + return total, nil +} + func CastFromExtension(ctx *exec.KernelCtx, batch *exec.ExecSpan, out *exec.ExecResult) error { opts := ctx.State.(kernels.CastState) @@ -537,6 +933,8 @@ func getBinaryLikeCasts() []*castFunction { addFn("cast_large_binary", arrow.LARGE_BINARY, kernels.GetToBinaryKernels(arrow.BinaryTypes.LargeBinary)) addFn("cast_string", arrow.STRING, kernels.GetToBinaryKernels(arrow.BinaryTypes.String)) addFn("cast_large_string", arrow.LARGE_STRING, kernels.GetToBinaryKernels(arrow.BinaryTypes.LargeString)) + addFn("cast_binary_view", arrow.BINARY_VIEW, kernels.GetToBinaryKernels(arrow.BinaryTypes.BinaryView)) + addFn("cast_string_view", arrow.STRING_VIEW, kernels.GetToBinaryKernels(arrow.BinaryTypes.StringView)) addFn("cast_fixed_sized_binary", arrow.FIXED_SIZE_BINARY, kernels.GetFsbCastKernels()) return out } diff --git a/arrow/compute/cast_internal_test.go b/arrow/compute/cast_internal_test.go new file mode 100644 index 00000000..1bfc5dfb --- /dev/null +++ b/arrow/compute/cast_internal_test.go @@ -0,0 +1,93 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build go1.18 + +package compute + +import ( + "strings" + "testing" + + "github.com/apache/arrow-go/v18/arrow" + "github.com/apache/arrow-go/v18/arrow/array" + "github.com/apache/arrow-go/v18/arrow/memory" + "github.com/apache/arrow-go/v18/internal/types" +) + +// TestCoalesceArrayDataExtensionOverDictOfView is the regression test for +// the GH-184 twelfth-round review finding: when an extension's storage is +// dictionary<...view...>, coalesceArrayData must preserve the +// Dictionary() member when it unwraps and rewraps the extension so the +// recursive DICTIONARY branch does not see a nil dictionary. Driven +// directly at the coalesce helper because the full cast pipeline for +// extension> hits a pre-existing SetMembers gap unrelated to +// view coalescing (exec.ArraySpan.SetMembers does not recognize a +// dictionary stored under an extension type). +func TestCoalesceArrayDataExtensionOverDictOfView(t *testing.T) { + mem := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer mem.AssertSize(t, 0) + + extType := types.NewDictStringViewExtType() + + const valBytes = 20000 + a := strings.Repeat("a", valBytes) + b := strings.Repeat("b", valBytes) + vbldr := array.NewStringViewBuilder(mem) + defer vbldr.Release() + vbldr.Append(a) + vbldr.Append(b) + vals := vbldr.NewArray() + defer vals.Release() + if n := len(vals.Data().Buffers()); n <= 3 { + t.Fatalf("test precondition: dict values must be multi-buffer; got %d", n) + } + + ibldr := array.NewInt32Builder(mem) + defer ibldr.Release() + ibldr.AppendValues([]int32{0, 1, 0, 1, 0}, nil) + idx := ibldr.NewArray() + defer idx.Release() + + valsData, ok := vals.Data().(*array.Data) + if !ok { + t.Fatalf("values data not *array.Data, got %T", vals.Data()) + } + extData := array.NewDataWithDictionary(extType, idx.Len(), + idx.Data().Buffers(), 0, 0, valsData) + defer extData.Release() + + if !needsViewCoalesce(extData) { + t.Fatal("needsViewCoalesce should return true for extension>") + } + + coalesced, err := coalesceArrayData(mem, extData) + if err != nil { + t.Fatalf("coalesceArrayData returned an error: %v", err) + } + defer coalesced.Release() + + if !arrow.TypeEqual(coalesced.DataType(), extType) { + t.Errorf("coalesced datatype mismatch: want %s, got %s", extType, coalesced.DataType()) + } + dict := coalesced.Dictionary() + if dict == nil { + t.Fatal("coalesced extension> must preserve Dictionary()") + } + if got := len(dict.Buffers()); got > 3 { + t.Errorf("coalesced dictionary values must be single-buffer; got %d buffers", got) + } +} diff --git a/arrow/compute/cast_test.go b/arrow/compute/cast_test.go index 370ced16..330f33fd 100644 --- a/arrow/compute/cast_test.go +++ b/arrow/compute/cast_test.go @@ -19,6 +19,7 @@ package compute_test import ( + "bytes" "context" "fmt" "math" @@ -1527,6 +1528,952 @@ func (c *CastSuite) TestStringToString() { } } +// TestBinaryLikeToBinaryView covers GH-184: casting between base-binary +// (binary, large_binary, string, large_string, fixed_size_binary) and the +// view variants (binary_view, string_view). It verifies round-trip fidelity +// for both short (<=12 byte, inline) and long (out-of-line) values, across +// null bitmaps, and exercises UTF-8 validation semantics. +func (c *CastSuite) TestBinaryLikeToBinaryView() { + const strInJSON = `["a", "short", "this is a string well over twelve bytes", "", null]` + + for _, from := range []arrow.DataType{arrow.BinaryTypes.String, arrow.BinaryTypes.LargeString} { + c.Run("from "+from.String(), func() { + c.Run("to string_view", func() { + c.checkCastArrayOnly(from, arrow.BinaryTypes.StringView, strInJSON, strInJSON) + c.checkCastArrayOnly(from, arrow.BinaryTypes.StringView, `[]`, `[]`) + }) + c.Run("to binary_view", func() { + exp := c.buildBinaryViewArray([][]byte{ + []byte("a"), []byte("short"), + []byte("this is a string well over twelve bytes"), + []byte(""), nil, + }) + defer exp.Release() + + in, _, err := array.FromJSON(c.mem, from, strings.NewReader(strInJSON), array.WithUseNumber()) + c.Require().NoError(err) + defer in.Release() + + got, err := compute.CastArray(context.Background(), in, + compute.SafeCastOptions(arrow.BinaryTypes.BinaryView)) + c.Require().NoError(err) + defer got.Release() + + c.Truef(array.Equal(exp, got), "expected %s, got %s", exp, got) + }) + }) + } + + const binJSON = `["aGk=", "dGhpcyBpcyBieXRlcyBiZXlvbmQgdHdlbHZlIGNoYXJz", "//4=", null]` + for _, from := range []arrow.DataType{arrow.BinaryTypes.Binary, arrow.BinaryTypes.LargeBinary} { + c.Run("from "+from.String(), func() { + c.Run("to binary_view", func() { + c.checkCastArrayOnly(from, arrow.BinaryTypes.BinaryView, binJSON, binJSON) + }) + c.Run("to string_view rejects invalid utf8", func() { + invalid := c.invalidUtf8Arr(from) + defer invalid.Release() + _, err := compute.CastArray(context.Background(), invalid, + compute.SafeCastOptions(arrow.BinaryTypes.StringView)) + c.ErrorIs(err, arrow.ErrInvalid) + }) + c.Run("to string_view allows invalid utf8 when opted in", func() { + invalid := c.invalidUtf8Arr(from) + defer invalid.Release() + opts := compute.SafeCastOptions(arrow.BinaryTypes.StringView) + opts.AllowInvalidUtf8 = true + out, err := compute.CastArray(context.Background(), invalid, opts) + c.NoError(err) + defer out.Release() + c.Equal(invalid.Len(), out.Len()) + }) + }) + } + + c.Run("from fixed_size_binary", func() { + fsbType := &arrow.FixedSizeBinaryType{ByteWidth: 3} + in, _, err := array.FromJSON(c.mem, fsbType, + strings.NewReader(`["YWJj", "ZGVm", null]`), array.WithUseNumber()) + c.Require().NoError(err) + defer in.Release() + + exp := c.buildBinaryViewArray([][]byte{[]byte("abc"), []byte("def"), nil}) + defer exp.Release() + + got, err := compute.CastArray(context.Background(), in, + compute.SafeCastOptions(arrow.BinaryTypes.BinaryView)) + c.Require().NoError(err) + defer got.Release() + + c.Truef(array.Equal(exp, got), "expected %s, got %s", exp, got) + }) +} + +// TestBinaryViewToBinaryLike covers the reverse direction of GH-184: +// materializing view arrays back into contiguous binary/string arrays. +func (c *CastSuite) TestBinaryViewToBinaryLike() { + const strJSON = `["a", "short", "this is a string well over twelve bytes", "", null]` + + for _, to := range []arrow.DataType{arrow.BinaryTypes.String, arrow.BinaryTypes.LargeString} { + c.Run("string_view to "+to.String(), func() { + c.checkCastArrayOnly(arrow.BinaryTypes.StringView, to, strJSON, strJSON) + c.checkCastArrayOnly(arrow.BinaryTypes.StringView, to, `[]`, `[]`) + }) + } + + const binJSON = `["aGk=", "dGhpcyBpcyBieXRlcyBiZXlvbmQgdHdlbHZlIGNoYXJz", "//4=", null]` + for _, to := range []arrow.DataType{arrow.BinaryTypes.Binary, arrow.BinaryTypes.LargeBinary} { + c.Run("binary_view to "+to.String(), func() { + c.checkCastArrayOnly(arrow.BinaryTypes.BinaryView, to, binJSON, binJSON) + }) + } + + c.Run("binary_view to string rejects invalid utf8", func() { + invalid := c.buildBinaryViewArray([][]byte{[]byte("ok"), {0xff, 0xfe}}) + defer invalid.Release() + + _, err := compute.CastArray(context.Background(), invalid, + compute.SafeCastOptions(arrow.BinaryTypes.String)) + c.ErrorIs(err, arrow.ErrInvalid) + }) +} + +// TestBinaryViewToBinaryView covers cross-view casts (binary_view <-> +// string_view) and identity casts. These are zero-copy except for UTF-8 +// validation in the binary_view -> string_view direction. +func (c *CastSuite) TestBinaryViewToBinaryView() { + values := [][]byte{ + []byte("a"), []byte("short"), + []byte("this is a string well over twelve bytes"), + []byte(""), nil, + } + + c.Run("string_view to binary_view", func() { + in := c.buildStringViewArray([]string{"a", "short", "this is a string well over twelve bytes", "", ""}, []bool{true, true, true, true, false}) + defer in.Release() + exp := c.buildBinaryViewArray(values) + defer exp.Release() + + got, err := compute.CastArray(context.Background(), in, + compute.SafeCastOptions(arrow.BinaryTypes.BinaryView)) + c.Require().NoError(err) + defer got.Release() + + c.Truef(array.Equal(exp, got), "expected %s, got %s", exp, got) + }) + + c.Run("binary_view to string_view valid utf8", func() { + in := c.buildBinaryViewArray(values) + defer in.Release() + exp := c.buildStringViewArray([]string{"a", "short", "this is a string well over twelve bytes", "", ""}, + []bool{true, true, true, true, false}) + defer exp.Release() + + got, err := compute.CastArray(context.Background(), in, + compute.SafeCastOptions(arrow.BinaryTypes.StringView)) + c.Require().NoError(err) + defer got.Release() + + c.Truef(array.Equal(exp, got), "expected %s, got %s", exp, got) + }) + + c.Run("binary_view to string_view rejects invalid utf8", func() { + invalid := c.buildBinaryViewArray([][]byte{[]byte("ok"), {0x80, 0x81}}) + defer invalid.Release() + + _, err := compute.CastArray(context.Background(), invalid, + compute.SafeCastOptions(arrow.BinaryTypes.StringView)) + c.ErrorIs(err, arrow.ErrInvalid) + }) +} + +// TestCanCastViewTypes verifies CanCast advertises the new view cast paths +// registered for GH-184. +func (c *CastSuite) TestCanCastViewTypes() { + baseBinary := []arrow.DataType{ + arrow.BinaryTypes.Binary, arrow.BinaryTypes.LargeBinary, + arrow.BinaryTypes.String, arrow.BinaryTypes.LargeString, + } + viewTypes := []arrow.DataType{arrow.BinaryTypes.BinaryView, arrow.BinaryTypes.StringView} + + for _, from := range baseBinary { + for _, to := range viewTypes { + c.Truef(compute.CanCast(from, to), "expected CanCast %s -> %s", from, to) + c.Truef(compute.CanCast(to, from), "expected CanCast %s -> %s", to, from) + } + } + + for _, from := range viewTypes { + for _, to := range viewTypes { + c.Truef(compute.CanCast(from, to), "expected CanCast %s -> %s", from, to) + } + } + + c.True(compute.CanCast(&arrow.FixedSizeBinaryType{ByteWidth: 3}, arrow.BinaryTypes.BinaryView)) + c.True(compute.CanCast(&arrow.FixedSizeBinaryType{ByteWidth: 3}, arrow.BinaryTypes.StringView)) +} + +// TestBinaryLikeToBinaryViewLargePayload is a regression test for the GH-184 +// review feedback: when the cast output's out-of-line payload would exceed +// the builder's default 32KB block size, the kernel must still produce a +// single-data-buffer view so the result fits in ArraySpan's fixed 3-buffer +// layout. A multi-buffer result would panic in out.TakeOwnership. +func (c *CastSuite) TestBinaryLikeToBinaryViewLargePayload() { + const ( + perValue = 200 + count = 500 // total out-of-line = 100_000 bytes, comfortably over the 32KB block default + ) + + longStr := strings.Repeat("x", perValue) + + sb := array.NewStringBuilder(c.mem) + defer sb.Release() + for i := 0; i < count; i++ { + sb.Append(longStr) + } + in := sb.NewArray() + defer in.Release() + + for _, to := range []arrow.DataType{arrow.BinaryTypes.StringView, arrow.BinaryTypes.BinaryView} { + c.Run("to "+to.String(), func() { + got, err := compute.CastArray(context.Background(), in, + compute.SafeCastOptions(to)) + c.Require().NoError(err) + defer got.Release() + + c.Equal(count, got.Len()) + c.LessOrEqualf(len(got.Data().Buffers()), 3, + "expected <=3 buffers (bitmap + view headers + 1 data), got %d", + len(got.Data().Buffers())) + + switch v := got.(type) { + case *array.StringView: + for i := 0; i < v.Len(); i++ { + c.Equalf(longStr, v.Value(i), "row %d mismatch", i) + } + case *array.BinaryView: + for i := 0; i < v.Len(); i++ { + c.Equalf(longStr, string(v.Value(i)), "row %d mismatch", i) + } + } + }) + } +} + +// TestMultiBufferViewInputCoalesced is a regression test for the second +// round of GH-184 review feedback: a BinaryView/StringView input whose +// payload already spans multiple builder blocks (>32KB) must be coalesced +// into a single-buffer view before entering compute, since +// exec.ArraySpan's fixed [3]BufferSpan cannot carry multi-buffer views. +func (c *CastSuite) TestMultiBufferViewInputCoalesced() { + const ( + perValue = 200 + count = 500 // ~100KB of non-inline payload, forces multi-buffer view + ) + longStr := strings.Repeat("y", perValue) + + buildMultiBufferBinaryView := func() arrow.Array { + bldr := array.NewBinaryViewBuilder(c.mem) + defer bldr.Release() + for i := 0; i < count; i++ { + bldr.Append([]byte(longStr)) + } + return bldr.NewArray() + } + + buildMultiBufferStringView := func() arrow.Array { + bldr := array.NewStringViewBuilder(c.mem) + defer bldr.Release() + for i := 0; i < count; i++ { + bldr.Append(longStr) + } + return bldr.NewArray() + } + + c.Run("binary_view input has multiple data buffers", func() { + in := buildMultiBufferBinaryView() + defer in.Release() + c.Greater(len(in.Data().Buffers()), 3, + "test precondition: input must be multi-buffer") + }) + + c.Run("string_view to utf8", func() { + in := buildMultiBufferStringView() + defer in.Release() + + got, err := compute.CastArray(context.Background(), in, + compute.SafeCastOptions(arrow.BinaryTypes.String)) + c.Require().NoError(err) + defer got.Release() + + sv := got.(*array.String) + c.Equal(count, sv.Len()) + for i := 0; i < sv.Len(); i++ { + c.Equalf(longStr, sv.Value(i), "row %d mismatch", i) + } + }) + + c.Run("binary_view to binary", func() { + in := buildMultiBufferBinaryView() + defer in.Release() + + got, err := compute.CastArray(context.Background(), in, + compute.SafeCastOptions(arrow.BinaryTypes.Binary)) + c.Require().NoError(err) + defer got.Release() + + bv := got.(*array.Binary) + c.Equal(count, bv.Len()) + for i := 0; i < bv.Len(); i++ { + c.Equalf(longStr, string(bv.Value(i)), "row %d mismatch", i) + } + }) + + c.Run("binary_view to string_view", func() { + in := buildMultiBufferBinaryView() + defer in.Release() + + got, err := compute.CastArray(context.Background(), in, + compute.SafeCastOptions(arrow.BinaryTypes.StringView)) + c.Require().NoError(err) + defer got.Release() + + sv := got.(*array.StringView) + c.Equal(count, sv.Len()) + c.LessOrEqualf(len(got.Data().Buffers()), 3, + "coalesced output must fit in ArraySpan's 3 buffers, got %d", + len(got.Data().Buffers())) + }) +} + +// TestNumericToStringViewLargePayload is a regression test for the second +// round of GH-184 review feedback: casting a numeric/temporal column with +// enough values to produce >32KB of non-inline formatted output must still +// produce a single-buffer string_view rather than spilling into a second +// overflow block (which would panic in out.TakeOwnership). +func (c *CastSuite) TestNumericToStringViewLargePayload() { + const count = 5000 + + c.Run("int64 to string_view", func() { + bldr := array.NewInt64Builder(c.mem) + defer bldr.Release() + for i := 0; i < count; i++ { + // use large magnitudes so formatted strings are 19-20 bytes (non-inline) + bldr.Append(int64(-9223372036854775000 + int64(i))) + } + in := bldr.NewArray() + defer in.Release() + + got, err := compute.CastArray(context.Background(), in, + compute.SafeCastOptions(arrow.BinaryTypes.StringView)) + c.Require().NoError(err) + defer got.Release() + + c.Equal(count, got.Len()) + c.LessOrEqualf(len(got.Data().Buffers()), 3, + "expected <=3 buffers for string_view output, got %d", + len(got.Data().Buffers())) + }) + + c.Run("timestamp to string_view", func() { + tsType := &arrow.TimestampType{Unit: arrow.Nanosecond, TimeZone: "UTC"} + bldr := array.NewTimestampBuilder(c.mem, tsType) + defer bldr.Release() + for i := 0; i < count; i++ { + bldr.Append(arrow.Timestamp(int64(1e18) + int64(i))) + } + in := bldr.NewArray() + defer in.Release() + + got, err := compute.CastArray(context.Background(), in, + compute.SafeCastOptions(arrow.BinaryTypes.StringView)) + c.Require().NoError(err) + defer got.Release() + + c.Equal(count, got.Len()) + c.LessOrEqualf(len(got.Data().Buffers()), 3, + "expected <=3 buffers for string_view output, got %d", + len(got.Data().Buffers())) + }) + + c.Run("time64 to string_view", func() { + t64Type := &arrow.Time64Type{Unit: arrow.Nanosecond} + bldr := array.NewTime64Builder(c.mem, t64Type) + defer bldr.Release() + for i := 0; i < count; i++ { + bldr.Append(arrow.Time64(int64(86400e9-1) - int64(i))) + } + in := bldr.NewArray() + defer in.Release() + + got, err := compute.CastArray(context.Background(), in, + compute.SafeCastOptions(arrow.BinaryTypes.StringView)) + c.Require().NoError(err) + defer got.Release() + + c.Equal(count, got.Len()) + c.LessOrEqualf(len(got.Data().Buffers()), 3, + "expected <=3 buffers for string_view output, got %d", + len(got.Data().Buffers())) + }) + + // Regression test for GH-184 eighth-round review finding: time32[ms] + // formats to exactly 12 bytes ("HH:MM:SS.sss") which ViewHeader + // stores inline (size <= 12), so the inline-skip in + // reserveFormattedData must treat 12 as inline. This is the + // cast-level smoke test verifying the end-to-end path; the boundary + // itself is exercised in TestReserveFormattedDataInlineViewSkip. + // We additionally assert the output has no overflow data buffer, i.e. + // all values stayed inline in view headers. + c.Run("time32[ms] to string_view stays inline", func() { + t32Type := &arrow.Time32Type{Unit: arrow.Millisecond} + bldr := array.NewTime32Builder(c.mem, t32Type) + defer bldr.Release() + for i := 0; i < count; i++ { + bldr.Append(arrow.Time32(int32(i))) + } + in := bldr.NewArray() + defer in.Release() + + got, err := compute.CastArray(context.Background(), in, + compute.SafeCastOptions(arrow.BinaryTypes.StringView)) + c.Require().NoError(err) + defer got.Release() + + c.Equal(count, got.Len()) + bufs := got.Data().Buffers() + c.LessOrEqualf(len(bufs), 3, + "expected <=3 buffers for string_view output, got %d", len(bufs)) + // All values must be inline; overflow buffer (index 2) should be + // absent or empty. + if len(bufs) >= 3 && bufs[2] != nil { + c.Equalf(0, bufs[2].Len(), + "time32[ms] values are <=12 bytes and must stay inline; "+ + "found %d bytes in overflow buffer", bufs[2].Len()) + } + }) +} + +// TestChunkedMultiBufferViewInputCoalesced is a regression test for the +// third round of GH-184 review feedback: a *ChunkedDatum whose chunks are +// multi-buffer view arrays must also be coalesced by the cast meta function +// before dispatch; otherwise executor.SetMembers on the second chunk panics. +func (c *CastSuite) TestChunkedMultiBufferViewInputCoalesced() { + const ( + perValue = 200 + count = 500 // ~100KB non-inline payload per chunk + ) + chunkStr1 := strings.Repeat("a", perValue) + chunkStr2 := strings.Repeat("b", perValue) + + buildChunk := func(s string) arrow.Array { + b := array.NewStringViewBuilder(c.mem) + defer b.Release() + for i := 0; i < count; i++ { + b.Append(s) + } + return b.NewArray() + } + + chunk1 := buildChunk(chunkStr1) + defer chunk1.Release() + chunk2 := buildChunk(chunkStr2) + defer chunk2.Release() + + c.Greater(len(chunk1.Data().Buffers()), 3, + "test precondition: chunk1 must be multi-buffer") + c.Greater(len(chunk2.Data().Buffers()), 3, + "test precondition: chunk2 must be multi-buffer") + + chunked := arrow.NewChunked(arrow.BinaryTypes.StringView, []arrow.Array{chunk1, chunk2}) + defer chunked.Release() + + in := compute.NewDatum(chunked) + defer in.Release() + + out, err := compute.CallFunction(context.Background(), "cast", + compute.SafeCastOptions(arrow.BinaryTypes.String), in) + c.Require().NoError(err) + defer out.Release() + + chunks := out.(compute.ArrayLikeDatum).Chunks() + total := 0 + for idx, ch := range chunks { + sv := ch.(*array.String) + for i := 0; i < sv.Len(); i++ { + want := chunkStr1 + if idx == 1 { + want = chunkStr2 + } + c.Equalf(want, sv.Value(i), "chunk %d row %d mismatch", idx, i) + } + total += sv.Len() + } + c.Equal(count*2, total) +} + +// buildInt32Dictionary builds an int32-indexed *array.Dictionary from the +// given values array, indices slice, and optional per-index validity. Used +// by view-dictionary cast regression tests. Indices must be valid against +// values; the helper handles retain/release bookkeeping internally. +func (c *CastSuite) buildInt32Dictionary(values arrow.Array, indices []int32, valid []bool) *array.Dictionary { + ibldr := array.NewInt32Builder(c.mem) + defer ibldr.Release() + ibldr.AppendValues(indices, valid) + idx := ibldr.NewArray() + defer idx.Release() + + dictType := &arrow.DictionaryType{ + IndexType: arrow.PrimitiveTypes.Int32, + ValueType: values.DataType(), + } + d := array.NewData(dictType, idx.Len(), idx.Data().Buffers(), nil, + idx.NullN(), 0) + d.SetDictionary(values.Data()) + da := array.NewDictionaryData(d) + d.Release() + return da +} + +// TestViewDictionaryUnpackCast is a regression test for the GH-184 +// fifth-round review finding: a cast whose input is a dictionary whose +// values are string_view or binary_view went through unpackDictionary's +// TakeArray path, which has no view kernel and errored out. Dictionary +// unpacking now handles view-typed values directly. Covers dict +// -> utf8, dict -> string_view (dict removal), dict +// -> binary, and null/empty preservation. +func (c *CastSuite) TestViewDictionaryUnpackCast() { + + c.Run("string_view dict to utf8", func() { + vals := c.buildStringViewArray([]string{"foo", "bar", "baz"}, nil) + defer vals.Release() + dict := c.buildInt32Dictionary(vals, []int32{0, 1, 0, 2}, nil) + defer dict.Release() + + out, err := compute.CastArray(context.Background(), dict, + compute.SafeCastOptions(arrow.BinaryTypes.String)) + c.Require().NoError(err) + defer out.Release() + + sv := out.(*array.String) + c.Equal(4, sv.Len()) + c.Equal("foo", sv.Value(0)) + c.Equal("bar", sv.Value(1)) + c.Equal("foo", sv.Value(2)) + c.Equal("baz", sv.Value(3)) + }) + + c.Run("string_view dict to string_view", func() { + vals := c.buildStringViewArray([]string{"alpha", "beta"}, nil) + defer vals.Release() + dict := c.buildInt32Dictionary(vals, []int32{1, 0, 1}, nil) + defer dict.Release() + + out, err := compute.CastArray(context.Background(), dict, + compute.SafeCastOptions(arrow.BinaryTypes.StringView)) + c.Require().NoError(err) + defer out.Release() + + sv := out.(*array.StringView) + c.Equal(3, sv.Len()) + c.Equal("beta", sv.Value(0)) + c.Equal("alpha", sv.Value(1)) + c.Equal("beta", sv.Value(2)) + }) + + c.Run("binary_view dict to binary", func() { + vals := c.buildBinaryViewArray([][]byte{[]byte("\x01\x02"), []byte("\x03")}) + defer vals.Release() + dict := c.buildInt32Dictionary(vals, []int32{0, 1, 0}, nil) + defer dict.Release() + + out, err := compute.CastArray(context.Background(), dict, + compute.SafeCastOptions(arrow.BinaryTypes.Binary)) + c.Require().NoError(err) + defer out.Release() + + bv := out.(*array.Binary) + c.Equal(3, bv.Len()) + c.Equal([]byte("\x01\x02"), bv.Value(0)) + c.Equal([]byte("\x03"), bv.Value(1)) + c.Equal([]byte("\x01\x02"), bv.Value(2)) + }) + + c.Run("string_view dict with nulls", func() { + vals := c.buildStringViewArray([]string{"x", "y"}, nil) + defer vals.Release() + dict := c.buildInt32Dictionary(vals, []int32{0, 0, 1}, []bool{true, false, true}) + defer dict.Release() + + out, err := compute.CastArray(context.Background(), dict, + compute.SafeCastOptions(arrow.BinaryTypes.String)) + c.Require().NoError(err) + defer out.Release() + + sv := out.(*array.String) + c.Equal(3, sv.Len()) + c.Equal(1, sv.NullN()) + c.False(sv.IsNull(0)) + c.True(sv.IsNull(1)) + c.False(sv.IsNull(2)) + c.Equal("x", sv.Value(0)) + c.Equal("y", sv.Value(2)) + }) + + c.Run("string_view dict with null in values array", func() { + vals := c.buildStringViewArray([]string{"a", "", "c"}, []bool{true, false, true}) + defer vals.Release() + dict := c.buildInt32Dictionary(vals, []int32{0, 1, 2, 1}, nil) + defer dict.Release() + + out, err := compute.CastArray(context.Background(), dict, + compute.SafeCastOptions(arrow.BinaryTypes.String)) + c.Require().NoError(err) + defer out.Release() + + sv := out.(*array.String) + c.Equal(4, sv.Len()) + c.Equal(2, sv.NullN()) + c.Equal("a", sv.Value(0)) + c.True(sv.IsNull(1)) + c.Equal("c", sv.Value(2)) + c.True(sv.IsNull(3)) + }) + + c.Run("binary_view dict with null in values array", func() { + bldr := array.NewBinaryViewBuilder(c.mem) + defer bldr.Release() + bldr.Append([]byte{0x01, 0x02}) + bldr.AppendNull() + bldr.Append([]byte{0x03}) + vals := bldr.NewArray() + defer vals.Release() + + dict := c.buildInt32Dictionary(vals, []int32{0, 1, 2, 1}, nil) + defer dict.Release() + + out, err := compute.CastArray(context.Background(), dict, + compute.SafeCastOptions(arrow.BinaryTypes.Binary)) + c.Require().NoError(err) + defer out.Release() + + bv := out.(*array.Binary) + c.Equal(4, bv.Len()) + c.Equal(2, bv.NullN()) + c.Equal([]byte{0x01, 0x02}, bv.Value(0)) + c.True(bv.IsNull(1)) + c.Equal([]byte{0x03}, bv.Value(2)) + c.True(bv.IsNull(3)) + }) + + // Regression for GH-184 ninth-round review finding: when the top-level + // input is a dictionary whose *values* are a multi-buffer view array, + // exec.ArraySpan.SetMembers recurses into the dictionary child and + // hits the fixed [3]BufferSpan before unpackViewDictionary ever runs. + // coalesceMultiBufferViewDatum must therefore rebuild the dictionary + // values into a single overflow buffer up front. Build dictionary + // values that are large enough to span multiple overflow buffers via + // the default 32KB block size (two 20,000-byte entries are enough to + // trip the multi-buffer condition). + c.Run("dict to utf8", func() { + const valBytes = 20000 + a := strings.Repeat("a", valBytes) + b := strings.Repeat("b", valBytes) + vbldr := array.NewStringViewBuilder(c.mem) + defer vbldr.Release() + // Force two separate overflow buffers by appending values individually + // through the default builder (no pre-reservation). + vbldr.Append(a) + vbldr.Append(b) + vals := vbldr.NewArray() + defer vals.Release() + c.Greaterf(len(vals.Data().Buffers()), 3, + "test precondition: dictionary values must be multi-buffer; got %d", + len(vals.Data().Buffers())) + + dict := c.buildInt32Dictionary(vals, []int32{0, 1, 0, 1}, nil) + defer dict.Release() + + out, err := compute.CastArray(context.Background(), dict, + compute.SafeCastOptions(arrow.BinaryTypes.String)) + c.Require().NoError(err) + defer out.Release() + + sv := out.(*array.String) + c.Equal(4, sv.Len()) + c.Equal(a, sv.Value(0)) + c.Equal(b, sv.Value(1)) + c.Equal(a, sv.Value(2)) + c.Equal(b, sv.Value(3)) + }) + + // Regression for GH-184 tenth-round review finding: exec.ArraySpan + // .SetMembers recurses through ordinary nested children as well as + // dictionary children, so a list (or struct-of-view) + // whose descendant is multi-buffer also panics before dispatch. + // coalesceMultiBufferViewDatum must therefore recurse through + // Children() to rebuild any descendant view array in place. + c.Run("list to list", func() { + const valBytes = 20000 + a := strings.Repeat("a", valBytes) + b := strings.Repeat("b", valBytes) + vbldr := array.NewStringViewBuilder(c.mem) + defer vbldr.Release() + vbldr.Append(a) + vbldr.Append(b) + vbldr.Append(a) + vbldr.Append(b) + values := vbldr.NewArray() + defer values.Release() + c.Greaterf(len(values.Data().Buffers()), 3, + "test precondition: list values must be multi-buffer; got %d", + len(values.Data().Buffers())) + + // Build list with two rows: [a, b] and [a, b]. + listType := arrow.ListOf(arrow.BinaryTypes.StringView) + offsetsBuf := memory.NewBufferBytes([]byte{ + 0x00, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, + }) + listData := array.NewData(listType, 2, + []*memory.Buffer{nil, offsetsBuf}, + []arrow.ArrayData{values.Data()}, 0, 0) + defer listData.Release() + listArr := array.MakeFromData(listData) + defer listArr.Release() + + out, err := compute.CastArray(context.Background(), listArr, + compute.SafeCastOptions(arrow.ListOf(arrow.BinaryTypes.String))) + c.Require().NoError(err) + defer out.Release() + + got := out.(*array.List) + c.Equal(2, got.Len()) + child := got.ListValues().(*array.String) + c.Equal(4, child.Len()) + c.Equal(a, child.Value(0)) + c.Equal(b, child.Value(1)) + c.Equal(a, child.Value(2)) + c.Equal(b, child.Value(3)) + }) + + // Regression for GH-184 eleventh-round review finding: extension + // arrays reuse their storage buffers/children in place, so an + // extension over a multi-buffer view hit exec.ArraySpan.SetMembers + // before CastFromExtension unwrapped the storage. coalesce must + // therefore traverse extensions through their StorageType and + // re-wrap after rebuilding so the extension datatype survives. + c.Run("extension to utf8", func() { + extType := types.NewStringViewExtType() + arrow.RegisterExtensionType(extType) + defer arrow.UnregisterExtensionType("string_view_ext") + + const valBytes = 20000 + a := strings.Repeat("a", valBytes) + b := strings.Repeat("b", valBytes) + vbldr := array.NewStringViewBuilder(c.mem) + defer vbldr.Release() + vbldr.Append(a) + vbldr.Append(b) + vbldr.Append(a) + storage := vbldr.NewArray() + defer storage.Release() + c.Greaterf(len(storage.Data().Buffers()), 3, + "test precondition: extension storage must be multi-buffer; got %d", + len(storage.Data().Buffers())) + + extData := array.NewData(extType, storage.Len(), storage.Data().Buffers(), + storage.Data().Children(), storage.NullN(), storage.Data().Offset()) + defer extData.Release() + extArr := array.MakeFromData(extData) + defer extArr.Release() + + out, err := compute.CastArray(context.Background(), extArr, + compute.SafeCastOptions(arrow.BinaryTypes.String)) + c.Require().NoError(err) + defer out.Release() + + sv := out.(*array.String) + c.Equal(3, sv.Len()) + c.Equal(a, sv.Value(0)) + c.Equal(b, sv.Value(1)) + c.Equal(a, sv.Value(2)) + }) + + c.Run("string_view dict to string_view with large non-inline payload", func() { + const ( + valBytes = 200 + count = 500 + ) + a := strings.Repeat("a", valBytes) + b := strings.Repeat("b", valBytes) + vals := c.buildStringViewArray([]string{a, b}, nil) + defer vals.Release() + + indices := make([]int32, count) + for i := range indices { + indices[i] = int32(i & 1) + } + dict := c.buildInt32Dictionary(vals, indices, nil) + defer dict.Release() + + out, err := compute.CastArray(context.Background(), dict, + compute.SafeCastOptions(arrow.BinaryTypes.StringView)) + c.Require().NoError(err) + defer out.Release() + + c.LessOrEqualf(len(out.Data().Buffers()), 3, + "expected <=3 buffers for unpacked view dictionary, got %d", + len(out.Data().Buffers())) + + sv := out.(*array.StringView) + c.Equal(count, sv.Len()) + c.Equal(a, sv.Value(0)) + c.Equal(b, sv.Value(1)) + c.Equal(a, sv.Value(count-2)) + c.Equal(b, sv.Value(count-1)) + }) + + c.Run("binary_view dict to binary_view with large non-inline payload", func() { + const ( + valBytes = 200 + count = 500 + ) + a := bytes.Repeat([]byte{0xAA}, valBytes) + b := bytes.Repeat([]byte{0xBB}, valBytes) + vals := c.buildBinaryViewArray([][]byte{a, b}) + defer vals.Release() + + indices := make([]int32, count) + for i := range indices { + indices[i] = int32(i & 1) + } + dict := c.buildInt32Dictionary(vals, indices, nil) + defer dict.Release() + + out, err := compute.CastArray(context.Background(), dict, + compute.SafeCastOptions(arrow.BinaryTypes.BinaryView)) + c.Require().NoError(err) + defer out.Release() + + c.LessOrEqualf(len(out.Data().Buffers()), 3, + "expected <=3 buffers for unpacked binary_view dictionary, got %d", + len(out.Data().Buffers())) + + bv := out.(*array.BinaryView) + c.Equal(count, bv.Len()) + c.Equal(a, bv.Value(0)) + c.Equal(b, bv.Value(1)) + c.Equal(a, bv.Value(count-2)) + c.Equal(b, bv.Value(count-1)) + }) +} + +// TestBoolToStringViewStaysSingleBuffer is a regression test for the +// GH-184 sixth-round review finding: boolToStringCastExec joined the +// numeric/temporal-to-string kernels without the matching pre-reservation +// path, so large bool -> string_view casts could allocate multiple +// overflow data buffers and exceed exec.ArraySpan's fixed [3]BufferSpan. +// The kernel now counts exact formatted bytes ("true"=4, "false"=5) and +// pre-reserves that amount on the builder, so the output always has +// <= 3 buffers and all-true casts near the int32 limit are not +// over-rejected relative to the tight bound. +func (c *CastSuite) TestBoolToStringViewStaysSingleBuffer() { + const count = 1 << 14 + + check := func(name string, pick func(i int) bool, wantBytes int64) { + c.Run(name, func() { + bldr := array.NewBooleanBuilder(c.mem) + defer bldr.Release() + for i := 0; i < count; i++ { + bldr.Append(pick(i)) + } + in := bldr.NewArray() + defer in.Release() + + out, err := compute.CastArray(context.Background(), in, + compute.SafeCastOptions(arrow.BinaryTypes.String)) + c.Require().NoError(err) + defer out.Release() + + c.Equal(count, out.Len()) + // Data buffer (index 2 for string) holds exactly the formatted + // bytes, validating the exact-count payload logic. + dataBuf := out.Data().Buffers()[2] + c.Require().NotNil(dataBuf) + c.Equalf(wantBytes, int64(dataBuf.Len()), + "%s: data buffer size should equal exact formatted payload", + name) + }) + } + check("all true", func(int) bool { return true }, 4*count) + check("all false", func(int) bool { return false }, 5*count) + check("alternating", func(i int) bool { return i%2 == 0 }, 4*(count/2)+5*(count/2)) + + c.Run("string_view stays within 3 buffers", func() { + bldr := array.NewBooleanBuilder(c.mem) + defer bldr.Release() + for i := 0; i < count; i++ { + bldr.Append(i%2 == 0) + } + in := bldr.NewArray() + defer in.Release() + + out, err := compute.CastArray(context.Background(), in, + compute.SafeCastOptions(arrow.BinaryTypes.StringView)) + c.Require().NoError(err) + defer out.Release() + + c.LessOrEqualf(len(out.Data().Buffers()), 3, + "expected <=3 buffers for bool->string_view, got %d", + len(out.Data().Buffers())) + c.Equal(count, out.Len()) + }) +} + +// checkCastArrayOnly performs a cast and compares the resulting array against +// an expected array built from JSON. It deliberately avoids the per-element +// scalar iteration path used by checkCast, which relies on scalar.GetScalar +// - that helper does not yet support view types (unrelated to GH-184). +func (c *CastSuite) checkCastArrayOnly(dtIn, dtOut arrow.DataType, inJSON, outJSON string) { + inArr, _, err := array.FromJSON(c.mem, dtIn, strings.NewReader(inJSON), array.WithUseNumber()) + c.Require().NoError(err) + defer inArr.Release() + + expArr, _, err := array.FromJSON(c.mem, dtOut, strings.NewReader(outJSON), array.WithUseNumber()) + c.Require().NoError(err) + defer expArr.Release() + + opts := compute.SafeCastOptions(dtOut) + got, err := compute.CastArray(context.Background(), inArr, opts) + c.Require().NoError(err) + defer got.Release() + + c.Truef(array.Equal(expArr, got), "cast %s -> %s: expected %s, got %s", + dtIn, dtOut, expArr, got) +} + +func (c *CastSuite) buildBinaryViewArray(values [][]byte) arrow.Array { + bldr := array.NewBinaryViewBuilder(c.mem) + defer bldr.Release() + for _, v := range values { + if v == nil { + bldr.AppendNull() + continue + } + bldr.Append(v) + } + return bldr.NewArray() +} + +func (c *CastSuite) buildStringViewArray(values []string, valid []bool) arrow.Array { + bldr := array.NewStringViewBuilder(c.mem) + defer bldr.Release() + bldr.AppendValues(values, valid) + return bldr.NewArray() +} + func (c *CastSuite) TestStringToInt() { for _, stype := range []arrow.DataType{arrow.BinaryTypes.String, arrow.BinaryTypes.LargeString} { for _, dt := range signedIntTypes { diff --git a/arrow/compute/exec/span.go b/arrow/compute/exec/span.go index a235d42b..be1a10a7 100644 --- a/arrow/compute/exec/span.go +++ b/arrow/compute/exec/span.go @@ -579,6 +579,13 @@ func getNumBuffers(dt arrow.DataType) int { return 1 case arrow.BINARY, arrow.LARGE_BINARY, arrow.STRING, arrow.LARGE_STRING, arrow.DENSE_UNION: return 3 + case arrow.BINARY_VIEW, arrow.STRING_VIEW: + // bitmap + view-header buffer + a single overflow data buffer. + // ArraySpan's fixed buffer array limits additional data buffers, + // so callers producing multi-buffer views must keep their data + // within a single block (the default 32KB allocation in the + // builder is sufficient for most use cases). + return 3 case arrow.EXTENSION: return getNumBuffers(dt.(arrow.ExtensionType).StorageType()) default: diff --git a/arrow/compute/internal/kernels/binary_view_casts.go b/arrow/compute/internal/kernels/binary_view_casts.go new file mode 100644 index 00000000..d3cb4dbe --- /dev/null +++ b/arrow/compute/internal/kernels/binary_view_casts.go @@ -0,0 +1,266 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build go1.18 + +package kernels + +import ( + "fmt" + "math" + "unsafe" + + "github.com/apache/arrow-go/v18/arrow" + "github.com/apache/arrow-go/v18/arrow/array" + "github.com/apache/arrow-go/v18/arrow/compute/exec" +) + +func validateUtf8View(input *exec.ArraySpan) error { + views := arrow.ViewHeaderTraits.CastFromBytes(input.Buffers[1].Buf) + dataBuffers := make([][]byte, 0, len(input.Buffers)-2) + for i := 2; i < len(input.Buffers); i++ { + dataBuffers = append(dataBuffers, input.Buffers[i].Buf) + } + return validateUTF8Sequence(input.Buffers[0].Buf, input.Offset, input.Len, + func(pos int64) []byte { + h := &views[input.Offset+pos] + if h.IsInline() { + return h.InlineBytes() + } + off := h.BufferOffset() + return dataBuffers[h.BufferIndex()][off : off+int32(h.Len())] + }) +} + +func unsafeStringBytes(s string) []byte { + if len(s) == 0 { + return nil + } + return unsafe.Slice(unsafe.StringData(s), len(s)) +} + +type binaryAppender struct { + bldr array.Builder + appendBytes func([]byte) + reserveData func(int) +} + +func newBinaryAppender(bldr array.Builder) (binaryAppender, error) { + switch b := bldr.(type) { + case *array.BinaryBuilder: + return binaryAppender{bldr: b, appendBytes: b.Append, reserveData: b.ReserveData}, nil + case *array.StringBuilder: + return binaryAppender{bldr: b, appendBytes: b.BinaryBuilder.Append, reserveData: b.ReserveData}, nil + case *array.LargeStringBuilder: + return binaryAppender{bldr: b, appendBytes: b.BinaryBuilder.Append, reserveData: b.ReserveData}, nil + case *array.BinaryViewBuilder: + return binaryAppender{bldr: b, appendBytes: b.Append, reserveData: b.ReserveData}, nil + case *array.StringViewBuilder: + return binaryAppender{bldr: b, appendBytes: b.BinaryViewBuilder.Append, reserveData: b.ReserveData}, nil + default: + return binaryAppender{}, fmt.Errorf("%w: unsupported builder type %T for binary-like output", + arrow.ErrNotImplemented, bldr) + } +} + +// binaryLikeValueAccessor returns a per-index byte slice accessor for any +// binary-like arrow array. Shared by CastBinaryToBinaryView and +// CastBinaryViewToBinary so the input-type switch has one source of truth. +func binaryLikeValueAccessor(arr arrow.Array) (func(int) []byte, error) { + switch a := arr.(type) { + case *array.Binary: + return a.Value, nil + case *array.LargeBinary: + return a.Value, nil + case *array.String: + return func(i int) []byte { return unsafeStringBytes(a.Value(i)) }, nil + case *array.LargeString: + return func(i int) []byte { return unsafeStringBytes(a.Value(i)) }, nil + case *array.FixedSizeBinary: + return a.Value, nil + case *array.BinaryView: + return a.Value, nil + case *array.StringView: + return func(i int) []byte { return unsafeStringBytes(a.Value(i)) }, nil + default: + return nil, fmt.Errorf("%w: unsupported binary-like type: %s", + arrow.ErrNotImplemented, arr.DataType()) + } +} + +// appendBinaryValues drives the null-preserving append loop shared by +// both directions of the binary<->view cast kernels. +func appendBinaryValues(arr arrow.Array, getVal func(int) []byte, ba binaryAppender) { + for i := 0; i < arr.Len(); i++ { + if arr.IsNull(i) { + ba.bldr.AppendNull() + continue + } + ba.appendBytes(getVal(i)) + } +} + +// CastBinaryToBinaryView casts a Binary, LargeBinary, String, LargeString, +// or FixedSizeBinary array into a BinaryView or StringView array. When the +// source is a non-utf8 binary type and the destination is a utf8 view type, +// every non-null element is validated as UTF-8 unless +// CastOptions.AllowInvalidUtf8 is set. +func CastBinaryToBinaryView(ctx *exec.KernelCtx, batch *exec.ExecSpan, out *exec.ExecResult) error { + opts := ctx.State.(CastState) + input := &batch.Values[0].Array + outputType := out.Type.(arrow.BinaryDataType) + + inputIsUtf8 := false + if b, ok := input.Type.(arrow.BinaryDataType); ok { + inputIsUtf8 = b.IsUtf8() + } + + if !inputIsUtf8 && outputType.IsUtf8() && !opts.AllowInvalidUtf8 { + switch input.Type.ID() { + case arrow.BINARY: + if err := validateUtf8[int32](input); err != nil { + return err + } + case arrow.LARGE_BINARY: + if err := validateUtf8[int64](input); err != nil { + return err + } + case arrow.FIXED_SIZE_BINARY: + if err := validateUtf8Fsb(input); err != nil { + return err + } + } + } + + rawBldr := array.NewBuilder(exec.GetAllocator(ctx.Ctx), out.Type) + defer rawBldr.Release() + rawBldr.Reserve(int(input.Len)) + + ba, err := newBinaryAppender(rawBldr) + if err != nil { + return err + } + + arr := input.MakeArray() + defer arr.Release() + + getVal, err := binaryLikeValueAccessor(arr) + if err != nil { + return fmt.Errorf("%w: unsupported input type for cast to %s: %s", + arrow.ErrNotImplemented, out.Type, input.Type) + } + + // Pre-size the out-of-line data buffer to the total required capacity so + // the builder allocates a single overflow block. ArraySpan only has three + // buffer slots (bitmap + view headers + one data buffer); if we let the + // builder spill into a second block, TakeOwnership would index past + // Buffers[2] and panic. See the review feedback on GH-184. + var outOfLineTotal int64 + for i := 0; i < arr.Len(); i++ { + if arr.IsNull(i) { + continue + } + vlen := len(getVal(i)) + if !arrow.IsViewInline(vlen) { + outOfLineTotal += int64(vlen) + } + } + if outOfLineTotal > math.MaxInt32 { + return fmt.Errorf("%w: cast from %s to %s: out-of-line payload (%d bytes) exceeds single view data buffer limit (%d bytes)", + arrow.ErrInvalid, input.Type, out.Type, outOfLineTotal, math.MaxInt32) + } + if outOfLineTotal > 0 { + ba.reserveData(int(outOfLineTotal)) + } + + appendBinaryValues(arr, getVal, ba) + + result := ba.bldr.NewArray() + out.TakeOwnership(result.Data()) + return nil +} + +// CastBinaryViewToBinary casts a BinaryView or StringView array into a +// Binary, LargeBinary, String, or LargeString array, materializing the +// referenced byte ranges into a single contiguous data buffer. UTF-8 +// validation is performed when casting from a non-utf8 view into a utf8 +// destination unless CastOptions.AllowInvalidUtf8 is set. +func CastBinaryViewToBinary[OutOffsetT int32 | int64](ctx *exec.KernelCtx, batch *exec.ExecSpan, out *exec.ExecResult) error { + opts := ctx.State.(CastState) + input := &batch.Values[0].Array + inputType := input.Type.(arrow.BinaryDataType) + outputType := out.Type.(arrow.BinaryDataType) + + if !inputType.IsUtf8() && outputType.IsUtf8() && !opts.AllowInvalidUtf8 { + if err := validateUtf8View(input); err != nil { + return err + } + } + + rawBldr := array.NewBuilder(exec.GetAllocator(ctx.Ctx), out.Type) + defer rawBldr.Release() + rawBldr.Reserve(int(input.Len)) + + ba, err := newBinaryAppender(rawBldr) + if err != nil { + return err + } + + arr := input.MakeArray() + defer arr.Release() + + getVal, err := binaryLikeValueAccessor(arr) + if err != nil { + return fmt.Errorf("%w: unsupported input type for view-to-binary cast: %s", + arrow.ErrNotImplemented, input.Type) + } + + var totalBytes int64 + for i := 0; i < arr.Len(); i++ { + if !arr.IsNull(i) { + totalBytes += int64(len(getVal(i))) + } + } + if totalBytes > int64(MaxOf[OutOffsetT]()) { + return fmt.Errorf("%w: failed casting from %s to %s: input array too large", + arrow.ErrInvalid, input.Type, out.Type) + } + + appendBinaryValues(arr, getVal, ba) + + result := ba.bldr.NewArray() + out.TakeOwnership(result.Data()) + return nil +} + +// CastBinaryViewToBinaryView handles casts between BinaryView and StringView +// (and same-type view casts). The cast is zero-copy; UTF-8 validation is +// performed when casting from a binary_view into a string_view unless +// CastOptions.AllowInvalidUtf8 is set. +func CastBinaryViewToBinaryView(ctx *exec.KernelCtx, batch *exec.ExecSpan, out *exec.ExecResult) error { + opts := ctx.State.(CastState) + input := &batch.Values[0].Array + inputType := input.Type.(arrow.BinaryDataType) + outputType := out.Type.(arrow.BinaryDataType) + + if !inputType.IsUtf8() && outputType.IsUtf8() && !opts.AllowInvalidUtf8 { + if err := validateUtf8View(input); err != nil { + return err + } + } + + return ZeroCopyCastExec(ctx, batch, out) +} diff --git a/arrow/compute/internal/kernels/string_casts.go b/arrow/compute/internal/kernels/string_casts.go index 1f4b2218..ec49ca3a 100644 --- a/arrow/compute/internal/kernels/string_casts.go +++ b/arrow/compute/internal/kernels/string_casts.go @@ -20,6 +20,7 @@ package kernels import ( "fmt" + "math" "strconv" "unicode/utf8" @@ -31,40 +32,42 @@ import ( "github.com/apache/arrow-go/v18/internal/bitutils" ) +// validateUTF8Sequence walks non-null positions in bitmap and validates +// the byte slice returned by valueAt against utf8.Valid. Shared skeleton +// for validateUtf8, validateUtf8Fsb, and validateUtf8View so they stay +// consistent on iteration, error format, and null handling. +func validateUTF8Sequence(bitmap []byte, off, n int64, valueAt func(pos int64) []byte) error { + return bitutils.VisitBitBlocksShort(bitmap, off, n, + func(pos int64) error { + v := valueAt(pos) + if !utf8.Valid(v) { + return fmt.Errorf("%w: invalid UTF8 bytes: %x", arrow.ErrInvalid, v) + } + return nil + }, func() error { return nil }) +} + func validateUtf8Fsb(input *exec.ArraySpan) error { var ( inputData = input.Buffers[1].Buf width = int64(input.Type.(*arrow.FixedSizeBinaryType).ByteWidth) - bitmap = input.Buffers[0].Buf ) - - return bitutils.VisitBitBlocksShort(bitmap, input.Offset, input.Len, - func(pos int64) error { + return validateUTF8Sequence(input.Buffers[0].Buf, input.Offset, input.Len, + func(pos int64) []byte { pos += input.Offset - beg := pos * width - end := (pos + 1) * width - if !utf8.Valid(inputData[beg:end]) { - return fmt.Errorf("%w: invalid UTF8 bytes: %x", arrow.ErrInvalid, inputData[beg:end]) - } - return nil - }, func() error { return nil }) + return inputData[pos*width : (pos+1)*width] + }) } func validateUtf8[OffsetT int32 | int64](input *exec.ArraySpan) error { var ( inputOffsets = exec.GetSpanOffsets[OffsetT](input, 1) inputData = input.Buffers[2].Buf - bitmap = input.Buffers[0].Buf ) - - return bitutils.VisitBitBlocksShort(bitmap, input.Offset, input.Len, - func(pos int64) error { - v := inputData[inputOffsets[pos]:inputOffsets[pos+1]] - if !utf8.Valid(v) { - return fmt.Errorf("%w: invalid UTF8 bytes: %x", arrow.ErrInvalid, v) - } - return nil - }, func() error { return nil }) + return validateUTF8Sequence(input.Buffers[0].Buf, input.Offset, input.Len, + func(pos int64) []byte { + return inputData[inputOffsets[pos]:inputOffsets[pos+1]] + }) } func CastFsbToFsb(ctx *exec.KernelCtx, batch *exec.ExecSpan, out *exec.ExecResult) error { @@ -178,17 +181,55 @@ func addBinaryToBinaryCast[InOffsetT, OutOffsetT int32 | int64](inType arrow.Typ outType, CastBinaryToBinary[InOffsetT, OutOffsetT], nil) } +func addViewToBinaryCast[OutOffsetT int32 | int64](inType arrow.Type, outType exec.OutputType) exec.ScalarKernel { + k := exec.NewScalarKernel([]exec.InputType{exec.NewIDInput(inType)}, + outType, CastBinaryViewToBinary[OutOffsetT], nil) + k.NullHandling = exec.NullComputedNoPrealloc + k.MemAlloc = exec.MemNoPrealloc + return k +} + +func addBinaryToBinaryViewCast(inType arrow.Type, outType exec.OutputType) exec.ScalarKernel { + k := exec.NewScalarKernel([]exec.InputType{exec.NewIDInput(inType)}, + outType, CastBinaryToBinaryView, nil) + k.NullHandling = exec.NullComputedNoPrealloc + k.MemAlloc = exec.MemNoPrealloc + return k +} + +func addViewToViewCast(inType arrow.Type, outType exec.OutputType) exec.ScalarKernel { + k := exec.NewScalarKernel([]exec.InputType{exec.NewIDInput(inType)}, + outType, CastBinaryViewToBinaryView, nil) + k.NullHandling = exec.NullComputedNoPrealloc + k.MemAlloc = exec.MemNoPrealloc + return k +} + func addToBinaryKernels[OffsetsT int32 | int64](outType exec.OutputType, kernels []exec.ScalarKernel) []exec.ScalarKernel { return append(kernels, addBinaryToBinaryCast[int32, OffsetsT](arrow.STRING, outType), addBinaryToBinaryCast[int32, OffsetsT](arrow.BINARY, outType), addBinaryToBinaryCast[int64, OffsetsT](arrow.LARGE_STRING, outType), addBinaryToBinaryCast[int64, OffsetsT](arrow.LARGE_BINARY, outType), + addViewToBinaryCast[OffsetsT](arrow.BINARY_VIEW, outType), + addViewToBinaryCast[OffsetsT](arrow.STRING_VIEW, outType), exec.NewScalarKernel([]exec.InputType{exec.NewIDInput(arrow.FIXED_SIZE_BINARY)}, outType, CastFsbToBinary[OffsetsT], nil), ) } +func addToBinaryViewKernels(outType exec.OutputType, kernels []exec.ScalarKernel) []exec.ScalarKernel { + return append(kernels, + addBinaryToBinaryViewCast(arrow.STRING, outType), + addBinaryToBinaryViewCast(arrow.BINARY, outType), + addBinaryToBinaryViewCast(arrow.LARGE_STRING, outType), + addBinaryToBinaryViewCast(arrow.LARGE_BINARY, outType), + addBinaryToBinaryViewCast(arrow.FIXED_SIZE_BINARY, outType), + addViewToViewCast(arrow.BINARY_VIEW, outType), + addViewToViewCast(arrow.STRING_VIEW, outType), + ) +} + func GetFsbCastKernels() []exec.ScalarKernel { outputType := exec.NewComputedOutputType(resolveOutputFromOptions) out := GetCommonCastKernels(arrow.FIXED_SIZE_BINARY, outputType) @@ -213,6 +254,27 @@ func boolToStringCastExec(ctx *exec.KernelCtx, batch *exec.ExecSpan, out *exec.E ) defer bldr.Release() + bldr.Reserve(int(input.Len)) + // "true" (4 bytes) and "false" (5 bytes) are both inline for view + // builders; they consume no overflow data, so skip reservation and the + // limit check. For offset-based builders, count exact formatted bytes + // (tighter than 5*nonNull and avoids rejecting valid large all-true + // casts near the int32 limit). + if !isViewBuilder(bldr) { + var total int64 + bitutils.VisitBitBlocks(input.Buffers[0].Buf, input.Offset, input.Len, + func(pos int64) { + if bitutil.BitIsSet(input.Buffers[1].Buf, int(pos)) { + total += 4 + } else { + total += 5 + } + }, func() {}) + if err := reserveFormattedDataExact(bldr, total); err != nil { + return err + } + } + bitutils.VisitBitBlocks(input.Buffers[0].Buf, input.Offset, input.Len, func(pos int64) { bldr.Append(strconv.FormatBool(bitutil.BitIsSet(input.Buffers[1].Buf, int(pos)))) @@ -237,6 +299,11 @@ func timeToStringCastExec[T timeIntrinsic](ctx *exec.KernelCtx, batch *exec.Exec ) defer bldr.Release() + bldr.Reserve(int(input.Len)) + if err := reserveFormattedData(bldr, input, maxFormattedBytes(input.Type)); err != nil { + return err + } + bitutils.VisitBitBlocks(input.Buffers[0].Buf, input.Offset, input.Len, func(pos int64) { bldr.Append(inputData[pos].FormattedString(inputType.TimeUnit())) @@ -256,6 +323,11 @@ func numericToStringCastExec[T arrow.IntType | arrow.UintType | arrow.FloatType] ) defer bldr.Release() + bldr.Reserve(int(input.Len)) + if err := reserveFormattedData(bldr, input, maxFormattedBytes(input.Type)); err != nil { + return err + } + bitutils.VisitBitBlocks(input.Buffers[0].Buf, input.Offset, input.Len, func(pos int64) { bldr.Append(formatter(inputData[pos])) @@ -267,6 +339,109 @@ func numericToStringCastExec[T arrow.IntType | arrow.UintType | arrow.FloatType] } } +// reserveFormattedData pre-reserves bldr's data buffer for at most +// (non-null count) * perValueBytes formatted bytes. Returns arrow.ErrInvalid +// when the product exceeds the destination builder's single-buffer limit, +// avoiding a panic inside BinaryViewBuilder.ReserveData or an int32 offset +// overflow in StringBuilder. For view builders whose per-value upper bound +// fits inline (arrow.IsViewInline), all values are stored inside view +// headers and consume no overflow data, so reservation and the limit +// check are skipped. See GH-184 review feedback. +func reserveFormattedData(bldr array.StringLikeBuilder, input *exec.ArraySpan, perValueBytes int) error { + if perValueBytes <= 0 { + return nil + } + if isViewBuilder(bldr) && arrow.IsViewInline(perValueBytes) { + return nil + } + total := int64(input.Len-input.Nulls) * int64(perValueBytes) + return reserveFormattedDataExact(bldr, total) +} + +// reserveFormattedDataExact reserves exactly total bytes of out-of-line +// payload on bldr, returning arrow.ErrInvalid if total exceeds the +// destination builder's single-buffer limit. Use this when the kernel can +// compute the exact payload in advance; use reserveFormattedData with a +// per-value upper bound otherwise. +func reserveFormattedDataExact(bldr array.StringLikeBuilder, total int64) error { + limit := formattedDataLimit(bldr) + if total > limit { + return fmt.Errorf("%w: formatted cast payload (%d bytes) exceeds single data buffer limit (%d bytes) for destination builder", + arrow.ErrInvalid, total, limit) + } + if total > 0 { + bldr.ReserveData(int(total)) + } + return nil +} + +// isViewBuilder reports whether bldr writes into a view-typed array whose +// out-of-line data lives in a single overflow buffer subject to the +// view_value_size_limit. Only *array.StringViewBuilder satisfies the +// StringLikeBuilder interface (BinaryViewBuilder's Append takes []byte, +// not string), so this is effectively a StringView check, kept under a +// clearer name so the inline-skip callers read as "view builder" intent. +func isViewBuilder(bldr array.StringLikeBuilder) bool { + _, ok := bldr.(*array.StringViewBuilder) + return ok +} + +// formattedDataLimit returns the largest contiguous data payload (in bytes) +// that the destination builder can hold in a single buffer, clamped so that +// reserveFormattedData's int(total) conversion cannot overflow: +// - *array.StringBuilder (utf8): MaxInt32 (int32 offsets) +// - *array.StringViewBuilder (string_view): MaxInt32 (single overflow buffer) +// - *array.LargeStringBuilder (large_utf8): MaxInt64 (int64 offsets), +// clamped to int64(math.MaxInt) so int(total) fits on 32-bit builds +// where int is 32-bit. +func formattedDataLimit(bldr array.StringLikeBuilder) int64 { + limit := int64(math.MaxInt32) + if _, ok := bldr.(*array.LargeStringBuilder); ok { + limit = math.MaxInt64 + } + if limit > int64(math.MaxInt) { + limit = int64(math.MaxInt) + } + return limit +} + +// maxFormattedBytes returns an upper bound on the textual representation +// of a single value of dt used by the numeric/temporal-to-string kernels. +// The bound is used to pre-reserve the builder's data buffer so StringView +// outputs stay within a single overflow block (compute's ArraySpan can +// carry only one view data buffer). See GH-184 review feedback. +func maxFormattedBytes(dt arrow.DataType) int { + switch dt.ID() { + case arrow.INT8, arrow.UINT8: + return 4 // "-128" / "255" + case arrow.INT16: + return 6 // "-32768" + case arrow.UINT16: + return 5 // "65535" + case arrow.INT32: + return 11 // "-2147483648" + case arrow.UINT32: + return 10 // "4294967295" + case arrow.INT64, arrow.UINT64: + return 20 // "-9223372036854775808" + case arrow.FLOAT16: + return 16 // empirical max from strconv.FormatFloat(32): "-0.000100016594" is 15 + case arrow.FLOAT32: + return 25 // scientific notation upper bound + case arrow.FLOAT64: + return 32 // scientific notation upper bound + case arrow.DATE32, arrow.DATE64: + return 10 // "YYYY-MM-DD" + case arrow.TIME32: + return 12 // "HH:MM:SS.sss" + case arrow.TIME64: + return 18 // "HH:MM:SS.nnnnnnnnn" + case arrow.TIMESTAMP: + return 35 // date + time with ns precision + short tz offset + } + return 0 +} + func castTimestampToString(ctx *exec.KernelCtx, batch *exec.ExecSpan, out *exec.ExecResult) error { var ( input = &batch.Values[0].Array @@ -302,7 +477,9 @@ func castTimestampToString(ctx *exec.KernelCtx, batch *exec.ExecSpan, out *exec. strlen := len(fmtstring) bldr.Reserve(int(input.Len)) - bldr.ReserveData(int(input.Len-input.Nulls) * strlen) + if err := reserveFormattedData(bldr, input, strlen); err != nil { + return err + } bitutils.VisitBitBlocks(input.Buffers[0].Buf, input.Offset, input.Len, func(pos int64) { @@ -404,6 +581,11 @@ func GetToBinaryKernels(outType arrow.DataType) []exec.ScalarKernel { case arrow.LARGE_STRING: out = addToBinaryKernels[int64](outputType, out) return addNumericAndTemporalToStringCasts(outputType, out) + case arrow.BINARY_VIEW: + return addToBinaryViewKernels(outputType, out) + case arrow.STRING_VIEW: + out = addToBinaryViewKernels(outputType, out) + return addNumericAndTemporalToStringCasts(outputType, out) } return nil } diff --git a/arrow/compute/internal/kernels/string_casts_test.go b/arrow/compute/internal/kernels/string_casts_test.go new file mode 100644 index 00000000..a31dedf1 --- /dev/null +++ b/arrow/compute/internal/kernels/string_casts_test.go @@ -0,0 +1,198 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build go1.18 + +package kernels + +import ( + "errors" + "math" + "testing" + + "github.com/apache/arrow-go/v18/arrow" + "github.com/apache/arrow-go/v18/arrow/array" + "github.com/apache/arrow-go/v18/arrow/compute/exec" + "github.com/apache/arrow-go/v18/arrow/memory" +) + +// TestReserveFormattedDataOverflowGuard verifies that reserveFormattedData +// returns arrow.ErrInvalid (rather than panicking inside the builder's +// ReserveData) when (Len - Nulls) * perValueBytes would overflow int32. +// Tested at the kernel level so we don't need to drive >2 GB of input +// through the full executor to trigger the guard. See GH-184. +func TestReserveFormattedDataOverflowGuard(t *testing.T) { + mem := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer mem.AssertSize(t, 0) + + bldr := array.NewStringViewBuilder(mem) + defer bldr.Release() + + span := &exec.ArraySpan{ + Len: int64(math.MaxInt32/20) + 1, + Nulls: 0, + } + + err := reserveFormattedData(bldr, span, 20) + if err == nil { + t.Fatal("expected overflow guard to return an error") + } + if !errors.Is(err, arrow.ErrInvalid) { + t.Fatalf("expected arrow.ErrInvalid, got %v", err) + } +} + +// TestFormattedDataLimitPerBuilder verifies the per-destination single-buffer +// limit used by reserveFormattedData: utf8 and string_view are capped at +// MaxInt32 (int32 offsets / single overflow buffer), while large_utf8 is +// capped at min(MaxInt64, platform int max). On 64-bit builds the large_utf8 +// limit is MaxInt64; on 32-bit builds it is clamped to MaxInt32 so the +// int(total) conversion in reserveFormattedData never overflows. Regression +// test for GH-184: the original MaxInt32 guard incorrectly rejected valid +// large_utf8 casts above 2 GiB on 64-bit platforms. +func TestFormattedDataLimitPerBuilder(t *testing.T) { + mem := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer mem.AssertSize(t, 0) + + utf8Bldr := array.NewStringBuilder(mem) + defer utf8Bldr.Release() + viewBldr := array.NewStringViewBuilder(mem) + defer viewBldr.Release() + largeBldr := array.NewLargeStringBuilder(mem) + defer largeBldr.Release() + + largeWant := int64(math.MaxInt64) + if largeWant > int64(math.MaxInt) { + largeWant = int64(math.MaxInt) + } + + cases := []struct { + name string + bldr array.StringLikeBuilder + want int64 + }{ + {"utf8", utf8Bldr, math.MaxInt32}, + {"string_view", viewBldr, math.MaxInt32}, + {"large_utf8", largeBldr, largeWant}, + } + for _, tc := range cases { + if got := formattedDataLimit(tc.bldr); got != tc.want { + t.Errorf("formattedDataLimit(%s): want %d, got %d", tc.name, tc.want, got) + } + } +} + +// TestFormattedDataLimitFitsInPlatformInt is the regression test for the +// GH-184 third-round review finding: reserveFormattedData passes int(total) +// to ReserveData, so the returned limit must fit in the platform int to +// prevent overflow on 32-bit builds where int is 32-bit. The assertion +// holds on both 32- and 64-bit builds by construction. +func TestFormattedDataLimitFitsInPlatformInt(t *testing.T) { + mem := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer mem.AssertSize(t, 0) + + bldrs := []struct { + name string + bldr array.StringLikeBuilder + }{ + {"utf8", array.NewStringBuilder(mem)}, + {"string_view", array.NewStringViewBuilder(mem)}, + {"large_utf8", array.NewLargeStringBuilder(mem)}, + } + for _, tc := range bldrs { + defer tc.bldr.Release() + if limit := formattedDataLimit(tc.bldr); limit > int64(math.MaxInt) { + t.Errorf("formattedDataLimit(%s) = %d exceeds platform math.MaxInt %d", + tc.name, limit, int64(math.MaxInt)) + } + } +} + +// TestReserveFormattedDataAllowsLargeUtf8AboveInt32 is a platform-aware +// regression test for the GH-184 second-round review finding: on 64-bit +// builds, reserveFormattedData must not reject large_utf8 totals above +// MaxInt32 (int64 offsets can represent them); on 32-bit builds, the +// destination-specific limit is instead clamped to math.MaxInt32 so +// int(total) cannot overflow in reserveFormattedData. We assert the +// destination-specific limit via formattedDataLimit rather than driving +// a multi-gigabyte allocation through the full kernel path, and +// separately confirm the guard still fires for string_view at the +// int32 boundary. +func TestReserveFormattedDataAllowsLargeUtf8AboveInt32(t *testing.T) { + mem := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer mem.AssertSize(t, 0) + + largeBldr := array.NewLargeStringBuilder(mem) + defer largeBldr.Release() + + got := formattedDataLimit(largeBldr) + if int64(math.MaxInt) > math.MaxInt32 { + if got <= math.MaxInt32 { + t.Fatalf("64-bit: large_utf8 destination limit must exceed MaxInt32, got %d", got) + } + } else { + if got != math.MaxInt32 { + t.Fatalf("32-bit: large_utf8 destination limit must clamp to MaxInt32, got %d", got) + } + } + + viewBldr := array.NewStringViewBuilder(mem) + defer viewBldr.Release() + span := &exec.ArraySpan{ + Len: int64(math.MaxInt32/20) + 1, + Nulls: 0, + } + if err := reserveFormattedData(viewBldr, span, 20); !errors.Is(err, arrow.ErrInvalid) { + t.Fatalf("string_view guard should reject total > MaxInt32, got %v", err) + } +} + +// TestReserveFormattedDataInlineViewSkip is the regression test for the +// GH-184 seventh/eighth-round review findings: for view builders, values +// whose per-value upper bound fits inline (arrow.IsViewInline, <= 12) +// are stored inside view headers and never consume overflow data. +// reserveFormattedData must skip both the reservation and the +// single-buffer limit check in that case, so large casts with +// inline-bounded values (bool, int8..int32, date32, time32[ms], etc.) +// are not rejected against the overflow-buffer limit they never use. +func TestReserveFormattedDataInlineViewSkip(t *testing.T) { + mem := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer mem.AssertSize(t, 0) + + viewBldr := array.NewStringViewBuilder(mem) + defer viewBldr.Release() + + span := &exec.ArraySpan{Len: math.MaxInt32, Nulls: 0} + + if err := reserveFormattedData(viewBldr, span, 5); err != nil { + t.Fatalf("string_view with inline-bounded perValueBytes=5 must not error: %v", err) + } + if err := reserveFormattedData(viewBldr, span, 12); err != nil { + t.Fatalf("string_view with boundary perValueBytes=12 must not error (inline per arrow.IsViewInline): %v", err) + } + // Above the inline boundary (12) the overflow-buffer limit is enforced. + if err := reserveFormattedData(viewBldr, span, 13); !errors.Is(err, arrow.ErrInvalid) { + t.Fatalf("string_view with non-inline perValueBytes=13 must hit the limit, got %v", err) + } + + utf8Bldr := array.NewStringBuilder(mem) + defer utf8Bldr.Release() + // utf8 builders are offset-based; inline skipping does not apply and + // a 5-byte-per-row upper bound on MaxInt32 rows overflows int32. + if err := reserveFormattedData(utf8Bldr, span, 5); !errors.Is(err, arrow.ErrInvalid) { + t.Fatalf("utf8 inline-skip must not apply; expected ErrInvalid, got %v", err) + } +} diff --git a/arrow/datatype_viewheader.go b/arrow/datatype_viewheader.go index 691b97df..ed6ed087 100644 --- a/arrow/datatype_viewheader.go +++ b/arrow/datatype_viewheader.go @@ -31,7 +31,7 @@ const ( ) func IsViewInline(length int) bool { - return length < viewInlineSize + return length <= viewInlineSize } // ViewHeader is a variable length string (utf8) or byte slice with diff --git a/internal/types/extension_types.go b/internal/types/extension_types.go index eaffce65..0162b487 100644 --- a/internal/types/extension_types.go +++ b/internal/types/extension_types.go @@ -317,9 +317,111 @@ var ( _ arrow.ExtensionType = (*ExtStructType)(nil) _ arrow.ExtensionType = (*DictExtensionType)(nil) _ arrow.ExtensionType = (*SmallintType)(nil) + _ arrow.ExtensionType = (*StringViewExtType)(nil) + _ arrow.ExtensionType = (*DictStringViewExtType)(nil) _ array.ExtensionArray = (*Parametric1Array)(nil) _ array.ExtensionArray = (*Parametric2Array)(nil) _ array.ExtensionArray = (*ExtStructArray)(nil) _ array.ExtensionArray = (*DictExtensionArray)(nil) _ array.ExtensionArray = (*SmallintArray)(nil) + _ array.ExtensionArray = (*StringViewExtArray)(nil) + _ array.ExtensionArray = (*DictStringViewExtArray)(nil) ) + +// StringViewExtArray is an extension array backed by a string_view storage +// array. +type StringViewExtArray struct { + array.ExtensionArrayBase +} + +func (a StringViewExtArray) ValueStr(i int) string { + if a.IsNull(i) { + return array.NullValueStr + } + return a.Storage().(*array.StringView).Value(i) +} + +// StringViewExtType wraps arrow.BinaryTypes.StringView as an extension so +// tests can exercise code paths that must look through extension types +// at their storage layout. +type StringViewExtType struct { + arrow.ExtensionBase +} + +func NewStringViewExtType() *StringViewExtType { + return &StringViewExtType{ExtensionBase: arrow.ExtensionBase{ + Storage: arrow.BinaryTypes.StringView}} +} + +func (StringViewExtType) ArrayType() reflect.Type { return reflect.TypeOf(StringViewExtArray{}) } + +func (StringViewExtType) ExtensionName() string { return "string_view_ext" } + +func (StringViewExtType) Serialize() string { return "string_view_ext-serialized" } + +func (s *StringViewExtType) ExtensionEquals(other arrow.ExtensionType) bool { + return s.ExtensionName() == other.ExtensionName() +} + +func (StringViewExtType) Deserialize(storageType arrow.DataType, data string) (arrow.ExtensionType, error) { + if data != "string_view_ext-serialized" { + return nil, fmt.Errorf("type identifier did not match: '%s'", data) + } + if !arrow.TypeEqual(storageType, arrow.BinaryTypes.StringView) { + return nil, fmt.Errorf("invalid storage type for StringViewExtType: %s", storageType) + } + return NewStringViewExtType(), nil +} + +// DictStringViewExtArray is an extension array backed by a +// dictionary storage array. +type DictStringViewExtArray struct { + array.ExtensionArrayBase +} + +func (a DictStringViewExtArray) ValueStr(i int) string { + if a.IsNull(i) { + return array.NullValueStr + } + dict := a.Storage().(*array.Dictionary) + vals := dict.Dictionary().(*array.StringView) + return vals.Value(dict.GetValueIndex(i)) +} + +// DictStringViewExtType wraps dictionary as an +// extension so tests can exercise code paths that must preserve a +// Dictionary() member when unwrapping/rewrapping extensions at their +// storage layer. +type DictStringViewExtType struct { + arrow.ExtensionBase +} + +func NewDictStringViewExtType() *DictStringViewExtType { + return &DictStringViewExtType{ExtensionBase: arrow.ExtensionBase{ + Storage: &arrow.DictionaryType{ + IndexType: arrow.PrimitiveTypes.Int32, + ValueType: arrow.BinaryTypes.StringView, + }}} +} + +func (DictStringViewExtType) ArrayType() reflect.Type { + return reflect.TypeOf(DictStringViewExtArray{}) +} + +func (DictStringViewExtType) ExtensionName() string { return "dict_string_view_ext" } + +func (DictStringViewExtType) Serialize() string { return "dict_string_view_ext-serialized" } + +func (d *DictStringViewExtType) ExtensionEquals(other arrow.ExtensionType) bool { + return d.ExtensionName() == other.ExtensionName() +} + +func (d *DictStringViewExtType) Deserialize(storageType arrow.DataType, data string) (arrow.ExtensionType, error) { + if data != "dict_string_view_ext-serialized" { + return nil, fmt.Errorf("type identifier did not match: '%s'", data) + } + if !arrow.TypeEqual(d.StorageType(), storageType) { + return nil, fmt.Errorf("invalid storage type for DictStringViewExtType: %s", storageType) + } + return NewDictStringViewExtType(), nil +}