From 65aa957f8ee2cb5f49fa00d1025506fde53fc48d Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Tue, 5 May 2026 12:37:02 -0400 Subject: [PATCH 01/23] feat(compute): support casts between binary/string and binary_view/string_view Adds the missing cast kernels so that compute.CastArray (and the 'cast' compute function) can convert between the view variants (binary_view, string_view) and the existing base-binary types (binary, large_binary, string, large_string, fixed_size_binary). Registers three new cast paths: * base-binary -> view (CastBinaryToBinaryView) * view -> base-binary (CastBinaryViewToBinary) * view <-> view (CastBinaryViewToBinaryView, zero-copy) UTF-8 validation mirrors the behavior of the existing binary->string kernels: when casting a non-utf8 source into a utf8 destination, every non-null value is validated unless CastOptions.AllowInvalidUtf8 is set. Also fixes two related gaps in the compute/array plumbing that otherwise prevent view types from flowing through generic code paths: * arrow/compute/exec.getNumBuffers now returns 3 for binary_view and string_view (bitmap + view headers + a single overflow data buffer), matching ArraySpan's [3]BufferSpan limit. Previously it fell through to the default case and dropped the data buffer, which crashed any cast that walked non-inline view values through input.MakeArray(). * arrow/array.getMaxBufferLen and the nullArrayFactory learn about BinaryViewDataType so that MakeArrayOfNull (called by the executor when building empty-input results) works for view outputs. Tests exercising short (<=12 byte inline) and long (out-of-line) values, null handling, UTF-8 validation, CanCast coverage, and FSB -> view are added to arrow/compute/cast_test.go. The tests use a direct array comparison helper rather than the existing checkCast -- that helper iterates scalar.GetScalar, which does not yet support view types (a separate limitation outside the scope of this change). Fixes #184 --- arrow/array/util.go | 4 + arrow/compute/cast.go | 2 + arrow/compute/cast_test.go | 227 +++++++++++++++++ arrow/compute/exec/span.go | 7 + .../internal/kernels/binary_view_casts.go | 241 ++++++++++++++++++ .../compute/internal/kernels/string_casts.go | 43 ++++ 6 files changed, 524 insertions(+) create mode 100644 arrow/compute/internal/kernels/binary_view_casts.go diff --git a/arrow/array/util.go b/arrow/array/util.go index 6a1d29cb0..5cbcbeabf 100644 --- a/arrow/array/util.go +++ b/arrow/array/util.go @@ -447,6 +447,8 @@ func getMaxBufferLen(dt arrow.DataType, length int) int { return bufferLen case arrow.OffsetsDataType: return maxOf(dt.OffsetTypeTraits().BytesRequired(length + 1)) + case arrow.BinaryViewDataType: + return maxOf(arrow.ViewHeaderSizeBytes * length) case *arrow.FixedSizeListType: return maxOf(getMaxBufferLen(dt.Elem(), int(dt.Len())*length)) case arrow.ExtensionType: @@ -496,6 +498,8 @@ func (n *nullArrayFactory) create() *Data { dictData = arr.Data() case arrow.FixedWidthDataType: bufs = append(bufs, n.buf) + case arrow.BinaryViewDataType: + bufs = append(bufs, n.buf) case arrow.BinaryDataType: bufs = append(bufs, n.buf, n.buf) case arrow.OffsetsDataType: diff --git a/arrow/compute/cast.go b/arrow/compute/cast.go index 95f2f8f1c..5764f4c38 100644 --- a/arrow/compute/cast.go +++ b/arrow/compute/cast.go @@ -537,6 +537,8 @@ func getBinaryLikeCasts() []*castFunction { addFn("cast_large_binary", arrow.LARGE_BINARY, kernels.GetToBinaryKernels(arrow.BinaryTypes.LargeBinary)) addFn("cast_string", arrow.STRING, kernels.GetToBinaryKernels(arrow.BinaryTypes.String)) addFn("cast_large_string", arrow.LARGE_STRING, kernels.GetToBinaryKernels(arrow.BinaryTypes.LargeString)) + addFn("cast_binary_view", arrow.BINARY_VIEW, kernels.GetToBinaryKernels(arrow.BinaryTypes.BinaryView)) + addFn("cast_string_view", arrow.STRING_VIEW, kernels.GetToBinaryKernels(arrow.BinaryTypes.StringView)) addFn("cast_fixed_sized_binary", arrow.FIXED_SIZE_BINARY, kernels.GetFsbCastKernels()) return out } diff --git a/arrow/compute/cast_test.go b/arrow/compute/cast_test.go index 370ced16d..108d43f52 100644 --- a/arrow/compute/cast_test.go +++ b/arrow/compute/cast_test.go @@ -1527,6 +1527,233 @@ func (c *CastSuite) TestStringToString() { } } +// TestBinaryLikeToBinaryView covers GH-184: casting between base-binary +// (binary, large_binary, string, large_string, fixed_size_binary) and the +// view variants (binary_view, string_view). It verifies round-trip fidelity +// for both short (<=12 byte, inline) and long (out-of-line) values, across +// null bitmaps, and exercises UTF-8 validation semantics. +func (c *CastSuite) TestBinaryLikeToBinaryView() { + const strInJSON = `["a", "short", "this is a string well over twelve bytes", "", null]` + + for _, from := range []arrow.DataType{arrow.BinaryTypes.String, arrow.BinaryTypes.LargeString} { + c.Run("from "+from.String(), func() { + c.Run("to string_view", func() { + c.checkCastArrayOnly(from, arrow.BinaryTypes.StringView, strInJSON, strInJSON) + c.checkCastArrayOnly(from, arrow.BinaryTypes.StringView, `[]`, `[]`) + }) + c.Run("to binary_view", func() { + exp := c.buildBinaryViewArray([][]byte{ + []byte("a"), []byte("short"), + []byte("this is a string well over twelve bytes"), + []byte(""), nil, + }) + defer exp.Release() + + in, _, err := array.FromJSON(c.mem, from, strings.NewReader(strInJSON), array.WithUseNumber()) + c.Require().NoError(err) + defer in.Release() + + got, err := compute.CastArray(context.Background(), in, + compute.SafeCastOptions(arrow.BinaryTypes.BinaryView)) + c.Require().NoError(err) + defer got.Release() + + c.Truef(array.Equal(exp, got), "expected %s, got %s", exp, got) + }) + }) + } + + const binJSON = `["aGk=", "dGhpcyBpcyBieXRlcyBiZXlvbmQgdHdlbHZlIGNoYXJz", "//4=", null]` + for _, from := range []arrow.DataType{arrow.BinaryTypes.Binary, arrow.BinaryTypes.LargeBinary} { + c.Run("from "+from.String(), func() { + c.Run("to binary_view", func() { + c.checkCastArrayOnly(from, arrow.BinaryTypes.BinaryView, binJSON, binJSON) + }) + c.Run("to string_view rejects invalid utf8", func() { + invalid := c.invalidUtf8Arr(from) + defer invalid.Release() + _, err := compute.CastArray(context.Background(), invalid, + compute.SafeCastOptions(arrow.BinaryTypes.StringView)) + c.ErrorIs(err, arrow.ErrInvalid) + }) + c.Run("to string_view allows invalid utf8 when opted in", func() { + invalid := c.invalidUtf8Arr(from) + defer invalid.Release() + opts := compute.SafeCastOptions(arrow.BinaryTypes.StringView) + opts.AllowInvalidUtf8 = true + out, err := compute.CastArray(context.Background(), invalid, opts) + c.NoError(err) + defer out.Release() + c.Equal(invalid.Len(), out.Len()) + }) + }) + } + + c.Run("from fixed_size_binary", func() { + fsbType := &arrow.FixedSizeBinaryType{ByteWidth: 3} + in, _, err := array.FromJSON(c.mem, fsbType, + strings.NewReader(`["YWJj", "ZGVm", null]`), array.WithUseNumber()) + c.Require().NoError(err) + defer in.Release() + + exp := c.buildBinaryViewArray([][]byte{[]byte("abc"), []byte("def"), nil}) + defer exp.Release() + + got, err := compute.CastArray(context.Background(), in, + compute.SafeCastOptions(arrow.BinaryTypes.BinaryView)) + c.Require().NoError(err) + defer got.Release() + + c.Truef(array.Equal(exp, got), "expected %s, got %s", exp, got) + }) +} + +// TestBinaryViewToBinaryLike covers the reverse direction of GH-184: +// materializing view arrays back into contiguous binary/string arrays. +func (c *CastSuite) TestBinaryViewToBinaryLike() { + const strJSON = `["a", "short", "this is a string well over twelve bytes", "", null]` + + for _, to := range []arrow.DataType{arrow.BinaryTypes.String, arrow.BinaryTypes.LargeString} { + c.Run("string_view to "+to.String(), func() { + c.checkCastArrayOnly(arrow.BinaryTypes.StringView, to, strJSON, strJSON) + c.checkCastArrayOnly(arrow.BinaryTypes.StringView, to, `[]`, `[]`) + }) + } + + const binJSON = `["aGk=", "dGhpcyBpcyBieXRlcyBiZXlvbmQgdHdlbHZlIGNoYXJz", "//4=", null]` + for _, to := range []arrow.DataType{arrow.BinaryTypes.Binary, arrow.BinaryTypes.LargeBinary} { + c.Run("binary_view to "+to.String(), func() { + c.checkCastArrayOnly(arrow.BinaryTypes.BinaryView, to, binJSON, binJSON) + }) + } + + c.Run("binary_view to string rejects invalid utf8", func() { + invalid := c.buildBinaryViewArray([][]byte{[]byte("ok"), {0xff, 0xfe}}) + defer invalid.Release() + + _, err := compute.CastArray(context.Background(), invalid, + compute.SafeCastOptions(arrow.BinaryTypes.String)) + c.ErrorIs(err, arrow.ErrInvalid) + }) +} + +// TestBinaryViewToBinaryView covers cross-view casts (binary_view <-> +// string_view) and identity casts. These are zero-copy except for UTF-8 +// validation in the binary_view -> string_view direction. +func (c *CastSuite) TestBinaryViewToBinaryView() { + values := [][]byte{ + []byte("a"), []byte("short"), + []byte("this is a string well over twelve bytes"), + []byte(""), nil, + } + + c.Run("string_view to binary_view", func() { + in := c.buildStringViewArray([]string{"a", "short", "this is a string well over twelve bytes", "", ""}, []bool{true, true, true, true, false}) + defer in.Release() + exp := c.buildBinaryViewArray(values) + defer exp.Release() + + got, err := compute.CastArray(context.Background(), in, + compute.SafeCastOptions(arrow.BinaryTypes.BinaryView)) + c.Require().NoError(err) + defer got.Release() + + c.Truef(array.Equal(exp, got), "expected %s, got %s", exp, got) + }) + + c.Run("binary_view to string_view valid utf8", func() { + in := c.buildBinaryViewArray(values) + defer in.Release() + exp := c.buildStringViewArray([]string{"a", "short", "this is a string well over twelve bytes", "", ""}, + []bool{true, true, true, true, false}) + defer exp.Release() + + got, err := compute.CastArray(context.Background(), in, + compute.SafeCastOptions(arrow.BinaryTypes.StringView)) + c.Require().NoError(err) + defer got.Release() + + c.Truef(array.Equal(exp, got), "expected %s, got %s", exp, got) + }) + + c.Run("binary_view to string_view rejects invalid utf8", func() { + invalid := c.buildBinaryViewArray([][]byte{[]byte("ok"), {0x80, 0x81}}) + defer invalid.Release() + + _, err := compute.CastArray(context.Background(), invalid, + compute.SafeCastOptions(arrow.BinaryTypes.StringView)) + c.ErrorIs(err, arrow.ErrInvalid) + }) +} + +// TestCanCastViewTypes verifies CanCast advertises the new view cast paths +// registered for GH-184. +func (c *CastSuite) TestCanCastViewTypes() { + baseBinary := []arrow.DataType{ + arrow.BinaryTypes.Binary, arrow.BinaryTypes.LargeBinary, + arrow.BinaryTypes.String, arrow.BinaryTypes.LargeString, + } + viewTypes := []arrow.DataType{arrow.BinaryTypes.BinaryView, arrow.BinaryTypes.StringView} + + for _, from := range baseBinary { + for _, to := range viewTypes { + c.Truef(compute.CanCast(from, to), "expected CanCast %s -> %s", from, to) + c.Truef(compute.CanCast(to, from), "expected CanCast %s -> %s", to, from) + } + } + + for _, from := range viewTypes { + for _, to := range viewTypes { + c.Truef(compute.CanCast(from, to), "expected CanCast %s -> %s", from, to) + } + } + + c.True(compute.CanCast(&arrow.FixedSizeBinaryType{ByteWidth: 3}, arrow.BinaryTypes.BinaryView)) + c.True(compute.CanCast(&arrow.FixedSizeBinaryType{ByteWidth: 3}, arrow.BinaryTypes.StringView)) +} + +// checkCastArrayOnly performs a cast and compares the resulting array against +// an expected array built from JSON. It deliberately avoids the per-element +// scalar iteration path used by checkCast, which relies on scalar.GetScalar +// - that helper does not yet support view types (unrelated to GH-184). +func (c *CastSuite) checkCastArrayOnly(dtIn, dtOut arrow.DataType, inJSON, outJSON string) { + inArr, _, err := array.FromJSON(c.mem, dtIn, strings.NewReader(inJSON), array.WithUseNumber()) + c.Require().NoError(err) + defer inArr.Release() + + expArr, _, err := array.FromJSON(c.mem, dtOut, strings.NewReader(outJSON), array.WithUseNumber()) + c.Require().NoError(err) + defer expArr.Release() + + opts := compute.SafeCastOptions(dtOut) + got, err := compute.CastArray(context.Background(), inArr, opts) + c.Require().NoError(err) + defer got.Release() + + c.Truef(array.Equal(expArr, got), "cast %s -> %s: expected %s, got %s", + dtIn, dtOut, expArr, got) +} + +func (c *CastSuite) buildBinaryViewArray(values [][]byte) arrow.Array { + bldr := array.NewBinaryViewBuilder(c.mem) + defer bldr.Release() + for _, v := range values { + if v == nil { + bldr.AppendNull() + continue + } + bldr.Append(v) + } + return bldr.NewArray() +} + +func (c *CastSuite) buildStringViewArray(values []string, valid []bool) arrow.Array { + bldr := array.NewStringViewBuilder(c.mem) + defer bldr.Release() + bldr.AppendValues(values, valid) + return bldr.NewArray() +} + func (c *CastSuite) TestStringToInt() { for _, stype := range []arrow.DataType{arrow.BinaryTypes.String, arrow.BinaryTypes.LargeString} { for _, dt := range signedIntTypes { diff --git a/arrow/compute/exec/span.go b/arrow/compute/exec/span.go index a235d42b7..be1a10a79 100644 --- a/arrow/compute/exec/span.go +++ b/arrow/compute/exec/span.go @@ -579,6 +579,13 @@ func getNumBuffers(dt arrow.DataType) int { return 1 case arrow.BINARY, arrow.LARGE_BINARY, arrow.STRING, arrow.LARGE_STRING, arrow.DENSE_UNION: return 3 + case arrow.BINARY_VIEW, arrow.STRING_VIEW: + // bitmap + view-header buffer + a single overflow data buffer. + // ArraySpan's fixed buffer array limits additional data buffers, + // so callers producing multi-buffer views must keep their data + // within a single block (the default 32KB allocation in the + // builder is sufficient for most use cases). + return 3 case arrow.EXTENSION: return getNumBuffers(dt.(arrow.ExtensionType).StorageType()) default: diff --git a/arrow/compute/internal/kernels/binary_view_casts.go b/arrow/compute/internal/kernels/binary_view_casts.go new file mode 100644 index 000000000..8a1814bd7 --- /dev/null +++ b/arrow/compute/internal/kernels/binary_view_casts.go @@ -0,0 +1,241 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build go1.18 + +package kernels + +import ( + "fmt" + "unicode/utf8" + "unsafe" + + "github.com/apache/arrow-go/v18/arrow" + "github.com/apache/arrow-go/v18/arrow/array" + "github.com/apache/arrow-go/v18/arrow/compute/exec" + "github.com/apache/arrow-go/v18/internal/bitutils" +) + +func validateUtf8View(input *exec.ArraySpan) error { + views := arrow.ViewHeaderTraits.CastFromBytes(input.Buffers[1].Buf) + dataBuffers := make([][]byte, 0, len(input.Buffers)-2) + for i := 2; i < len(input.Buffers); i++ { + dataBuffers = append(dataBuffers, input.Buffers[i].Buf) + } + + return bitutils.VisitBitBlocksShort(input.Buffers[0].Buf, input.Offset, input.Len, + func(pos int64) error { + h := &views[input.Offset+pos] + var v []byte + if h.IsInline() { + v = h.InlineBytes() + } else { + off := h.BufferOffset() + v = dataBuffers[h.BufferIndex()][off : off+int32(h.Len())] + } + if !utf8.Valid(v) { + return fmt.Errorf("%w: invalid UTF8 bytes: %x", arrow.ErrInvalid, v) + } + return nil + }, func() error { return nil }) +} + +func unsafeStringBytes(s string) []byte { + if len(s) == 0 { + return nil + } + return unsafe.Slice(unsafe.StringData(s), len(s)) +} + +type binaryAppender struct { + bldr array.Builder + appendBytes func([]byte) +} + +func newBinaryAppender(bldr array.Builder) (binaryAppender, error) { + switch b := bldr.(type) { + case *array.BinaryBuilder: + return binaryAppender{bldr: b, appendBytes: b.Append}, nil + case *array.StringBuilder: + return binaryAppender{bldr: b, appendBytes: b.BinaryBuilder.Append}, nil + case *array.LargeStringBuilder: + return binaryAppender{bldr: b, appendBytes: b.BinaryBuilder.Append}, nil + case *array.BinaryViewBuilder: + return binaryAppender{bldr: b, appendBytes: b.Append}, nil + case *array.StringViewBuilder: + return binaryAppender{bldr: b, appendBytes: b.BinaryViewBuilder.Append}, nil + default: + return binaryAppender{}, fmt.Errorf("%w: unsupported builder type %T for binary-like output", + arrow.ErrNotImplemented, bldr) + } +} + +// CastBinaryToBinaryView casts a Binary, LargeBinary, String, LargeString, +// or FixedSizeBinary array into a BinaryView or StringView array. When the +// source is a non-utf8 binary type and the destination is a utf8 view type, +// every non-null element is validated as UTF-8 unless +// CastOptions.AllowInvalidUtf8 is set. +func CastBinaryToBinaryView(ctx *exec.KernelCtx, batch *exec.ExecSpan, out *exec.ExecResult) error { + opts := ctx.State.(CastState) + input := &batch.Values[0].Array + outputType := out.Type.(arrow.BinaryDataType) + + inputIsUtf8 := false + if b, ok := input.Type.(arrow.BinaryDataType); ok { + inputIsUtf8 = b.IsUtf8() + } + + if !inputIsUtf8 && outputType.IsUtf8() && !opts.AllowInvalidUtf8 { + switch input.Type.ID() { + case arrow.BINARY: + if err := validateUtf8[int32](input); err != nil { + return err + } + case arrow.LARGE_BINARY: + if err := validateUtf8[int64](input); err != nil { + return err + } + case arrow.FIXED_SIZE_BINARY: + if err := validateUtf8Fsb(input); err != nil { + return err + } + } + } + + rawBldr := array.NewBuilder(exec.GetAllocator(ctx.Ctx), out.Type) + defer rawBldr.Release() + rawBldr.Reserve(int(input.Len)) + + ba, err := newBinaryAppender(rawBldr) + if err != nil { + return err + } + + arr := input.MakeArray() + defer arr.Release() + + var getVal func(int) []byte + switch a := arr.(type) { + case *array.Binary: + getVal = a.Value + case *array.LargeBinary: + getVal = a.Value + case *array.String: + getVal = func(i int) []byte { return unsafeStringBytes(a.Value(i)) } + case *array.LargeString: + getVal = func(i int) []byte { return unsafeStringBytes(a.Value(i)) } + case *array.FixedSizeBinary: + getVal = a.Value + default: + return fmt.Errorf("%w: unsupported input type for cast to %s: %s", + arrow.ErrNotImplemented, out.Type, input.Type) + } + + for i := 0; i < arr.Len(); i++ { + if arr.IsNull(i) { + ba.bldr.AppendNull() + continue + } + ba.appendBytes(getVal(i)) + } + + result := ba.bldr.NewArray() + out.TakeOwnership(result.Data()) + return nil +} + +// CastBinaryViewToBinary casts a BinaryView or StringView array into a +// Binary, LargeBinary, String, or LargeString array, materializing the +// referenced byte ranges into a single contiguous data buffer. UTF-8 +// validation is performed when casting from a non-utf8 view into a utf8 +// destination unless CastOptions.AllowInvalidUtf8 is set. +func CastBinaryViewToBinary[OutOffsetT int32 | int64](ctx *exec.KernelCtx, batch *exec.ExecSpan, out *exec.ExecResult) error { + opts := ctx.State.(CastState) + input := &batch.Values[0].Array + inputType := input.Type.(arrow.BinaryDataType) + outputType := out.Type.(arrow.BinaryDataType) + + if !inputType.IsUtf8() && outputType.IsUtf8() && !opts.AllowInvalidUtf8 { + if err := validateUtf8View(input); err != nil { + return err + } + } + + rawBldr := array.NewBuilder(exec.GetAllocator(ctx.Ctx), out.Type) + defer rawBldr.Release() + rawBldr.Reserve(int(input.Len)) + + ba, err := newBinaryAppender(rawBldr) + if err != nil { + return err + } + + arr := input.MakeArray() + defer arr.Release() + + var getVal func(int) []byte + switch a := arr.(type) { + case *array.BinaryView: + getVal = a.Value + case *array.StringView: + getVal = func(i int) []byte { return unsafeStringBytes(a.Value(i)) } + default: + return fmt.Errorf("%w: unsupported input type for view-to-binary cast: %s", + arrow.ErrNotImplemented, input.Type) + } + + var totalBytes int64 + for i := 0; i < arr.Len(); i++ { + if !arr.IsNull(i) { + totalBytes += int64(len(getVal(i))) + } + } + if totalBytes > int64(MaxOf[OutOffsetT]()) { + return fmt.Errorf("%w: failed casting from %s to %s: input array too large", + arrow.ErrInvalid, input.Type, out.Type) + } + + for i := 0; i < arr.Len(); i++ { + if arr.IsNull(i) { + ba.bldr.AppendNull() + continue + } + ba.appendBytes(getVal(i)) + } + + result := ba.bldr.NewArray() + out.TakeOwnership(result.Data()) + return nil +} + +// CastBinaryViewToBinaryView handles casts between BinaryView and StringView +// (and same-type view casts). The cast is zero-copy; UTF-8 validation is +// performed when casting from a binary_view into a string_view unless +// CastOptions.AllowInvalidUtf8 is set. +func CastBinaryViewToBinaryView(ctx *exec.KernelCtx, batch *exec.ExecSpan, out *exec.ExecResult) error { + opts := ctx.State.(CastState) + input := &batch.Values[0].Array + inputType := input.Type.(arrow.BinaryDataType) + outputType := out.Type.(arrow.BinaryDataType) + + if !inputType.IsUtf8() && outputType.IsUtf8() && !opts.AllowInvalidUtf8 { + if err := validateUtf8View(input); err != nil { + return err + } + } + + return ZeroCopyCastExec(ctx, batch, out) +} diff --git a/arrow/compute/internal/kernels/string_casts.go b/arrow/compute/internal/kernels/string_casts.go index 1f4b2218a..f994777d0 100644 --- a/arrow/compute/internal/kernels/string_casts.go +++ b/arrow/compute/internal/kernels/string_casts.go @@ -178,17 +178,55 @@ func addBinaryToBinaryCast[InOffsetT, OutOffsetT int32 | int64](inType arrow.Typ outType, CastBinaryToBinary[InOffsetT, OutOffsetT], nil) } +func addViewToBinaryCast[OutOffsetT int32 | int64](inType arrow.Type, outType exec.OutputType) exec.ScalarKernel { + k := exec.NewScalarKernel([]exec.InputType{exec.NewIDInput(inType)}, + outType, CastBinaryViewToBinary[OutOffsetT], nil) + k.NullHandling = exec.NullComputedNoPrealloc + k.MemAlloc = exec.MemNoPrealloc + return k +} + +func addBinaryToBinaryViewCast(inType arrow.Type, outType exec.OutputType) exec.ScalarKernel { + k := exec.NewScalarKernel([]exec.InputType{exec.NewIDInput(inType)}, + outType, CastBinaryToBinaryView, nil) + k.NullHandling = exec.NullComputedNoPrealloc + k.MemAlloc = exec.MemNoPrealloc + return k +} + +func addViewToViewCast(inType arrow.Type, outType exec.OutputType) exec.ScalarKernel { + k := exec.NewScalarKernel([]exec.InputType{exec.NewIDInput(inType)}, + outType, CastBinaryViewToBinaryView, nil) + k.NullHandling = exec.NullComputedNoPrealloc + k.MemAlloc = exec.MemNoPrealloc + return k +} + func addToBinaryKernels[OffsetsT int32 | int64](outType exec.OutputType, kernels []exec.ScalarKernel) []exec.ScalarKernel { return append(kernels, addBinaryToBinaryCast[int32, OffsetsT](arrow.STRING, outType), addBinaryToBinaryCast[int32, OffsetsT](arrow.BINARY, outType), addBinaryToBinaryCast[int64, OffsetsT](arrow.LARGE_STRING, outType), addBinaryToBinaryCast[int64, OffsetsT](arrow.LARGE_BINARY, outType), + addViewToBinaryCast[OffsetsT](arrow.BINARY_VIEW, outType), + addViewToBinaryCast[OffsetsT](arrow.STRING_VIEW, outType), exec.NewScalarKernel([]exec.InputType{exec.NewIDInput(arrow.FIXED_SIZE_BINARY)}, outType, CastFsbToBinary[OffsetsT], nil), ) } +func addToBinaryViewKernels(outType exec.OutputType, kernels []exec.ScalarKernel) []exec.ScalarKernel { + return append(kernels, + addBinaryToBinaryViewCast(arrow.STRING, outType), + addBinaryToBinaryViewCast(arrow.BINARY, outType), + addBinaryToBinaryViewCast(arrow.LARGE_STRING, outType), + addBinaryToBinaryViewCast(arrow.LARGE_BINARY, outType), + addBinaryToBinaryViewCast(arrow.FIXED_SIZE_BINARY, outType), + addViewToViewCast(arrow.BINARY_VIEW, outType), + addViewToViewCast(arrow.STRING_VIEW, outType), + ) +} + func GetFsbCastKernels() []exec.ScalarKernel { outputType := exec.NewComputedOutputType(resolveOutputFromOptions) out := GetCommonCastKernels(arrow.FIXED_SIZE_BINARY, outputType) @@ -404,6 +442,11 @@ func GetToBinaryKernels(outType arrow.DataType) []exec.ScalarKernel { case arrow.LARGE_STRING: out = addToBinaryKernels[int64](outputType, out) return addNumericAndTemporalToStringCasts(outputType, out) + case arrow.BINARY_VIEW: + return addToBinaryViewKernels(outputType, out) + case arrow.STRING_VIEW: + out = addToBinaryViewKernels(outputType, out) + return addNumericAndTemporalToStringCasts(outputType, out) } return nil } From d5f247388a3f811ce1efb1c7c4fba65f33438ca4 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Tue, 5 May 2026 13:12:50 -0400 Subject: [PATCH 02/23] fix(compute): coalesce view payload into a single data buffer in cast CastBinaryToBinaryView relied on BinaryViewBuilder's default 32KB block size. As soon as the cumulative out-of-line (>12 byte) payload of the cast output exceeded that, the builder allocated a second overflow block, producing a 4-buffer Array. exec.ArraySpan only has three BufferSpan slots, so out.TakeOwnership(result.Data()) indexed past Buffers[2] and panicked. The kernel now walks the input once to compute the total out-of-line payload, errors out if it exceeds math.MaxInt32 (the single-buffer offset limit on ViewHeader.BufferOffset), and pre-reserves the builder's data buffer to exactly that size before appending. With the builder's first block sized to fit everything, subsequent per-value ReserveData calls return without allocating new blocks, guaranteeing a single data buffer in the output. Also wires a reserveData callback through the internal binaryAppender helper. *StringViewBuilder shadows the embedded BinaryViewBuilder's Append([]byte) with its own Append(string), so it doesn't satisfy the array.BinaryLikeBuilder interface - type-switching via binaryAppender is the same pattern already used for appendBytes. Adds TestBinaryLikeToBinaryViewLargePayload which casts 500x200-byte values (100 KB total out-of-line, well over the 32KB block default) to both string_view and binary_view, verifies no panic, correct content, and that the output fits in \u22643 buffers. Addresses review 809 feedback on #802. --- arrow/compute/cast_test.go | 47 +++++++++++++++++++ .../internal/kernels/binary_view_casts.go | 35 ++++++++++++-- 2 files changed, 77 insertions(+), 5 deletions(-) diff --git a/arrow/compute/cast_test.go b/arrow/compute/cast_test.go index 108d43f52..a64405e9f 100644 --- a/arrow/compute/cast_test.go +++ b/arrow/compute/cast_test.go @@ -1712,6 +1712,53 @@ func (c *CastSuite) TestCanCastViewTypes() { c.True(compute.CanCast(&arrow.FixedSizeBinaryType{ByteWidth: 3}, arrow.BinaryTypes.StringView)) } +// TestBinaryLikeToBinaryViewLargePayload is a regression test for the GH-184 +// review feedback: when the cast output's out-of-line payload would exceed +// the builder's default 32KB block size, the kernel must still produce a +// single-data-buffer view so the result fits in ArraySpan's fixed 3-buffer +// layout. A multi-buffer result would panic in out.TakeOwnership. +func (c *CastSuite) TestBinaryLikeToBinaryViewLargePayload() { + const ( + perValue = 200 + count = 500 // total out-of-line = 100_000 bytes, comfortably over the 32KB block default + ) + + longStr := strings.Repeat("x", perValue) + + sb := array.NewStringBuilder(c.mem) + defer sb.Release() + for i := 0; i < count; i++ { + sb.Append(longStr) + } + in := sb.NewArray() + defer in.Release() + + for _, to := range []arrow.DataType{arrow.BinaryTypes.StringView, arrow.BinaryTypes.BinaryView} { + c.Run("to "+to.String(), func() { + got, err := compute.CastArray(context.Background(), in, + compute.SafeCastOptions(to)) + c.Require().NoError(err) + defer got.Release() + + c.Equal(count, got.Len()) + c.LessOrEqualf(len(got.Data().Buffers()), 3, + "expected <=3 buffers (bitmap + view headers + 1 data), got %d", + len(got.Data().Buffers())) + + switch v := got.(type) { + case *array.StringView: + for i := 0; i < v.Len(); i++ { + c.Equalf(longStr, v.Value(i), "row %d mismatch", i) + } + case *array.BinaryView: + for i := 0; i < v.Len(); i++ { + c.Equalf(longStr, string(v.Value(i)), "row %d mismatch", i) + } + } + }) + } +} + // checkCastArrayOnly performs a cast and compares the resulting array against // an expected array built from JSON. It deliberately avoids the per-element // scalar iteration path used by checkCast, which relies on scalar.GetScalar diff --git a/arrow/compute/internal/kernels/binary_view_casts.go b/arrow/compute/internal/kernels/binary_view_casts.go index 8a1814bd7..15298b72b 100644 --- a/arrow/compute/internal/kernels/binary_view_casts.go +++ b/arrow/compute/internal/kernels/binary_view_casts.go @@ -20,6 +20,7 @@ package kernels import ( "fmt" + "math" "unicode/utf8" "unsafe" @@ -63,20 +64,21 @@ func unsafeStringBytes(s string) []byte { type binaryAppender struct { bldr array.Builder appendBytes func([]byte) + reserveData func(int) } func newBinaryAppender(bldr array.Builder) (binaryAppender, error) { switch b := bldr.(type) { case *array.BinaryBuilder: - return binaryAppender{bldr: b, appendBytes: b.Append}, nil + return binaryAppender{bldr: b, appendBytes: b.Append, reserveData: b.ReserveData}, nil case *array.StringBuilder: - return binaryAppender{bldr: b, appendBytes: b.BinaryBuilder.Append}, nil + return binaryAppender{bldr: b, appendBytes: b.BinaryBuilder.Append, reserveData: b.BinaryBuilder.ReserveData}, nil case *array.LargeStringBuilder: - return binaryAppender{bldr: b, appendBytes: b.BinaryBuilder.Append}, nil + return binaryAppender{bldr: b, appendBytes: b.BinaryBuilder.Append, reserveData: b.BinaryBuilder.ReserveData}, nil case *array.BinaryViewBuilder: - return binaryAppender{bldr: b, appendBytes: b.Append}, nil + return binaryAppender{bldr: b, appendBytes: b.Append, reserveData: b.ReserveData}, nil case *array.StringViewBuilder: - return binaryAppender{bldr: b, appendBytes: b.BinaryViewBuilder.Append}, nil + return binaryAppender{bldr: b, appendBytes: b.BinaryViewBuilder.Append, reserveData: b.BinaryViewBuilder.ReserveData}, nil default: return binaryAppender{}, fmt.Errorf("%w: unsupported builder type %T for binary-like output", arrow.ErrNotImplemented, bldr) @@ -144,6 +146,29 @@ func CastBinaryToBinaryView(ctx *exec.KernelCtx, batch *exec.ExecSpan, out *exec arrow.ErrNotImplemented, out.Type, input.Type) } + // Pre-size the out-of-line data buffer to the total required capacity so + // the builder allocates a single overflow block. ArraySpan only has three + // buffer slots (bitmap + view headers + one data buffer); if we let the + // builder spill into a second block, TakeOwnership would index past + // Buffers[2] and panic. See the review feedback on GH-184. + var outOfLineTotal int64 + for i := 0; i < arr.Len(); i++ { + if arr.IsNull(i) { + continue + } + vlen := len(getVal(i)) + if !arrow.IsViewInline(vlen) { + outOfLineTotal += int64(vlen) + } + } + if outOfLineTotal > math.MaxInt32 { + return fmt.Errorf("%w: cast from %s to %s: out-of-line payload (%d bytes) exceeds single view data buffer limit (%d bytes)", + arrow.ErrInvalid, input.Type, out.Type, outOfLineTotal, math.MaxInt32) + } + if outOfLineTotal > 0 { + ba.reserveData(int(outOfLineTotal)) + } + for i := 0; i < arr.Len(); i++ { if arr.IsNull(i) { ba.bldr.AppendNull() From 486c6122f7789935ec186623a77a9beed064ecde Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Tue, 5 May 2026 13:27:13 -0400 Subject: [PATCH 03/23] linting --- arrow/compute/internal/kernels/binary_view_casts.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arrow/compute/internal/kernels/binary_view_casts.go b/arrow/compute/internal/kernels/binary_view_casts.go index 15298b72b..d1f3fa47a 100644 --- a/arrow/compute/internal/kernels/binary_view_casts.go +++ b/arrow/compute/internal/kernels/binary_view_casts.go @@ -72,13 +72,13 @@ func newBinaryAppender(bldr array.Builder) (binaryAppender, error) { case *array.BinaryBuilder: return binaryAppender{bldr: b, appendBytes: b.Append, reserveData: b.ReserveData}, nil case *array.StringBuilder: - return binaryAppender{bldr: b, appendBytes: b.BinaryBuilder.Append, reserveData: b.BinaryBuilder.ReserveData}, nil + return binaryAppender{bldr: b, appendBytes: b.BinaryBuilder.Append, reserveData: b.ReserveData}, nil case *array.LargeStringBuilder: - return binaryAppender{bldr: b, appendBytes: b.BinaryBuilder.Append, reserveData: b.BinaryBuilder.ReserveData}, nil + return binaryAppender{bldr: b, appendBytes: b.BinaryBuilder.Append, reserveData: b.ReserveData}, nil case *array.BinaryViewBuilder: return binaryAppender{bldr: b, appendBytes: b.Append, reserveData: b.ReserveData}, nil case *array.StringViewBuilder: - return binaryAppender{bldr: b, appendBytes: b.BinaryViewBuilder.Append, reserveData: b.BinaryViewBuilder.ReserveData}, nil + return binaryAppender{bldr: b, appendBytes: b.BinaryViewBuilder.Append, reserveData: b.ReserveData}, nil default: return binaryAppender{}, fmt.Errorf("%w: unsupported builder type %T for binary-like output", arrow.ErrNotImplemented, bldr) From a741fd864ee542482efae96fad689a1641c33364 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Tue, 5 May 2026 13:41:18 -0400 Subject: [PATCH 04/23] fix(compute): handle multi-buffer views and size view outputs for cast Addresses two more multi-buffer-view failure modes caught in review 811 of the original GH-184 PR: 1. Multi-buffer view **inputs**. exec.ArraySpan is a fixed [3]BufferSpan, so a BinaryView/StringView array with more than one overflow data buffer (e.g. built via BinaryViewBuilder with >32KB of non-inline payload) panics in SetMembers before any kernel runs. The cast meta-function now calls coalesceMultiBufferViewDatum before dispatch; if the input datum is a view with >3 buffers, it rebuilds the array with a single overflow buffer (sized via ReserveData) and substitutes that into the argument list. 2. Numeric/temporal casts to string_view. addNumericAndTemporalToStringCasts reuses the shared numericToStringCastExec/timeToStringCastExec kernels for string_view output. Those kernels did not reserve data capacity, so StringViewBuilder could spill into a second overflow block and produce a 4-buffer result, which would then panic in TakeOwnership. Both kernels now call bldr.ReserveData(input.Len * maxFormattedBytes) based on a per-type upper bound derived from the formatters. This guarantees the builder's first block is large enough to hold the entire formatted output, so no spill occurs. castTimestampToString already handled this via its existing strlen/ReserveData pattern; boolToStringCastExec is safe because bool formats to at most 5 bytes (always inline). Adds three regression tests under arrow/compute/cast_test.go: - TestMultiBufferViewInputCoalesced: builds multi-buffer BinaryView/ StringView inputs and casts them to binary/utf8/string_view. - TestNumericToStringViewLargePayload: casts 5000 int64/timestamp/time64 values to string_view and asserts the output stays within one data buffer. - TestBinaryLikeToBinaryViewLargePayload (from the earlier fix) continues to cover the output-side pre-reservation path. Addresses review 811 feedback on #802. --- arrow/compute/cast.go | 123 ++++++++++++++ arrow/compute/cast_test.go | 158 ++++++++++++++++++ .../compute/internal/kernels/string_casts.go | 43 +++++ 3 files changed, 324 insertions(+) diff --git a/arrow/compute/cast.go b/arrow/compute/cast.go index 5764f4c38..cc30381cc 100644 --- a/arrow/compute/cast.go +++ b/arrow/compute/cast.go @@ -21,6 +21,7 @@ package compute import ( "context" "fmt" + "math" "sync" "github.com/apache/arrow-go/v18/arrow" @@ -28,6 +29,7 @@ import ( "github.com/apache/arrow-go/v18/arrow/bitutil" "github.com/apache/arrow-go/v18/arrow/compute/exec" "github.com/apache/arrow-go/v18/arrow/compute/internal/kernels" + "github.com/apache/arrow-go/v18/arrow/memory" ) var ( @@ -52,6 +54,21 @@ var ( return NewDatum(d[0]), nil } + // Coalesce multi-buffer view inputs. exec.ArraySpan's fixed + // [3]BufferSpan cannot carry BinaryView/StringView arrays with + // more than one overflow data buffer, so SetMembers would panic + // before any kernel runs. Rebuild such inputs with a single data + // buffer up front. See GH-184 review feedback. + coalescedDatum, err := coalesceMultiBufferViewDatum(ctx, d[0]) + if err != nil { + return nil, err + } + if coalescedDatum != nil { + defer coalescedDatum.Release() + d = append([]Datum(nil), d...) + d[0] = coalescedDatum + } + fn, err := getCastFunction(castOpts.ToType) if err != nil { return nil, fmt.Errorf("%w from %s", err, d[0].(ArrayLikeDatum).Type()) @@ -65,6 +82,112 @@ func RegisterScalarCast(reg FunctionRegistry) { reg.AddFunction(castMetaFunc, false) } +// coalesceMultiBufferViewDatum returns a new ArrayDatum whose view array has +// a single overflow data buffer, or nil if the input does not need +// coalescing. Non-array and non-view datums are never coalesced. Callers are +// responsible for releasing the returned datum. +func coalesceMultiBufferViewDatum(ctx context.Context, d Datum) (Datum, error) { + ad, ok := d.(*ArrayDatum) + if !ok { + return nil, nil + } + data := ad.Value + switch data.DataType().ID() { + case arrow.BINARY_VIEW, arrow.STRING_VIEW: + default: + return nil, nil + } + if len(data.Buffers()) <= 3 { + return nil, nil + } + + mem := exec.GetAllocator(ctx) + newData, err := rebuildViewSingleBuffer(mem, data) + if err != nil { + return nil, err + } + return &ArrayDatum{Value: newData}, nil +} + +// rebuildViewSingleBuffer rebuilds a BinaryView/StringView ArrayData so that +// all non-inline payload lives in a single contiguous overflow buffer. The +// caller owns the returned ArrayData. +func rebuildViewSingleBuffer(mem memory.Allocator, data arrow.ArrayData) (arrow.ArrayData, error) { + arr := array.MakeFromData(data) + defer arr.Release() + + var ( + total int64 + getLen func(int) int + getInto func(i int, dst func([]byte)) + ) + switch a := arr.(type) { + case *array.BinaryView: + getLen = a.ValueLen + getInto = func(i int, dst func([]byte)) { dst(a.Value(i)) } + case *array.StringView: + getLen = a.ValueLen + getInto = func(i int, dst func([]byte)) { + s := a.Value(i) + dst([]byte(s)) + } + default: + return nil, fmt.Errorf("%w: unexpected view array type %T", arrow.ErrInvalid, arr) + } + for i := 0; i < arr.Len(); i++ { + if arr.IsNull(i) { + continue + } + vlen := getLen(i) + if !arrow.IsViewInline(vlen) { + total += int64(vlen) + } + } + if total > math.MaxInt32 { + return nil, fmt.Errorf("%w: view array out-of-line payload (%d bytes) exceeds single-buffer limit (%d bytes)", + arrow.ErrInvalid, total, math.MaxInt32) + } + + bldr := array.NewBuilder(mem, arr.DataType()) + defer bldr.Release() + bldr.Reserve(arr.Len()) + + switch b := bldr.(type) { + case *array.BinaryViewBuilder: + if total > 0 { + b.ReserveData(int(total)) + } + case *array.StringViewBuilder: + if total > 0 { + b.ReserveData(int(total)) + } + default: + return nil, fmt.Errorf("%w: unexpected view builder type %T", arrow.ErrInvalid, bldr) + } + + appendBytes := func([]byte) {} + switch b := bldr.(type) { + case *array.BinaryViewBuilder: + appendBytes = b.Append + case *array.StringViewBuilder: + appendBytes = b.BinaryViewBuilder.Append + } + + for i := 0; i < arr.Len(); i++ { + if arr.IsNull(i) { + bldr.AppendNull() + continue + } + getInto(i, appendBytes) + } + + newArr := bldr.NewArray() + defer newArr.Release() + result := newArr.Data() + result.Retain() + return result, nil +} + type castFunction struct { ScalarFunction diff --git a/arrow/compute/cast_test.go b/arrow/compute/cast_test.go index a64405e9f..7a59db2b1 100644 --- a/arrow/compute/cast_test.go +++ b/arrow/compute/cast_test.go @@ -1759,6 +1759,164 @@ func (c *CastSuite) TestBinaryLikeToBinaryViewLargePayload() { } } +// TestMultiBufferViewInputCoalesced is a regression test for the second +// round of GH-184 review feedback: a BinaryView/StringView input whose +// payload already spans multiple builder blocks (>32KB) must be coalesced +// into a single-buffer view before entering compute, since +// exec.ArraySpan's fixed [3]BufferSpan cannot carry multi-buffer views. +func (c *CastSuite) TestMultiBufferViewInputCoalesced() { + const ( + perValue = 200 + count = 500 // ~100KB of non-inline payload, forces multi-buffer view + ) + longStr := strings.Repeat("y", perValue) + + buildMultiBufferBinaryView := func() arrow.Array { + bldr := array.NewBinaryViewBuilder(c.mem) + defer bldr.Release() + for i := 0; i < count; i++ { + bldr.Append([]byte(longStr)) + } + return bldr.NewArray() + } + + buildMultiBufferStringView := func() arrow.Array { + bldr := array.NewStringViewBuilder(c.mem) + defer bldr.Release() + for i := 0; i < count; i++ { + bldr.Append(longStr) + } + return bldr.NewArray() + } + + c.Run("binary_view input has multiple data buffers", func() { + in := buildMultiBufferBinaryView() + defer in.Release() + c.Greater(len(in.Data().Buffers()), 3, + "test precondition: input must be multi-buffer") + }) + + c.Run("string_view to utf8", func() { + in := buildMultiBufferStringView() + defer in.Release() + + got, err := compute.CastArray(context.Background(), in, + compute.SafeCastOptions(arrow.BinaryTypes.String)) + c.Require().NoError(err) + defer got.Release() + + sv := got.(*array.String) + c.Equal(count, sv.Len()) + for i := 0; i < sv.Len(); i++ { + c.Equalf(longStr, sv.Value(i), "row %d mismatch", i) + } + }) + + c.Run("binary_view to binary", func() { + in := buildMultiBufferBinaryView() + defer in.Release() + + got, err := compute.CastArray(context.Background(), in, + compute.SafeCastOptions(arrow.BinaryTypes.Binary)) + c.Require().NoError(err) + defer got.Release() + + bv := got.(*array.Binary) + c.Equal(count, bv.Len()) + for i := 0; i < bv.Len(); i++ { + c.Equalf(longStr, string(bv.Value(i)), "row %d mismatch", i) + } + }) + + c.Run("binary_view to string_view", func() { + in := buildMultiBufferBinaryView() + defer in.Release() + + got, err := compute.CastArray(context.Background(), in, + compute.SafeCastOptions(arrow.BinaryTypes.StringView)) + c.Require().NoError(err) + defer got.Release() + + sv := got.(*array.StringView) + c.Equal(count, sv.Len()) + c.LessOrEqualf(len(got.Data().Buffers()), 3, + "coalesced output must fit in ArraySpan's 3 buffers, got %d", + len(got.Data().Buffers())) + }) +} + +// TestNumericToStringViewLargePayload is a regression test for the second +// round of GH-184 review feedback: casting a numeric/temporal column with +// enough values to produce >32KB of non-inline formatted output must still +// produce a single-buffer string_view rather than spilling into a second +// overflow block (which would panic in out.TakeOwnership). +func (c *CastSuite) TestNumericToStringViewLargePayload() { + const count = 5000 + + c.Run("int64 to string_view", func() { + bldr := array.NewInt64Builder(c.mem) + defer bldr.Release() + for i := 0; i < count; i++ { + // use large magnitudes so formatted strings are 19-20 bytes (non-inline) + bldr.Append(int64(-9223372036854775000 + int64(i))) + } + in := bldr.NewArray() + defer in.Release() + + got, err := compute.CastArray(context.Background(), in, + compute.SafeCastOptions(arrow.BinaryTypes.StringView)) + c.Require().NoError(err) + defer got.Release() + + c.Equal(count, got.Len()) + c.LessOrEqualf(len(got.Data().Buffers()), 3, + "expected <=3 buffers for string_view output, got %d", + len(got.Data().Buffers())) + }) + + c.Run("timestamp to string_view", func() { + tsType := &arrow.TimestampType{Unit: arrow.Nanosecond, TimeZone: "UTC"} + bldr := array.NewTimestampBuilder(c.mem, tsType) + defer bldr.Release() + for i := 0; i < count; i++ { + bldr.Append(arrow.Timestamp(int64(1e18) + int64(i))) + } + in := bldr.NewArray() + defer in.Release() + + got, err := compute.CastArray(context.Background(), in, + compute.SafeCastOptions(arrow.BinaryTypes.StringView)) + c.Require().NoError(err) + defer got.Release() + + c.Equal(count, got.Len()) + c.LessOrEqualf(len(got.Data().Buffers()), 3, + "expected <=3 buffers for string_view output, got %d", + len(got.Data().Buffers())) + }) + + c.Run("time64 to string_view", func() { + t64Type := &arrow.Time64Type{Unit: arrow.Nanosecond} + bldr := array.NewTime64Builder(c.mem, t64Type) + defer bldr.Release() + for i := 0; i < count; i++ { + bldr.Append(arrow.Time64(int64(86400e9-1) - int64(i))) + } + in := bldr.NewArray() + defer in.Release() + + got, err := compute.CastArray(context.Background(), in, + compute.SafeCastOptions(arrow.BinaryTypes.StringView)) + c.Require().NoError(err) + defer got.Release() + + c.Equal(count, got.Len()) + c.LessOrEqualf(len(got.Data().Buffers()), 3, + "expected <=3 buffers for string_view output, got %d", + len(got.Data().Buffers())) + }) +} + // checkCastArrayOnly performs a cast and compares the resulting array against // an expected array built from JSON. It deliberately avoids the per-element // scalar iteration path used by checkCast, which relies on scalar.GetScalar diff --git a/arrow/compute/internal/kernels/string_casts.go b/arrow/compute/internal/kernels/string_casts.go index f994777d0..14bb3f11e 100644 --- a/arrow/compute/internal/kernels/string_casts.go +++ b/arrow/compute/internal/kernels/string_casts.go @@ -275,6 +275,9 @@ func timeToStringCastExec[T timeIntrinsic](ctx *exec.KernelCtx, batch *exec.Exec ) defer bldr.Release() + bldr.Reserve(int(input.Len)) + bldr.ReserveData(int(input.Len-input.Nulls) * maxFormattedBytes(input.Type)) + bitutils.VisitBitBlocks(input.Buffers[0].Buf, input.Offset, input.Len, func(pos int64) { bldr.Append(inputData[pos].FormattedString(inputType.TimeUnit())) @@ -294,6 +297,9 @@ func numericToStringCastExec[T arrow.IntType | arrow.UintType | arrow.FloatType] ) defer bldr.Release() + bldr.Reserve(int(input.Len)) + bldr.ReserveData(int(input.Len-input.Nulls) * maxFormattedBytes(input.Type)) + bitutils.VisitBitBlocks(input.Buffers[0].Buf, input.Offset, input.Len, func(pos int64) { bldr.Append(formatter(inputData[pos])) @@ -305,6 +311,43 @@ func numericToStringCastExec[T arrow.IntType | arrow.UintType | arrow.FloatType] } } +// maxFormattedBytes returns an upper bound on the textual representation +// of a single value of dt used by the numeric/temporal-to-string kernels. +// The bound is used to pre-reserve the builder's data buffer so StringView +// outputs stay within a single overflow block (compute's ArraySpan can +// carry only one view data buffer). See GH-184 review feedback. +func maxFormattedBytes(dt arrow.DataType) int { + switch dt.ID() { + case arrow.INT8, arrow.UINT8: + return 4 // "-128" / "255" + case arrow.INT16: + return 6 // "-32768" + case arrow.UINT16: + return 5 // "65535" + case arrow.INT32: + return 11 // "-2147483648" + case arrow.UINT32: + return 10 // "4294967295" + case arrow.INT64, arrow.UINT64: + return 20 // "-9223372036854775808" + case arrow.FLOAT16: + return 12 // covers "-NaN", scientific, and decimal forms + case arrow.FLOAT32: + return 25 // scientific notation upper bound + case arrow.FLOAT64: + return 32 // scientific notation upper bound + case arrow.DATE32, arrow.DATE64: + return 10 // "YYYY-MM-DD" + case arrow.TIME32: + return 12 // "HH:MM:SS.sss" + case arrow.TIME64: + return 18 // "HH:MM:SS.nnnnnnnnn" + case arrow.TIMESTAMP: + return 35 // date + time with ns precision + short tz offset + } + return 0 +} + func castTimestampToString(ctx *exec.KernelCtx, batch *exec.ExecSpan, out *exec.ExecResult) error { var ( input = &batch.Values[0].Array From 703758f93e03d69d2468831b01ed2f80535d46b7 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Tue, 5 May 2026 14:11:47 -0400 Subject: [PATCH 05/23] fix(compute): extend view coalescing to chunked inputs and guard overflow Addresses three more gaps in view-cast support caught in review 814 of the GH-184 PR: 1. **Chunked view inputs bypassed coalescing (HIGH).** coalesceMultiBufferViewDatum only handled *ArrayDatum. A ChunkedDatum whose chunks are multi-buffer BinaryView/StringView still crashed when the executor called SetMembers on a second chunk. The meta-function helper now walks ChunkedDatum.Chunks(), rebuilds any multi-buffer chunk via rebuildViewSingleBuffer, retains single-buffer chunks as-is, and returns a new *ChunkedDatum. 2. **maxFormattedBytes(FLOAT16) bound was 12 (MEDIUM).** float16.Num.String() calls strconv.FormatFloat(..., 32) which can emit up to 15 bytes. Raised the bound to 16. (No direct cast test added because FLOAT16 -> string_view isn't registered through addNumericAndTemporalToStringCasts - numericTypes excludes Float16 - so the bound is defensive for the eventual case where it is.) 3. **Numeric/temporal ReserveData could overflow int32 (MEDIUM).** numericToStringCastExec, timeToStringCastExec, and castTimestampToString now call a new reserveFormattedData helper that computes (Len - Nulls) * perValueBytes in int64 and returns arrow.ErrInvalid when the result exceeds math.MaxInt32, rather than panicking inside BinaryViewBuilder.ReserveData for very large inputs. Tests: - TestChunkedMultiBufferViewInputCoalesced (arrow/compute/cast_test.go): 2 multi-buffer StringView chunks cast to utf8, verifies correct content in each output chunk. - TestReserveFormattedDataOverflowGuard (arrow/compute/internal/kernels/string_casts_test.go): unit-tests the overflow guard directly against a fake ArraySpan so we don't have to allocate >2GB through the full executor to trigger it. Addresses review 814 feedback on #802. --- arrow/compute/cast.go | 79 ++++++++++++++----- arrow/compute/cast_test.go | 59 ++++++++++++++ .../compute/internal/kernels/string_casts.go | 34 +++++++- .../internal/kernels/string_casts_test.go | 56 +++++++++++++ 4 files changed, 205 insertions(+), 23 deletions(-) create mode 100644 arrow/compute/internal/kernels/string_casts_test.go diff --git a/arrow/compute/cast.go b/arrow/compute/cast.go index cc30381cc..a3ee7fbcc 100644 --- a/arrow/compute/cast.go +++ b/arrow/compute/cast.go @@ -82,31 +82,72 @@ func RegisterScalarCast(reg FunctionRegistry) { reg.AddFunction(castMetaFunc, false) } -// coalesceMultiBufferViewDatum returns a new ArrayDatum whose view array has -// a single overflow data buffer, or nil if the input does not need -// coalescing. Non-array and non-view datums are never coalesced. Callers are -// responsible for releasing the returned datum. +// coalesceMultiBufferViewDatum returns a new Datum whose view arrays each +// have a single overflow data buffer, or nil if the input does not need +// coalescing. Both ArrayDatum and ChunkedDatum view inputs are handled; +// other datums are never coalesced. Callers are responsible for releasing +// the returned datum when it is non-nil. func coalesceMultiBufferViewDatum(ctx context.Context, d Datum) (Datum, error) { - ad, ok := d.(*ArrayDatum) - if !ok { - return nil, nil + switch v := d.(type) { + case *ArrayDatum: + if !needsViewCoalesce(v.Value) { + return nil, nil + } + mem := exec.GetAllocator(ctx) + newData, err := rebuildViewSingleBuffer(mem, v.Value) + if err != nil { + return nil, err + } + return &ArrayDatum{Value: newData}, nil + + case *ChunkedDatum: + chunks := v.Value.Chunks() + needAny := false + for _, c := range chunks { + if needsViewCoalesce(c.Data()) { + needAny = true + break + } + } + if !needAny { + return nil, nil + } + mem := exec.GetAllocator(ctx) + newChunks := make([]arrow.Array, len(chunks)) + for i, c := range chunks { + if !needsViewCoalesce(c.Data()) { + c.Retain() + newChunks[i] = c + continue + } + newData, err := rebuildViewSingleBuffer(mem, c.Data()) + if err != nil { + for j := 0; j < i; j++ { + newChunks[j].Release() + } + return nil, err + } + newChunks[i] = array.MakeFromData(newData) + newData.Release() + } + chunked := arrow.NewChunked(v.Value.DataType(), newChunks) + for _, nc := range newChunks { + nc.Release() + } + return &ChunkedDatum{Value: chunked}, nil } - data := ad.Value + return nil, nil +} + +// needsViewCoalesce reports whether data is a view array whose payload +// spans more than the single overflow data buffer exec.ArraySpan can carry. +func needsViewCoalesce(data arrow.ArrayData) bool { switch data.DataType().ID() { case arrow.BINARY_VIEW, arrow.STRING_VIEW: default: - return nil, nil - } - if len(data.Buffers()) <= 3 { - return nil, nil - } - - mem := exec.GetAllocator(ctx) - newData, err := rebuildViewSingleBuffer(mem, data) - if err != nil { - return nil, err + return false } - return &ArrayDatum{Value: newData}, nil + return len(data.Buffers()) > 3 } // rebuildViewSingleBuffer rebuilds a BinaryView/StringView ArrayData so that diff --git a/arrow/compute/cast_test.go b/arrow/compute/cast_test.go index 7a59db2b1..6e242460e 100644 --- a/arrow/compute/cast_test.go +++ b/arrow/compute/cast_test.go @@ -1917,6 +1917,65 @@ func (c *CastSuite) TestNumericToStringViewLargePayload() { }) } +// TestChunkedMultiBufferViewInputCoalesced is a regression test for the +// third round of GH-184 review feedback: a *ChunkedDatum whose chunks are +// multi-buffer view arrays must also be coalesced by the cast meta function +// before dispatch; otherwise executor.SetMembers on the second chunk panics. +func (c *CastSuite) TestChunkedMultiBufferViewInputCoalesced() { + const ( + perValue = 200 + count = 500 // ~100KB non-inline payload per chunk + ) + chunkStr1 := strings.Repeat("a", perValue) + chunkStr2 := strings.Repeat("b", perValue) + + buildChunk := func(s string) arrow.Array { + b := array.NewStringViewBuilder(c.mem) + defer b.Release() + for i := 0; i < count; i++ { + b.Append(s) + } + return b.NewArray() + } + + chunk1 := buildChunk(chunkStr1) + defer chunk1.Release() + chunk2 := buildChunk(chunkStr2) + defer chunk2.Release() + + c.Greater(len(chunk1.Data().Buffers()), 3, + "test precondition: chunk1 must be multi-buffer") + c.Greater(len(chunk2.Data().Buffers()), 3, + "test precondition: chunk2 must be multi-buffer") + + chunked := arrow.NewChunked(arrow.BinaryTypes.StringView, []arrow.Array{chunk1, chunk2}) + defer chunked.Release() + + in := compute.NewDatum(chunked) + defer in.Release() + + out, err := compute.CallFunction(context.Background(), "cast", + compute.SafeCastOptions(arrow.BinaryTypes.String), in) + c.Require().NoError(err) + defer out.Release() + + chunks := out.(compute.ArrayLikeDatum).Chunks() + total := 0 + for idx, ch := range chunks { + defer ch.Release() + sv := ch.(*array.String) + for i := 0; i < sv.Len(); i++ { + want := chunkStr1 + if idx == 1 { + want = chunkStr2 + } + c.Equalf(want, sv.Value(i), "chunk %d row %d mismatch", idx, i) + } + total += sv.Len() + } + c.Equal(count*2, total) +} + // checkCastArrayOnly performs a cast and compares the resulting array against // an expected array built from JSON. It deliberately avoids the per-element // scalar iteration path used by checkCast, which relies on scalar.GetScalar diff --git a/arrow/compute/internal/kernels/string_casts.go b/arrow/compute/internal/kernels/string_casts.go index 14bb3f11e..57310d61e 100644 --- a/arrow/compute/internal/kernels/string_casts.go +++ b/arrow/compute/internal/kernels/string_casts.go @@ -20,6 +20,7 @@ package kernels import ( "fmt" + "math" "strconv" "unicode/utf8" @@ -276,7 +277,9 @@ func timeToStringCastExec[T timeIntrinsic](ctx *exec.KernelCtx, batch *exec.Exec defer bldr.Release() bldr.Reserve(int(input.Len)) - bldr.ReserveData(int(input.Len-input.Nulls) * maxFormattedBytes(input.Type)) + if err := reserveFormattedData(bldr, input, maxFormattedBytes(input.Type)); err != nil { + return err + } bitutils.VisitBitBlocks(input.Buffers[0].Buf, input.Offset, input.Len, func(pos int64) { @@ -298,7 +301,9 @@ func numericToStringCastExec[T arrow.IntType | arrow.UintType | arrow.FloatType] defer bldr.Release() bldr.Reserve(int(input.Len)) - bldr.ReserveData(int(input.Len-input.Nulls) * maxFormattedBytes(input.Type)) + if err := reserveFormattedData(bldr, input, maxFormattedBytes(input.Type)); err != nil { + return err + } bitutils.VisitBitBlocks(input.Buffers[0].Buf, input.Offset, input.Len, func(pos int64) { @@ -311,6 +316,25 @@ func numericToStringCastExec[T arrow.IntType | arrow.UintType | arrow.FloatType] } } +// reserveFormattedData pre-reserves bldr's data buffer for at most +// (non-null count) * perValueBytes formatted bytes. Returns arrow.ErrInvalid +// when the product exceeds the int32 single-buffer limit, avoiding a panic +// inside BinaryViewBuilder.ReserveData. See GH-184 review feedback. +func reserveFormattedData(bldr array.StringLikeBuilder, input *exec.ArraySpan, perValueBytes int) error { + if perValueBytes <= 0 { + return nil + } + total := int64(input.Len-input.Nulls) * int64(perValueBytes) + if total > math.MaxInt32 { + return fmt.Errorf("%w: formatted cast payload (%d bytes) exceeds single view data buffer limit (%d bytes)", + arrow.ErrInvalid, total, math.MaxInt32) + } + if total > 0 { + bldr.ReserveData(int(total)) + } + return nil +} + // maxFormattedBytes returns an upper bound on the textual representation // of a single value of dt used by the numeric/temporal-to-string kernels. // The bound is used to pre-reserve the builder's data buffer so StringView @@ -331,7 +355,7 @@ func maxFormattedBytes(dt arrow.DataType) int { case arrow.INT64, arrow.UINT64: return 20 // "-9223372036854775808" case arrow.FLOAT16: - return 12 // covers "-NaN", scientific, and decimal forms + return 16 // empirical max from strconv.FormatFloat(32): "-0.000100016594" is 15 case arrow.FLOAT32: return 25 // scientific notation upper bound case arrow.FLOAT64: @@ -383,7 +407,9 @@ func castTimestampToString(ctx *exec.KernelCtx, batch *exec.ExecSpan, out *exec. strlen := len(fmtstring) bldr.Reserve(int(input.Len)) - bldr.ReserveData(int(input.Len-input.Nulls) * strlen) + if err := reserveFormattedData(bldr, input, strlen); err != nil { + return err + } bitutils.VisitBitBlocks(input.Buffers[0].Buf, input.Offset, input.Len, func(pos int64) { diff --git a/arrow/compute/internal/kernels/string_casts_test.go b/arrow/compute/internal/kernels/string_casts_test.go new file mode 100644 index 000000000..d08e8f5c8 --- /dev/null +++ b/arrow/compute/internal/kernels/string_casts_test.go @@ -0,0 +1,56 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build go1.18 + +package kernels + +import ( + "errors" + "math" + "testing" + + "github.com/apache/arrow-go/v18/arrow" + "github.com/apache/arrow-go/v18/arrow/array" + "github.com/apache/arrow-go/v18/arrow/compute/exec" + "github.com/apache/arrow-go/v18/arrow/memory" +) + +// TestReserveFormattedDataOverflowGuard verifies that reserveFormattedData +// returns arrow.ErrInvalid (rather than panicking inside the builder's +// ReserveData) when (Len - Nulls) * perValueBytes would overflow int32. +// Tested at the kernel level so we don't need to drive >2 GB of input +// through the full executor to trigger the guard. See GH-184. +func TestReserveFormattedDataOverflowGuard(t *testing.T) { + mem := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer mem.AssertSize(t, 0) + + bldr := array.NewStringViewBuilder(mem) + defer bldr.Release() + + span := &exec.ArraySpan{ + Len: int64(math.MaxInt32/20) + 1, + Nulls: 0, + } + + err := reserveFormattedData(bldr, span, 20) + if err == nil { + t.Fatal("expected overflow guard to return an error") + } + if !errors.Is(err, arrow.ErrInvalid) { + t.Fatalf("expected arrow.ErrInvalid, got %v", err) + } +} From f78713506d9e4c5a8cfacbe3b94622ccb7716eb8 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Tue, 5 May 2026 14:29:35 -0400 Subject: [PATCH 06/23] fix(compute): respect destination builder limits in formatted-data guard reserveFormattedData hard-failed any payload above MaxInt32 regardless of the destination builder, rejecting valid large_utf8 casts that LargeStringBuilder could represent via int64 offsets. Split the limit per builder: MaxInt32 for StringBuilder/StringViewBuilder (int32 offsets / single overflow buffer), MaxInt64 for LargeStringBuilder. Also fix a double-release in TestChunkedMultiBufferViewInputCoalesced: ChunkedDatum.Chunks() returns the slice owned by the datum, so per-chunk Release() under defer out.Release() double-freed the arrays. --- arrow/compute/cast_test.go | 1 - .../compute/internal/kernels/string_casts.go | 24 +++++-- .../internal/kernels/string_casts_test.go | 62 +++++++++++++++++++ 3 files changed, 81 insertions(+), 6 deletions(-) diff --git a/arrow/compute/cast_test.go b/arrow/compute/cast_test.go index 6e242460e..bc389a52e 100644 --- a/arrow/compute/cast_test.go +++ b/arrow/compute/cast_test.go @@ -1962,7 +1962,6 @@ func (c *CastSuite) TestChunkedMultiBufferViewInputCoalesced() { chunks := out.(compute.ArrayLikeDatum).Chunks() total := 0 for idx, ch := range chunks { - defer ch.Release() sv := ch.(*array.String) for i := 0; i < sv.Len(); i++ { want := chunkStr1 diff --git a/arrow/compute/internal/kernels/string_casts.go b/arrow/compute/internal/kernels/string_casts.go index 57310d61e..3c60af608 100644 --- a/arrow/compute/internal/kernels/string_casts.go +++ b/arrow/compute/internal/kernels/string_casts.go @@ -318,16 +318,18 @@ func numericToStringCastExec[T arrow.IntType | arrow.UintType | arrow.FloatType] // reserveFormattedData pre-reserves bldr's data buffer for at most // (non-null count) * perValueBytes formatted bytes. Returns arrow.ErrInvalid -// when the product exceeds the int32 single-buffer limit, avoiding a panic -// inside BinaryViewBuilder.ReserveData. See GH-184 review feedback. +// when the product exceeds the destination builder's single-buffer limit, +// avoiding a panic inside BinaryViewBuilder.ReserveData or an int32 offset +// overflow in StringBuilder. See GH-184 review feedback. func reserveFormattedData(bldr array.StringLikeBuilder, input *exec.ArraySpan, perValueBytes int) error { if perValueBytes <= 0 { return nil } total := int64(input.Len-input.Nulls) * int64(perValueBytes) - if total > math.MaxInt32 { - return fmt.Errorf("%w: formatted cast payload (%d bytes) exceeds single view data buffer limit (%d bytes)", - arrow.ErrInvalid, total, math.MaxInt32) + limit := formattedDataLimit(bldr) + if total > limit { + return fmt.Errorf("%w: formatted cast payload (%d bytes) exceeds single data buffer limit (%d bytes) for destination builder", + arrow.ErrInvalid, total, limit) } if total > 0 { bldr.ReserveData(int(total)) @@ -335,6 +337,18 @@ func reserveFormattedData(bldr array.StringLikeBuilder, input *exec.ArraySpan, p return nil } +// formattedDataLimit returns the largest contiguous data payload (in bytes) +// that the destination builder can hold in a single buffer: +// - *array.StringBuilder (utf8): MaxInt32 (int32 offsets) +// - *array.StringViewBuilder (string_view): MaxInt32 (single overflow buffer) +// - *array.LargeStringBuilder (large_utf8): MaxInt64 (int64 offsets) +func formattedDataLimit(bldr array.StringLikeBuilder) int64 { + if _, ok := bldr.(*array.LargeStringBuilder); ok { + return math.MaxInt64 + } + return math.MaxInt32 +} + // maxFormattedBytes returns an upper bound on the textual representation // of a single value of dt used by the numeric/temporal-to-string kernels. // The bound is used to pre-reserve the builder's data buffer so StringView diff --git a/arrow/compute/internal/kernels/string_casts_test.go b/arrow/compute/internal/kernels/string_casts_test.go index d08e8f5c8..fe03ad81e 100644 --- a/arrow/compute/internal/kernels/string_casts_test.go +++ b/arrow/compute/internal/kernels/string_casts_test.go @@ -54,3 +54,65 @@ func TestReserveFormattedDataOverflowGuard(t *testing.T) { t.Fatalf("expected arrow.ErrInvalid, got %v", err) } } + +// TestFormattedDataLimitPerBuilder verifies the per-destination single-buffer +// limit used by reserveFormattedData: utf8 and string_view are capped at +// MaxInt32 (int32 offsets / single overflow buffer), while large_utf8 is +// capped at MaxInt64 (int64 offsets). Regression test for GH-184: the +// previous hardcoded MaxInt32 guard incorrectly rejected valid large_utf8 +// casts above 2 GiB. +func TestFormattedDataLimitPerBuilder(t *testing.T) { + mem := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer mem.AssertSize(t, 0) + + utf8Bldr := array.NewStringBuilder(mem) + defer utf8Bldr.Release() + viewBldr := array.NewStringViewBuilder(mem) + defer viewBldr.Release() + largeBldr := array.NewLargeStringBuilder(mem) + defer largeBldr.Release() + + cases := []struct { + name string + bldr array.StringLikeBuilder + want int64 + }{ + {"utf8", utf8Bldr, math.MaxInt32}, + {"string_view", viewBldr, math.MaxInt32}, + {"large_utf8", largeBldr, math.MaxInt64}, + } + for _, tc := range cases { + if got := formattedDataLimit(tc.bldr); got != tc.want { + t.Errorf("formattedDataLimit(%s): want %d, got %d", tc.name, tc.want, got) + } + } +} + +// TestReserveFormattedDataAllowsLargeUtf8AboveInt32 is the regression test +// for the GH-184 second-round review finding: reserveFormattedData must not +// reject totals above MaxInt32 when the destination is large_utf8 +// (LargeStringBuilder), because int64 offsets can represent them. We assert +// the destination-specific limit via formattedDataLimit rather than driving +// a multi-gigabyte allocation through the full kernel path, and separately +// confirm the guard still fires for string_view at the int32 boundary. +func TestReserveFormattedDataAllowsLargeUtf8AboveInt32(t *testing.T) { + mem := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer mem.AssertSize(t, 0) + + largeBldr := array.NewLargeStringBuilder(mem) + defer largeBldr.Release() + + if got := formattedDataLimit(largeBldr); got <= math.MaxInt32 { + t.Fatalf("large_utf8 destination limit must exceed MaxInt32, got %d", got) + } + + viewBldr := array.NewStringViewBuilder(mem) + defer viewBldr.Release() + span := &exec.ArraySpan{ + Len: int64(math.MaxInt32/20) + 1, + Nulls: 0, + } + if err := reserveFormattedData(viewBldr, span, 20); !errors.Is(err, arrow.ErrInvalid) { + t.Fatalf("string_view guard should reject total > MaxInt32, got %v", err) + } +} From 27e6af85f1fc881205943dff205c813aca957825 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Tue, 5 May 2026 15:53:40 -0400 Subject: [PATCH 07/23] fix(compute): clamp formatted-data limit to platform int on 32-bit builds formattedDataLimit returned math.MaxInt64 for LargeStringBuilder, but reserveFormattedData later casts total to int for ReserveData. On 32-bit builds where int is 32-bit, any total above math.MaxInt32 would overflow the conversion, bypassing the guard and producing a negative or wrapped value inside BinaryBuilder.ReserveData. Clamp the returned limit to int64(math.MaxInt) so it always fits the platform's int. On 64-bit builds this is a no-op (MaxInt == MaxInt64); on 32-bit it caps large_utf8 at MaxInt32. Added TestFormattedDataLimitFitsInPlatformInt regression test that holds on both platforms by construction. --- .../compute/internal/kernels/string_casts.go | 15 +++++-- .../internal/kernels/string_casts_test.go | 41 +++++++++++++++++-- 2 files changed, 48 insertions(+), 8 deletions(-) diff --git a/arrow/compute/internal/kernels/string_casts.go b/arrow/compute/internal/kernels/string_casts.go index 3c60af608..49aebf7dd 100644 --- a/arrow/compute/internal/kernels/string_casts.go +++ b/arrow/compute/internal/kernels/string_casts.go @@ -338,15 +338,22 @@ func reserveFormattedData(bldr array.StringLikeBuilder, input *exec.ArraySpan, p } // formattedDataLimit returns the largest contiguous data payload (in bytes) -// that the destination builder can hold in a single buffer: +// that the destination builder can hold in a single buffer, clamped so that +// reserveFormattedData's int(total) conversion cannot overflow: // - *array.StringBuilder (utf8): MaxInt32 (int32 offsets) // - *array.StringViewBuilder (string_view): MaxInt32 (single overflow buffer) -// - *array.LargeStringBuilder (large_utf8): MaxInt64 (int64 offsets) +// - *array.LargeStringBuilder (large_utf8): MaxInt64 (int64 offsets), +// clamped to int64(math.MaxInt) so int(total) fits on 32-bit builds +// where int is 32-bit. func formattedDataLimit(bldr array.StringLikeBuilder) int64 { + limit := int64(math.MaxInt32) if _, ok := bldr.(*array.LargeStringBuilder); ok { - return math.MaxInt64 + limit = math.MaxInt64 } - return math.MaxInt32 + if limit > int64(math.MaxInt) { + limit = int64(math.MaxInt) + } + return limit } // maxFormattedBytes returns an upper bound on the textual representation diff --git a/arrow/compute/internal/kernels/string_casts_test.go b/arrow/compute/internal/kernels/string_casts_test.go index fe03ad81e..38430d27c 100644 --- a/arrow/compute/internal/kernels/string_casts_test.go +++ b/arrow/compute/internal/kernels/string_casts_test.go @@ -58,9 +58,11 @@ func TestReserveFormattedDataOverflowGuard(t *testing.T) { // TestFormattedDataLimitPerBuilder verifies the per-destination single-buffer // limit used by reserveFormattedData: utf8 and string_view are capped at // MaxInt32 (int32 offsets / single overflow buffer), while large_utf8 is -// capped at MaxInt64 (int64 offsets). Regression test for GH-184: the -// previous hardcoded MaxInt32 guard incorrectly rejected valid large_utf8 -// casts above 2 GiB. +// capped at min(MaxInt64, platform int max). On 64-bit builds the large_utf8 +// limit is MaxInt64; on 32-bit builds it is clamped to MaxInt32 so the +// int(total) conversion in reserveFormattedData never overflows. Regression +// test for GH-184: the original MaxInt32 guard incorrectly rejected valid +// large_utf8 casts above 2 GiB on 64-bit platforms. func TestFormattedDataLimitPerBuilder(t *testing.T) { mem := memory.NewCheckedAllocator(memory.NewGoAllocator()) defer mem.AssertSize(t, 0) @@ -72,6 +74,11 @@ func TestFormattedDataLimitPerBuilder(t *testing.T) { largeBldr := array.NewLargeStringBuilder(mem) defer largeBldr.Release() + largeWant := int64(math.MaxInt64) + if largeWant > int64(math.MaxInt) { + largeWant = int64(math.MaxInt) + } + cases := []struct { name string bldr array.StringLikeBuilder @@ -79,7 +86,7 @@ func TestFormattedDataLimitPerBuilder(t *testing.T) { }{ {"utf8", utf8Bldr, math.MaxInt32}, {"string_view", viewBldr, math.MaxInt32}, - {"large_utf8", largeBldr, math.MaxInt64}, + {"large_utf8", largeBldr, largeWant}, } for _, tc := range cases { if got := formattedDataLimit(tc.bldr); got != tc.want { @@ -88,6 +95,32 @@ func TestFormattedDataLimitPerBuilder(t *testing.T) { } } +// TestFormattedDataLimitFitsInPlatformInt is the regression test for the +// GH-184 third-round review finding: reserveFormattedData passes int(total) +// to ReserveData, so the returned limit must fit in the platform int to +// prevent overflow on 32-bit builds where int is 32-bit. The assertion +// holds on both 32- and 64-bit builds by construction. +func TestFormattedDataLimitFitsInPlatformInt(t *testing.T) { + mem := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer mem.AssertSize(t, 0) + + bldrs := []struct { + name string + bldr array.StringLikeBuilder + }{ + {"utf8", array.NewStringBuilder(mem)}, + {"string_view", array.NewStringViewBuilder(mem)}, + {"large_utf8", array.NewLargeStringBuilder(mem)}, + } + for _, tc := range bldrs { + defer tc.bldr.Release() + if limit := formattedDataLimit(tc.bldr); limit > int64(math.MaxInt) { + t.Errorf("formattedDataLimit(%s) = %d exceeds platform math.MaxInt %d", + tc.name, limit, int64(math.MaxInt)) + } + } +} + // TestReserveFormattedDataAllowsLargeUtf8AboveInt32 is the regression test // for the GH-184 second-round review finding: reserveFormattedData must not // reject totals above MaxInt32 when the destination is large_utf8 From fee033f55380c0793b901c98566f2837d2642c45 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Tue, 5 May 2026 16:06:29 -0400 Subject: [PATCH 08/23] test(compute): make large_utf8 limit assertion platform-aware TestReserveFormattedDataAllowsLargeUtf8AboveInt32 previously asserted formattedDataLimit(large_utf8) > MaxInt32 unconditionally. After the 32-bit clamp landed, formattedDataLimit clamps the large_utf8 limit to math.MaxInt32 on 32-bit builds, so the test would fail on the very platform the clamp is meant to protect. Split the assertion: on 64-bit require > MaxInt32 (MaxInt64 exposes the int64 offset capacity); on 32-bit require == MaxInt32 (the clamp-enforced ceiling). --- .../internal/kernels/string_casts_test.go | 28 +++++++++++++------ 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/arrow/compute/internal/kernels/string_casts_test.go b/arrow/compute/internal/kernels/string_casts_test.go index 38430d27c..38783dbf7 100644 --- a/arrow/compute/internal/kernels/string_casts_test.go +++ b/arrow/compute/internal/kernels/string_casts_test.go @@ -121,13 +121,16 @@ func TestFormattedDataLimitFitsInPlatformInt(t *testing.T) { } } -// TestReserveFormattedDataAllowsLargeUtf8AboveInt32 is the regression test -// for the GH-184 second-round review finding: reserveFormattedData must not -// reject totals above MaxInt32 when the destination is large_utf8 -// (LargeStringBuilder), because int64 offsets can represent them. We assert -// the destination-specific limit via formattedDataLimit rather than driving -// a multi-gigabyte allocation through the full kernel path, and separately -// confirm the guard still fires for string_view at the int32 boundary. +// TestReserveFormattedDataAllowsLargeUtf8AboveInt32 is a platform-aware +// regression test for the GH-184 second-round review finding: on 64-bit +// builds, reserveFormattedData must not reject large_utf8 totals above +// MaxInt32 (int64 offsets can represent them); on 32-bit builds, the +// destination-specific limit is instead clamped to math.MaxInt32 so +// int(total) cannot overflow in reserveFormattedData. We assert the +// destination-specific limit via formattedDataLimit rather than driving +// a multi-gigabyte allocation through the full kernel path, and +// separately confirm the guard still fires for string_view at the +// int32 boundary. func TestReserveFormattedDataAllowsLargeUtf8AboveInt32(t *testing.T) { mem := memory.NewCheckedAllocator(memory.NewGoAllocator()) defer mem.AssertSize(t, 0) @@ -135,8 +138,15 @@ func TestReserveFormattedDataAllowsLargeUtf8AboveInt32(t *testing.T) { largeBldr := array.NewLargeStringBuilder(mem) defer largeBldr.Release() - if got := formattedDataLimit(largeBldr); got <= math.MaxInt32 { - t.Fatalf("large_utf8 destination limit must exceed MaxInt32, got %d", got) + got := formattedDataLimit(largeBldr) + if int64(math.MaxInt) > math.MaxInt32 { + if got <= math.MaxInt32 { + t.Fatalf("64-bit: large_utf8 destination limit must exceed MaxInt32, got %d", got) + } + } else { + if got != math.MaxInt32 { + t.Fatalf("32-bit: large_utf8 destination limit must clamp to MaxInt32, got %d", got) + } } viewBldr := array.NewStringViewBuilder(mem) From 41e5abf1cd493fcbfa5673ea76276d87a155883c Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Tue, 5 May 2026 16:30:26 -0400 Subject: [PATCH 09/23] fix(compute): unpack view-typed dictionaries without TakeArray unpackDictionary previously expanded every dictionary through TakeArray, but array_take registers no kernel for binary_view/string_view, so casts whose input was a dictionary with view-typed values failed with 'function array_take has no kernel matching input types (string_view, int32)'. Detect view-typed dictionary values and materialize them directly with StringViewBuilder/BinaryViewBuilder, preserving index-level nulls. Regression test TestCasts/TestViewDictionaryUnpackCast covers string_view and binary_view dictionaries through cast to utf8, string_view, binary, and with nulls. array_take itself still lacks view-type kernels; that is tracked separately. --- arrow/compute/cast.go | 50 +++++++++++++++++- arrow/compute/cast_test.go | 104 +++++++++++++++++++++++++++++++++++++ 2 files changed, 153 insertions(+), 1 deletion(-) diff --git a/arrow/compute/cast.go b/arrow/compute/cast.go index a3ee7fbcc..73da8102b 100644 --- a/arrow/compute/cast.go +++ b/arrow/compute/cast.go @@ -314,7 +314,18 @@ func unpackDictionary(ctx *exec.KernelCtx, batch *exec.ExecSpan, out *exec.ExecR arrow.ErrInvalid, toType, dictType) } - unpacked, err := TakeArray(ctx.Ctx, dictArr.Dictionary(), dictArr.Indices()) + var ( + unpacked arrow.Array + err error + ) + switch dictArr.Dictionary().DataType().ID() { + case arrow.STRING_VIEW, arrow.BINARY_VIEW: + // array_take has no view kernel, so unpack view-typed dictionaries + // directly into a fresh view array instead of going through TakeArray. + unpacked, err = unpackViewDictionary(exec.GetAllocator(ctx.Ctx), dictArr) + default: + unpacked, err = TakeArray(ctx.Ctx, dictArr.Dictionary(), dictArr.Indices()) + } if err != nil { return err } @@ -332,6 +343,43 @@ func unpackDictionary(ctx *exec.KernelCtx, batch *exec.ExecSpan, out *exec.ExecR return nil } +// unpackViewDictionary materializes a dictionary whose values are a view +// type (string_view or binary_view) into a flat view array of the same +// type, preserving per-element nulls from the dictionary indices. This +// exists because array_take has no view kernel; the cast meta function +// still needs to expand dictionaries as part of DICTIONARY -> X casts. +func unpackViewDictionary(mem memory.Allocator, dictArr *array.Dictionary) (arrow.Array, error) { + switch vals := dictArr.Dictionary().(type) { + case *array.StringView: + bldr := array.NewStringViewBuilder(mem) + defer bldr.Release() + bldr.Reserve(dictArr.Len()) + for i := 0; i < dictArr.Len(); i++ { + if dictArr.IsNull(i) { + bldr.AppendNull() + continue + } + bldr.Append(vals.Value(dictArr.GetValueIndex(i))) + } + return bldr.NewArray(), nil + case *array.BinaryView: + bldr := array.NewBinaryViewBuilder(mem) + defer bldr.Release() + bldr.Reserve(dictArr.Len()) + for i := 0; i < dictArr.Len(); i++ { + if dictArr.IsNull(i) { + bldr.AppendNull() + continue + } + bldr.Append(vals.Value(dictArr.GetValueIndex(i))) + } + return bldr.NewArray(), nil + default: + return nil, fmt.Errorf("%w: unpackViewDictionary: expected view-typed dictionary values, got %s", + arrow.ErrInvalid, vals.DataType()) + } +} + func CastFromExtension(ctx *exec.KernelCtx, batch *exec.ExecSpan, out *exec.ExecResult) error { opts := ctx.State.(kernels.CastState) diff --git a/arrow/compute/cast_test.go b/arrow/compute/cast_test.go index bc389a52e..8721b147c 100644 --- a/arrow/compute/cast_test.go +++ b/arrow/compute/cast_test.go @@ -1975,6 +1975,110 @@ func (c *CastSuite) TestChunkedMultiBufferViewInputCoalesced() { c.Equal(count*2, total) } +// TestViewDictionaryUnpackCast is a regression test for the GH-184 +// fifth-round review finding: a cast whose input is a dictionary whose +// values are string_view or binary_view went through unpackDictionary's +// TakeArray path, which has no view kernel and errored out. Dictionary +// unpacking now handles view-typed values directly. Covers dict +// -> utf8, dict -> string_view (dict removal), dict +// -> binary, and null/empty preservation. +func (c *CastSuite) TestViewDictionaryUnpackCast() { + buildDict := func(values arrow.Array, indices []int32, valid []bool) *array.Dictionary { + ibldr := array.NewInt32Builder(c.mem) + defer ibldr.Release() + ibldr.AppendValues(indices, valid) + idx := ibldr.NewArray() + defer idx.Release() + + dictType := &arrow.DictionaryType{ + IndexType: arrow.PrimitiveTypes.Int32, + ValueType: values.DataType(), + } + d := array.NewData(dictType, idx.Len(), idx.Data().Buffers(), nil, + idx.NullN(), 0) + d.SetDictionary(values.Data()) + da := array.NewDictionaryData(d) + d.Release() + return da + } + + c.Run("string_view dict to utf8", func() { + vals := c.buildStringViewArray([]string{"foo", "bar", "baz"}, nil) + defer vals.Release() + dict := buildDict(vals, []int32{0, 1, 0, 2}, nil) + defer dict.Release() + + out, err := compute.CastArray(context.Background(), dict, + compute.SafeCastOptions(arrow.BinaryTypes.String)) + c.Require().NoError(err) + defer out.Release() + + sv := out.(*array.String) + c.Equal(4, sv.Len()) + c.Equal("foo", sv.Value(0)) + c.Equal("bar", sv.Value(1)) + c.Equal("foo", sv.Value(2)) + c.Equal("baz", sv.Value(3)) + }) + + c.Run("string_view dict to string_view", func() { + vals := c.buildStringViewArray([]string{"alpha", "beta"}, nil) + defer vals.Release() + dict := buildDict(vals, []int32{1, 0, 1}, nil) + defer dict.Release() + + out, err := compute.CastArray(context.Background(), dict, + compute.SafeCastOptions(arrow.BinaryTypes.StringView)) + c.Require().NoError(err) + defer out.Release() + + sv := out.(*array.StringView) + c.Equal(3, sv.Len()) + c.Equal("beta", sv.Value(0)) + c.Equal("alpha", sv.Value(1)) + c.Equal("beta", sv.Value(2)) + }) + + c.Run("binary_view dict to binary", func() { + vals := c.buildBinaryViewArray([][]byte{[]byte("\x01\x02"), []byte("\x03")}) + defer vals.Release() + dict := buildDict(vals, []int32{0, 1, 0}, nil) + defer dict.Release() + + out, err := compute.CastArray(context.Background(), dict, + compute.SafeCastOptions(arrow.BinaryTypes.Binary)) + c.Require().NoError(err) + defer out.Release() + + bv := out.(*array.Binary) + c.Equal(3, bv.Len()) + c.Equal([]byte("\x01\x02"), bv.Value(0)) + c.Equal([]byte("\x03"), bv.Value(1)) + c.Equal([]byte("\x01\x02"), bv.Value(2)) + }) + + c.Run("string_view dict with nulls", func() { + vals := c.buildStringViewArray([]string{"x", "y"}, nil) + defer vals.Release() + dict := buildDict(vals, []int32{0, 0, 1}, []bool{true, false, true}) + defer dict.Release() + + out, err := compute.CastArray(context.Background(), dict, + compute.SafeCastOptions(arrow.BinaryTypes.String)) + c.Require().NoError(err) + defer out.Release() + + sv := out.(*array.String) + c.Equal(3, sv.Len()) + c.Equal(1, sv.NullN()) + c.False(sv.IsNull(0)) + c.True(sv.IsNull(1)) + c.False(sv.IsNull(2)) + c.Equal("x", sv.Value(0)) + c.Equal("y", sv.Value(2)) + }) +} + // checkCastArrayOnly performs a cast and compares the resulting array against // an expected array built from JSON. It deliberately avoids the per-element // scalar iteration path used by checkCast, which relies on scalar.GetScalar From 8dbf269a0ecb7f8ea8daf4db4c836e630a70556b Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Tue, 5 May 2026 16:33:33 -0400 Subject: [PATCH 10/23] fix(compute): preserve value-level nulls when unpacking view dictionaries unpackViewDictionary only checked index-level nulls (dictArr.IsNull(i)), so a dictionary whose values array contained a null entry would materialize that slot as an empty string/byte slice instead of a null. Also check vals.IsNull(idx) for each referenced dictionary slot, which matches TakeArray's behavior on the non-view path. Extended the existing regression test with a 'null in values array' subtest. --- arrow/compute/cast.go | 21 ++++++++++++++++----- arrow/compute/cast_test.go | 20 ++++++++++++++++++++ 2 files changed, 36 insertions(+), 5 deletions(-) diff --git a/arrow/compute/cast.go b/arrow/compute/cast.go index 73da8102b..b5905277a 100644 --- a/arrow/compute/cast.go +++ b/arrow/compute/cast.go @@ -345,9 +345,10 @@ func unpackDictionary(ctx *exec.KernelCtx, batch *exec.ExecSpan, out *exec.ExecR // unpackViewDictionary materializes a dictionary whose values are a view // type (string_view or binary_view) into a flat view array of the same -// type, preserving per-element nulls from the dictionary indices. This -// exists because array_take has no view kernel; the cast meta function -// still needs to expand dictionaries as part of DICTIONARY -> X casts. +// type, preserving per-element nulls from the dictionary indices and +// per-slot nulls from the dictionary values themselves. This exists +// because array_take has no view kernel; the cast meta function still +// needs to expand dictionaries as part of DICTIONARY -> X casts. func unpackViewDictionary(mem memory.Allocator, dictArr *array.Dictionary) (arrow.Array, error) { switch vals := dictArr.Dictionary().(type) { case *array.StringView: @@ -359,7 +360,12 @@ func unpackViewDictionary(mem memory.Allocator, dictArr *array.Dictionary) (arro bldr.AppendNull() continue } - bldr.Append(vals.Value(dictArr.GetValueIndex(i))) + idx := dictArr.GetValueIndex(i) + if vals.IsNull(idx) { + bldr.AppendNull() + continue + } + bldr.Append(vals.Value(idx)) } return bldr.NewArray(), nil case *array.BinaryView: @@ -371,7 +377,12 @@ func unpackViewDictionary(mem memory.Allocator, dictArr *array.Dictionary) (arro bldr.AppendNull() continue } - bldr.Append(vals.Value(dictArr.GetValueIndex(i))) + idx := dictArr.GetValueIndex(i) + if vals.IsNull(idx) { + bldr.AppendNull() + continue + } + bldr.Append(vals.Value(idx)) } return bldr.NewArray(), nil default: diff --git a/arrow/compute/cast_test.go b/arrow/compute/cast_test.go index 8721b147c..ac7712c80 100644 --- a/arrow/compute/cast_test.go +++ b/arrow/compute/cast_test.go @@ -2077,6 +2077,26 @@ func (c *CastSuite) TestViewDictionaryUnpackCast() { c.Equal("x", sv.Value(0)) c.Equal("y", sv.Value(2)) }) + + c.Run("string_view dict with null in values array", func() { + vals := c.buildStringViewArray([]string{"a", "", "c"}, []bool{true, false, true}) + defer vals.Release() + dict := buildDict(vals, []int32{0, 1, 2, 1}, nil) + defer dict.Release() + + out, err := compute.CastArray(context.Background(), dict, + compute.SafeCastOptions(arrow.BinaryTypes.String)) + c.Require().NoError(err) + defer out.Release() + + sv := out.(*array.String) + c.Equal(4, sv.Len()) + c.Equal(2, sv.NullN()) + c.Equal("a", sv.Value(0)) + c.True(sv.IsNull(1)) + c.Equal("c", sv.Value(2)) + c.True(sv.IsNull(3)) + }) } // checkCastArrayOnly performs a cast and compares the resulting array against From a5e93fa735b8d3b0711a44b5791e32ed462060bd Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Tue, 5 May 2026 16:44:12 -0400 Subject: [PATCH 11/23] fix(compute): keep view outputs in a single buffer for dict and bool casts Two remaining view-producing paths could emit multi-buffer view arrays and blow past exec.ArraySpan's fixed [3]BufferSpan. unpackViewDictionary now pre-scans the dictionary to sum non-inline payload, rejects totals above MaxInt32, and calls ReserveData up-front so the unpacked view array lives in a single overflow buffer even on the dict -> view type-equal fast path. boolToStringCastExec now matches the numeric and temporal kernels: it Reserves header slots and pre-reserves up to 5 bytes per row (the longer of 'true'/'false') via reserveFormattedData. Added a large-non-inline-payload regression subtest to TestViewDictionaryUnpackCast and TestBoolToStringViewStaysSingleBuffer asserting <= 3 output buffers. --- arrow/compute/cast.go | 44 ++++++++++++- arrow/compute/cast_test.go | 62 +++++++++++++++++++ .../compute/internal/kernels/string_casts.go | 8 +++ 3 files changed, 113 insertions(+), 1 deletion(-) diff --git a/arrow/compute/cast.go b/arrow/compute/cast.go index b5905277a..6e84fb41e 100644 --- a/arrow/compute/cast.go +++ b/arrow/compute/cast.go @@ -346,15 +346,25 @@ func unpackDictionary(ctx *exec.KernelCtx, batch *exec.ExecSpan, out *exec.ExecR // unpackViewDictionary materializes a dictionary whose values are a view // type (string_view or binary_view) into a flat view array of the same // type, preserving per-element nulls from the dictionary indices and -// per-slot nulls from the dictionary values themselves. This exists +// per-slot nulls from the dictionary values themselves. Out-of-line +// payload is pre-reserved on the builder so the resulting array lives +// in a single overflow data buffer; exec.ArraySpan's fixed [3]BufferSpan +// cannot carry view arrays that span multiple data buffers. This exists // because array_take has no view kernel; the cast meta function still // needs to expand dictionaries as part of DICTIONARY -> X casts. func unpackViewDictionary(mem memory.Allocator, dictArr *array.Dictionary) (arrow.Array, error) { switch vals := dictArr.Dictionary().(type) { case *array.StringView: + outOfLine, err := sumViewDictionaryOutOfLine(dictArr, vals.IsNull, vals.ValueLen) + if err != nil { + return nil, err + } bldr := array.NewStringViewBuilder(mem) defer bldr.Release() bldr.Reserve(dictArr.Len()) + if outOfLine > 0 { + bldr.ReserveData(int(outOfLine)) + } for i := 0; i < dictArr.Len(); i++ { if dictArr.IsNull(i) { bldr.AppendNull() @@ -369,9 +379,16 @@ func unpackViewDictionary(mem memory.Allocator, dictArr *array.Dictionary) (arro } return bldr.NewArray(), nil case *array.BinaryView: + outOfLine, err := sumViewDictionaryOutOfLine(dictArr, vals.IsNull, vals.ValueLen) + if err != nil { + return nil, err + } bldr := array.NewBinaryViewBuilder(mem) defer bldr.Release() bldr.Reserve(dictArr.Len()) + if outOfLine > 0 { + bldr.ReserveData(int(outOfLine)) + } for i := 0; i < dictArr.Len(); i++ { if dictArr.IsNull(i) { bldr.AppendNull() @@ -391,6 +408,31 @@ func unpackViewDictionary(mem memory.Allocator, dictArr *array.Dictionary) (arro } } +// sumViewDictionaryOutOfLine returns the total non-inline payload (in bytes) +// that will be materialized when dictArr is unpacked against vals, returning +// arrow.ErrInvalid if the total exceeds the view single-data-buffer limit. +func sumViewDictionaryOutOfLine(dictArr *array.Dictionary, valIsNull func(int) bool, valLen func(int) int) (int64, error) { + var total int64 + for i := 0; i < dictArr.Len(); i++ { + if dictArr.IsNull(i) { + continue + } + idx := dictArr.GetValueIndex(i) + if valIsNull(idx) { + continue + } + vlen := valLen(idx) + if !arrow.IsViewInline(vlen) { + total += int64(vlen) + if total > math.MaxInt32 { + return 0, fmt.Errorf("%w: view dictionary out-of-line payload (%d bytes) exceeds single-buffer limit (%d bytes)", + arrow.ErrInvalid, total, math.MaxInt32) + } + } + } + return total, nil +} + func CastFromExtension(ctx *exec.KernelCtx, batch *exec.ExecSpan, out *exec.ExecResult) error { opts := ctx.State.(kernels.CastState) diff --git a/arrow/compute/cast_test.go b/arrow/compute/cast_test.go index ac7712c80..878ae89ca 100644 --- a/arrow/compute/cast_test.go +++ b/arrow/compute/cast_test.go @@ -2097,6 +2097,68 @@ func (c *CastSuite) TestViewDictionaryUnpackCast() { c.Equal("c", sv.Value(2)) c.True(sv.IsNull(3)) }) + + c.Run("string_view dict to string_view with large non-inline payload", func() { + const ( + valBytes = 200 + count = 500 + ) + a := strings.Repeat("a", valBytes) + b := strings.Repeat("b", valBytes) + vals := c.buildStringViewArray([]string{a, b}, nil) + defer vals.Release() + + indices := make([]int32, count) + for i := range indices { + indices[i] = int32(i & 1) + } + dict := buildDict(vals, indices, nil) + defer dict.Release() + + out, err := compute.CastArray(context.Background(), dict, + compute.SafeCastOptions(arrow.BinaryTypes.StringView)) + c.Require().NoError(err) + defer out.Release() + + c.LessOrEqualf(len(out.Data().Buffers()), 3, + "expected <=3 buffers for unpacked view dictionary, got %d", + len(out.Data().Buffers())) + + sv := out.(*array.StringView) + c.Equal(count, sv.Len()) + c.Equal(a, sv.Value(0)) + c.Equal(b, sv.Value(1)) + c.Equal(a, sv.Value(count-2)) + c.Equal(b, sv.Value(count-1)) + }) +} + +// TestBoolToStringViewStaysSingleBuffer is a regression test for the +// GH-184 sixth-round review finding: boolToStringCastExec joined the +// numeric/temporal-to-string kernels without the matching pre-reservation +// path, so large bool -> string_view casts could allocate multiple +// overflow data buffers and exceed exec.ArraySpan's fixed [3]BufferSpan. +// The kernel now reserves up to 5 bytes per row ("false") on the builder, +// so the output always has <= 3 buffers regardless of input length. +func (c *CastSuite) TestBoolToStringViewStaysSingleBuffer() { + const count = 1 << 14 + bldr := array.NewBooleanBuilder(c.mem) + defer bldr.Release() + for i := 0; i < count; i++ { + bldr.Append(i%2 == 0) + } + in := bldr.NewArray() + defer in.Release() + + out, err := compute.CastArray(context.Background(), in, + compute.SafeCastOptions(arrow.BinaryTypes.StringView)) + c.Require().NoError(err) + defer out.Release() + + c.LessOrEqualf(len(out.Data().Buffers()), 3, + "expected <=3 buffers for bool->string_view, got %d", + len(out.Data().Buffers())) + c.Equal(count, out.Len()) } // checkCastArrayOnly performs a cast and compares the resulting array against diff --git a/arrow/compute/internal/kernels/string_casts.go b/arrow/compute/internal/kernels/string_casts.go index 49aebf7dd..585c3d11f 100644 --- a/arrow/compute/internal/kernels/string_casts.go +++ b/arrow/compute/internal/kernels/string_casts.go @@ -252,6 +252,14 @@ func boolToStringCastExec(ctx *exec.KernelCtx, batch *exec.ExecSpan, out *exec.E ) defer bldr.Release() + bldr.Reserve(int(input.Len)) + // 5-byte upper bound covers "false"; keeps string_view output inside a + // single overflow data buffer so exec.ArraySpan's fixed [3]BufferSpan + // can carry it. + if err := reserveFormattedData(bldr, input, 5); err != nil { + return err + } + bitutils.VisitBitBlocks(input.Buffers[0].Buf, input.Offset, input.Len, func(pos int64) { bldr.Append(strconv.FormatBool(bitutil.BitIsSet(input.Buffers[1].Buf, int(pos)))) From 9ad9caee2a50c5c3a6c880f60d59cc852c4cdf48 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Tue, 5 May 2026 16:59:57 -0400 Subject: [PATCH 12/23] fix(compute): use exact formatted payload for bool casts; add binary_view dict test Two follow-ups from the GH-184 review rotation: 1) boolToStringCastExec previously reserved 5*nonNull bytes unconditionally, which treated every row as 'false'. For large all-true casts the true payload is 4*nonNull, and the looser bound would reject valid inputs near MaxInt32. Count exact bytes by walking the data bitmap on non-null positions ('true'=4, 'false'=5) and pre-reserve exactly that total. Extracted reserveFormattedDataExact(bldr, total) so the per-value and exact paths share the single-buffer limit check. 2) TestViewDictionaryUnpackCast only exercised the string_view fast path; added a mirror binary_view-dict-to-binary_view subtest with 100KB of non-inline payload, asserting <=3 output buffers and value correctness. Extended TestBoolToStringViewStaysSingleBuffer with all-true, all-false, and alternating cases that assert the output data buffer equals the exact formatted payload size. --- arrow/compute/cast_test.go | 101 +++++++++++++++--- .../compute/internal/kernels/string_casts.go | 28 ++++- 2 files changed, 109 insertions(+), 20 deletions(-) diff --git a/arrow/compute/cast_test.go b/arrow/compute/cast_test.go index 878ae89ca..9e7b474c1 100644 --- a/arrow/compute/cast_test.go +++ b/arrow/compute/cast_test.go @@ -19,6 +19,7 @@ package compute_test import ( + "bytes" "context" "fmt" "math" @@ -2131,6 +2132,40 @@ func (c *CastSuite) TestViewDictionaryUnpackCast() { c.Equal(a, sv.Value(count-2)) c.Equal(b, sv.Value(count-1)) }) + + c.Run("binary_view dict to binary_view with large non-inline payload", func() { + const ( + valBytes = 200 + count = 500 + ) + a := bytes.Repeat([]byte{0xAA}, valBytes) + b := bytes.Repeat([]byte{0xBB}, valBytes) + vals := c.buildBinaryViewArray([][]byte{a, b}) + defer vals.Release() + + indices := make([]int32, count) + for i := range indices { + indices[i] = int32(i & 1) + } + dict := buildDict(vals, indices, nil) + defer dict.Release() + + out, err := compute.CastArray(context.Background(), dict, + compute.SafeCastOptions(arrow.BinaryTypes.BinaryView)) + c.Require().NoError(err) + defer out.Release() + + c.LessOrEqualf(len(out.Data().Buffers()), 3, + "expected <=3 buffers for unpacked binary_view dictionary, got %d", + len(out.Data().Buffers())) + + bv := out.(*array.BinaryView) + c.Equal(count, bv.Len()) + c.Equal(a, bv.Value(0)) + c.Equal(b, bv.Value(1)) + c.Equal(a, bv.Value(count-2)) + c.Equal(b, bv.Value(count-1)) + }) } // TestBoolToStringViewStaysSingleBuffer is a regression test for the @@ -2138,27 +2173,61 @@ func (c *CastSuite) TestViewDictionaryUnpackCast() { // numeric/temporal-to-string kernels without the matching pre-reservation // path, so large bool -> string_view casts could allocate multiple // overflow data buffers and exceed exec.ArraySpan's fixed [3]BufferSpan. -// The kernel now reserves up to 5 bytes per row ("false") on the builder, -// so the output always has <= 3 buffers regardless of input length. +// The kernel now counts exact formatted bytes ("true"=4, "false"=5) and +// pre-reserves that amount on the builder, so the output always has +// <= 3 buffers and all-true casts near the int32 limit are not +// over-rejected relative to the tight bound. func (c *CastSuite) TestBoolToStringViewStaysSingleBuffer() { const count = 1 << 14 - bldr := array.NewBooleanBuilder(c.mem) - defer bldr.Release() - for i := 0; i < count; i++ { - bldr.Append(i%2 == 0) + + check := func(name string, pick func(i int) bool, wantBytes int64) { + c.Run(name, func() { + bldr := array.NewBooleanBuilder(c.mem) + defer bldr.Release() + for i := 0; i < count; i++ { + bldr.Append(pick(i)) + } + in := bldr.NewArray() + defer in.Release() + + out, err := compute.CastArray(context.Background(), in, + compute.SafeCastOptions(arrow.BinaryTypes.String)) + c.Require().NoError(err) + defer out.Release() + + c.Equal(count, out.Len()) + // Data buffer (index 2 for string) holds exactly the formatted + // bytes, validating the exact-count payload logic. + dataBuf := out.Data().Buffers()[2] + c.Require().NotNil(dataBuf) + c.Equalf(wantBytes, int64(dataBuf.Len()), + "%s: data buffer size should equal exact formatted payload", + name) + }) } - in := bldr.NewArray() - defer in.Release() + check("all true", func(int) bool { return true }, 4*count) + check("all false", func(int) bool { return false }, 5*count) + check("alternating", func(i int) bool { return i%2 == 0 }, 4*(count/2)+5*(count/2)) - out, err := compute.CastArray(context.Background(), in, - compute.SafeCastOptions(arrow.BinaryTypes.StringView)) - c.Require().NoError(err) - defer out.Release() + c.Run("string_view stays within 3 buffers", func() { + bldr := array.NewBooleanBuilder(c.mem) + defer bldr.Release() + for i := 0; i < count; i++ { + bldr.Append(i%2 == 0) + } + in := bldr.NewArray() + defer in.Release() - c.LessOrEqualf(len(out.Data().Buffers()), 3, - "expected <=3 buffers for bool->string_view, got %d", - len(out.Data().Buffers())) - c.Equal(count, out.Len()) + out, err := compute.CastArray(context.Background(), in, + compute.SafeCastOptions(arrow.BinaryTypes.StringView)) + c.Require().NoError(err) + defer out.Release() + + c.LessOrEqualf(len(out.Data().Buffers()), 3, + "expected <=3 buffers for bool->string_view, got %d", + len(out.Data().Buffers())) + c.Equal(count, out.Len()) + }) } // checkCastArrayOnly performs a cast and compares the resulting array against diff --git a/arrow/compute/internal/kernels/string_casts.go b/arrow/compute/internal/kernels/string_casts.go index 585c3d11f..5c58d6ff9 100644 --- a/arrow/compute/internal/kernels/string_casts.go +++ b/arrow/compute/internal/kernels/string_casts.go @@ -252,11 +252,22 @@ func boolToStringCastExec(ctx *exec.KernelCtx, batch *exec.ExecSpan, out *exec.E ) defer bldr.Release() + // Count exact formatted payload: "true" is 4 bytes, "false" is 5. + // Only non-null rows contribute bytes. This is tighter than a 5*nonNull + // upper bound and avoids rejecting valid large all-true casts near the + // MaxInt32 destination-buffer limit. + var total int64 + bitutils.VisitBitBlocks(input.Buffers[0].Buf, input.Offset, input.Len, + func(pos int64) { + if bitutil.BitIsSet(input.Buffers[1].Buf, int(pos)) { + total += 4 + } else { + total += 5 + } + }, func() {}) + bldr.Reserve(int(input.Len)) - // 5-byte upper bound covers "false"; keeps string_view output inside a - // single overflow data buffer so exec.ArraySpan's fixed [3]BufferSpan - // can carry it. - if err := reserveFormattedData(bldr, input, 5); err != nil { + if err := reserveFormattedDataExact(bldr, total); err != nil { return err } @@ -334,6 +345,15 @@ func reserveFormattedData(bldr array.StringLikeBuilder, input *exec.ArraySpan, p return nil } total := int64(input.Len-input.Nulls) * int64(perValueBytes) + return reserveFormattedDataExact(bldr, total) +} + +// reserveFormattedDataExact reserves exactly total bytes of out-of-line +// payload on bldr, returning arrow.ErrInvalid if total exceeds the +// destination builder's single-buffer limit. Use this when the kernel can +// compute the exact payload in advance; use reserveFormattedData with a +// per-value upper bound otherwise. +func reserveFormattedDataExact(bldr array.StringLikeBuilder, total int64) error { limit := formattedDataLimit(bldr) if total > limit { return fmt.Errorf("%w: formatted cast payload (%d bytes) exceeds single data buffer limit (%d bytes) for destination builder", From b8d9b963ee7bded5785793aca608a2e2fa08a087 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Tue, 5 May 2026 17:01:40 -0400 Subject: [PATCH 13/23] test(compute): cover binary_view dict value-level null preservation Job 824 flagged that TestViewDictionaryUnpackCast/string_view_dict_with_null_in_values_array only exercised the string_view branch of unpackViewDictionary; the binary_view branch had the same vals.IsNull(idx) logic but no test, so it could regress independently. Added a mirror subtest with a *BinaryView dictionary whose second value is null and indices that reference it twice, asserting the output null count and per-element null mask match. --- arrow/compute/cast_test.go | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/arrow/compute/cast_test.go b/arrow/compute/cast_test.go index 9e7b474c1..993aca62e 100644 --- a/arrow/compute/cast_test.go +++ b/arrow/compute/cast_test.go @@ -2099,6 +2099,32 @@ func (c *CastSuite) TestViewDictionaryUnpackCast() { c.True(sv.IsNull(3)) }) + c.Run("binary_view dict with null in values array", func() { + bldr := array.NewBinaryViewBuilder(c.mem) + defer bldr.Release() + bldr.Append([]byte{0x01, 0x02}) + bldr.AppendNull() + bldr.Append([]byte{0x03}) + vals := bldr.NewArray() + defer vals.Release() + + dict := buildDict(vals, []int32{0, 1, 2, 1}, nil) + defer dict.Release() + + out, err := compute.CastArray(context.Background(), dict, + compute.SafeCastOptions(arrow.BinaryTypes.Binary)) + c.Require().NoError(err) + defer out.Release() + + bv := out.(*array.Binary) + c.Equal(4, bv.Len()) + c.Equal(2, bv.NullN()) + c.Equal([]byte{0x01, 0x02}, bv.Value(0)) + c.True(bv.IsNull(1)) + c.Equal([]byte{0x03}, bv.Value(2)) + c.True(bv.IsNull(3)) + }) + c.Run("string_view dict to string_view with large non-inline payload", func() { const ( valBytes = 200 From b54a657a6c24942a7fdbbace49790208b86dc03a Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Tue, 5 May 2026 17:11:56 -0400 Subject: [PATCH 14/23] fix(compute): skip view overflow-buffer reservation for inline-bound values For view destinations (string_view / binary_view), values that fit inside the view header's inline slot consume no overflow data, so there is no overflow buffer to reserve and no per-buffer limit to enforce. Add an inline-skip in reserveFormattedData so large casts whose per-value upper bound is inline (bool via 5 bytes, int8..int32 via <=11, date32, etc.) are not rejected against a limit they never touch. Apply the same logic in boolToStringCastExec's exact-count path. Added TestReserveFormattedDataInlineViewSkip asserting inline pass-through at the bool / boundary (11) and the overflow-buffer limit firing at 12+. Also restored TestReserveFormattedDataAllowsLargeUtf8AboveInt32's platform-aware branching which an earlier edit accidentally reverted. --- .../compute/internal/kernels/string_casts.go | 53 +++++++++++++------ .../internal/kernels/string_casts_test.go | 38 +++++++++++++ 2 files changed, 74 insertions(+), 17 deletions(-) diff --git a/arrow/compute/internal/kernels/string_casts.go b/arrow/compute/internal/kernels/string_casts.go index 5c58d6ff9..f3756095f 100644 --- a/arrow/compute/internal/kernels/string_casts.go +++ b/arrow/compute/internal/kernels/string_casts.go @@ -252,23 +252,25 @@ func boolToStringCastExec(ctx *exec.KernelCtx, batch *exec.ExecSpan, out *exec.E ) defer bldr.Release() - // Count exact formatted payload: "true" is 4 bytes, "false" is 5. - // Only non-null rows contribute bytes. This is tighter than a 5*nonNull - // upper bound and avoids rejecting valid large all-true casts near the - // MaxInt32 destination-buffer limit. - var total int64 - bitutils.VisitBitBlocks(input.Buffers[0].Buf, input.Offset, input.Len, - func(pos int64) { - if bitutil.BitIsSet(input.Buffers[1].Buf, int(pos)) { - total += 4 - } else { - total += 5 - } - }, func() {}) - bldr.Reserve(int(input.Len)) - if err := reserveFormattedDataExact(bldr, total); err != nil { - return err + // "true" (4 bytes) and "false" (5 bytes) are both inline for view + // builders; they consume no overflow data, so skip reservation and the + // limit check. For offset-based builders, count exact formatted bytes + // (tighter than 5*nonNull and avoids rejecting valid large all-true + // casts near the int32 limit). + if !isViewBuilder(bldr) { + var total int64 + bitutils.VisitBitBlocks(input.Buffers[0].Buf, input.Offset, input.Len, + func(pos int64) { + if bitutil.BitIsSet(input.Buffers[1].Buf, int(pos)) { + total += 4 + } else { + total += 5 + } + }, func() {}) + if err := reserveFormattedDataExact(bldr, total); err != nil { + return err + } } bitutils.VisitBitBlocks(input.Buffers[0].Buf, input.Offset, input.Len, @@ -339,11 +341,17 @@ func numericToStringCastExec[T arrow.IntType | arrow.UintType | arrow.FloatType] // (non-null count) * perValueBytes formatted bytes. Returns arrow.ErrInvalid // when the product exceeds the destination builder's single-buffer limit, // avoiding a panic inside BinaryViewBuilder.ReserveData or an int32 offset -// overflow in StringBuilder. See GH-184 review feedback. +// overflow in StringBuilder. For view builders whose per-value upper bound +// is ≤ 12 bytes, all values are stored inline in view headers and consume +// no overflow data, so reservation and the limit check are skipped. See +// GH-184 review feedback. func reserveFormattedData(bldr array.StringLikeBuilder, input *exec.ArraySpan, perValueBytes int) error { if perValueBytes <= 0 { return nil } + if isViewBuilder(bldr) && arrow.IsViewInline(perValueBytes) { + return nil + } total := int64(input.Len-input.Nulls) * int64(perValueBytes) return reserveFormattedDataExact(bldr, total) } @@ -365,6 +373,17 @@ func reserveFormattedDataExact(bldr array.StringLikeBuilder, total int64) error return nil } +// isViewBuilder reports whether bldr writes into a view-typed array whose +// out-of-line data lives in a single overflow buffer subject to the +// view_value_size_limit. Only *array.StringViewBuilder satisfies the +// StringLikeBuilder interface (BinaryViewBuilder's Append takes []byte, +// not string), so this is effectively a StringView check, kept under a +// clearer name so the inline-skip callers read as "view builder" intent. +func isViewBuilder(bldr array.StringLikeBuilder) bool { + _, ok := bldr.(*array.StringViewBuilder) + return ok +} + // formattedDataLimit returns the largest contiguous data payload (in bytes) // that the destination builder can hold in a single buffer, clamped so that // reserveFormattedData's int(total) conversion cannot overflow: diff --git a/arrow/compute/internal/kernels/string_casts_test.go b/arrow/compute/internal/kernels/string_casts_test.go index 38783dbf7..9972bd8e1 100644 --- a/arrow/compute/internal/kernels/string_casts_test.go +++ b/arrow/compute/internal/kernels/string_casts_test.go @@ -159,3 +159,41 @@ func TestReserveFormattedDataAllowsLargeUtf8AboveInt32(t *testing.T) { t.Fatalf("string_view guard should reject total > MaxInt32, got %v", err) } } + +// TestReserveFormattedDataInlineViewSkip is the regression test for the +// GH-184 seventh-round review finding: for view builders, values whose +// per-value upper bound passes arrow.IsViewInline (strictly < 12 bytes) +// are stored inline in view headers and never consume overflow data. +// reserveFormattedData must skip both the reservation and the +// single-buffer limit check in that case, so large casts with +// inline-bounded values (bool, int8..int32, date32, etc.) are not +// rejected against the overflow-buffer limit they never use. +func TestReserveFormattedDataInlineViewSkip(t *testing.T) { + mem := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer mem.AssertSize(t, 0) + + viewBldr := array.NewStringViewBuilder(mem) + defer viewBldr.Release() + + span := &exec.ArraySpan{Len: math.MaxInt32, Nulls: 0} + + if err := reserveFormattedData(viewBldr, span, 5); err != nil { + t.Fatalf("string_view with inline-bounded perValueBytes=5 must not error: %v", err) + } + if err := reserveFormattedData(viewBldr, span, 11); err != nil { + t.Fatalf("string_view with boundary perValueBytes=11 must not error: %v", err) + } + // arrow.IsViewInline uses strict '<' against the 12-byte inline slot, + // so 12 is already non-inline and the overflow-buffer limit applies. + if err := reserveFormattedData(viewBldr, span, 12); !errors.Is(err, arrow.ErrInvalid) { + t.Fatalf("string_view with non-inline perValueBytes=12 must hit the limit, got %v", err) + } + + utf8Bldr := array.NewStringBuilder(mem) + defer utf8Bldr.Release() + // utf8 builders are offset-based; inline skipping does not apply and + // a 5-byte-per-row upper bound on MaxInt32 rows overflows int32. + if err := reserveFormattedData(utf8Bldr, span, 5); !errors.Is(err, arrow.ErrInvalid) { + t.Fatalf("utf8 inline-skip must not apply; expected ErrInvalid, got %v", err) + } +} From f15582b878d7a5ce8cbfcf6b0986bd36988b8fad Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Tue, 5 May 2026 17:19:02 -0400 Subject: [PATCH 15/23] fix(compute): treat 12-byte values as inline to match ViewHeader.IsInline ViewHeader.IsInline uses size <= 12 but arrow.IsViewInline uses strict < 12, so a 12-byte formatted value (time32[ms] 'HH:MM:SS.sss') is stored inline by the builder yet my earlier guard treated it as non-inline and rejected the cast against the overflow-buffer limit. Introduce a local viewHeaderStoresInline helper (n <= 12) that matches the storage rule, use it in reserveFormattedData's inline-skip, and add TestCasts/ TestNumericToStringViewLargePayload/time32[ms]_to_string_view_stays_inline as the cast-level smoke test. TestReserveFormattedDataInlineViewSkip now asserts that perValueBytes=12 passes inline-skip and 13 hits the limit. --- arrow/compute/cast_test.go | 29 +++++++++++++++++++ .../compute/internal/kernels/string_casts.go | 18 +++++++++--- .../internal/kernels/string_casts_test.go | 26 ++++++++--------- 3 files changed, 56 insertions(+), 17 deletions(-) diff --git a/arrow/compute/cast_test.go b/arrow/compute/cast_test.go index 993aca62e..f7a1d594c 100644 --- a/arrow/compute/cast_test.go +++ b/arrow/compute/cast_test.go @@ -1916,6 +1916,35 @@ func (c *CastSuite) TestNumericToStringViewLargePayload() { "expected <=3 buffers for string_view output, got %d", len(got.Data().Buffers())) }) + + // Regression test for GH-184 eighth-round review finding: time32[ms] + // formats to exactly 12 bytes ("HH:MM:SS.sss") which ViewHeader stores + // inline (size <= 12), so the inline-skip in reserveFormattedData must + // treat 12 as inline (<=12) rather than strictly < 12 via + // arrow.IsViewInline. The boundary itself is exercised directly in + // TestReserveFormattedDataInlineViewSkip; this is the cast-level + // smoke test verifying the end-to-end path stays within a single + // overflow buffer. + c.Run("time32[ms] to string_view stays inline", func() { + t32Type := &arrow.Time32Type{Unit: arrow.Millisecond} + bldr := array.NewTime32Builder(c.mem, t32Type) + defer bldr.Release() + for i := 0; i < count; i++ { + bldr.Append(arrow.Time32(int32(i))) + } + in := bldr.NewArray() + defer in.Release() + + got, err := compute.CastArray(context.Background(), in, + compute.SafeCastOptions(arrow.BinaryTypes.StringView)) + c.Require().NoError(err) + defer got.Release() + + c.Equal(count, got.Len()) + c.LessOrEqualf(len(got.Data().Buffers()), 3, + "expected <=3 buffers for string_view output, got %d", + len(got.Data().Buffers())) + }) } // TestChunkedMultiBufferViewInputCoalesced is a regression test for the diff --git a/arrow/compute/internal/kernels/string_casts.go b/arrow/compute/internal/kernels/string_casts.go index f3756095f..339314a1e 100644 --- a/arrow/compute/internal/kernels/string_casts.go +++ b/arrow/compute/internal/kernels/string_casts.go @@ -342,14 +342,14 @@ func numericToStringCastExec[T arrow.IntType | arrow.UintType | arrow.FloatType] // when the product exceeds the destination builder's single-buffer limit, // avoiding a panic inside BinaryViewBuilder.ReserveData or an int32 offset // overflow in StringBuilder. For view builders whose per-value upper bound -// is ≤ 12 bytes, all values are stored inline in view headers and consume -// no overflow data, so reservation and the limit check are skipped. See -// GH-184 review feedback. +// fits inline (<= 12 bytes, matching ViewHeader.IsInline), all values are +// stored inline in view headers and consume no overflow data, so +// reservation and the limit check are skipped. See GH-184 review feedback. func reserveFormattedData(bldr array.StringLikeBuilder, input *exec.ArraySpan, perValueBytes int) error { if perValueBytes <= 0 { return nil } - if isViewBuilder(bldr) && arrow.IsViewInline(perValueBytes) { + if isViewBuilder(bldr) && viewHeaderStoresInline(perValueBytes) { return nil } total := int64(input.Len-input.Nulls) * int64(perValueBytes) @@ -384,6 +384,16 @@ func isViewBuilder(bldr array.StringLikeBuilder) bool { return ok } +// viewHeaderStoresInline reports whether a value of size n fits inside a +// view header's 12-byte inline slot, matching ViewHeader.IsInline (<= 12). +// arrow.IsViewInline uses strict '<' which would mis-classify 12-byte +// values like time32[ms]'s "HH:MM:SS.sss" as non-inline and reject them +// against the overflow-buffer limit even though the builder stores them +// inline. +func viewHeaderStoresInline(n int) bool { + return n <= 12 +} + // formattedDataLimit returns the largest contiguous data payload (in bytes) // that the destination builder can hold in a single buffer, clamped so that // reserveFormattedData's int(total) conversion cannot overflow: diff --git a/arrow/compute/internal/kernels/string_casts_test.go b/arrow/compute/internal/kernels/string_casts_test.go index 9972bd8e1..89618eb8b 100644 --- a/arrow/compute/internal/kernels/string_casts_test.go +++ b/arrow/compute/internal/kernels/string_casts_test.go @@ -161,13 +161,14 @@ func TestReserveFormattedDataAllowsLargeUtf8AboveInt32(t *testing.T) { } // TestReserveFormattedDataInlineViewSkip is the regression test for the -// GH-184 seventh-round review finding: for view builders, values whose -// per-value upper bound passes arrow.IsViewInline (strictly < 12 bytes) -// are stored inline in view headers and never consume overflow data. -// reserveFormattedData must skip both the reservation and the -// single-buffer limit check in that case, so large casts with -// inline-bounded values (bool, int8..int32, date32, etc.) are not -// rejected against the overflow-buffer limit they never use. +// GH-184 seventh/eighth-round review findings: for view builders, values +// whose per-value upper bound fits inline (<= 12 bytes, matching +// ViewHeader.IsInline) are stored inside the view header and never +// consume overflow data. reserveFormattedData must skip both the +// reservation and the single-buffer limit check in that case, so large +// casts with inline-bounded values (bool, int8..int32, date32, +// time32[ms], etc.) are not rejected against the overflow-buffer limit +// they never use. func TestReserveFormattedDataInlineViewSkip(t *testing.T) { mem := memory.NewCheckedAllocator(memory.NewGoAllocator()) defer mem.AssertSize(t, 0) @@ -180,13 +181,12 @@ func TestReserveFormattedDataInlineViewSkip(t *testing.T) { if err := reserveFormattedData(viewBldr, span, 5); err != nil { t.Fatalf("string_view with inline-bounded perValueBytes=5 must not error: %v", err) } - if err := reserveFormattedData(viewBldr, span, 11); err != nil { - t.Fatalf("string_view with boundary perValueBytes=11 must not error: %v", err) + if err := reserveFormattedData(viewBldr, span, 12); err != nil { + t.Fatalf("string_view with boundary perValueBytes=12 must not error (inline per ViewHeader.IsInline): %v", err) } - // arrow.IsViewInline uses strict '<' against the 12-byte inline slot, - // so 12 is already non-inline and the overflow-buffer limit applies. - if err := reserveFormattedData(viewBldr, span, 12); !errors.Is(err, arrow.ErrInvalid) { - t.Fatalf("string_view with non-inline perValueBytes=12 must hit the limit, got %v", err) + // Above the inline boundary (12) the overflow-buffer limit is enforced. + if err := reserveFormattedData(viewBldr, span, 13); !errors.Is(err, arrow.ErrInvalid) { + t.Fatalf("string_view with non-inline perValueBytes=13 must hit the limit, got %v", err) } utf8Bldr := array.NewStringBuilder(mem) From 45760d19688e465ca058e4270297c84e24914195 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Tue, 5 May 2026 17:25:44 -0400 Subject: [PATCH 16/23] fix(arrow): align IsViewInline with ViewHeader.IsInline and docs (<=12) ViewHeader documents inlining for '12 bytes or fewer' and ViewHeader.IsInline implements that as size <= viewInlineSize, but the exported arrow.IsViewInline used strict '<'. The inconsistency caused downstream view-payload accounting (reserveFormattedData, rebuildViewSingleBuffer, sumViewDictionaryOutOfLine, the string/binary->view cast kernel) to treat 12-byte values as non-inline and count them against the single-buffer limit even though the builder stores them inline, so large time32[ms] -> string_view casts were falsely rejected. Fixing IsViewInline at the source removes the mismatch without touching downstream call sites or introducing a parallel helper. Removed my local viewHeaderStoresInline helper added earlier and routed the inline-skip in reserveFormattedData back through arrow.IsViewInline. Tightened TestCasts/TestNumericToStringViewLargePayload/ time32[ms]_to_string_view_stays_inline to assert Buffers()[2] is empty/absent so the test actually exercises the inline path. --- arrow/compute/cast_test.go | 27 ++++++++++++------- .../compute/internal/kernels/string_casts.go | 18 +++---------- .../internal/kernels/string_casts_test.go | 15 +++++------ arrow/datatype_viewheader.go | 2 +- 4 files changed, 29 insertions(+), 33 deletions(-) diff --git a/arrow/compute/cast_test.go b/arrow/compute/cast_test.go index f7a1d594c..099c15ddd 100644 --- a/arrow/compute/cast_test.go +++ b/arrow/compute/cast_test.go @@ -1918,13 +1918,13 @@ func (c *CastSuite) TestNumericToStringViewLargePayload() { }) // Regression test for GH-184 eighth-round review finding: time32[ms] - // formats to exactly 12 bytes ("HH:MM:SS.sss") which ViewHeader stores - // inline (size <= 12), so the inline-skip in reserveFormattedData must - // treat 12 as inline (<=12) rather than strictly < 12 via - // arrow.IsViewInline. The boundary itself is exercised directly in - // TestReserveFormattedDataInlineViewSkip; this is the cast-level - // smoke test verifying the end-to-end path stays within a single - // overflow buffer. + // formats to exactly 12 bytes ("HH:MM:SS.sss") which ViewHeader + // stores inline (size <= 12), so the inline-skip in + // reserveFormattedData must treat 12 as inline. This is the + // cast-level smoke test verifying the end-to-end path; the boundary + // itself is exercised in TestReserveFormattedDataInlineViewSkip. + // We additionally assert the output has no overflow data buffer, i.e. + // all values stayed inline in view headers. c.Run("time32[ms] to string_view stays inline", func() { t32Type := &arrow.Time32Type{Unit: arrow.Millisecond} bldr := array.NewTime32Builder(c.mem, t32Type) @@ -1941,9 +1941,16 @@ func (c *CastSuite) TestNumericToStringViewLargePayload() { defer got.Release() c.Equal(count, got.Len()) - c.LessOrEqualf(len(got.Data().Buffers()), 3, - "expected <=3 buffers for string_view output, got %d", - len(got.Data().Buffers())) + bufs := got.Data().Buffers() + c.LessOrEqualf(len(bufs), 3, + "expected <=3 buffers for string_view output, got %d", len(bufs)) + // All values must be inline; overflow buffer (index 2) should be + // absent or empty. + if len(bufs) >= 3 && bufs[2] != nil { + c.Equalf(0, bufs[2].Len(), + "time32[ms] values are <=12 bytes and must stay inline; "+ + "found %d bytes in overflow buffer", bufs[2].Len()) + } }) } diff --git a/arrow/compute/internal/kernels/string_casts.go b/arrow/compute/internal/kernels/string_casts.go index 339314a1e..20354ef8e 100644 --- a/arrow/compute/internal/kernels/string_casts.go +++ b/arrow/compute/internal/kernels/string_casts.go @@ -342,14 +342,14 @@ func numericToStringCastExec[T arrow.IntType | arrow.UintType | arrow.FloatType] // when the product exceeds the destination builder's single-buffer limit, // avoiding a panic inside BinaryViewBuilder.ReserveData or an int32 offset // overflow in StringBuilder. For view builders whose per-value upper bound -// fits inline (<= 12 bytes, matching ViewHeader.IsInline), all values are -// stored inline in view headers and consume no overflow data, so -// reservation and the limit check are skipped. See GH-184 review feedback. +// fits inline (arrow.IsViewInline), all values are stored inside view +// headers and consume no overflow data, so reservation and the limit +// check are skipped. See GH-184 review feedback. func reserveFormattedData(bldr array.StringLikeBuilder, input *exec.ArraySpan, perValueBytes int) error { if perValueBytes <= 0 { return nil } - if isViewBuilder(bldr) && viewHeaderStoresInline(perValueBytes) { + if isViewBuilder(bldr) && arrow.IsViewInline(perValueBytes) { return nil } total := int64(input.Len-input.Nulls) * int64(perValueBytes) @@ -384,16 +384,6 @@ func isViewBuilder(bldr array.StringLikeBuilder) bool { return ok } -// viewHeaderStoresInline reports whether a value of size n fits inside a -// view header's 12-byte inline slot, matching ViewHeader.IsInline (<= 12). -// arrow.IsViewInline uses strict '<' which would mis-classify 12-byte -// values like time32[ms]'s "HH:MM:SS.sss" as non-inline and reject them -// against the overflow-buffer limit even though the builder stores them -// inline. -func viewHeaderStoresInline(n int) bool { - return n <= 12 -} - // formattedDataLimit returns the largest contiguous data payload (in bytes) // that the destination builder can hold in a single buffer, clamped so that // reserveFormattedData's int(total) conversion cannot overflow: diff --git a/arrow/compute/internal/kernels/string_casts_test.go b/arrow/compute/internal/kernels/string_casts_test.go index 89618eb8b..a31dedf19 100644 --- a/arrow/compute/internal/kernels/string_casts_test.go +++ b/arrow/compute/internal/kernels/string_casts_test.go @@ -162,13 +162,12 @@ func TestReserveFormattedDataAllowsLargeUtf8AboveInt32(t *testing.T) { // TestReserveFormattedDataInlineViewSkip is the regression test for the // GH-184 seventh/eighth-round review findings: for view builders, values -// whose per-value upper bound fits inline (<= 12 bytes, matching -// ViewHeader.IsInline) are stored inside the view header and never -// consume overflow data. reserveFormattedData must skip both the -// reservation and the single-buffer limit check in that case, so large -// casts with inline-bounded values (bool, int8..int32, date32, -// time32[ms], etc.) are not rejected against the overflow-buffer limit -// they never use. +// whose per-value upper bound fits inline (arrow.IsViewInline, <= 12) +// are stored inside view headers and never consume overflow data. +// reserveFormattedData must skip both the reservation and the +// single-buffer limit check in that case, so large casts with +// inline-bounded values (bool, int8..int32, date32, time32[ms], etc.) +// are not rejected against the overflow-buffer limit they never use. func TestReserveFormattedDataInlineViewSkip(t *testing.T) { mem := memory.NewCheckedAllocator(memory.NewGoAllocator()) defer mem.AssertSize(t, 0) @@ -182,7 +181,7 @@ func TestReserveFormattedDataInlineViewSkip(t *testing.T) { t.Fatalf("string_view with inline-bounded perValueBytes=5 must not error: %v", err) } if err := reserveFormattedData(viewBldr, span, 12); err != nil { - t.Fatalf("string_view with boundary perValueBytes=12 must not error (inline per ViewHeader.IsInline): %v", err) + t.Fatalf("string_view with boundary perValueBytes=12 must not error (inline per arrow.IsViewInline): %v", err) } // Above the inline boundary (12) the overflow-buffer limit is enforced. if err := reserveFormattedData(viewBldr, span, 13); !errors.Is(err, arrow.ErrInvalid) { diff --git a/arrow/datatype_viewheader.go b/arrow/datatype_viewheader.go index 691b97df5..ed6ed0876 100644 --- a/arrow/datatype_viewheader.go +++ b/arrow/datatype_viewheader.go @@ -31,7 +31,7 @@ const ( ) func IsViewInline(length int) bool { - return length < viewInlineSize + return length <= viewInlineSize } // ViewHeader is a variable length string (utf8) or byte slice with From 3100f3234665bc4dd514457f6c457aa6a86072f1 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Tue, 5 May 2026 17:41:59 -0400 Subject: [PATCH 17/23] fix(compute): coalesce multi-buffer view values inside dictionary inputs exec.ArraySpan.SetMembers recurses into dictionary children, so a dict/dict whose dictionary values are multi-buffer trips the [3]BufferSpan limit before unpackViewDictionary gets a chance to run. needsViewCoalesce now recurses into dictionary values, and coalesceMultiBufferViewDatum dispatches via a new coalesceArrayData helper that rebuilds either a top-level view array or a dictionary whose values are a view array (reusing the existing index buffers via array.NewDataWithDictionary). Added TestCasts/TestViewDictionaryUnpackCast/dict_to_utf8 regression test that builds a dictionary with 2x20KB unique values (guaranteed multi-buffer with the default 32KB block size) and asserts the cast completes with the expected values. --- arrow/compute/cast.go | 50 ++++++++++++++++++++++++++++++-------- arrow/compute/cast_test.go | 41 +++++++++++++++++++++++++++++++ 2 files changed, 81 insertions(+), 10 deletions(-) diff --git a/arrow/compute/cast.go b/arrow/compute/cast.go index 6e84fb41e..386077a3c 100644 --- a/arrow/compute/cast.go +++ b/arrow/compute/cast.go @@ -84,9 +84,11 @@ func RegisterScalarCast(reg FunctionRegistry) { // coalesceMultiBufferViewDatum returns a new Datum whose view arrays each // have a single overflow data buffer, or nil if the input does not need -// coalescing. Both ArrayDatum and ChunkedDatum view inputs are handled; -// other datums are never coalesced. Callers are responsible for releasing -// the returned datum when it is non-nil. +// coalescing. Both ArrayDatum and ChunkedDatum view inputs are handled, +// including dictionaries whose values are multi-buffer view arrays (the +// recursive check matches what exec.ArraySpan.SetMembers traverses). +// Other datums are never coalesced. Callers are responsible for +// releasing the returned datum when it is non-nil. func coalesceMultiBufferViewDatum(ctx context.Context, d Datum) (Datum, error) { switch v := d.(type) { case *ArrayDatum: @@ -94,7 +96,7 @@ func coalesceMultiBufferViewDatum(ctx context.Context, d Datum) (Datum, error) { return nil, nil } mem := exec.GetAllocator(ctx) - newData, err := rebuildViewSingleBuffer(mem, v.Value) + newData, err := coalesceArrayData(mem, v.Value) if err != nil { return nil, err } @@ -120,7 +122,7 @@ func coalesceMultiBufferViewDatum(ctx context.Context, d Datum) (Datum, error) { newChunks[i] = c continue } - newData, err := rebuildViewSingleBuffer(mem, c.Data()) + newData, err := coalesceArrayData(mem, c.Data()) if err != nil { for j := 0; j < i; j++ { newChunks[j].Release() @@ -139,15 +141,43 @@ func coalesceMultiBufferViewDatum(ctx context.Context, d Datum) (Datum, error) { return nil, nil } -// needsViewCoalesce reports whether data is a view array whose payload -// spans more than the single overflow data buffer exec.ArraySpan can carry. +// needsViewCoalesce reports whether data carries a view array whose +// payload spans more than the single overflow data buffer exec.ArraySpan +// can carry. The check recurses into dictionary values because +// exec.ArraySpan.SetMembers recursively spans the dictionary child and +// hits the [3]BufferSpan limit there too. func needsViewCoalesce(data arrow.ArrayData) bool { switch data.DataType().ID() { case arrow.BINARY_VIEW, arrow.STRING_VIEW: - default: - return false + return len(data.Buffers()) > 3 + case arrow.DICTIONARY: + return needsViewCoalesce(data.Dictionary()) + } + return false +} + +// coalesceArrayData coalesces data for a view or dictionary-of-view array +// so it fits the single-overflow-buffer invariant. For view data it +// delegates to rebuildViewSingleBuffer; for dictionary data it rebuilds +// the dictionary values and reuses the existing index buffers. +func coalesceArrayData(mem memory.Allocator, data arrow.ArrayData) (arrow.ArrayData, error) { + switch data.DataType().ID() { + case arrow.BINARY_VIEW, arrow.STRING_VIEW: + return rebuildViewSingleBuffer(mem, data) + case arrow.DICTIONARY: + newValues, err := coalesceArrayData(mem, data.Dictionary()) + if err != nil { + return nil, err + } + defer newValues.Release() + newDict, ok := newValues.(*array.Data) + if !ok { + return nil, fmt.Errorf("%w: unexpected dictionary values data type %T", arrow.ErrInvalid, newValues) + } + return array.NewDataWithDictionary(data.DataType(), data.Len(), + data.Buffers(), data.NullN(), data.Offset(), newDict), nil } - return len(data.Buffers()) > 3 + return nil, fmt.Errorf("%w: coalesceArrayData: unsupported type %s", arrow.ErrInvalid, data.DataType()) } // rebuildViewSingleBuffer rebuilds a BinaryView/StringView ArrayData so that diff --git a/arrow/compute/cast_test.go b/arrow/compute/cast_test.go index 099c15ddd..6b679e544 100644 --- a/arrow/compute/cast_test.go +++ b/arrow/compute/cast_test.go @@ -2161,6 +2161,47 @@ func (c *CastSuite) TestViewDictionaryUnpackCast() { c.True(bv.IsNull(3)) }) + // Regression for GH-184 ninth-round review finding: when the top-level + // input is a dictionary whose *values* are a multi-buffer view array, + // exec.ArraySpan.SetMembers recurses into the dictionary child and + // hits the fixed [3]BufferSpan before unpackViewDictionary ever runs. + // coalesceMultiBufferViewDatum must therefore rebuild the dictionary + // values into a single overflow buffer up front. Build dictionary + // values that are large enough to span multiple overflow buffers via + // the default 32KB block size (two 20,000-byte entries are enough to + // trip the multi-buffer condition). + c.Run("dict to utf8", func() { + const valBytes = 20000 + a := strings.Repeat("a", valBytes) + b := strings.Repeat("b", valBytes) + vbldr := array.NewStringViewBuilder(c.mem) + defer vbldr.Release() + // Force two separate overflow buffers by appending values individually + // through the default builder (no pre-reservation). + vbldr.Append(a) + vbldr.Append(b) + vals := vbldr.NewArray() + defer vals.Release() + c.Greaterf(len(vals.Data().Buffers()), 3, + "test precondition: dictionary values must be multi-buffer; got %d", + len(vals.Data().Buffers())) + + dict := buildDict(vals, []int32{0, 1, 0, 1}, nil) + defer dict.Release() + + out, err := compute.CastArray(context.Background(), dict, + compute.SafeCastOptions(arrow.BinaryTypes.String)) + c.Require().NoError(err) + defer out.Release() + + sv := out.(*array.String) + c.Equal(4, sv.Len()) + c.Equal(a, sv.Value(0)) + c.Equal(b, sv.Value(1)) + c.Equal(a, sv.Value(2)) + c.Equal(b, sv.Value(3)) + }) + c.Run("string_view dict to string_view with large non-inline payload", func() { const ( valBytes = 200 From 02afee2d5f913710eb5178e27f61fecef00078bb Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Tue, 5 May 2026 17:52:27 -0400 Subject: [PATCH 18/23] fix(compute): recurse into nested children when coalescing view payloads exec.ArraySpan.SetMembers recurses through both Dictionary() and Children(), so a nested type (list, struct{f: binary_view}, list>, etc.) whose descendant view array is multi-buffer still hits the fixed [3]BufferSpan limit before dispatch. Extended needsViewCoalesce to walk Children() as well, and coalesceArrayData now rebuilds non-view/non-dictionary nested arrays by keeping their own buffers, recursively coalescing only the children that need it (retaining the others), and reconstructing via array.NewData. Added TestCasts/TestViewDictionaryUnpackCast/list_to_list regression test that builds a list with a multi-buffer values array (4x20KB unique entries) and asserts the list cast completes end-to-end with the expected values. --- arrow/compute/cast.go | 50 +++++++++++++++++++++++++++++++------ arrow/compute/cast_test.go | 51 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 93 insertions(+), 8 deletions(-) diff --git a/arrow/compute/cast.go b/arrow/compute/cast.go index 386077a3c..b52ec99a9 100644 --- a/arrow/compute/cast.go +++ b/arrow/compute/cast.go @@ -143,9 +143,10 @@ func coalesceMultiBufferViewDatum(ctx context.Context, d Datum) (Datum, error) { // needsViewCoalesce reports whether data carries a view array whose // payload spans more than the single overflow data buffer exec.ArraySpan -// can carry. The check recurses into dictionary values because -// exec.ArraySpan.SetMembers recursively spans the dictionary child and -// hits the [3]BufferSpan limit there too. +// can carry. The check recurses into both dictionary values and ordinary +// child arrays because exec.ArraySpan.SetMembers recursively spans both, +// so any view descendant anywhere in the shape hits the [3]BufferSpan +// limit during dispatch. func needsViewCoalesce(data arrow.ArrayData) bool { switch data.DataType().ID() { case arrow.BINARY_VIEW, arrow.STRING_VIEW: @@ -153,13 +154,20 @@ func needsViewCoalesce(data arrow.ArrayData) bool { case arrow.DICTIONARY: return needsViewCoalesce(data.Dictionary()) } + for _, c := range data.Children() { + if needsViewCoalesce(c) { + return true + } + } return false } -// coalesceArrayData coalesces data for a view or dictionary-of-view array -// so it fits the single-overflow-buffer invariant. For view data it -// delegates to rebuildViewSingleBuffer; for dictionary data it rebuilds -// the dictionary values and reuses the existing index buffers. +// coalesceArrayData rebuilds data so that every view descendant lives in +// a single overflow buffer. View arrays are rebuilt via +// rebuildViewSingleBuffer; dictionaries keep their index buffers and +// recursively rebuild their values; nested types (list, struct, etc.) +// keep their own buffers and recursively rebuild each child. Shares +// already-compliant children by retaining them rather than copying. func coalesceArrayData(mem memory.Allocator, data arrow.ArrayData) (arrow.ArrayData, error) { switch data.DataType().ID() { case arrow.BINARY_VIEW, arrow.STRING_VIEW: @@ -177,7 +185,33 @@ func coalesceArrayData(mem memory.Allocator, data arrow.ArrayData) (arrow.ArrayD return array.NewDataWithDictionary(data.DataType(), data.Len(), data.Buffers(), data.NullN(), data.Offset(), newDict), nil } - return nil, fmt.Errorf("%w: coalesceArrayData: unsupported type %s", arrow.ErrInvalid, data.DataType()) + + children := data.Children() + if len(children) == 0 { + return nil, fmt.Errorf("%w: coalesceArrayData: no view descendants in %s", arrow.ErrInvalid, data.DataType()) + } + newChildren := make([]arrow.ArrayData, len(children)) + for i, c := range children { + if !needsViewCoalesce(c) { + c.Retain() + newChildren[i] = c + continue + } + nc, err := coalesceArrayData(mem, c) + if err != nil { + for j := 0; j < i; j++ { + newChildren[j].Release() + } + return nil, err + } + newChildren[i] = nc + } + result := array.NewData(data.DataType(), data.Len(), data.Buffers(), + newChildren, data.NullN(), data.Offset()) + for _, nc := range newChildren { + nc.Release() + } + return result, nil } // rebuildViewSingleBuffer rebuilds a BinaryView/StringView ArrayData so that diff --git a/arrow/compute/cast_test.go b/arrow/compute/cast_test.go index 6b679e544..f74b20e15 100644 --- a/arrow/compute/cast_test.go +++ b/arrow/compute/cast_test.go @@ -2202,6 +2202,57 @@ func (c *CastSuite) TestViewDictionaryUnpackCast() { c.Equal(b, sv.Value(3)) }) + // Regression for GH-184 tenth-round review finding: exec.ArraySpan + // .SetMembers recurses through ordinary nested children as well as + // dictionary children, so a list (or struct-of-view) + // whose descendant is multi-buffer also panics before dispatch. + // coalesceMultiBufferViewDatum must therefore recurse through + // Children() to rebuild any descendant view array in place. + c.Run("list to list", func() { + const valBytes = 20000 + a := strings.Repeat("a", valBytes) + b := strings.Repeat("b", valBytes) + vbldr := array.NewStringViewBuilder(c.mem) + defer vbldr.Release() + vbldr.Append(a) + vbldr.Append(b) + vbldr.Append(a) + vbldr.Append(b) + values := vbldr.NewArray() + defer values.Release() + c.Greaterf(len(values.Data().Buffers()), 3, + "test precondition: list values must be multi-buffer; got %d", + len(values.Data().Buffers())) + + // Build list with two rows: [a, b] and [a, b]. + listType := arrow.ListOf(arrow.BinaryTypes.StringView) + offsetsBuf := memory.NewBufferBytes([]byte{ + 0x00, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, + }) + listData := array.NewData(listType, 2, + []*memory.Buffer{nil, offsetsBuf}, + []arrow.ArrayData{values.Data()}, 0, 0) + defer listData.Release() + listArr := array.MakeFromData(listData) + defer listArr.Release() + + out, err := compute.CastArray(context.Background(), listArr, + compute.SafeCastOptions(arrow.ListOf(arrow.BinaryTypes.String))) + c.Require().NoError(err) + defer out.Release() + + got := out.(*array.List) + c.Equal(2, got.Len()) + child := got.ListValues().(*array.String) + c.Equal(4, child.Len()) + c.Equal(a, child.Value(0)) + c.Equal(b, child.Value(1)) + c.Equal(a, child.Value(2)) + c.Equal(b, child.Value(3)) + }) + c.Run("string_view dict to string_view with large non-inline payload", func() { const ( valBytes = 200 From 76b3ff55fe8c6e15045e73f350fdcf3c6639a566 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Tue, 5 May 2026 18:00:12 -0400 Subject: [PATCH 19/23] fix(compute): treat extension inputs as their storage type when coalescing Extension arrays reuse their underlying storage buffers/children in place, so exec.ArraySpan.SetMembers iterates the storage layout directly and trips the [3]BufferSpan limit before CastFromExtension unwraps anything. needsViewCoalesce now reads through ExtensionType.StorageType() during the view/dictionary switch, and coalesceArrayData unwraps extension inputs, recursively coalesces the storage-typed ArrayData, and rewraps with the original extension datatype so downstream dispatch still sees the extension. Added StringViewExtType to the internal test types package (extension backed by string_view) and TestCasts/TestViewDictionaryUnpackCast/ extension_to_utf8 regression test that registers the extension, builds a multi-buffer storage array, wraps it, and casts to utf8 end-to-end. --- arrow/compute/cast.go | 33 +++++++++++++++++----- arrow/compute/cast_test.go | 43 ++++++++++++++++++++++++++++ internal/types/extension_types.go | 47 +++++++++++++++++++++++++++++++ 3 files changed, 116 insertions(+), 7 deletions(-) diff --git a/arrow/compute/cast.go b/arrow/compute/cast.go index b52ec99a9..fb5c9ee22 100644 --- a/arrow/compute/cast.go +++ b/arrow/compute/cast.go @@ -143,12 +143,16 @@ func coalesceMultiBufferViewDatum(ctx context.Context, d Datum) (Datum, error) { // needsViewCoalesce reports whether data carries a view array whose // payload spans more than the single overflow data buffer exec.ArraySpan -// can carry. The check recurses into both dictionary values and ordinary -// child arrays because exec.ArraySpan.SetMembers recursively spans both, -// so any view descendant anywhere in the shape hits the [3]BufferSpan -// limit during dispatch. +// can carry. The check recurses into dictionary values and ordinary +// child arrays because exec.ArraySpan.SetMembers recursively spans both; +// extension inputs reuse their storage layout in place, so they are +// treated as their StorageType for this traversal. func needsViewCoalesce(data arrow.ArrayData) bool { - switch data.DataType().ID() { + dt := data.DataType() + if ext, ok := dt.(arrow.ExtensionType); ok { + dt = ext.StorageType() + } + switch dt.ID() { case arrow.BINARY_VIEW, arrow.STRING_VIEW: return len(data.Buffers()) > 3 case arrow.DICTIONARY: @@ -166,9 +170,24 @@ func needsViewCoalesce(data arrow.ArrayData) bool { // a single overflow buffer. View arrays are rebuilt via // rebuildViewSingleBuffer; dictionaries keep their index buffers and // recursively rebuild their values; nested types (list, struct, etc.) -// keep their own buffers and recursively rebuild each child. Shares -// already-compliant children by retaining them rather than copying. +// keep their own buffers and recursively rebuild each child. Extension +// inputs are rebuilt at their storage layer and then re-wrapped so the +// original extension datatype survives. Shares already-compliant +// children by retaining them rather than copying. func coalesceArrayData(mem memory.Allocator, data arrow.ArrayData) (arrow.ArrayData, error) { + if ext, ok := data.DataType().(arrow.ExtensionType); ok { + storageData := array.NewData(ext.StorageType(), data.Len(), data.Buffers(), + data.Children(), data.NullN(), data.Offset()) + defer storageData.Release() + newStorage, err := coalesceArrayData(mem, storageData) + if err != nil { + return nil, err + } + defer newStorage.Release() + return array.NewData(data.DataType(), newStorage.Len(), newStorage.Buffers(), + newStorage.Children(), newStorage.NullN(), newStorage.Offset()), nil + } + switch data.DataType().ID() { case arrow.BINARY_VIEW, arrow.STRING_VIEW: return rebuildViewSingleBuffer(mem, data) diff --git a/arrow/compute/cast_test.go b/arrow/compute/cast_test.go index f74b20e15..ae3e4cb39 100644 --- a/arrow/compute/cast_test.go +++ b/arrow/compute/cast_test.go @@ -2253,6 +2253,49 @@ func (c *CastSuite) TestViewDictionaryUnpackCast() { c.Equal(b, child.Value(3)) }) + // Regression for GH-184 eleventh-round review finding: extension + // arrays reuse their storage buffers/children in place, so an + // extension over a multi-buffer view hit exec.ArraySpan.SetMembers + // before CastFromExtension unwrapped the storage. coalesce must + // therefore traverse extensions through their StorageType and + // re-wrap after rebuilding so the extension datatype survives. + c.Run("extension to utf8", func() { + extType := types.NewStringViewExtType() + arrow.RegisterExtensionType(extType) + defer arrow.UnregisterExtensionType("string_view_ext") + + const valBytes = 20000 + a := strings.Repeat("a", valBytes) + b := strings.Repeat("b", valBytes) + vbldr := array.NewStringViewBuilder(c.mem) + defer vbldr.Release() + vbldr.Append(a) + vbldr.Append(b) + vbldr.Append(a) + storage := vbldr.NewArray() + defer storage.Release() + c.Greaterf(len(storage.Data().Buffers()), 3, + "test precondition: extension storage must be multi-buffer; got %d", + len(storage.Data().Buffers())) + + extData := array.NewData(extType, storage.Len(), storage.Data().Buffers(), + storage.Data().Children(), storage.NullN(), storage.Data().Offset()) + defer extData.Release() + extArr := array.MakeFromData(extData) + defer extArr.Release() + + out, err := compute.CastArray(context.Background(), extArr, + compute.SafeCastOptions(arrow.BinaryTypes.String)) + c.Require().NoError(err) + defer out.Release() + + sv := out.(*array.String) + c.Equal(3, sv.Len()) + c.Equal(a, sv.Value(0)) + c.Equal(b, sv.Value(1)) + c.Equal(a, sv.Value(2)) + }) + c.Run("string_view dict to string_view with large non-inline payload", func() { const ( valBytes = 200 diff --git a/internal/types/extension_types.go b/internal/types/extension_types.go index eaffce653..3e68ab1aa 100644 --- a/internal/types/extension_types.go +++ b/internal/types/extension_types.go @@ -317,9 +317,56 @@ var ( _ arrow.ExtensionType = (*ExtStructType)(nil) _ arrow.ExtensionType = (*DictExtensionType)(nil) _ arrow.ExtensionType = (*SmallintType)(nil) + _ arrow.ExtensionType = (*StringViewExtType)(nil) _ array.ExtensionArray = (*Parametric1Array)(nil) _ array.ExtensionArray = (*Parametric2Array)(nil) _ array.ExtensionArray = (*ExtStructArray)(nil) _ array.ExtensionArray = (*DictExtensionArray)(nil) _ array.ExtensionArray = (*SmallintArray)(nil) + _ array.ExtensionArray = (*StringViewExtArray)(nil) ) + +// StringViewExtArray is an extension array backed by a string_view storage +// array. +type StringViewExtArray struct { + array.ExtensionArrayBase +} + +func (a StringViewExtArray) ValueStr(i int) string { + if a.IsNull(i) { + return array.NullValueStr + } + return a.Storage().(*array.StringView).Value(i) +} + +// StringViewExtType wraps arrow.BinaryTypes.StringView as an extension so +// tests can exercise code paths that must look through extension types +// at their storage layout. +type StringViewExtType struct { + arrow.ExtensionBase +} + +func NewStringViewExtType() *StringViewExtType { + return &StringViewExtType{ExtensionBase: arrow.ExtensionBase{ + Storage: arrow.BinaryTypes.StringView}} +} + +func (StringViewExtType) ArrayType() reflect.Type { return reflect.TypeOf(StringViewExtArray{}) } + +func (StringViewExtType) ExtensionName() string { return "string_view_ext" } + +func (StringViewExtType) Serialize() string { return "string_view_ext-serialized" } + +func (s *StringViewExtType) ExtensionEquals(other arrow.ExtensionType) bool { + return s.ExtensionName() == other.ExtensionName() +} + +func (StringViewExtType) Deserialize(storageType arrow.DataType, data string) (arrow.ExtensionType, error) { + if data != "string_view_ext-serialized" { + return nil, fmt.Errorf("type identifier did not match: '%s'", data) + } + if !arrow.TypeEqual(storageType, arrow.BinaryTypes.StringView) { + return nil, fmt.Errorf("invalid storage type for StringViewExtType: %s", storageType) + } + return NewStringViewExtType(), nil +} From 09a857441a60ce603f43ba4a27b4b653118fb8d2 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Tue, 5 May 2026 18:08:23 -0400 Subject: [PATCH 20/23] fix(compute): preserve Dictionary() across extension unwrap/rewrap array.NewData does not carry a Data's Dictionary() field, so the previous extension-type unwrap/rewrap path dropped the dictionary for extension>: the recursive DICTIONARY branch inside coalesceArrayData then saw a nil Dictionary() and panicked or produced malformed data. Routed both the unwrap and the rewrap through a new reshapeArrayDataType helper that honours Dictionary() via array.NewDataWithDictionary when present. Added TestCoalesceArrayDataExtensionOverDictOfView as an internal-package regression (direct coalesceArrayData call) because the full cast pipeline for extension> hits a pre-existing SetMembers gap (exec.ArraySpan.SetMembers does not recognize a dictionary under an extension type) that is unrelated to view coalescing. Added DictStringViewExtType to internal/types to support the regression. --- arrow/compute/cast.go | 27 +++++++-- arrow/compute/cast_internal_test.go | 93 +++++++++++++++++++++++++++++ internal/types/extension_types.go | 55 +++++++++++++++++ 3 files changed, 171 insertions(+), 4 deletions(-) create mode 100644 arrow/compute/cast_internal_test.go diff --git a/arrow/compute/cast.go b/arrow/compute/cast.go index fb5c9ee22..85521ebee 100644 --- a/arrow/compute/cast.go +++ b/arrow/compute/cast.go @@ -176,16 +176,17 @@ func needsViewCoalesce(data arrow.ArrayData) bool { // children by retaining them rather than copying. func coalesceArrayData(mem memory.Allocator, data arrow.ArrayData) (arrow.ArrayData, error) { if ext, ok := data.DataType().(arrow.ExtensionType); ok { - storageData := array.NewData(ext.StorageType(), data.Len(), data.Buffers(), - data.Children(), data.NullN(), data.Offset()) + storageData, err := reshapeArrayDataType(data, ext.StorageType()) + if err != nil { + return nil, err + } defer storageData.Release() newStorage, err := coalesceArrayData(mem, storageData) if err != nil { return nil, err } defer newStorage.Release() - return array.NewData(data.DataType(), newStorage.Len(), newStorage.Buffers(), - newStorage.Children(), newStorage.NullN(), newStorage.Offset()), nil + return reshapeArrayDataType(newStorage, data.DataType()) } switch data.DataType().ID() { @@ -233,6 +234,24 @@ func coalesceArrayData(mem memory.Allocator, data arrow.ArrayData) (arrow.ArrayD return result, nil } +// reshapeArrayDataType returns a new ArrayData that shares src's buffers, +// children, nulls, offset, and dictionary but reports the given datatype. +// Used by the extension unwrap/rewrap path in coalesceArrayData so the +// Dictionary() member survives both directions (extension<->storage) +// when the extension's storage type is a dictionary. +func reshapeArrayDataType(src arrow.ArrayData, dt arrow.DataType) (arrow.ArrayData, error) { + if dict := src.Dictionary(); dict != nil { + d, ok := dict.(*array.Data) + if !ok { + return nil, fmt.Errorf("%w: unexpected dictionary data type %T", arrow.ErrInvalid, dict) + } + return array.NewDataWithDictionary(dt, src.Len(), src.Buffers(), + src.NullN(), src.Offset(), d), nil + } + return array.NewData(dt, src.Len(), src.Buffers(), + src.Children(), src.NullN(), src.Offset()), nil +} + // rebuildViewSingleBuffer rebuilds a BinaryView/StringView ArrayData so that // all non-inline payload lives in a single contiguous overflow buffer. The // caller owns the returned ArrayData. diff --git a/arrow/compute/cast_internal_test.go b/arrow/compute/cast_internal_test.go new file mode 100644 index 000000000..1bfc5dfb6 --- /dev/null +++ b/arrow/compute/cast_internal_test.go @@ -0,0 +1,93 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build go1.18 + +package compute + +import ( + "strings" + "testing" + + "github.com/apache/arrow-go/v18/arrow" + "github.com/apache/arrow-go/v18/arrow/array" + "github.com/apache/arrow-go/v18/arrow/memory" + "github.com/apache/arrow-go/v18/internal/types" +) + +// TestCoalesceArrayDataExtensionOverDictOfView is the regression test for +// the GH-184 twelfth-round review finding: when an extension's storage is +// dictionary<...view...>, coalesceArrayData must preserve the +// Dictionary() member when it unwraps and rewraps the extension so the +// recursive DICTIONARY branch does not see a nil dictionary. Driven +// directly at the coalesce helper because the full cast pipeline for +// extension> hits a pre-existing SetMembers gap unrelated to +// view coalescing (exec.ArraySpan.SetMembers does not recognize a +// dictionary stored under an extension type). +func TestCoalesceArrayDataExtensionOverDictOfView(t *testing.T) { + mem := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer mem.AssertSize(t, 0) + + extType := types.NewDictStringViewExtType() + + const valBytes = 20000 + a := strings.Repeat("a", valBytes) + b := strings.Repeat("b", valBytes) + vbldr := array.NewStringViewBuilder(mem) + defer vbldr.Release() + vbldr.Append(a) + vbldr.Append(b) + vals := vbldr.NewArray() + defer vals.Release() + if n := len(vals.Data().Buffers()); n <= 3 { + t.Fatalf("test precondition: dict values must be multi-buffer; got %d", n) + } + + ibldr := array.NewInt32Builder(mem) + defer ibldr.Release() + ibldr.AppendValues([]int32{0, 1, 0, 1, 0}, nil) + idx := ibldr.NewArray() + defer idx.Release() + + valsData, ok := vals.Data().(*array.Data) + if !ok { + t.Fatalf("values data not *array.Data, got %T", vals.Data()) + } + extData := array.NewDataWithDictionary(extType, idx.Len(), + idx.Data().Buffers(), 0, 0, valsData) + defer extData.Release() + + if !needsViewCoalesce(extData) { + t.Fatal("needsViewCoalesce should return true for extension>") + } + + coalesced, err := coalesceArrayData(mem, extData) + if err != nil { + t.Fatalf("coalesceArrayData returned an error: %v", err) + } + defer coalesced.Release() + + if !arrow.TypeEqual(coalesced.DataType(), extType) { + t.Errorf("coalesced datatype mismatch: want %s, got %s", extType, coalesced.DataType()) + } + dict := coalesced.Dictionary() + if dict == nil { + t.Fatal("coalesced extension> must preserve Dictionary()") + } + if got := len(dict.Buffers()); got > 3 { + t.Errorf("coalesced dictionary values must be single-buffer; got %d buffers", got) + } +} diff --git a/internal/types/extension_types.go b/internal/types/extension_types.go index 3e68ab1aa..0162b4876 100644 --- a/internal/types/extension_types.go +++ b/internal/types/extension_types.go @@ -318,12 +318,14 @@ var ( _ arrow.ExtensionType = (*DictExtensionType)(nil) _ arrow.ExtensionType = (*SmallintType)(nil) _ arrow.ExtensionType = (*StringViewExtType)(nil) + _ arrow.ExtensionType = (*DictStringViewExtType)(nil) _ array.ExtensionArray = (*Parametric1Array)(nil) _ array.ExtensionArray = (*Parametric2Array)(nil) _ array.ExtensionArray = (*ExtStructArray)(nil) _ array.ExtensionArray = (*DictExtensionArray)(nil) _ array.ExtensionArray = (*SmallintArray)(nil) _ array.ExtensionArray = (*StringViewExtArray)(nil) + _ array.ExtensionArray = (*DictStringViewExtArray)(nil) ) // StringViewExtArray is an extension array backed by a string_view storage @@ -370,3 +372,56 @@ func (StringViewExtType) Deserialize(storageType arrow.DataType, data string) (a } return NewStringViewExtType(), nil } + +// DictStringViewExtArray is an extension array backed by a +// dictionary storage array. +type DictStringViewExtArray struct { + array.ExtensionArrayBase +} + +func (a DictStringViewExtArray) ValueStr(i int) string { + if a.IsNull(i) { + return array.NullValueStr + } + dict := a.Storage().(*array.Dictionary) + vals := dict.Dictionary().(*array.StringView) + return vals.Value(dict.GetValueIndex(i)) +} + +// DictStringViewExtType wraps dictionary as an +// extension so tests can exercise code paths that must preserve a +// Dictionary() member when unwrapping/rewrapping extensions at their +// storage layer. +type DictStringViewExtType struct { + arrow.ExtensionBase +} + +func NewDictStringViewExtType() *DictStringViewExtType { + return &DictStringViewExtType{ExtensionBase: arrow.ExtensionBase{ + Storage: &arrow.DictionaryType{ + IndexType: arrow.PrimitiveTypes.Int32, + ValueType: arrow.BinaryTypes.StringView, + }}} +} + +func (DictStringViewExtType) ArrayType() reflect.Type { + return reflect.TypeOf(DictStringViewExtArray{}) +} + +func (DictStringViewExtType) ExtensionName() string { return "dict_string_view_ext" } + +func (DictStringViewExtType) Serialize() string { return "dict_string_view_ext-serialized" } + +func (d *DictStringViewExtType) ExtensionEquals(other arrow.ExtensionType) bool { + return d.ExtensionName() == other.ExtensionName() +} + +func (d *DictStringViewExtType) Deserialize(storageType arrow.DataType, data string) (arrow.ExtensionType, error) { + if data != "dict_string_view_ext-serialized" { + return nil, fmt.Errorf("type identifier did not match: '%s'", data) + } + if !arrow.TypeEqual(d.StorageType(), storageType) { + return nil, fmt.Errorf("invalid storage type for DictStringViewExtType: %s", storageType) + } + return NewDictStringViewExtType(), nil +} From d05c46ba961b5e2a7e60f6d6d8d595038aeef673 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Wed, 6 May 2026 11:18:56 -0400 Subject: [PATCH 21/23] refactor(compute): consolidate branch-introduced duplication Pulls the three branch-introduced duplicate patterns identified by roborev analyze duplication into shared helpers. 1) validateUtf8, validateUtf8Fsb, and validateUtf8View (in kernels/binary_view_casts.go) shared the same VisitBitBlocksShort + utf8.Valid + error-format skeleton and differed only in how they obtained the byte slice per row. Extracted validateUTF8Sequence in kernels/string_casts.go; each validator now composes it with its own valueAt closure. Removed the now-unused utf8/bitutils imports in binary_view_casts.go. 2) rebuildViewSingleBuffer and sumViewDictionaryOutOfLine duplicated the 'sum out-of-line bytes, error on > MaxInt32' loop. Collapsed them into a single sumOutOfLineBytes(n, isNull, valueLen) helper. unpackViewDictionary now calls the helper with closures that route dictionary positions through GetValueIndex. 3) unpackViewDictionary's StringView and BinaryView branches were ~95% similar copies of the same null-aware append loop. Extracted buildFromDictionary(dictArr, valIsNull, appendValue, appendNull) so both branches share the walk and the two-level null rule. 4) Promoted the TestViewDictionaryUnpackCast local buildDict closure (9 call sites within one test) to CastSuite.buildInt32Dictionary so it is discoverable and reusable across the suite. Net LOC change is close to zero (code moved into named helpers rather than removed), but the cognitive and copy/paste-drift surface area drops substantially. All GH-184 regression tests still pass. --- arrow/compute/cast.go | 94 ++++++++++--------- arrow/compute/cast_test.go | 59 ++++++------ .../internal/kernels/binary_view_casts.go | 21 ++--- .../compute/internal/kernels/string_casts.go | 44 ++++----- 4 files changed, 109 insertions(+), 109 deletions(-) diff --git a/arrow/compute/cast.go b/arrow/compute/cast.go index 85521ebee..5a5c44715 100644 --- a/arrow/compute/cast.go +++ b/arrow/compute/cast.go @@ -260,7 +260,6 @@ func rebuildViewSingleBuffer(mem memory.Allocator, data arrow.ArrayData) (arrow. defer arr.Release() var ( - total int64 getLen func(int) int getInto func(i int, dst func([]byte)) ) @@ -277,18 +276,9 @@ func rebuildViewSingleBuffer(mem memory.Allocator, data arrow.ArrayData) (arrow. default: return nil, fmt.Errorf("%w: unexpected view array type %T", arrow.ErrInvalid, arr) } - for i := 0; i < arr.Len(); i++ { - if arr.IsNull(i) { - continue - } - vlen := getLen(i) - if !arrow.IsViewInline(vlen) { - total += int64(vlen) - } - } - if total > math.MaxInt32 { - return nil, fmt.Errorf("%w: view array out-of-line payload (%d bytes) exceeds single-buffer limit (%d bytes)", - arrow.ErrInvalid, total, math.MaxInt32) + total, err := sumOutOfLineBytes(arr.Len(), arr.IsNull, getLen) + if err != nil { + return nil, err } bldr := array.NewBuilder(mem, arr.DataType()) @@ -455,9 +445,21 @@ func unpackDictionary(ctx *exec.KernelCtx, batch *exec.ExecSpan, out *exec.ExecR // because array_take has no view kernel; the cast meta function still // needs to expand dictionaries as part of DICTIONARY -> X casts. func unpackViewDictionary(mem memory.Allocator, dictArr *array.Dictionary) (arrow.Array, error) { + isNullAt := func(valIsNull func(int) bool) func(int) bool { + return func(i int) bool { + if dictArr.IsNull(i) { + return true + } + return valIsNull(dictArr.GetValueIndex(i)) + } + } + lenAt := func(valLen func(int) int) func(int) int { + return func(i int) int { return valLen(dictArr.GetValueIndex(i)) } + } + switch vals := dictArr.Dictionary().(type) { case *array.StringView: - outOfLine, err := sumViewDictionaryOutOfLine(dictArr, vals.IsNull, vals.ValueLen) + outOfLine, err := sumOutOfLineBytes(dictArr.Len(), isNullAt(vals.IsNull), lenAt(vals.ValueLen)) if err != nil { return nil, err } @@ -467,21 +469,12 @@ func unpackViewDictionary(mem memory.Allocator, dictArr *array.Dictionary) (arro if outOfLine > 0 { bldr.ReserveData(int(outOfLine)) } - for i := 0; i < dictArr.Len(); i++ { - if dictArr.IsNull(i) { - bldr.AppendNull() - continue - } - idx := dictArr.GetValueIndex(i) - if vals.IsNull(idx) { - bldr.AppendNull() - continue - } - bldr.Append(vals.Value(idx)) - } + buildFromDictionary(dictArr, vals.IsNull, + func(idx int) { bldr.Append(vals.Value(idx)) }, + bldr.AppendNull) return bldr.NewArray(), nil case *array.BinaryView: - outOfLine, err := sumViewDictionaryOutOfLine(dictArr, vals.IsNull, vals.ValueLen) + outOfLine, err := sumOutOfLineBytes(dictArr.Len(), isNullAt(vals.IsNull), lenAt(vals.ValueLen)) if err != nil { return nil, err } @@ -491,18 +484,9 @@ func unpackViewDictionary(mem memory.Allocator, dictArr *array.Dictionary) (arro if outOfLine > 0 { bldr.ReserveData(int(outOfLine)) } - for i := 0; i < dictArr.Len(); i++ { - if dictArr.IsNull(i) { - bldr.AppendNull() - continue - } - idx := dictArr.GetValueIndex(i) - if vals.IsNull(idx) { - bldr.AppendNull() - continue - } - bldr.Append(vals.Value(idx)) - } + buildFromDictionary(dictArr, vals.IsNull, + func(idx int) { bldr.Append(vals.Value(idx)) }, + bldr.AppendNull) return bldr.NewArray(), nil default: return nil, fmt.Errorf("%w: unpackViewDictionary: expected view-typed dictionary values, got %s", @@ -510,24 +494,42 @@ func unpackViewDictionary(mem memory.Allocator, dictArr *array.Dictionary) (arro } } -// sumViewDictionaryOutOfLine returns the total non-inline payload (in bytes) -// that will be materialized when dictArr is unpacked against vals, returning -// arrow.ErrInvalid if the total exceeds the view single-data-buffer limit. -func sumViewDictionaryOutOfLine(dictArr *array.Dictionary, valIsNull func(int) bool, valLen func(int) int) (int64, error) { - var total int64 +// buildFromDictionary walks dictArr and routes each position to either +// appendValue(valueIndex) or appendNull(). A position is null when the +// dictionary index itself is null or when the referenced value is null. +func buildFromDictionary(dictArr *array.Dictionary, valIsNull func(int) bool, appendValue func(idx int), appendNull func()) { for i := 0; i < dictArr.Len(); i++ { if dictArr.IsNull(i) { + appendNull() continue } idx := dictArr.GetValueIndex(i) if valIsNull(idx) { + appendNull() + continue + } + appendValue(idx) + } +} + +// sumOutOfLineBytes returns the total non-inline payload (in bytes) that +// would be written to a view array's overflow buffer when iterating n +// positions with the given predicates, and arrow.ErrInvalid if the total +// exceeds the single-overflow-buffer limit (math.MaxInt32). Used by +// rebuildViewSingleBuffer (iterating top-level view rows), the +// binary-to-view cast kernel, and unpackViewDictionary (where the +// closures translate dictionary positions into value-level lookups). +func sumOutOfLineBytes(n int, isNull func(int) bool, valueLen func(int) int) (int64, error) { + var total int64 + for i := 0; i < n; i++ { + if isNull(i) { continue } - vlen := valLen(idx) + vlen := valueLen(i) if !arrow.IsViewInline(vlen) { total += int64(vlen) if total > math.MaxInt32 { - return 0, fmt.Errorf("%w: view dictionary out-of-line payload (%d bytes) exceeds single-buffer limit (%d bytes)", + return 0, fmt.Errorf("%w: view out-of-line payload (%d bytes) exceeds single-buffer limit (%d bytes)", arrow.ErrInvalid, total, math.MaxInt32) } } diff --git a/arrow/compute/cast_test.go b/arrow/compute/cast_test.go index ae3e4cb39..330f33fde 100644 --- a/arrow/compute/cast_test.go +++ b/arrow/compute/cast_test.go @@ -2012,6 +2012,29 @@ func (c *CastSuite) TestChunkedMultiBufferViewInputCoalesced() { c.Equal(count*2, total) } +// buildInt32Dictionary builds an int32-indexed *array.Dictionary from the +// given values array, indices slice, and optional per-index validity. Used +// by view-dictionary cast regression tests. Indices must be valid against +// values; the helper handles retain/release bookkeeping internally. +func (c *CastSuite) buildInt32Dictionary(values arrow.Array, indices []int32, valid []bool) *array.Dictionary { + ibldr := array.NewInt32Builder(c.mem) + defer ibldr.Release() + ibldr.AppendValues(indices, valid) + idx := ibldr.NewArray() + defer idx.Release() + + dictType := &arrow.DictionaryType{ + IndexType: arrow.PrimitiveTypes.Int32, + ValueType: values.DataType(), + } + d := array.NewData(dictType, idx.Len(), idx.Data().Buffers(), nil, + idx.NullN(), 0) + d.SetDictionary(values.Data()) + da := array.NewDictionaryData(d) + d.Release() + return da +} + // TestViewDictionaryUnpackCast is a regression test for the GH-184 // fifth-round review finding: a cast whose input is a dictionary whose // values are string_view or binary_view went through unpackDictionary's @@ -2020,29 +2043,11 @@ func (c *CastSuite) TestChunkedMultiBufferViewInputCoalesced() { // -> utf8, dict -> string_view (dict removal), dict // -> binary, and null/empty preservation. func (c *CastSuite) TestViewDictionaryUnpackCast() { - buildDict := func(values arrow.Array, indices []int32, valid []bool) *array.Dictionary { - ibldr := array.NewInt32Builder(c.mem) - defer ibldr.Release() - ibldr.AppendValues(indices, valid) - idx := ibldr.NewArray() - defer idx.Release() - - dictType := &arrow.DictionaryType{ - IndexType: arrow.PrimitiveTypes.Int32, - ValueType: values.DataType(), - } - d := array.NewData(dictType, idx.Len(), idx.Data().Buffers(), nil, - idx.NullN(), 0) - d.SetDictionary(values.Data()) - da := array.NewDictionaryData(d) - d.Release() - return da - } c.Run("string_view dict to utf8", func() { vals := c.buildStringViewArray([]string{"foo", "bar", "baz"}, nil) defer vals.Release() - dict := buildDict(vals, []int32{0, 1, 0, 2}, nil) + dict := c.buildInt32Dictionary(vals, []int32{0, 1, 0, 2}, nil) defer dict.Release() out, err := compute.CastArray(context.Background(), dict, @@ -2061,7 +2066,7 @@ func (c *CastSuite) TestViewDictionaryUnpackCast() { c.Run("string_view dict to string_view", func() { vals := c.buildStringViewArray([]string{"alpha", "beta"}, nil) defer vals.Release() - dict := buildDict(vals, []int32{1, 0, 1}, nil) + dict := c.buildInt32Dictionary(vals, []int32{1, 0, 1}, nil) defer dict.Release() out, err := compute.CastArray(context.Background(), dict, @@ -2079,7 +2084,7 @@ func (c *CastSuite) TestViewDictionaryUnpackCast() { c.Run("binary_view dict to binary", func() { vals := c.buildBinaryViewArray([][]byte{[]byte("\x01\x02"), []byte("\x03")}) defer vals.Release() - dict := buildDict(vals, []int32{0, 1, 0}, nil) + dict := c.buildInt32Dictionary(vals, []int32{0, 1, 0}, nil) defer dict.Release() out, err := compute.CastArray(context.Background(), dict, @@ -2097,7 +2102,7 @@ func (c *CastSuite) TestViewDictionaryUnpackCast() { c.Run("string_view dict with nulls", func() { vals := c.buildStringViewArray([]string{"x", "y"}, nil) defer vals.Release() - dict := buildDict(vals, []int32{0, 0, 1}, []bool{true, false, true}) + dict := c.buildInt32Dictionary(vals, []int32{0, 0, 1}, []bool{true, false, true}) defer dict.Release() out, err := compute.CastArray(context.Background(), dict, @@ -2118,7 +2123,7 @@ func (c *CastSuite) TestViewDictionaryUnpackCast() { c.Run("string_view dict with null in values array", func() { vals := c.buildStringViewArray([]string{"a", "", "c"}, []bool{true, false, true}) defer vals.Release() - dict := buildDict(vals, []int32{0, 1, 2, 1}, nil) + dict := c.buildInt32Dictionary(vals, []int32{0, 1, 2, 1}, nil) defer dict.Release() out, err := compute.CastArray(context.Background(), dict, @@ -2144,7 +2149,7 @@ func (c *CastSuite) TestViewDictionaryUnpackCast() { vals := bldr.NewArray() defer vals.Release() - dict := buildDict(vals, []int32{0, 1, 2, 1}, nil) + dict := c.buildInt32Dictionary(vals, []int32{0, 1, 2, 1}, nil) defer dict.Release() out, err := compute.CastArray(context.Background(), dict, @@ -2186,7 +2191,7 @@ func (c *CastSuite) TestViewDictionaryUnpackCast() { "test precondition: dictionary values must be multi-buffer; got %d", len(vals.Data().Buffers())) - dict := buildDict(vals, []int32{0, 1, 0, 1}, nil) + dict := c.buildInt32Dictionary(vals, []int32{0, 1, 0, 1}, nil) defer dict.Release() out, err := compute.CastArray(context.Background(), dict, @@ -2310,7 +2315,7 @@ func (c *CastSuite) TestViewDictionaryUnpackCast() { for i := range indices { indices[i] = int32(i & 1) } - dict := buildDict(vals, indices, nil) + dict := c.buildInt32Dictionary(vals, indices, nil) defer dict.Release() out, err := compute.CastArray(context.Background(), dict, @@ -2344,7 +2349,7 @@ func (c *CastSuite) TestViewDictionaryUnpackCast() { for i := range indices { indices[i] = int32(i & 1) } - dict := buildDict(vals, indices, nil) + dict := c.buildInt32Dictionary(vals, indices, nil) defer dict.Release() out, err := compute.CastArray(context.Background(), dict, diff --git a/arrow/compute/internal/kernels/binary_view_casts.go b/arrow/compute/internal/kernels/binary_view_casts.go index d1f3fa47a..d4550ea8a 100644 --- a/arrow/compute/internal/kernels/binary_view_casts.go +++ b/arrow/compute/internal/kernels/binary_view_casts.go @@ -21,13 +21,11 @@ package kernels import ( "fmt" "math" - "unicode/utf8" "unsafe" "github.com/apache/arrow-go/v18/arrow" "github.com/apache/arrow-go/v18/arrow/array" "github.com/apache/arrow-go/v18/arrow/compute/exec" - "github.com/apache/arrow-go/v18/internal/bitutils" ) func validateUtf8View(input *exec.ArraySpan) error { @@ -36,22 +34,15 @@ func validateUtf8View(input *exec.ArraySpan) error { for i := 2; i < len(input.Buffers); i++ { dataBuffers = append(dataBuffers, input.Buffers[i].Buf) } - - return bitutils.VisitBitBlocksShort(input.Buffers[0].Buf, input.Offset, input.Len, - func(pos int64) error { + return validateUTF8Sequence(input.Buffers[0].Buf, input.Offset, input.Len, + func(pos int64) []byte { h := &views[input.Offset+pos] - var v []byte if h.IsInline() { - v = h.InlineBytes() - } else { - off := h.BufferOffset() - v = dataBuffers[h.BufferIndex()][off : off+int32(h.Len())] - } - if !utf8.Valid(v) { - return fmt.Errorf("%w: invalid UTF8 bytes: %x", arrow.ErrInvalid, v) + return h.InlineBytes() } - return nil - }, func() error { return nil }) + off := h.BufferOffset() + return dataBuffers[h.BufferIndex()][off : off+int32(h.Len())] + }) } func unsafeStringBytes(s string) []byte { diff --git a/arrow/compute/internal/kernels/string_casts.go b/arrow/compute/internal/kernels/string_casts.go index 20354ef8e..ec49ca3a1 100644 --- a/arrow/compute/internal/kernels/string_casts.go +++ b/arrow/compute/internal/kernels/string_casts.go @@ -32,40 +32,42 @@ import ( "github.com/apache/arrow-go/v18/internal/bitutils" ) +// validateUTF8Sequence walks non-null positions in bitmap and validates +// the byte slice returned by valueAt against utf8.Valid. Shared skeleton +// for validateUtf8, validateUtf8Fsb, and validateUtf8View so they stay +// consistent on iteration, error format, and null handling. +func validateUTF8Sequence(bitmap []byte, off, n int64, valueAt func(pos int64) []byte) error { + return bitutils.VisitBitBlocksShort(bitmap, off, n, + func(pos int64) error { + v := valueAt(pos) + if !utf8.Valid(v) { + return fmt.Errorf("%w: invalid UTF8 bytes: %x", arrow.ErrInvalid, v) + } + return nil + }, func() error { return nil }) +} + func validateUtf8Fsb(input *exec.ArraySpan) error { var ( inputData = input.Buffers[1].Buf width = int64(input.Type.(*arrow.FixedSizeBinaryType).ByteWidth) - bitmap = input.Buffers[0].Buf ) - - return bitutils.VisitBitBlocksShort(bitmap, input.Offset, input.Len, - func(pos int64) error { + return validateUTF8Sequence(input.Buffers[0].Buf, input.Offset, input.Len, + func(pos int64) []byte { pos += input.Offset - beg := pos * width - end := (pos + 1) * width - if !utf8.Valid(inputData[beg:end]) { - return fmt.Errorf("%w: invalid UTF8 bytes: %x", arrow.ErrInvalid, inputData[beg:end]) - } - return nil - }, func() error { return nil }) + return inputData[pos*width : (pos+1)*width] + }) } func validateUtf8[OffsetT int32 | int64](input *exec.ArraySpan) error { var ( inputOffsets = exec.GetSpanOffsets[OffsetT](input, 1) inputData = input.Buffers[2].Buf - bitmap = input.Buffers[0].Buf ) - - return bitutils.VisitBitBlocksShort(bitmap, input.Offset, input.Len, - func(pos int64) error { - v := inputData[inputOffsets[pos]:inputOffsets[pos+1]] - if !utf8.Valid(v) { - return fmt.Errorf("%w: invalid UTF8 bytes: %x", arrow.ErrInvalid, v) - } - return nil - }, func() error { return nil }) + return validateUTF8Sequence(input.Buffers[0].Buf, input.Offset, input.Len, + func(pos int64) []byte { + return inputData[inputOffsets[pos]:inputOffsets[pos+1]] + }) } func CastFsbToFsb(ctx *exec.KernelCtx, batch *exec.ExecSpan, out *exec.ExecResult) error { From e1414c414684710c994764a0a1875930fd5229d3 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Wed, 6 May 2026 11:31:26 -0400 Subject: [PATCH 22/23] refactor(compute): share structure between view cast paths Second pass driven by the re-run roborev analyze duplication job (842): two residual branch-introduced patterns were still 60-97% similar after the first cleanup. Both are consolidated without changing behavior. 1) unpackViewDictionary's StringView and BinaryView branches shared the sum-outOfLine + builder.Reserve + ReserveData + buildFromDictionary pipeline verbatim; only the concrete builder type and Value() return type differed. Extracted unpackViewDictionaryIntoBuilder which takes a viewValuesNull (the part of the dictionary-values interface that can be unified in Go) plus closures for reserveData / appendValue / appendNull keyed to the concrete builder. The two switch arms are now just 'new builder + release + call helper + NewArray'. 2) CastBinaryToBinaryView and CastBinaryViewToBinary shared a newBinaryAppender + per-type value accessor + null-preserving append-loop skeleton. Extracted binaryLikeValueAccessor for the input type dispatch (covers all 7 binary-like arrow array types in one place) and appendBinaryValues for the null-routing loop. Each cast kernel still owns its own direction-specific validation and size-limit check (view->binary uses MaxOf[OutOffsetT]; binary->view uses the overflow-buffer MaxInt32 limit), so the correctness- sensitive pieces stay explicit and auditable. Net LOC is slightly higher because the new helpers carry docstrings, but the copy-paste-drift surface is substantially smaller and the six-fold similarity the reviewer flagged is gone. --- arrow/compute/cast.go | 85 ++++++++++++------- .../internal/kernels/binary_view_casts.go | 77 +++++++++-------- 2 files changed, 96 insertions(+), 66 deletions(-) diff --git a/arrow/compute/cast.go b/arrow/compute/cast.go index 5a5c44715..bfbac1d8e 100644 --- a/arrow/compute/cast.go +++ b/arrow/compute/cast.go @@ -445,48 +445,26 @@ func unpackDictionary(ctx *exec.KernelCtx, batch *exec.ExecSpan, out *exec.ExecR // because array_take has no view kernel; the cast meta function still // needs to expand dictionaries as part of DICTIONARY -> X casts. func unpackViewDictionary(mem memory.Allocator, dictArr *array.Dictionary) (arrow.Array, error) { - isNullAt := func(valIsNull func(int) bool) func(int) bool { - return func(i int) bool { - if dictArr.IsNull(i) { - return true - } - return valIsNull(dictArr.GetValueIndex(i)) - } - } - lenAt := func(valLen func(int) int) func(int) int { - return func(i int) int { return valLen(dictArr.GetValueIndex(i)) } - } - switch vals := dictArr.Dictionary().(type) { case *array.StringView: - outOfLine, err := sumOutOfLineBytes(dictArr.Len(), isNullAt(vals.IsNull), lenAt(vals.ValueLen)) - if err != nil { - return nil, err - } bldr := array.NewStringViewBuilder(mem) defer bldr.Release() - bldr.Reserve(dictArr.Len()) - if outOfLine > 0 { - bldr.ReserveData(int(outOfLine)) - } - buildFromDictionary(dictArr, vals.IsNull, + if err := unpackViewDictionaryIntoBuilder(dictArr, vals, vals.ValueLen, bldr, + bldr.ReserveData, func(idx int) { bldr.Append(vals.Value(idx)) }, - bldr.AppendNull) - return bldr.NewArray(), nil - case *array.BinaryView: - outOfLine, err := sumOutOfLineBytes(dictArr.Len(), isNullAt(vals.IsNull), lenAt(vals.ValueLen)) - if err != nil { + bldr.AppendNull); err != nil { return nil, err } + return bldr.NewArray(), nil + case *array.BinaryView: bldr := array.NewBinaryViewBuilder(mem) defer bldr.Release() - bldr.Reserve(dictArr.Len()) - if outOfLine > 0 { - bldr.ReserveData(int(outOfLine)) - } - buildFromDictionary(dictArr, vals.IsNull, + if err := unpackViewDictionaryIntoBuilder(dictArr, vals, vals.ValueLen, bldr, + bldr.ReserveData, func(idx int) { bldr.Append(vals.Value(idx)) }, - bldr.AppendNull) + bldr.AppendNull); err != nil { + return nil, err + } return bldr.NewArray(), nil default: return nil, fmt.Errorf("%w: unpackViewDictionary: expected view-typed dictionary values, got %s", @@ -494,6 +472,49 @@ func unpackViewDictionary(mem memory.Allocator, dictArr *array.Dictionary) (arro } } +// viewValuesNull is the null-check half of the view dictionary values +// interface; exposed separately because *array.StringView and +// *array.BinaryView both satisfy it while their Value() return types +// differ (string vs []byte) and cannot be expressed in one interface. +type viewValuesNull interface { + IsNull(int) bool +} + +// unpackViewDictionaryIntoBuilder runs the shared reserve/walk pipeline +// for both view-typed dictionary branches of unpackViewDictionary. The +// caller owns the builder lifecycle and supplies the reserveData / +// appendValue / appendNull closures so Append can stay typed to the +// concrete builder (StringViewBuilder.Append takes string, +// BinaryViewBuilder.Append takes []byte). +func unpackViewDictionaryIntoBuilder( + dictArr *array.Dictionary, + vals viewValuesNull, + valLen func(int) int, + builder array.Builder, + reserveData func(int), + appendValue func(idx int), + appendNull func(), +) error { + outOfLine, err := sumOutOfLineBytes(dictArr.Len(), + func(i int) bool { + if dictArr.IsNull(i) { + return true + } + return vals.IsNull(dictArr.GetValueIndex(i)) + }, + func(i int) int { return valLen(dictArr.GetValueIndex(i)) }, + ) + if err != nil { + return err + } + builder.Reserve(dictArr.Len()) + if outOfLine > 0 { + reserveData(int(outOfLine)) + } + buildFromDictionary(dictArr, vals.IsNull, appendValue, appendNull) + return nil +} + // buildFromDictionary walks dictArr and routes each position to either // appendValue(valueIndex) or appendNull(). A position is null when the // dictionary index itself is null or when the referenced value is null. diff --git a/arrow/compute/internal/kernels/binary_view_casts.go b/arrow/compute/internal/kernels/binary_view_casts.go index d4550ea8a..d3cb4dbe4 100644 --- a/arrow/compute/internal/kernels/binary_view_casts.go +++ b/arrow/compute/internal/kernels/binary_view_casts.go @@ -76,6 +76,43 @@ func newBinaryAppender(bldr array.Builder) (binaryAppender, error) { } } +// binaryLikeValueAccessor returns a per-index byte slice accessor for any +// binary-like arrow array. Shared by CastBinaryToBinaryView and +// CastBinaryViewToBinary so the input-type switch has one source of truth. +func binaryLikeValueAccessor(arr arrow.Array) (func(int) []byte, error) { + switch a := arr.(type) { + case *array.Binary: + return a.Value, nil + case *array.LargeBinary: + return a.Value, nil + case *array.String: + return func(i int) []byte { return unsafeStringBytes(a.Value(i)) }, nil + case *array.LargeString: + return func(i int) []byte { return unsafeStringBytes(a.Value(i)) }, nil + case *array.FixedSizeBinary: + return a.Value, nil + case *array.BinaryView: + return a.Value, nil + case *array.StringView: + return func(i int) []byte { return unsafeStringBytes(a.Value(i)) }, nil + default: + return nil, fmt.Errorf("%w: unsupported binary-like type: %s", + arrow.ErrNotImplemented, arr.DataType()) + } +} + +// appendBinaryValues drives the null-preserving append loop shared by +// both directions of the binary<->view cast kernels. +func appendBinaryValues(arr arrow.Array, getVal func(int) []byte, ba binaryAppender) { + for i := 0; i < arr.Len(); i++ { + if arr.IsNull(i) { + ba.bldr.AppendNull() + continue + } + ba.appendBytes(getVal(i)) + } +} + // CastBinaryToBinaryView casts a Binary, LargeBinary, String, LargeString, // or FixedSizeBinary array into a BinaryView or StringView array. When the // source is a non-utf8 binary type and the destination is a utf8 view type, @@ -120,19 +157,8 @@ func CastBinaryToBinaryView(ctx *exec.KernelCtx, batch *exec.ExecSpan, out *exec arr := input.MakeArray() defer arr.Release() - var getVal func(int) []byte - switch a := arr.(type) { - case *array.Binary: - getVal = a.Value - case *array.LargeBinary: - getVal = a.Value - case *array.String: - getVal = func(i int) []byte { return unsafeStringBytes(a.Value(i)) } - case *array.LargeString: - getVal = func(i int) []byte { return unsafeStringBytes(a.Value(i)) } - case *array.FixedSizeBinary: - getVal = a.Value - default: + getVal, err := binaryLikeValueAccessor(arr) + if err != nil { return fmt.Errorf("%w: unsupported input type for cast to %s: %s", arrow.ErrNotImplemented, out.Type, input.Type) } @@ -160,13 +186,7 @@ func CastBinaryToBinaryView(ctx *exec.KernelCtx, batch *exec.ExecSpan, out *exec ba.reserveData(int(outOfLineTotal)) } - for i := 0; i < arr.Len(); i++ { - if arr.IsNull(i) { - ba.bldr.AppendNull() - continue - } - ba.appendBytes(getVal(i)) - } + appendBinaryValues(arr, getVal, ba) result := ba.bldr.NewArray() out.TakeOwnership(result.Data()) @@ -202,13 +222,8 @@ func CastBinaryViewToBinary[OutOffsetT int32 | int64](ctx *exec.KernelCtx, batch arr := input.MakeArray() defer arr.Release() - var getVal func(int) []byte - switch a := arr.(type) { - case *array.BinaryView: - getVal = a.Value - case *array.StringView: - getVal = func(i int) []byte { return unsafeStringBytes(a.Value(i)) } - default: + getVal, err := binaryLikeValueAccessor(arr) + if err != nil { return fmt.Errorf("%w: unsupported input type for view-to-binary cast: %s", arrow.ErrNotImplemented, input.Type) } @@ -224,13 +239,7 @@ func CastBinaryViewToBinary[OutOffsetT int32 | int64](ctx *exec.KernelCtx, batch arrow.ErrInvalid, input.Type, out.Type) } - for i := 0; i < arr.Len(); i++ { - if arr.IsNull(i) { - ba.bldr.AppendNull() - continue - } - ba.appendBytes(getVal(i)) - } + appendBinaryValues(arr, getVal, ba) result := ba.bldr.NewArray() out.TakeOwnership(result.Data()) From 074dc5e0d84d08ee19b1a2c6957290a096a77f5f Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Wed, 6 May 2026 11:41:40 -0400 Subject: [PATCH 23/23] refactor(compute): box unpackViewDictionaryIntoBuilder closures in an adapter Third-round roborev analyze complexity job (848) flagged the helper's 7-parameter signature with 3 callback parameters as harder to read than its ~30 LOC warrants. Packaged the builder + reserveData + appendValue + appendNull closures into a viewDictBuilderAdapter struct, dropping the signature to 4 parameters. Each switch arm now constructs the adapter with a named-field initializer, making the role of each closure obvious at the call site. No behaviour change; all existing regression tests pass unchanged. --- arrow/compute/cast.go | 54 ++++++++++++++++++++++++------------------- 1 file changed, 30 insertions(+), 24 deletions(-) diff --git a/arrow/compute/cast.go b/arrow/compute/cast.go index bfbac1d8e..df95a0f04 100644 --- a/arrow/compute/cast.go +++ b/arrow/compute/cast.go @@ -449,20 +449,24 @@ func unpackViewDictionary(mem memory.Allocator, dictArr *array.Dictionary) (arro case *array.StringView: bldr := array.NewStringViewBuilder(mem) defer bldr.Release() - if err := unpackViewDictionaryIntoBuilder(dictArr, vals, vals.ValueLen, bldr, - bldr.ReserveData, - func(idx int) { bldr.Append(vals.Value(idx)) }, - bldr.AppendNull); err != nil { + if err := unpackViewDictionaryIntoBuilder(dictArr, vals, vals.ValueLen, viewDictBuilderAdapter{ + builder: bldr, + reserveData: bldr.ReserveData, + appendValue: func(idx int) { bldr.Append(vals.Value(idx)) }, + appendNull: bldr.AppendNull, + }); err != nil { return nil, err } return bldr.NewArray(), nil case *array.BinaryView: bldr := array.NewBinaryViewBuilder(mem) defer bldr.Release() - if err := unpackViewDictionaryIntoBuilder(dictArr, vals, vals.ValueLen, bldr, - bldr.ReserveData, - func(idx int) { bldr.Append(vals.Value(idx)) }, - bldr.AppendNull); err != nil { + if err := unpackViewDictionaryIntoBuilder(dictArr, vals, vals.ValueLen, viewDictBuilderAdapter{ + builder: bldr, + reserveData: bldr.ReserveData, + appendValue: func(idx int) { bldr.Append(vals.Value(idx)) }, + appendNull: bldr.AppendNull, + }); err != nil { return nil, err } return bldr.NewArray(), nil @@ -480,21 +484,23 @@ type viewValuesNull interface { IsNull(int) bool } +// viewDictBuilderAdapter packages the concrete builder + typed closures +// needed by unpackViewDictionaryIntoBuilder; boxing them keeps the +// helper's parameter list tight and lets each caller express the +// builder-specific Append in one place. +type viewDictBuilderAdapter struct { + builder array.Builder + reserveData func(int) + appendValue func(idx int) + appendNull func() +} + // unpackViewDictionaryIntoBuilder runs the shared reserve/walk pipeline // for both view-typed dictionary branches of unpackViewDictionary. The -// caller owns the builder lifecycle and supplies the reserveData / -// appendValue / appendNull closures so Append can stay typed to the -// concrete builder (StringViewBuilder.Append takes string, -// BinaryViewBuilder.Append takes []byte). -func unpackViewDictionaryIntoBuilder( - dictArr *array.Dictionary, - vals viewValuesNull, - valLen func(int) int, - builder array.Builder, - reserveData func(int), - appendValue func(idx int), - appendNull func(), -) error { +// caller owns the builder lifecycle and supplies the adapter so Append +// can stay typed to the concrete builder (StringViewBuilder.Append takes +// string, BinaryViewBuilder.Append takes []byte). +func unpackViewDictionaryIntoBuilder(dictArr *array.Dictionary, vals viewValuesNull, valLen func(int) int, adapter viewDictBuilderAdapter) error { outOfLine, err := sumOutOfLineBytes(dictArr.Len(), func(i int) bool { if dictArr.IsNull(i) { @@ -507,11 +513,11 @@ func unpackViewDictionaryIntoBuilder( if err != nil { return err } - builder.Reserve(dictArr.Len()) + adapter.builder.Reserve(dictArr.Len()) if outOfLine > 0 { - reserveData(int(outOfLine)) + adapter.reserveData(int(outOfLine)) } - buildFromDictionary(dictArr, vals.IsNull, appendValue, appendNull) + buildFromDictionary(dictArr, vals.IsNull, adapter.appendValue, adapter.appendNull) return nil }