Describe the bug, including details regarding any error messages, version, and platform.
The issue seems to be caused by an optimization that treats 2-byte fixed binary data as uint16 for hashing, but fails to provide the correct MemoTable implementation, leading to an interface conversion error.
Request from tpc-ds sf1:
SELECT count(*)
FROM customer_address
WHERE ca_state IN ('MS', 'IN', 'ND', 'OK', 'NM', 'VA');
Actual:
panic: interface conversion: *hashing.BinaryMemoTable is not hashing.TypedMemoTable[uint16]: missing method Exists
goroutine 243 [running]:
://github.com.(*SetLookupState[...]).Init(0x1f76000, {{0x1f7e180, 0x84a9f29e070}, 0x6, {0x84aa34e4340, 0x1, 0x1}, 0x0})
/vendor/://github.com/scalar_set_lookup.go:176 +0x18f
://github.com.CreateSetLookupState({{0x1f7e180, 0x84a9f29e070}, 0x6, {0x84aa34e4340, 0x1, 0x1}, 0x0}, {0x1f6ab50, 0x84a93f74c08})
/vendor/://github.com/scalar_set_lookup.go:144 +0x553
://github.com(0x84a94059cb0, {{0x1f649e0, 0x84a94050a88}, {0x84a943d0360, 0x1, 0x1}, {0x1b9ce40, 0x84a9f31e168}})
/vendor/://github.com +0xe6e
://github.com({0x1f726d8, 0x84a94193d40}, {{0x84aa34e4270?, 0x84a943d00c0?, 0x84aa1bff628?}, 0x48daa5?}, {0x1f8e538, 0x84a946e69b0}, {0x1f83be8, 0x84a94193d10})
/vendor/://github.com +0x1806
Steps to Reproduce:
package main
import (
"context"
"fmt"
"testing"
"github.com/apache/arrow-go/v18/arrow"
"github.com/apache/arrow-go/v18/arrow/array"
"github.com/apache/arrow-go/v18/arrow/compute"
"github.com/apache/arrow-go/v18/arrow/memory"
)
func TestReproduceArrowPanicWithSetOptions(t *testing.T) {
mem := memory.NewGoAllocator()
ctx := context.Background()
catType := &arrow.FixedSizeBinaryType{ByteWidth: 2}
numRows := 100
b := array.NewFixedSizeBinaryBuilder(mem, catType)
for i := 0; i < numRows; i++ {
b.Append([]byte("MS"))
}
dataArr := b.NewArray()
defer dataArr.Release()
vBuilder := array.NewFixedSizeBinaryBuilder(mem, catType)
vBuilder.Append([]byte("MS"))
vBuilder.Append([]byte("VA"))
valueSet := vBuilder.NewArray()
defer valueSet.Release()
opts := compute.SetOptions{
ValueSet: compute.NewDatumWithoutOwning(valueSet),
NullBehavior: compute.NullMatchingMatch,
}
input := compute.NewDatumWithoutOwning(dataArr)
defer func() {
if r := recover(); r != nil {
fmt.Printf("Caught expected panic: %v\n", r)
}
}()
_, err := compute.IsIn(ctx, opts, input)
if err != nil {
fmt.Printf("Got error instead of panic: %v\n", err)
}
}
Component(s)
Parquet
Describe the bug, including details regarding any error messages, version, and platform.
The issue seems to be caused by an optimization that treats 2-byte fixed binary data as uint16 for hashing, but fails to provide the correct MemoTable implementation, leading to an interface conversion error.
Request from tpc-ds sf1:
SELECT count(*)
FROM customer_address
WHERE ca_state IN ('MS', 'IN', 'ND', 'OK', 'NM', 'VA');
Actual:
panic: interface conversion: *hashing.BinaryMemoTable is not hashing.TypedMemoTable[uint16]: missing method Exists
goroutine 243 [running]:
://github.com.(*SetLookupState[...]).Init(0x1f76000, {{0x1f7e180, 0x84a9f29e070}, 0x6, {0x84aa34e4340, 0x1, 0x1}, 0x0})
/vendor/://github.com/scalar_set_lookup.go:176 +0x18f
://github.com.CreateSetLookupState({{0x1f7e180, 0x84a9f29e070}, 0x6, {0x84aa34e4340, 0x1, 0x1}, 0x0}, {0x1f6ab50, 0x84a93f74c08})
/vendor/://github.com/scalar_set_lookup.go:144 +0x553
://github.com(0x84a94059cb0, {{0x1f649e0, 0x84a94050a88}, {0x84a943d0360, 0x1, 0x1}, {0x1b9ce40, 0x84a9f31e168}})
/vendor/://github.com +0xe6e
://github.com({0x1f726d8, 0x84a94193d40}, {{0x84aa34e4270?, 0x84a943d00c0?, 0x84aa1bff628?}, 0x48daa5?}, {0x1f8e538, 0x84a946e69b0}, {0x1f83be8, 0x84a94193d10})
/vendor/://github.com +0x1806
Steps to Reproduce:
Component(s)
Parquet