From dbab7094b90fd7bb9c119f16215c4d6b0ad039a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Isma=C3=ABl=20Mej=C3=ADa?= Date: Tue, 21 Apr 2026 16:52:52 +0000 Subject: [PATCH] GH-3520: Cleanup binary write path Two small cleanups on the binary write side: 1. DeltaByteArrayWriter: replace v.getBytes() with v.copy().getBytesUnsafe() to avoid the unconditional Arrays.copyOf that getBytes() performs for ByteArrayBackedBinary. copy() is a no-op for constant Binaries, and getBytesUnsafe() returns the backing array directly. For reused-buffer Binaries (e.g. ByteBufferBackedBinary over a slab being mutated), copy() still snapshots them so correctness is preserved. 2. FixedLenByteArrayPlainValuesWriter: drop the unused LittleEndianDataOutputStream wrapper (only used to call Binary.writeTo(), which works directly with the underlying CapacityByteArrayOutputStream). The trailing out.flush() in getBytes() is also dead. Same pattern as #3517 fixed in DeltaLengthByteArrayValuesWriter. No public API change. No file format change. Validation: parquet-column 573 tests pass. Built with -Dspotless.check.skip=true -Drat.skip=true -Djapicmp.skip=true. --- .../values/deltastrings/DeltaByteArrayWriter.java | 5 ++++- .../plain/FixedLenByteArrayPlainValuesWriter.java | 10 +--------- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/deltastrings/DeltaByteArrayWriter.java b/parquet-column/src/main/java/org/apache/parquet/column/values/deltastrings/DeltaByteArrayWriter.java index c234108613..7828ffc3f5 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/values/deltastrings/DeltaByteArrayWriter.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/values/deltastrings/DeltaByteArrayWriter.java @@ -89,7 +89,10 @@ public String memUsageString(String prefix) { @Override public void writeBytes(Binary v) { int i = 0; - byte[] vb = v.getBytes(); + // copy() is a no-op for constant (non-reused) Binaries, and getBytesUnsafe() + // returns the backing array directly for ByteArrayBackedBinary — avoiding + // the unconditional array copy that getBytes() always performs. + byte[] vb = v.copy().getBytesUnsafe(); int length = previous.length < vb.length ? previous.length : vb.length; // find the number of matching prefix bytes between this value and the previous one for (i = 0; (i < length) && (previous[i] == vb[i]); i++) diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/plain/FixedLenByteArrayPlainValuesWriter.java b/parquet-column/src/main/java/org/apache/parquet/column/values/plain/FixedLenByteArrayPlainValuesWriter.java index dec4d1be1b..c170ad8e90 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/values/plain/FixedLenByteArrayPlainValuesWriter.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/values/plain/FixedLenByteArrayPlainValuesWriter.java @@ -22,7 +22,6 @@ import org.apache.parquet.bytes.ByteBufferAllocator; import org.apache.parquet.bytes.BytesInput; import org.apache.parquet.bytes.CapacityByteArrayOutputStream; -import org.apache.parquet.bytes.LittleEndianDataOutputStream; import org.apache.parquet.column.Encoding; import org.apache.parquet.column.values.ValuesWriter; import org.apache.parquet.io.ParquetEncodingException; @@ -37,7 +36,6 @@ public class FixedLenByteArrayPlainValuesWriter extends ValuesWriter { private static final Logger LOG = LoggerFactory.getLogger(PlainValuesWriter.class); private CapacityByteArrayOutputStream arrayOut; - private LittleEndianDataOutputStream out; private int length; private ByteBufferAllocator allocator; @@ -46,7 +44,6 @@ public FixedLenByteArrayPlainValuesWriter( this.length = length; this.allocator = allocator; this.arrayOut = new CapacityByteArrayOutputStream(initialSize, pageSize, this.allocator); - this.out = new LittleEndianDataOutputStream(arrayOut); } @Override @@ -56,7 +53,7 @@ public final void writeBytes(Binary v) { "Fixed Binary size " + v.length() + " does not match field type length " + length); } try { - v.writeTo(out); + v.writeTo(arrayOut); } catch (IOException e) { throw new ParquetEncodingException("could not write fixed bytes", e); } @@ -69,11 +66,6 @@ public long getBufferedSize() { @Override public BytesInput getBytes() { - try { - out.flush(); - } catch (IOException e) { - throw new ParquetEncodingException("could not write page", e); - } LOG.debug("writing a buffer of size {}", arrayOut.size()); return BytesInput.from(arrayOut); }