Make the build more forward-compatible (before move/testing with Java 21/25)

iemejia · iemejia · commit 7d25a18bebd6 · 2026-05-23T10:44:07.000+02:00
Hadoop upgrade (3.3.0 -&gt; 3.4.3):
- Hadoop 3.3.x uses javax.security.auth.Subject.getSubject() which
  was removed in JDK 23+ (JEP 471). Hadoop 3.4.x uses
  Subject.current() instead, restoring JDK 25 compatibility.

Spotless upgrade (2.46.1 -&gt; 3.5.1):
- Spotless 2.46.1 calls com.sun.tools.javac.util.Log methods that
  were removed in JDK 25, causing NoSuchMethodError at format time.
  Spotless 3.5.1 is compatible with JDK 25. The minor formatting
  changes to switch/case comment indentation are from the new version.

Fix ByteBuffer leak in vectored I/O reads:
- Hadoop's readVectored() API accepts an IntFunction&lt;ByteBuffer&gt; for
  allocation but has no corresponding release callback. When a
  wrapping filesystem like ChecksumFileSystem is in the path, Hadoop
  allocates a buffer through the caller's allocator, uses it
  internally for checksum verification, then creates a different
  buffer for the CompletableFuture result. The originally allocated
  buffer is abandoned without release.
- This caused TrackingByteBufferAllocator (used in tests) to throw
  LeakedByteBufferException for tests using vectored I/O:
  TestRecordLevelFilters, TestColumnIndexFiltering, TestParquetReader.
- Fix: wrap the allocator in a capturing decorator that tracks every
  buffer allocated during readVectored(), then registers them all
  for release via ByteBufferReleaser. A try-finally ensures buffers
  are registered even if a read future times out or fails.
diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java
@@ -763,7 +763,7 @@ public EncodingStats convertEncodingStats(List<PageEncodingStats> stats) {
       switch (stat.getPage_type()) {
         case DATA_PAGE_V2:
           builder.withV2Pages();
-          // falls through
+        // falls through
         case DATA_PAGE:
           builder.addDataEncoding(getEncoding(stat.getEncoding()), stat.getCount());
           break;
diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/CodecFactory.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/CodecFactory.java
@@ -325,7 +325,7 @@ private String cacheKey(CompressionCodecName codecName) {
         level = conf.get("parquet.compression.codec.zstd.level");
         break;
       default:
-        // compression level is not supported; ignore it
+      // compression level is not supported; ignore it
     }
     String codecClass = codecName.getHadoopCompressionCodecClassName();
     return level == null ? codecClass : codecClass + ":" + level;
diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnIndexValidator.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnIndexValidator.java
@@ -546,7 +546,7 @@ private void validateBoundaryOrder(
               prevMaxValue::toString);
           break;
         case UNORDERED:
-          // No checks necessary.
+        // No checks necessary.
       }
     }
   }
diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/DirectCodecFactory.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/DirectCodecFactory.java
@@ -103,8 +103,8 @@ protected BytesCompressor createCompressor(final CompressionCodecName codecName)
         return new SnappyCompressor();
       case ZSTD:
         return new ZstdCompressor();
-        // todo: create class similar to the SnappyCompressor for zlib and exclude it as
-        // snappy is above since it also generates allocateDirect calls.
+      // todo: create class similar to the SnappyCompressor for zlib and exclude it as
+      // snappy is above since it also generates allocateDirect calls.
       default:
         return super.createCompressor(codecName);
     }
diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReader.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReader.java
@@ -65,6 +65,7 @@
 import org.apache.parquet.HadoopReadOptions;
 import org.apache.parquet.ParquetReadOptions;
 import org.apache.parquet.Preconditions;
+import org.apache.parquet.bytes.ByteBufferAllocator;
 import org.apache.parquet.bytes.ByteBufferInputStream;
 import org.apache.parquet.bytes.ByteBufferReleaser;
 import org.apache.parquet.bytes.BytesInput;
@@ -1361,12 +1362,42 @@ private void readVectored(List<ConsecutivePartList> allParts, ChunkListBuilder b
       totalSize += len;
     }
     LOG.debug("Reading {} bytes of data with vectored IO in {} ranges", totalSize, ranges.size());
-    // Request a vectored read;
-    f.readVectored(ranges, options.getAllocator());
-    int k = 0;
-    for (ConsecutivePartList consecutivePart : allParts) {
-      ParquetFileRange currRange = ranges.get(k++);
-      consecutivePart.readFromVectoredRange(currRange, builder);
+    // Use a capturing allocator to track all buffers allocated by Hadoop during vectored reads.
+    // The buffer returned from the read future may differ from the one originally allocated
+    // (e.g., ChecksumFileSystem wraps/copies buffers), so we must track the actual allocations.
+    List<ByteBuffer> allocatedBuffers = new ArrayList<>();
+    ByteBufferAllocator capturingAllocator = new ByteBufferAllocator() {
+      @Override
+      public ByteBuffer allocate(int size) {
+        ByteBuffer buf = options.getAllocator().allocate(size);
+        allocatedBuffers.add(buf);
+        return buf;
+      }
+
+      @Override
+      public void release(ByteBuffer b) {
+        // Use identity comparison; ByteBuffer.equals() is content-based and could match wrong buffer
+        allocatedBuffers.removeIf(buf -> buf == b);
+        options.getAllocator().release(b);
+      }
+
+      @Override
+      public boolean isDirect() {
+        return options.getAllocator().isDirect();
+      }
+    };
+    try {
+      // Request a vectored read;
+      f.readVectored(ranges, capturingAllocator);
+      int k = 0;
+      for (ConsecutivePartList consecutivePart : allParts) {
+        ParquetFileRange currRange = ranges.get(k++);
+        consecutivePart.readFromVectoredRange(currRange, builder);
+      }
+    } finally {
+      // Register all buffers allocated during vectored reads for release.
+      // In a finally block so buffers are not leaked on read failures.
+      builder.addBuffersToRelease(allocatedBuffers);
     }
   }
 
diff --git a/parquet-thrift/src/main/java/org/apache/parquet/thrift/BufferedProtocolReadToWrite.java b/parquet-thrift/src/main/java/org/apache/parquet/thrift/BufferedProtocolReadToWrite.java
@@ -226,7 +226,7 @@ private boolean readOneValue(TProtocol in, byte type, List<Action> buffer, Thrif
         writeShortAction(buffer, s);
         break;
       case TType.ENUM: // same as i32 => actually never seen in the protocol layer as enums are written as a i32
-        // field
+      // field
       case TType.I32:
         final int i = in.readI32();
         checkEnum(expectedType, i);
diff --git a/parquet-thrift/src/main/java/org/apache/parquet/thrift/ProtocolReadToWrite.java b/parquet-thrift/src/main/java/org/apache/parquet/thrift/ProtocolReadToWrite.java
@@ -74,7 +74,7 @@ void readOneValue(TProtocol in, TProtocol out, byte type) throws TException {
         out.writeI16(in.readI16());
         break;
       case TType.ENUM: // same as i32 => actually never seen in the protocol layer as enums are written as a i32
-        // field
+      // field
       case TType.I32:
         out.writeI32(in.readI32());
         break;
diff --git a/pom.xml b/pom.xml
@@ -80,10 +80,10 @@
     <jackson-annotations.version>2.21</jackson-annotations.version>
     <japicmp.version>0.25.7</japicmp.version>
     <javax.annotation.version>1.3.2</javax.annotation.version>
-    <spotless.version>2.46.1</spotless.version>
+    <spotless.version>3.5.1</spotless.version>
     <shade.prefix>shaded.parquet</shade.prefix>
     <!-- Guarantees no newer classes/methods/constants are used by parquet. -->
-    <hadoop.version>3.3.0</hadoop.version>
+    <hadoop.version>3.4.3</hadoop.version>
     <parquet.format.version>2.12.0</parquet.format.version>
     <previous.version>1.17.0</previous.version>
     <thrift.executable>thrift</thrift.executable>

Original file line number	Diff line number	Diff line change
`@@ -325,7 +325,7 @@ private String cacheKey(CompressionCodecName codecName) {`
`325`	`325`	`level = conf.get("parquet.compression.codec.zstd.level");`
`326`	`326`	`break;`
`327`	`327`	`default:`
`328`		`- // compression level is not supported; ignore it`
	`328`	`+ // compression level is not supported; ignore it`
`329`	`329`	`}`
`330`	`330`	`String codecClass = codecName.getHadoopCompressionCodecClassName();`
`331`	`331`	`return level == null ? codecClass : codecClass + ":" + level;`
Original file line number	Diff line number	Diff line change
`@@ -546,7 +546,7 @@ private void validateBoundaryOrder(`
`546`	`546`	`prevMaxValue::toString);`
`547`	`547`	`break;`
`548`	`548`	`case UNORDERED:`
`549`		`- // No checks necessary.`
	`549`	`+ // No checks necessary.`
`550`	`550`	`}`
`551`	`551`	`}`
`552`	`552`	`}`