From 914b1c89a637c52661e09d84185b96edae01328c Mon Sep 17 00:00:00 2001 From: CapMoon Date: Mon, 30 Mar 2026 16:26:25 +0800 Subject: [PATCH 1/2] HDFS-17902. StripedBlockChecksumReconstructor leaks connections on init failure --- .../erasurecode/StripedBlockChecksumReconstructor.java | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/erasurecode/StripedBlockChecksumReconstructor.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/erasurecode/StripedBlockChecksumReconstructor.java index a196935219ec5..d5115c9827e04 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/erasurecode/StripedBlockChecksumReconstructor.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/erasurecode/StripedBlockChecksumReconstructor.java @@ -52,7 +52,13 @@ protected StripedBlockChecksumReconstructor(ErasureCodingWorker worker, assert targetIndices != null; this.checksumWriter = checksumWriter; this.requestedLen = requestedBlockLength; - init(); + try { + init(); + } catch (IOException e) { + getStripedReader().close(); + cleanup(); + throw e; + } } private void init() throws IOException { From e0b5f66ef453a2a838de9c20f7fd49a1b8ddd3c9 Mon Sep 17 00:00:00 2001 From: CapMoon Date: Tue, 31 Mar 2026 10:17:38 +0800 Subject: [PATCH 2/2] HDFS-17902. Add a unit test --- .../apache/hadoop/hdfs/TestFileChecksum.java | 42 +++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestFileChecksum.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestFileChecksum.java index afadbeccac790..c889574ee83e9 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestFileChecksum.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestFileChecksum.java @@ -717,6 +717,48 @@ public void testMixedBytesPerChecksum(String pMode) throws Exception { } } + @MethodSource("getParameters") + @ParameterizedTest + @Timeout(value = 90) + public void testStripedChecksumReconCleanupOnInitFailure(String pMode) + throws Exception { + initTestFileChecksum(pMode); + String stripedFile = ecDir + "/stripedFileChecksumInitFail"; + prepareTestFiles(fileSize, new String[]{stripedFile}); + + LocatedBlocks locatedBlocks = client.getLocatedBlocks(stripedFile, 0); + LocatedBlock locatedBlock = locatedBlocks.get(0); + DatanodeInfo[] blockDns = locatedBlock.getLocations(); + + int numToKill = parityBlocks + 1; + int[] killedIdx = new int[numToKill]; + int killed = 0; + for (int i = 0; i < blockDns.length && killed < numToKill; i++) { + int idx = 0; + for (DataNode dn : cluster.getDataNodes()) { + if (dn.getInfoPort() == blockDns[i].getInfoPort()) { + shutdownDataNode(dn); + killedIdx[killed++] = idx; + break; + } + idx++; + } + } + + try { + Exception ex = assertThrows(Exception.class, () -> { + fs.getFileChecksum(new Path(stripedFile)); + }); + LOG.info("Got expected failure when too many DNs are down: {}", + ex.getMessage()); + } finally { + for (int i = 0; i < killed; i++) { + cluster.restartDataNode(killedIdx[i]); + } + cluster.waitActive(); + } + } + private FileChecksum getFileChecksum(String filePath, int range, boolean killDn) throws Exception { int dnIdxToDie = -1;