Skip to content

Commit e1ce8ad

Browse files
committed
GH-3601: Cache shouldIgnoreStatistics version parsing result
1 parent 9d9ddca commit e1ce8ad

2 files changed

Lines changed: 71 additions & 0 deletions

File tree

parquet-column/src/main/java/org/apache/parquet/CorruptStatistics.java

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
*/
1919
package org.apache.parquet;
2020

21+
import java.util.concurrent.ConcurrentHashMap;
2122
import java.util.concurrent.atomic.AtomicBoolean;
2223
import org.apache.parquet.SemanticVersion.SemanticVersionParseException;
2324
import org.apache.parquet.VersionParser.ParsedVersion;
@@ -35,6 +36,8 @@
3536
*/
3637
public class CorruptStatistics {
3738
private static final AtomicBoolean alreadyLogged = new AtomicBoolean(false);
39+
private static final ConcurrentHashMap<String, Boolean> CACHE = new ConcurrentHashMap<>();
40+
private static final int MAX_CACHE_SIZE = 64;
3841

3942
private static final Logger LOG = LoggerFactory.getLogger(CorruptStatistics.class);
4043

@@ -68,6 +71,12 @@ public static boolean shouldIgnoreStatistics(String createdBy, PrimitiveTypeName
6871
return true;
6972
}
7073

74+
return CACHE.size() >= MAX_CACHE_SIZE
75+
? shouldIgnoreStatisticsForBinary(createdBy)
76+
: CACHE.computeIfAbsent(createdBy, CorruptStatistics::shouldIgnoreStatisticsForBinary);
77+
}
78+
79+
private static boolean shouldIgnoreStatisticsForBinary(String createdBy) {
7180
try {
7281
ParsedVersion version = VersionParser.parse(createdBy);
7382

@@ -114,4 +123,14 @@ private static void warnOnce(String message) {
114123
LOG.warn(message);
115124
}
116125
}
126+
127+
// Visible for testing
128+
static int cacheSize() {
129+
return CACHE.size();
130+
}
131+
132+
// Visible for testing
133+
static void clearCache() {
134+
CACHE.clear();
135+
}
117136
}

parquet-column/src/test/java/org/apache/parquet/CorruptStatisticsTest.java

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,14 +18,21 @@
1818
*/
1919
package org.apache.parquet;
2020

21+
import static org.junit.Assert.assertEquals;
2122
import static org.junit.Assert.assertFalse;
2223
import static org.junit.Assert.assertTrue;
2324

2425
import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName;
26+
import org.junit.Before;
2527
import org.junit.Test;
2628

2729
public class CorruptStatisticsTest {
2830

31+
@Before
32+
public void setUp() {
33+
CorruptStatistics.clearCache();
34+
}
35+
2936
@Test
3037
public void testOnlyAppliesToBinary() {
3138
assertTrue(CorruptStatistics.shouldIgnoreStatistics(
@@ -124,4 +131,49 @@ public void testDistributionCorruptStatistics() {
124131
assertTrue(CorruptStatistics.shouldIgnoreStatistics(
125132
"parquet-mr version 1.7.0 (build abcd)", PrimitiveTypeName.BINARY));
126133
}
134+
135+
@Test
136+
public void testCachingBehavior() {
137+
assertEquals(0, CorruptStatistics.cacheSize());
138+
139+
// Call many times with the same createdBy — cache should store exactly one entry
140+
String createdBy = "parquet-mr version 1.6.0 (build cache-test)";
141+
for (int i = 0; i < 100; i++) {
142+
CorruptStatistics.shouldIgnoreStatistics(createdBy, PrimitiveTypeName.BINARY);
143+
}
144+
assertEquals(1, CorruptStatistics.cacheSize());
145+
146+
// A different createdBy should add a second entry
147+
CorruptStatistics.shouldIgnoreStatistics("parquet-mr version 1.9.0 (build abcd)", PrimitiveTypeName.BINARY);
148+
assertEquals(2, CorruptStatistics.cacheSize());
149+
}
150+
151+
@Test
152+
public void testCorrectnessWhenCacheIsFull() {
153+
CorruptStatistics.clearCache();
154+
155+
// Fill cache to capacity with 64 distinct corrupt-version strings
156+
for (int i = 0; i < 64; i++) {
157+
assertTrue(
158+
"Corrupt version should be ignored",
159+
CorruptStatistics.shouldIgnoreStatistics(
160+
"parquet-mr version 1.6." + i + " (build x)", PrimitiveTypeName.BINARY));
161+
}
162+
assertEquals(64, CorruptStatistics.cacheSize());
163+
164+
// 65th distinct string bypasses cache -- must still return correct result
165+
assertTrue(
166+
"Cache-bypass path must still return correct result for corrupt version",
167+
CorruptStatistics.shouldIgnoreStatistics(
168+
"parquet-mr version 1.6.99 (build bypass)", PrimitiveTypeName.BINARY));
169+
170+
// Non-corrupt version also returns correct result when cache is full
171+
assertFalse(
172+
"Non-corrupt version must return false even when cache is full",
173+
CorruptStatistics.shouldIgnoreStatistics(
174+
"parquet-mr version 1.10.0 (build bypass2)", PrimitiveTypeName.BINARY));
175+
176+
// Cache did not grow beyond cap
177+
assertEquals(64, CorruptStatistics.cacheSize());
178+
}
127179
}

0 commit comments

Comments
 (0)