From cbd31f7da8f8aca587ee20c10bfae2637b555479 Mon Sep 17 00:00:00 2001 From: Anupam Yadav Date: Sun, 7 Jun 2026 02:05:24 +0000 Subject: [PATCH 1/2] GH-3601: Compute shouldIgnoreStatistics once per file in ParquetMetadataConverter --- .../converter/ParquetMetadataConverter.java | 60 ++++++++++++++++++- 1 file changed, 58 insertions(+), 2 deletions(-) diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java index 3597898c30..074138a1ed 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java @@ -970,6 +970,46 @@ public org.apache.parquet.column.statistics.Statistics fromParquetStatistics( return fromParquetStatisticsInternal(createdBy, statistics, type, expectedOrder); } + // Overload that uses a pre-computed shouldIgnoreCorruptStats flag to avoid redundant parsing + private org.apache.parquet.column.statistics.Statistics fromParquetStatisticsInternal( + String createdBy, Statistics formatStats, PrimitiveType type, boolean shouldIgnoreCorruptStats) { + SortOrder typeSortOrder = overrideSortOrderToSigned(type) ? SortOrder.SIGNED : sortOrder(type); + org.apache.parquet.column.statistics.Statistics.Builder statsBuilder = + org.apache.parquet.column.statistics.Statistics.getBuilderForReading(type); + + if (formatStats != null) { + if (formatStats.isSetMin_value() && formatStats.isSetMax_value()) { + byte[] min = formatStats.min_value.array(); + byte[] max = formatStats.max_value.array(); + if (isMinMaxStatsSupported(type) || Arrays.equals(min, max)) { + statsBuilder.withMin(min); + statsBuilder.withMax(max); + } + } else { + boolean isSet = formatStats.isSetMax() && formatStats.isSetMin(); + boolean maxEqualsMin = isSet ? Arrays.equals(formatStats.getMin(), formatStats.getMax()) : false; + boolean sortOrdersMatch = SortOrder.SIGNED == typeSortOrder; + // The shouldIgnoreCorruptStats flag applies only to BINARY and FIXED_LEN_BYTE_ARRAY. + // For other types, shouldIgnoreStatistics always returns false, so we only guard those. + PrimitiveTypeName primitiveTypeName = type.getPrimitiveTypeName(); + boolean ignoreForThisColumn = shouldIgnoreCorruptStats + && (primitiveTypeName == PrimitiveTypeName.BINARY + || primitiveTypeName == PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY); + if (!ignoreForThisColumn && (sortOrdersMatch || maxEqualsMin)) { + if (isSet) { + statsBuilder.withMin(formatStats.min.array()); + statsBuilder.withMax(formatStats.max.array()); + } + } + } + + if (formatStats.isSetNull_count()) { + statsBuilder.withNumNulls(formatStats.null_count); + } + } + return statsBuilder.build(); + } + GeospatialStatistics toParquetGeospatialStatistics( org.apache.parquet.column.statistics.geospatial.GeospatialStatistics geospatialStatistics) { if (geospatialStatistics == null) { @@ -1794,13 +1834,24 @@ public FileMetaDataAndRowGroupOffsetInfo visit(RangeMetadataFilter filter) throw public ColumnChunkMetaData buildColumnChunkMetaData( ColumnMetaData metaData, ColumnPath columnPath, PrimitiveType type, String createdBy) { + boolean shouldIgnoreCorruptStats = + CorruptStatistics.shouldIgnoreStatistics(createdBy, PrimitiveTypeName.BINARY); + return buildColumnChunkMetaData(metaData, columnPath, type, createdBy, shouldIgnoreCorruptStats); + } + + ColumnChunkMetaData buildColumnChunkMetaData( + ColumnMetaData metaData, + ColumnPath columnPath, + PrimitiveType type, + String createdBy, + boolean shouldIgnoreCorruptStats) { return ColumnChunkMetaData.get( columnPath, type, fromFormatCodec(metaData.codec), convertEncodingStats(metaData.getEncoding_stats()), fromFormatEncodings(metaData.encodings), - fromParquetStatistics(createdBy, metaData.statistics, type), + fromParquetStatisticsInternal(createdBy, metaData.statistics, type, shouldIgnoreCorruptStats), metaData.data_page_offset, metaData.dictionary_page_offset, metaData.num_values, @@ -1829,6 +1880,10 @@ public ParquetMetadata fromParquetMetadata( MessageType messageType = fromParquetSchema(parquetMetadata.getSchema(), parquetMetadata.getColumn_orders()); List blocks = new ArrayList(); List row_groups = parquetMetadata.getRow_groups(); + // Compute once per file: the result is the same for BINARY and FIXED_LEN_BYTE_ARRAY + // (the only types affected by PARQUET-251), and always false for other types. + boolean shouldIgnoreCorruptStats = + CorruptStatistics.shouldIgnoreStatistics(parquetMetadata.getCreated_by(), PrimitiveTypeName.BINARY); if (row_groups != null) { for (RowGroup rowGroup : row_groups) { @@ -1909,7 +1964,8 @@ public ParquetMetadata fromParquetMetadata( metaData, columnPath, messageType.getType(columnPath.toArray()).asPrimitiveType(), - createdBy); + createdBy, + shouldIgnoreCorruptStats); column.setRowGroupOrdinal(rowGroup.getOrdinal()); if (metaData.isSetBloom_filter_offset()) { column.setBloomFilterOffset(metaData.getBloom_filter_offset()); From c6f66e36f919323d60c101f56c214f79307d761d Mon Sep 17 00:00:00 2001 From: Anupam Yadav Date: Sat, 20 Jun 2026 07:24:00 +0000 Subject: [PATCH 2/2] GH-3601: Address review - factor CorruptStatistics into file-level + column-type checks; precompute once per file (schema-gated) --- .../org/apache/parquet/CorruptStatistics.java | 43 ++++++-- .../apache/parquet/CorruptStatisticsTest.java | 30 ++++++ .../converter/ParquetMetadataConverter.java | 89 ++++++--------- .../TestParquetMetadataConverter.java | 101 ++++++++++++++++++ 4 files changed, 198 insertions(+), 65 deletions(-) diff --git a/parquet-column/src/main/java/org/apache/parquet/CorruptStatistics.java b/parquet-column/src/main/java/org/apache/parquet/CorruptStatistics.java index c5846f9efa..79db8ccd2a 100644 --- a/parquet-column/src/main/java/org/apache/parquet/CorruptStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/CorruptStatistics.java @@ -47,20 +47,25 @@ public class CorruptStatistics { private static final SemanticVersion CDH_5_PARQUET_251_FIXED_END = new SemanticVersion(1, 5, 0); /** - * Decides if the statistics from a file created by createdBy (the created_by field from parquet format) - * should be ignored because they are potentially corrupt. + * Returns whether the given column type is one of the types affected by the PARQUET-251 bug + * (BINARY or FIXED_LEN_BYTE_ARRAY). * - * @param createdBy the created-by string from a file footer - * @param columnType the type of the column that this is checking - * @return true if the statistics may be invalid and should be ignored, false otherwise + * @param columnType the primitive type of the column + * @return true if this column type could have corrupt statistics */ - public static boolean shouldIgnoreStatistics(String createdBy, PrimitiveTypeName columnType) { - - if (columnType != PrimitiveTypeName.BINARY && columnType != PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY) { - // the bug only applies to binary columns - return false; - } + public static boolean isCorruptStatisticsColumnType(PrimitiveTypeName columnType) { + return columnType == PrimitiveTypeName.BINARY || columnType == PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY; + } + /** + * Determines whether a file (identified by its created_by string) was written by a version of + * parquet-mr that had the PARQUET-251 statistics bug. This is a file-level check that does not + * consider column type. + * + * @param createdBy the created-by string from a file footer + * @return true if the file was written by a version with the corrupt statistics bug + */ + public static boolean fileHasCorruptStatistics(String createdBy) { if (Strings.isNullOrEmpty(createdBy)) { // created_by is not populated, which could have been caused by // parquet-mr during the same time as PARQUET-251, see PARQUET-297 @@ -103,6 +108,22 @@ public static boolean shouldIgnoreStatistics(String createdBy, PrimitiveTypeName } } + /** + * Decides if the statistics from a file created by createdBy (the created_by field from parquet format) + * should be ignored because they are potentially corrupt. + * + * @param createdBy the created-by string from a file footer + * @param columnType the type of the column that this is checking + * @return true if the statistics may be invalid and should be ignored, false otherwise + */ + public static boolean shouldIgnoreStatistics(String createdBy, PrimitiveTypeName columnType) { + if (!isCorruptStatisticsColumnType(columnType)) { + // the bug only applies to binary columns + return false; + } + return fileHasCorruptStatistics(createdBy); + } + private static void warnParseErrorOnce(String createdBy, Throwable e) { if (!alreadyLogged.getAndSet(true)) { LOG.warn("Ignoring statistics because created_by could not be parsed (see PARQUET-251): " + createdBy, e); diff --git a/parquet-column/src/test/java/org/apache/parquet/CorruptStatisticsTest.java b/parquet-column/src/test/java/org/apache/parquet/CorruptStatisticsTest.java index e897b38512..4287c11e73 100644 --- a/parquet-column/src/test/java/org/apache/parquet/CorruptStatisticsTest.java +++ b/parquet-column/src/test/java/org/apache/parquet/CorruptStatisticsTest.java @@ -124,4 +124,34 @@ public void testDistributionCorruptStatistics() { assertTrue(CorruptStatistics.shouldIgnoreStatistics( "parquet-mr version 1.7.0 (build abcd)", PrimitiveTypeName.BINARY)); } + + @Test + public void testIsCorruptStatisticsColumnType() { + assertTrue(CorruptStatistics.isCorruptStatisticsColumnType(PrimitiveTypeName.BINARY)); + assertTrue(CorruptStatistics.isCorruptStatisticsColumnType(PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY)); + assertFalse(CorruptStatistics.isCorruptStatisticsColumnType(PrimitiveTypeName.INT32)); + assertFalse(CorruptStatistics.isCorruptStatisticsColumnType(PrimitiveTypeName.INT64)); + assertFalse(CorruptStatistics.isCorruptStatisticsColumnType(PrimitiveTypeName.DOUBLE)); + assertFalse(CorruptStatistics.isCorruptStatisticsColumnType(PrimitiveTypeName.FLOAT)); + assertFalse(CorruptStatistics.isCorruptStatisticsColumnType(PrimitiveTypeName.BOOLEAN)); + assertFalse(CorruptStatistics.isCorruptStatisticsColumnType(PrimitiveTypeName.INT96)); + } + + @Test + public void testFileHasCorruptStatistics() { + // Corrupt versions + assertTrue(CorruptStatistics.fileHasCorruptStatistics("parquet-mr version 1.6.0 (build abcd)")); + assertTrue(CorruptStatistics.fileHasCorruptStatistics("parquet-mr version 1.4.2 (build abcd)")); + assertTrue(CorruptStatistics.fileHasCorruptStatistics("parquet-mr version 1.7.999 (build abcd)")); + // Null/empty + assertTrue(CorruptStatistics.fileHasCorruptStatistics(null)); + assertTrue(CorruptStatistics.fileHasCorruptStatistics("")); + // Unparseable + assertTrue(CorruptStatistics.fileHasCorruptStatistics("unparseable string")); + // Fixed versions + assertFalse(CorruptStatistics.fileHasCorruptStatistics("parquet-mr version 1.8.0 (build abcd)")); + assertFalse(CorruptStatistics.fileHasCorruptStatistics("parquet-mr version 2.0.0 (build abcd)")); + // Non-parquet-mr applications + assertFalse(CorruptStatistics.fileHasCorruptStatistics("impala version 1.6.0 (build abcd)")); + } } diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java index 074138a1ed..a0fd34a9cf 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java @@ -47,6 +47,7 @@ import org.apache.parquet.CorruptStatistics; import org.apache.parquet.ParquetReadOptions; import org.apache.parquet.Preconditions; +import org.apache.parquet.column.ColumnDescriptor; import org.apache.parquet.column.EncodingStats; import org.apache.parquet.column.ParquetProperties; import org.apache.parquet.column.statistics.BinaryStatistics; @@ -926,6 +927,13 @@ public static org.apache.parquet.column.statistics.Statistics fromParquetStatist // Visible for testing static org.apache.parquet.column.statistics.Statistics fromParquetStatisticsInternal( String createdBy, Statistics formatStats, PrimitiveType type, SortOrder typeSortOrder) { + boolean fileHasCorruptStats = CorruptStatistics.fileHasCorruptStatistics(createdBy); + return fromParquetStatisticsInternal(formatStats, type, typeSortOrder, fileHasCorruptStats); + } + + // Core implementation using pre-computed file-level corrupt stats flag + static org.apache.parquet.column.statistics.Statistics fromParquetStatisticsInternal( + Statistics formatStats, PrimitiveType type, SortOrder typeSortOrder, boolean fileHasCorruptStats) { // create stats object based on the column type org.apache.parquet.column.statistics.Statistics.Builder statsBuilder = org.apache.parquet.column.statistics.Statistics.getBuilderForReading(type); @@ -948,8 +956,10 @@ static org.apache.parquet.column.statistics.Statistics fromParquetStatisticsInte // valid with the type's sort order. In previous releases, all stats were // aggregated using a signed byte-wise ordering, which isn't valid for all the // types (e.g. strings, decimals etc.). - if (!CorruptStatistics.shouldIgnoreStatistics(createdBy, type.getPrimitiveTypeName()) - && (sortOrdersMatch || maxEqualsMin)) { + // The fileHasCorruptStats flag applies only to BINARY and FIXED_LEN_BYTE_ARRAY columns. + boolean ignoreForThisColumn = + fileHasCorruptStats && CorruptStatistics.isCorruptStatisticsColumnType(type.getPrimitiveTypeName()); + if (!ignoreForThisColumn && (sortOrdersMatch || maxEqualsMin)) { if (isSet) { statsBuilder.withMin(formatStats.min.array()); statsBuilder.withMax(formatStats.max.array()); @@ -970,46 +980,6 @@ public org.apache.parquet.column.statistics.Statistics fromParquetStatistics( return fromParquetStatisticsInternal(createdBy, statistics, type, expectedOrder); } - // Overload that uses a pre-computed shouldIgnoreCorruptStats flag to avoid redundant parsing - private org.apache.parquet.column.statistics.Statistics fromParquetStatisticsInternal( - String createdBy, Statistics formatStats, PrimitiveType type, boolean shouldIgnoreCorruptStats) { - SortOrder typeSortOrder = overrideSortOrderToSigned(type) ? SortOrder.SIGNED : sortOrder(type); - org.apache.parquet.column.statistics.Statistics.Builder statsBuilder = - org.apache.parquet.column.statistics.Statistics.getBuilderForReading(type); - - if (formatStats != null) { - if (formatStats.isSetMin_value() && formatStats.isSetMax_value()) { - byte[] min = formatStats.min_value.array(); - byte[] max = formatStats.max_value.array(); - if (isMinMaxStatsSupported(type) || Arrays.equals(min, max)) { - statsBuilder.withMin(min); - statsBuilder.withMax(max); - } - } else { - boolean isSet = formatStats.isSetMax() && formatStats.isSetMin(); - boolean maxEqualsMin = isSet ? Arrays.equals(formatStats.getMin(), formatStats.getMax()) : false; - boolean sortOrdersMatch = SortOrder.SIGNED == typeSortOrder; - // The shouldIgnoreCorruptStats flag applies only to BINARY and FIXED_LEN_BYTE_ARRAY. - // For other types, shouldIgnoreStatistics always returns false, so we only guard those. - PrimitiveTypeName primitiveTypeName = type.getPrimitiveTypeName(); - boolean ignoreForThisColumn = shouldIgnoreCorruptStats - && (primitiveTypeName == PrimitiveTypeName.BINARY - || primitiveTypeName == PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY); - if (!ignoreForThisColumn && (sortOrdersMatch || maxEqualsMin)) { - if (isSet) { - statsBuilder.withMin(formatStats.min.array()); - statsBuilder.withMax(formatStats.max.array()); - } - } - } - - if (formatStats.isSetNull_count()) { - statsBuilder.withNumNulls(formatStats.null_count); - } - } - return statsBuilder.build(); - } - GeospatialStatistics toParquetGeospatialStatistics( org.apache.parquet.column.statistics.geospatial.GeospatialStatistics geospatialStatistics) { if (geospatialStatistics == null) { @@ -1834,24 +1804,23 @@ public FileMetaDataAndRowGroupOffsetInfo visit(RangeMetadataFilter filter) throw public ColumnChunkMetaData buildColumnChunkMetaData( ColumnMetaData metaData, ColumnPath columnPath, PrimitiveType type, String createdBy) { - boolean shouldIgnoreCorruptStats = - CorruptStatistics.shouldIgnoreStatistics(createdBy, PrimitiveTypeName.BINARY); - return buildColumnChunkMetaData(metaData, columnPath, type, createdBy, shouldIgnoreCorruptStats); + boolean fileHasCorruptStats = CorruptStatistics.fileHasCorruptStatistics(createdBy); + return buildColumnChunkMetaData(metaData, columnPath, type, fileHasCorruptStats); } ColumnChunkMetaData buildColumnChunkMetaData( ColumnMetaData metaData, ColumnPath columnPath, PrimitiveType type, - String createdBy, - boolean shouldIgnoreCorruptStats) { + boolean fileHasCorruptStats) { + SortOrder typeSortOrder = overrideSortOrderToSigned(type) ? SortOrder.SIGNED : sortOrder(type); return ColumnChunkMetaData.get( columnPath, type, fromFormatCodec(metaData.codec), convertEncodingStats(metaData.getEncoding_stats()), fromFormatEncodings(metaData.encodings), - fromParquetStatisticsInternal(createdBy, metaData.statistics, type, shouldIgnoreCorruptStats), + fromParquetStatisticsInternal(metaData.statistics, type, typeSortOrder, fileHasCorruptStats), metaData.data_page_offset, metaData.dictionary_page_offset, metaData.num_values, @@ -1880,10 +1849,11 @@ public ParquetMetadata fromParquetMetadata( MessageType messageType = fromParquetSchema(parquetMetadata.getSchema(), parquetMetadata.getColumn_orders()); List blocks = new ArrayList(); List row_groups = parquetMetadata.getRow_groups(); - // Compute once per file: the result is the same for BINARY and FIXED_LEN_BYTE_ARRAY - // (the only types affected by PARQUET-251), and always false for other types. - boolean shouldIgnoreCorruptStats = - CorruptStatistics.shouldIgnoreStatistics(parquetMetadata.getCreated_by(), PrimitiveTypeName.BINARY); + // Compute once per file: whether this file was written by a version with the PARQUET-251 bug. + // Only parse created_by if the schema has columns affected by the bug (BINARY/FIXED_LEN_BYTE_ARRAY). + // The per-column type check is applied later when statistics are actually read. + boolean fileHasCorruptStats = schemaHasCorruptStatisticsColumnType(messageType) + && CorruptStatistics.fileHasCorruptStatistics(parquetMetadata.getCreated_by()); if (row_groups != null) { for (RowGroup rowGroup : row_groups) { @@ -1964,8 +1934,7 @@ public ParquetMetadata fromParquetMetadata( metaData, columnPath, messageType.getType(columnPath.toArray()).asPrimitiveType(), - createdBy, - shouldIgnoreCorruptStats); + fileHasCorruptStats); column.setRowGroupOrdinal(rowGroup.getOrdinal()); if (metaData.isSetBloom_filter_offset()) { column.setBloomFilterOffset(metaData.getBloom_filter_offset()); @@ -2044,6 +2013,18 @@ private static ColumnPath getPath(ColumnMetaData metaData) { return ColumnPath.get(path); } + /** + * Returns true if the schema contains at least one column with a type affected by the PARQUET-251 bug. + */ + private static boolean schemaHasCorruptStatisticsColumnType(MessageType schema) { + for (ColumnDescriptor column : schema.getColumns()) { + if (CorruptStatistics.isCorruptStatisticsColumnType(column.getPrimitiveType().getPrimitiveTypeName())) { + return true; + } + } + return false; + } + // Visible for testing MessageType fromParquetSchema(List schema, List columnOrders) { Iterator iterator = schema.iterator(); diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java index e4851d8ec4..d4cd9c49d4 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java @@ -1977,4 +1977,105 @@ public void testEdgeInterpolationAlgorithmConversion() { assertNull(ParquetMetadataConverter.fromParquetEdgeInterpolationAlgorithm(null)); assertNull(ParquetMetadataConverter.toParquetEdgeInterpolationAlgorithm(null)); } + + @Test + public void testCorruptStatsPerColumnGate() { + // A created_by string from a version known to have the PARQUET-251 bug + String corruptCreatedBy = "parquet-mr version 1.6.0 (build abcd)"; + + // Set up legacy V1 statistics with min/max (not min_value/max_value) + org.apache.parquet.format.Statistics formatStats = new org.apache.parquet.format.Statistics(); + byte[] minBytes = new byte[] {0, 1, 2, 3}; + byte[] maxBytes = new byte[] {4, 5, 6, 7}; + formatStats.setMin(minBytes); + formatStats.setMax(maxBytes); + formatStats.setNull_count(5); + + // For BINARY column: stats should be ignored (only null_count preserved) + PrimitiveType binaryType = new PrimitiveType(Repetition.OPTIONAL, PrimitiveTypeName.BINARY, "bin_col"); + Statistics binaryStats = ParquetMetadataConverter.fromParquetStatisticsInternal( + corruptCreatedBy, formatStats, binaryType, ParquetMetadataConverter.SortOrder.SIGNED); + assertFalse("BINARY min/max should be ignored for corrupt file", binaryStats.hasNonNullValue()); + assertEquals(5, binaryStats.getNumNulls()); + + // For FIXED_LEN_BYTE_ARRAY column: stats should also be ignored + PrimitiveType fixedType = + new PrimitiveType(Repetition.OPTIONAL, PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY, 4, "fixed_col"); + Statistics fixedStats = ParquetMetadataConverter.fromParquetStatisticsInternal( + corruptCreatedBy, formatStats, fixedType, ParquetMetadataConverter.SortOrder.SIGNED); + assertFalse("FIXED_LEN_BYTE_ARRAY min/max should be ignored for corrupt file", fixedStats.hasNonNullValue()); + assertEquals(5, fixedStats.getNumNulls()); + + // For INT32 column: stats should NOT be ignored (per-column gate) + PrimitiveType int32Type = new PrimitiveType(Repetition.OPTIONAL, PrimitiveTypeName.INT32, "int_col"); + Statistics int32Stats = ParquetMetadataConverter.fromParquetStatisticsInternal( + corruptCreatedBy, formatStats, int32Type, ParquetMetadataConverter.SortOrder.SIGNED); + assertTrue("INT32 min/max should NOT be ignored for corrupt file", int32Stats.hasNonNullValue()); + assertEquals(5, int32Stats.getNumNulls()); + + // For INT64 column: stats should NOT be ignored + org.apache.parquet.format.Statistics formatStatsLong = new org.apache.parquet.format.Statistics(); + byte[] minLong = new byte[] {0, 0, 0, 0, 0, 0, 0, 1}; + byte[] maxLong = new byte[] {0, 0, 0, 0, 0, 0, 0, 7}; + formatStatsLong.setMin(minLong); + formatStatsLong.setMax(maxLong); + formatStatsLong.setNull_count(5); + PrimitiveType int64Type = new PrimitiveType(Repetition.OPTIONAL, PrimitiveTypeName.INT64, "long_col"); + Statistics int64Stats = ParquetMetadataConverter.fromParquetStatisticsInternal( + corruptCreatedBy, formatStatsLong, int64Type, ParquetMetadataConverter.SortOrder.SIGNED); + assertTrue("INT64 min/max should NOT be ignored for corrupt file", int64Stats.hasNonNullValue()); + assertEquals(5, int64Stats.getNumNulls()); + + // For a non-corrupt file, BINARY stats should be preserved + String goodCreatedBy = "parquet-mr version 1.8.0 (build abcd)"; + Statistics binaryStatsGood = ParquetMetadataConverter.fromParquetStatisticsInternal( + goodCreatedBy, formatStats, binaryType, ParquetMetadataConverter.SortOrder.SIGNED); + assertTrue("BINARY min/max should be kept for non-corrupt file", binaryStatsGood.hasNonNullValue()); + } + + @Test + public void testSchemaGateSkipsCorruptStatsCheckForNonBinarySchema() throws Exception { + // A file with ONLY INT32/INT64 columns should never trigger corrupt stats detection, + // even with a known-corrupt createdBy string. + String corruptCreatedBy = "parquet-mr version 1.6.0 (build abcd)"; + + MessageType intOnlySchema = Types.buildMessage() + .required(PrimitiveTypeName.INT32).named("id") + .required(PrimitiveTypeName.INT64).named("ts") + .named("msg"); + + ParquetMetadataConverter converter = new ParquetMetadataConverter(); + List schemaElements = converter.toParquetSchema(intOnlySchema); + + // Build ColumnMetaData with V1 statistics for the INT32 column + org.apache.parquet.format.Statistics stats = new org.apache.parquet.format.Statistics(); + stats.setMin(new byte[] {0, 0, 0, 1}); + stats.setMax(new byte[] {0, 0, 0, 7}); + stats.setNull_count(0); + + ColumnMetaData cmd = new ColumnMetaData( + Type.INT32, + Collections.emptyList(), + Collections.singletonList("id"), + UNCOMPRESSED, + 100L, 200L, 100L, 0L); + cmd.setStatistics(stats); + + ColumnChunk cc = new ColumnChunk(0L); + cc.setMeta_data(cmd); + RowGroup rg = new RowGroup(List.of(cc), 100L, 1); + + FileMetaData fmd = new FileMetaData(1, schemaElements, 1, List.of(rg)); + fmd.setCreated_by(corruptCreatedBy); + + // Parse via fromParquetMetadata – the schema gate should prevent fileHasCorruptStats + ParquetMetadata metadata = converter.fromParquetMetadata(fmd); + Statistics columnStats = + metadata.getBlocks().get(0).getColumns().get(0).getStatistics(); + + // Stats should be preserved (not ignored) because schema has no BINARY/FIXED columns + assertTrue( + "INT32 stats should be preserved when schema has no corrupt-stats-affected columns", + columnStats.hasNonNullValue()); + } }