From 6ae31bb622a4f0b3d80d55ec1abc857163b60147 Mon Sep 17 00:00:00 2001 From: Eduard Tudenhoefner Date: Wed, 1 Jul 2026 11:26:55 +0200 Subject: [PATCH] GH-3639: Don't drop row groups for IN(..., null) when num_nulls is not set --- .../statisticslevel/StatisticsFilter.java | 5 +++ .../statisticslevel/TestStatisticsFilter.java | 39 +++++++++++++++++++ 2 files changed, 44 insertions(+) diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/filter2/statisticslevel/StatisticsFilter.java b/parquet-hadoop/src/main/java/org/apache/parquet/filter2/statisticslevel/StatisticsFilter.java index 2574c8ea34..d06ea825d3 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/filter2/statisticslevel/StatisticsFilter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/filter2/statisticslevel/StatisticsFilter.java @@ -305,6 +305,11 @@ public > Boolean visit(In in) { } else { if (values.contains(null)) return BLOCK_MIGHT_MATCH; } + } else if (values.contains(null)) { + // the number of nulls is unknown, so this chunk might contain nulls that match the null + // literal in the IN set. we cannot fall through to the min/max check (which only considers + // the non-null values) or we might incorrectly drop a chunk containing matching null rows. + return BLOCK_MIGHT_MATCH; } // If any value in the IN set is NaN, be conservative diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/filter2/statisticslevel/TestStatisticsFilter.java b/parquet-hadoop/src/test/java/org/apache/parquet/filter2/statisticslevel/TestStatisticsFilter.java index e9756fa820..675dcf9149 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/filter2/statisticslevel/TestStatisticsFilter.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/filter2/statisticslevel/TestStatisticsFilter.java @@ -389,6 +389,45 @@ public void testInNotIn() { List.of(getIntColumnMeta(statsSomeNulls, 177L), getDoubleColumnMeta(doubleStats, 177L)))); } + @Test + public void testInWithNullLiteralAndUnsetNumNulls() { + // Reproduces the bug where StatisticsFilter drops a row group for IN (..., null) when num_nulls + // is unset. min/max are present but the number of nulls is unknown, so we must not fall through + // to the min/max-only check (which only considers the non-null literals) and drop a chunk that + // may contain matching null rows + org.apache.parquet.column.statistics.Statistics statsUnsetNulls = + org.apache.parquet.column.statistics.Statistics.getBuilderForReading( + Types.required(PrimitiveTypeName.INT32).named("test_int32")) + .withMin(BytesUtils.intToBytes(10)) + .withMax(BytesUtils.intToBytes(100)) + .build(); + // min/max are available but num_nulls is not + assertTrue(statsUnsetNulls.hasNonNullValue()); + assertFalse(statsUnsetNulls.isNumNullsSet()); + + List metas = + List.of(getIntColumnMeta(statsUnsetNulls, 177L), getDoubleColumnMeta(doubleStats, 177L)); + + // IN (200, null) where 200 is outside [10, 100]. The chunk might contain null rows matching the + // null literal, so it must NOT be dropped + Set valuesNullAndOutOfRange = new HashSet<>(); + valuesNullAndOutOfRange.add(null); + valuesNullAndOutOfRange.add(200); + assertFalse(canDrop(in(intColumn, valuesNullAndOutOfRange), metas)); + + // IN (200) without a null literal can still be dropped based on min/max even if num_nulls is + // unknown, confirming the fix does not over-broaden pruning. + Set valuesOutOfRange = new HashSet<>(); + valuesOutOfRange.add(200); + assertTrue(canDrop(in(intColumn, valuesOutOfRange), metas)); + + // IN (50, null) where 50 is inside [10, 100] must also not be dropped. + Set valuesNullAndInRange = new HashSet<>(); + valuesNullAndInRange.add(null); + valuesNullAndInRange.add(50); + assertFalse(canDrop(in(intColumn, valuesNullAndInRange), metas)); + } + @Test public void testContainsEqNonNull() { assertTrue(canDrop(contains(eq(intColumn, 9)), columnMetas));