From 6ae31bb622a4f0b3d80d55ec1abc857163b60147 Mon Sep 17 00:00:00 2001
From: Eduard Tudenhoefner <etudenhoefner@gmail.com>
Date: Wed, 1 Jul 2026 11:26:55 +0200
Subject: [PATCH] GH-3639: Don't drop row groups for IN(..., null) when
 num_nulls is not set

---
 .../statisticslevel/StatisticsFilter.java     |  5 +++
 .../statisticslevel/TestStatisticsFilter.java | 39 +++++++++++++++++++
 2 files changed, 44 insertions(+)
diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/filter2/statisticslevel/StatisticsFilter.java b/parquet-hadoop/src/main/java/org/apache/parquet/filter2/statisticslevel/StatisticsFilter.java
index 2574c8ea34..d06ea825d3 100644
--- a/parquet-hadoop/src/main/java/org/apache/parquet/filter2/statisticslevel/StatisticsFilter.java
+++ b/parquet-hadoop/src/main/java/org/apache/parquet/filter2/statisticslevel/StatisticsFilter.java
@@ -305,6 +305,11 @@ public <T extends Comparable<T>> Boolean visit(In<T> in) {
       } else {
         if (values.contains(null)) return BLOCK_MIGHT_MATCH;
       }
+    } else if (values.contains(null)) {
+      // the number of nulls is unknown, so this chunk might contain nulls that match the null
+      // literal in the IN set. we cannot fall through to the min/max check (which only considers
+      // the non-null values) or we might incorrectly drop a chunk containing matching null rows.
+      return BLOCK_MIGHT_MATCH;
     }
 
     // If any value in the IN set is NaN, be conservative
diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/filter2/statisticslevel/TestStatisticsFilter.java b/parquet-hadoop/src/test/java/org/apache/parquet/filter2/statisticslevel/TestStatisticsFilter.java
index e9756fa820..675dcf9149 100644
--- a/parquet-hadoop/src/test/java/org/apache/parquet/filter2/statisticslevel/TestStatisticsFilter.java
+++ b/parquet-hadoop/src/test/java/org/apache/parquet/filter2/statisticslevel/TestStatisticsFilter.java
@@ -389,6 +389,45 @@ public void testInNotIn() {
         List.of(getIntColumnMeta(statsSomeNulls, 177L), getDoubleColumnMeta(doubleStats, 177L))));
   }
 
+  @Test
+  public void testInWithNullLiteralAndUnsetNumNulls() {
+    // Reproduces the bug where StatisticsFilter drops a row group for IN (..., null) when num_nulls
+    // is unset. min/max are present but the number of nulls is unknown, so we must not fall through
+    // to the min/max-only check (which only considers the non-null literals) and drop a chunk that
+    // may contain matching null rows
+    org.apache.parquet.column.statistics.Statistics<?> statsUnsetNulls =
+        org.apache.parquet.column.statistics.Statistics.getBuilderForReading(
+                Types.required(PrimitiveTypeName.INT32).named("test_int32"))
+            .withMin(BytesUtils.intToBytes(10))
+            .withMax(BytesUtils.intToBytes(100))
+            .build();
+    // min/max are available but num_nulls is not
+    assertTrue(statsUnsetNulls.hasNonNullValue());
+    assertFalse(statsUnsetNulls.isNumNullsSet());
+
+    List<ColumnChunkMetaData> metas =
+        List.of(getIntColumnMeta(statsUnsetNulls, 177L), getDoubleColumnMeta(doubleStats, 177L));
+
+    // IN (200, null) where 200 is outside [10, 100]. The chunk might contain null rows matching the
+    // null literal, so it must NOT be dropped
+    Set<Integer> valuesNullAndOutOfRange = new HashSet<>();
+    valuesNullAndOutOfRange.add(null);
+    valuesNullAndOutOfRange.add(200);
+    assertFalse(canDrop(in(intColumn, valuesNullAndOutOfRange), metas));
+
+    // IN (200) without a null literal can still be dropped based on min/max even if num_nulls is
+    // unknown, confirming the fix does not over-broaden pruning.
+    Set<Integer> valuesOutOfRange = new HashSet<>();
+    valuesOutOfRange.add(200);
+    assertTrue(canDrop(in(intColumn, valuesOutOfRange), metas));
+
+    // IN (50, null) where 50 is inside [10, 100] must also not be dropped.
+    Set<Integer> valuesNullAndInRange = new HashSet<>();
+    valuesNullAndInRange.add(null);
+    valuesNullAndInRange.add(50);
+    assertFalse(canDrop(in(intColumn, valuesNullAndInRange), metas));
+  }
+
   @Test
   public void testContainsEqNonNull() {
     assertTrue(canDrop(contains(eq(intColumn, 9)), columnMetas));