IBM
diff --git a/‎velox/dwio/dwrf/reader/SelectiveDecimalColumnReader.cpp‎
Lines changed: 143 additions & 12 deletions b/‎velox/dwio/dwrf/reader/SelectiveDecimalColumnReader.cpp‎
Lines changed: 143 additions & 12 deletions
diff --git a/‎velox/dwio/dwrf/reader/SelectiveDecimalColumnReader.h‎
Lines changed: 18 additions & 1 deletion b/‎velox/dwio/dwrf/reader/SelectiveDecimalColumnReader.h‎
Lines changed: 18 additions & 1 deletion
diff --git a/‎velox/dwio/dwrf/test/E2EFilterTest.cpp‎
Lines changed: 56 additions & 0 deletions b/‎velox/dwio/dwrf/test/E2EFilterTest.cpp‎
Lines changed: 56 additions & 0 deletions
@@ -75,16 +75,17 @@ void SelectiveDecimalColumnReader<DataT>::seekToRowGroup(int64_t index) {
 
 template <typename DataT>
 template <bool kDense>
-void SelectiveDecimalColumnReader<DataT>::readHelper(RowSet rows) {
-  vector_size_t numRows = rows.back() + 1;
+void SelectiveDecimalColumnReader<DataT>::readHelper(
+    const common::Filter* filter,
+    RowSet rows) {
   ExtractToReader extractValues(this);
-  common::AlwaysTrue filter;
+  common::AlwaysTrue alwaysTrue;
   DirectRleColumnVisitor<
       int64_t,
       common::AlwaysTrue,
       decltype(extractValues),
       kDense>
-      visitor(filter, this, rows, extractValues);
+      visitor(alwaysTrue, this, rows, extractValues);
 
   // decode scale stream
   if (version_ == velox::dwrf::RleVersion_1) {
@@ -104,46 +105,176 @@ void SelectiveDecimalColumnReader<DataT>::readHelper(RowSet rows) {
   // reset numValues_ before reading values
   numValues_ = 0;
   valueSize_ = sizeof(DataT);
+  vector_size_t numRows = rows.back() + 1;
   ensureValuesCapacity<DataT>(numRows);
 
   // decode value stream
   facebook::velox::dwio::common::
       ColumnVisitor<DataT, common::AlwaysTrue, decltype(extractValues), kDense>
-          valueVisitor(filter, this, rows, extractValues);
+          valueVisitor(alwaysTrue, this, rows, extractValues);
   decodeWithVisitor<DirectDecoder<true>>(valueDecoder_.get(), valueVisitor);
   readOffset_ += numRows;
+
+  // Fill decimals before applying filter.
+  fillDecimals();
+
+  // 'nullsInReadRange_' is the nulls for the entire read range, and if the row
+  // set is not dense, result nulls should be allocated, which represents the
+  // nulls for the selected rows before filtering.
+  const auto rawNulls = nullsInReadRange_
+      ? (kDense ? nullsInReadRange_->as<uint64_t>() : rawResultNulls_)
+      : nullptr;
+  // Process filter.
+  process(filter, rows, rawNulls);
+}
+
+template <typename DataT>
+void SelectiveDecimalColumnReader<DataT>::processNulls(
+    bool isNull,
+    const RowSet& rows,
+    const uint64_t* rawNulls) {
+  if (!rawNulls) {
+    return;
+  }
+  returnReaderNulls_ = false;
+  anyNulls_ = !isNull;
+  allNull_ = isNull;
+
+  auto rawDecimal = values_->asMutable<DataT>();
+  auto rawScale = scaleBuffer_->asMutable<int64_t>();
+
+  vector_size_t idx = 0;
+  if (isNull) {
+    for (vector_size_t i = 0; i < numValues_; i++) {
+      if (bits::isBitNull(rawNulls, i)) {
+        bits::setNull(rawResultNulls_, idx);
+        addOutputRow(rows[i]);
+        idx++;
+      }
+    }
+  } else {
+    for (vector_size_t i = 0; i < numValues_; i++) {
+      if (!bits::isBitNull(rawNulls, i)) {
+        bits::setNull(rawResultNulls_, idx, false);
+        rawDecimal[idx] = rawDecimal[i];
+        rawScale[idx] = rawScale[i];
+        addOutputRow(rows[i]);
+        idx++;
+      }
+    }
+  }
+}
+
+template <typename DataT>
+void SelectiveDecimalColumnReader<DataT>::processFilter(
+    const common::Filter* filter,
+    const RowSet& rows,
+    const uint64_t* rawNulls) {
+  VELOX_CHECK_NOT_NULL(filter, "Filter must not be null.");
+  returnReaderNulls_ = false;
+  anyNulls_ = false;
+  allNull_ = true;
+
+  vector_size_t idx = 0;
+  auto rawDecimal = values_->asMutable<DataT>();
+  for (vector_size_t i = 0; i < numValues_; i++) {
+    if (rawNulls && bits::isBitNull(rawNulls, i)) {
+      if (filter->testNull()) {
+        bits::setNull(rawResultNulls_, idx);
+        addOutputRow(rows[i]);
+        anyNulls_ = true;
+        idx++;
+      }
+    } else {
+      bool tested;
+      if constexpr (std::is_same_v<DataT, int64_t>) {
+        tested = filter->testInt64(rawDecimal[i]);
+      } else {
+        tested = filter->testInt128(rawDecimal[i]);
+      }
+
+      if (tested) {
+        if (rawNulls) {
+          bits::setNull(rawResultNulls_, idx, false);
+        }
+        rawDecimal[idx] = rawDecimal[i];
+        addOutputRow(rows[i]);
+        allNull_ = false;
+        idx++;
+      }
+    }
+  }
+}
+
+template <typename DataT>
+void SelectiveDecimalColumnReader<DataT>::process(
+    const common::Filter* filter,
+    const RowSet& rows,
+    const uint64_t* rawNulls) {
+  if (!filter) {
+    // No filter and "hasDeletion" is false so input rows will be
+    // reused.
+    return;
+  }
+
+  switch (filter->kind()) {
+    case common::FilterKind::kIsNull:
+      processNulls(true, rows, rawNulls);
+      break;
+    case common::FilterKind::kIsNotNull: {
+      if (rawNulls) {
+        processNulls(false, rows, rawNulls);
+      } else {
+        for (vector_size_t i = 0; i < numValues_; i++) {
+          addOutputRow(rows[i]);
+        }
+      }
+      break;
+    }
+    default:
+      processFilter(filter, rows, rawNulls);
+  }
 }
 
 template <typename DataT>
 void SelectiveDecimalColumnReader<DataT>::read(
     int64_t offset,
     const RowSet& rows,
     const uint64_t* incomingNulls) {
-  VELOX_CHECK(!scanSpec_->filter());
   VELOX_CHECK(!scanSpec_->valueHook());
   prepareRead<int64_t>(offset, rows, incomingNulls);
+  if (DictionaryValues::hasFilter(scanSpec_->filter()) &&
+      (!resultNulls_ || !resultNulls_->unique() ||
+       resultNulls_->capacity() * 8 < rows.size())) {
+    // Make sure a dedicated resultNulls_ is allocated with enough capacity as
+    // RleDecoder always assumes it is available.
+    resultNulls_ = AlignedBuffer::allocate<bool>(rows.size(), memoryPool_);
+    rawResultNulls_ = resultNulls_->asMutable<uint64_t>();
+  }
+  rawValues_ = values_->asMutable<char>();
   bool isDense = rows.back() == rows.size() - 1;
   if (isDense) {
-    readHelper<true>(rows);
+    readHelper<true>(scanSpec_->filter(), rows);
   } else {
-    readHelper<false>(rows);
+    readHelper<false>(scanSpec_->filter(), rows);
   }
 }
 
 template <typename DataT>
 void SelectiveDecimalColumnReader<DataT>::getValues(
     const RowSet& rows,
     VectorPtr* result) {
+  getIntValues(rows, requestedType_, result);
+}
+
+template <typename DataT>
+void SelectiveDecimalColumnReader<DataT>::fillDecimals() {
   auto nullsPtr =
       resultNulls() ? resultNulls()->template as<uint64_t>() : nullptr;
   auto scales = scaleBuffer_->as<int64_t>();
   auto values = values_->asMutable<DataT>();
-
   DecimalUtil::fillDecimals<DataT>(
       values, nullsPtr, values, scales, numValues_, scale_);
-
-  rawValues_ = values_->asMutable<char>();
-  getIntValues(rows, requestedType_, result);
 }
 
 template class SelectiveDecimalColumnReader<int64_t>;
 
@@ -49,7 +49,24 @@ class SelectiveDecimalColumnReader : public SelectiveColumnReader {
 
  private:
   template <bool kDense>
-  void readHelper(RowSet rows);
+  void readHelper(const common::Filter* filter, RowSet rows);
+
+  // Process IsNull and IsNotNull filters.
+  void processNulls(bool isNull, const RowSet& rows, const uint64_t* rawNulls);
+
+  // Process filters on decimal values.
+  void processFilter(
+      const common::Filter* filter,
+      const RowSet& rows,
+      const uint64_t* rawNulls);
+
+  // Dispatch to the respective filter processing based on the filter type.
+  void process(
+      const common::Filter* filter,
+      const RowSet& rows,
+      const uint64_t* rawNulls);
+
+  void fillDecimals();
 
   std::unique_ptr<IntDecoder<true>> valueDecoder_;
   std::unique_ptr<IntDecoder<true>> scaleDecoder_;
 
@@ -241,6 +241,62 @@ TEST_F(E2EFilterTest, floatAndDouble) {
       false);
 }
 
+TEST_F(E2EFilterTest, DISABLED_shortDecimal) {
+  // ORC write functionality is not yet supported. Enable this test once it
+  // becomes available and set the file format to ORC at that time.
+  // options.format = DwrfFormat::kOrc;
+  const std::unordered_map<std::string, TypePtr> types = {
+      {"shortdecimal_val:decimal(8, 5)", DECIMAL(8, 5)},
+      {"shortdecimal_val:decimal(10, 5)", DECIMAL(10, 5)},
+      {"shortdecimal_val:decimal(17, 5)", DECIMAL(17, 5)}};
+
+  for (const auto& pair : types) {
+    testWithTypes(
+        pair.first,
+        [&]() {
+          makeIntDistribution<int64_t>(
+              "shortdecimal_val",
+              10, // min
+              100, // max
+              22, // repeats
+              19, // rareFrequency
+              -999, // rareMin
+              30000, // rareMax
+              true);
+        },
+        false,
+        {"shortdecimal_val"},
+        20);
+  }
+}
+
+TEST_F(E2EFilterTest, DISABLED_longDecimal) {
+  // ORC write functionality is not yet supported. Enable this test once it
+  // becomes available and set the file format to ORC at that time.
+  // options.format = DwrfFormat::kOrc;
+  const std::unordered_map<std::string, TypePtr> types = {
+      {"longdecimal_val:decimal(30, 10)", DECIMAL(30, 10)},
+      {"longdecimal_val:decimal(37, 15)", DECIMAL(37, 15)}};
+  for (const auto& pair : types) {
+    testWithTypes(
+        pair.first,
+        [&]() {
+          makeIntDistribution<int128_t>(
+              "longdecimal_val",
+              10, // min
+              100, // max
+              22, // repeats
+              19, // rareFrequency
+              -999, // rareMin
+              30000, // rareMax
+              true);
+        },
+        false,
+        {"longdecimal_val"},
+        20);
+  }
+}
+
 TEST_F(E2EFilterTest, stringDirect) {
   testutil::TestValue::enable();
   bool coverage[2][2]{};