Skip to content

Commit 4cffa4b

Browse files
kevinwilfongfacebook-github-bot
authored andcommitted
fix: Array_intersect with a single argument doesn't correctly handle nulls in dictionary encoded inner arrays (#11807)
Summary: Pull Request resolved: #11807 The single argument flavor of array_intersect computes the intersection of the inner arrays in an array of arrays. It currently does not correctly handle nulls in the dictionary if the inner arrays are dictionary encoded. There are 2 bugs: 1) toElementRows requires that nulls be deselected in the SelectivityVector, array_intersect does not do this when calling it on the inner arrays. This means it will attempt to use the index from the DecodedVector's indices() to get the offset and size of the null arrays, which can lead to accessing arbitrary memory as the index may point off the end of the buffers getting the size and offset, which can further lead to writing to arbitrary memory as we can write off the end of the SelectivityVector. 2) When actually doing the intersection we get the innerOffset and innerSize before checking if the inner array is null. We don't end up using these values if the inner array is null, so this just results in an error when ASAN is enabled. To fix 1) I created separate APIs for toElementRows for use with DecodedVectors and otherwise. It forces the user to pass in the nulls from the DecodedVector in addition to the indices to help avoid mistakes like this. Note that this worked in PrestoHasher because it was correctly deselecting nulls in the SelectivityVector, and in WidthBucketArray because it has default null behavior so the nulls would have been automatically deselected by the expression eval code. Reviewed By: spershin Differential Revision: D66994602 fbshipit-source-id: 1c14bfaab886e692aae09b555cd6d2971e3deb72
1 parent 1bd480e commit 4cffa4b

File tree

5 files changed

+59
-31
lines changed

5 files changed

+59
-31
lines changed

velox/functions/lib/RowsTranslationUtil.h

Lines changed: 23 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -23,22 +23,20 @@
2323
namespace facebook::velox::functions {
2424

2525
/// This function returns a SelectivityVector for ARRAY/MAP vectors that selects
26-
/// all rows corresponding to the specified rows. If the base vector was
27-
/// dictionary encoded, an optional rowMapping parameter can be used to pass
28-
/// the dictionary indices. In this case, it is important to ensure that the
29-
/// topLevelRows parameter has already filtered out any null rows added by the
30-
/// dictionary encoding.
26+
/// all rows corresponding to the specified rows. This flavor is intended for
27+
/// use with the base vectors of decoded vectors. Use `nulls` and `rowMapping`
28+
/// to pass in the null and index mappings from the DecodedVector.
3129
template <typename T>
3230
SelectivityVector toElementRows(
3331
vector_size_t size,
34-
const SelectivityVector& topLevelNonNullRows,
32+
const SelectivityVector& topLevelRows,
3533
const T* arrayBaseVector,
36-
const vector_size_t* rowMapping = nullptr) {
34+
const uint64_t* nulls,
35+
const vector_size_t* rowMapping) {
3736
VELOX_CHECK(
3837
arrayBaseVector->encoding() == VectorEncoding::Simple::MAP ||
3938
arrayBaseVector->encoding() == VectorEncoding::Simple::ARRAY);
4039

41-
auto rawNulls = arrayBaseVector->rawNulls();
4240
auto rawSizes = arrayBaseVector->rawSizes();
4341
auto rawOffsets = arrayBaseVector->rawOffsets();
4442
VELOX_DEBUG_ONLY const auto sizeRange =
@@ -47,9 +45,9 @@ SelectivityVector toElementRows(
4745
arrayBaseVector->offsets()->template asRange<vector_size_t>().end();
4846

4947
SelectivityVector elementRows(size, false);
50-
topLevelNonNullRows.applyToSelected([&](vector_size_t row) {
48+
topLevelRows.applyToSelected([&](vector_size_t row) {
5149
auto index = rowMapping ? rowMapping[row] : row;
52-
if (rawNulls && bits::isBitNull(rawNulls, index)) {
50+
if (nulls && bits::isBitNull(nulls, row)) {
5351
return;
5452
}
5553

@@ -64,6 +62,21 @@ SelectivityVector toElementRows(
6462
return elementRows;
6563
}
6664

65+
/// This function returns a SelectivityVector for ARRAY/MAP vectors that selects
66+
/// all rows corresponding to the specified rows.
67+
template <typename T>
68+
SelectivityVector toElementRows(
69+
vector_size_t size,
70+
const SelectivityVector& topLevelRows,
71+
const T* arrayBaseVector) {
72+
return toElementRows(
73+
size,
74+
topLevelRows,
75+
arrayBaseVector,
76+
arrayBaseVector->rawNulls(),
77+
nullptr);
78+
}
79+
6780
/// Returns a buffer of vector_size_t that represents the mapping from
6881
/// topLevelRows's element rows to its top-level rows. For example, suppose
6982
/// `result` is the returned buffer, result[i] == j means the value at index i

velox/functions/prestosql/ArrayIntersectExcept.cpp

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,11 @@ DecodedVector* decodeArrayElements(
146146
// Decode and acquire array elements vector.
147147
auto elementsVector = baseArrayVector->elements();
148148
*elementRows = toElementRows(
149-
elementsVector->size(), rows, baseArrayVector, decodedVector->indices());
149+
elementsVector->size(),
150+
rows,
151+
baseArrayVector,
152+
arrayDecoder->nulls(&rows),
153+
arrayDecoder->indices());
150154
elementsDecoder.get()->decode(*elementsVector, *elementRows);
151155
auto decodedElementsVector = elementsDecoder.get();
152156
return decodedElementsVector;
@@ -217,10 +221,6 @@ class ArraysIntersectSingleParam : public exec::VectorFunction {
217221
auto size = outerArray->sizeAt(idx);
218222
bool setInitialized = false;
219223
for (auto i = offset; i < (offset + size); ++i) {
220-
auto innerIdx = decodedInnerArray->index(i);
221-
auto innerOffset = innerArray->offsetAt(innerIdx);
222-
auto innerSize = innerArray->sizeAt(innerIdx);
223-
224224
// 1. prepare for next iteration
225225
indicesCursor = rawNewOffsets[row];
226226
SetWithNull<T> intermediateSet;
@@ -234,6 +234,10 @@ class ArraysIntersectSingleParam : public exec::VectorFunction {
234234
break;
235235
}
236236

237+
auto innerIdx = decodedInnerArray->index(i);
238+
auto innerOffset = innerArray->offsetAt(innerIdx);
239+
auto innerSize = innerArray->sizeAt(innerIdx);
240+
237241
// 3. Regular array
238242
for (auto j = innerOffset; j < (innerOffset + innerSize); ++j) {
239243
// null element

velox/functions/prestosql/WidthBucketArray.cpp

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -80,8 +80,12 @@ class WidthBucketArrayFunction : public exec::VectorFunction {
8080
auto rawSizes = binsArray->rawSizes();
8181
auto rawOffsets = binsArray->rawOffsets();
8282
auto elementsVector = binsArray->elements();
83-
auto elementsRows =
84-
toElementRows(elementsVector->size(), rows, binsArray, bins->indices());
83+
auto elementsRows = toElementRows(
84+
elementsVector->size(),
85+
rows,
86+
binsArray,
87+
bins->nulls(&rows),
88+
bins->indices());
8589
exec::LocalDecodedVector elementsHolder(
8690
context, *elementsVector, elementsRows);
8791

velox/functions/prestosql/aggregates/PrestoHasher.cpp

Lines changed: 4 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -245,14 +245,10 @@ void PrestoHasher::hash<TypeKind::ARRAY>(
245245
BufferPtr& hashes) {
246246
auto baseArray = vector_->base()->as<ArrayVector>();
247247
auto indices = vector_->indices();
248-
249-
auto nonNullRows = SelectivityVector(rows);
250-
if (vector_->nulls(&nonNullRows)) {
251-
nonNullRows.deselectNulls(vector_->nulls(), 0, nonNullRows.end());
252-
}
248+
auto decodedNulls = vector_->nulls(&rows);
253249

254250
auto elementRows = functions::toElementRows(
255-
baseArray->elements()->size(), nonNullRows, baseArray, indices);
251+
baseArray->elements()->size(), rows, baseArray, decodedNulls, indices);
256252

257253
BufferPtr elementHashes =
258254
AlignedBuffer::allocate<int64_t>(elementRows.end(), baseArray->pool());
@@ -263,7 +259,6 @@ void PrestoHasher::hash<TypeKind::ARRAY>(
263259
auto rawOffsets = baseArray->rawOffsets();
264260
auto rawElementHashes = elementHashes->as<int64_t>();
265261
auto rawHashes = hashes->asMutable<int64_t>();
266-
auto decodedNulls = vector_->nulls();
267262

268263
rows.applyToSelected([&](auto row) {
269264
int64_t hash = 0;
@@ -285,15 +280,11 @@ void PrestoHasher::hash<TypeKind::MAP>(
285280
BufferPtr& hashes) {
286281
auto baseMap = vector_->base()->as<MapVector>();
287282
auto indices = vector_->indices();
283+
auto decodedNulls = vector_->nulls(&rows);
288284
VELOX_CHECK_EQ(children_.size(), 2);
289285

290-
auto nonNullRows = SelectivityVector(rows);
291-
if (vector_->nulls(&nonNullRows)) {
292-
nonNullRows.deselectNulls(vector_->nulls(), 0, nonNullRows.end());
293-
}
294-
295286
auto elementRows = functions::toElementRows(
296-
baseMap->mapKeys()->size(), nonNullRows, baseMap, indices);
287+
baseMap->mapKeys()->size(), rows, baseMap, decodedNulls, indices);
297288
BufferPtr keyHashes =
298289
AlignedBuffer::allocate<int64_t>(elementRows.end(), baseMap->pool());
299290

@@ -309,7 +300,6 @@ void PrestoHasher::hash<TypeKind::MAP>(
309300

310301
auto rawSizes = baseMap->rawSizes();
311302
auto rawOffsets = baseMap->rawOffsets();
312-
auto decodedNulls = vector_->nulls();
313303

314304
rows.applyToSelected([&](auto row) {
315305
int64_t hash = 0;

velox/functions/prestosql/tests/ArrayIntersectTest.cpp

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -718,3 +718,20 @@ TEST_F(ArrayIntersectTest, timestampWithTimezone) {
718718
{pack(1, 0), pack(8, 6)},
719719
{pack(8, 10), pack(1, 11)});
720720
}
721+
722+
TEST_F(ArrayIntersectTest, nullInDictionary) {
723+
// Test that if the array elements are dictionary encoded, nulls in the
724+
// dictionary are propagated correctly.
725+
auto elements =
726+
makeArrayVector<int64_t>({{1, 2, 3}, {1, 5, 6}, {7, 8, 9}, {7}});
727+
auto indices = makeIndices({0, 1, 50, 2, 3});
728+
auto nulls = makeNulls({false, false, true, false, false});
729+
auto encodedElements =
730+
BaseVector::wrapInDictionary(nulls, indices, 5, elements);
731+
732+
auto arrays = makeArrayVector({0, 2, 3}, encodedElements);
733+
734+
auto expected =
735+
makeNullableArrayVector<int64_t>({{{1}}, std::nullopt, {{7}}});
736+
testExpr(expected, "array_intersect(c0)", {arrays});
737+
}

0 commit comments

Comments
 (0)