Skip to content

Commit 579f13d

Browse files
jagillmeta-codesync[bot]
authored andcommitted
refactor(geo): Implement RTree SpatialIndex for faster spatial joins (facebookincubator#15492)
Summary: Pull Request resolved: facebookincubator#15492 The previous flat spatial index required us to scan all bounding boxes (minus the ones we could discard by binary search). RTrees have a hierarchical set of bounding boxes, allow us to skip many bounding boxes with a single check. This results in a faster spatial index Results from the SpatialJoinBenchmark: Strategy | 1,1 Unf | 1,1 Cls | 50,5 Unf | 50,5 Cls | 25,5 L Unf | 25,5 L Cls | 200,50 Unf -------------|---------|---------|----------|--------|------|------|-------- Flat | 364.00 | 559.28 | 1.99 | 2.12 | 3.99 | 4.94 | 0.03027 Hilbert-32 | 383.03 | 545.45 | 4.50 | 2.62 | 8.78 | 5.76 | 0.13577 Hilbert-64 | 388.38 | 534.55 | 4.44 | 2.76 | 8.63 | 5.69 | 0.13694 Hilbert-128 | 364.52 | 520.60 | 4.31 | 2.76 | 8.42 | 5.38 | 0.12061 Hilbert-256 | 351.38 | 498.73 | 4.08 | 2.64 | 8.04 | 5.92 | 0.13350 Entries are in iterations per second, larger is better. Legend: Hilbert-N: Hilbert sorted RTree with branch size N N,M: Probe/build size, in thousands of rows. This 50,5 means 50k probe rows, 5k build Unf: Uniform distribution Cls: Clustered distribution L: Left join (otherwise inner) Reviewed By: kgpai Differential Revision: D86127954 fbshipit-source-id: 4b5473215db47318a729dc26c574e75ace1e55c3
1 parent 349dad3 commit 579f13d

File tree

3 files changed

+269
-65
lines changed

3 files changed

+269
-65
lines changed

velox/exec/SpatialIndex.cpp

Lines changed: 132 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -18,54 +18,157 @@
1818
#include <vector>
1919

2020
#include "velox/common/base/Exceptions.h"
21+
#include "velox/exec/HilbertIndex.h"
2122
#include "velox/exec/SpatialIndex.h"
2223

2324
namespace facebook::velox::exec {
2425

25-
SpatialIndex::SpatialIndex(Envelope bounds, std::vector<Envelope> envelopes)
26-
: bounds_(std::move(bounds)) {
27-
std::ranges::sort(envelopes, {}, &Envelope::minX);
26+
std::vector<size_t> RTreeLevel::query(
27+
const Envelope& queryEnv,
28+
const std::vector<size_t>& branchIndices) const {
29+
std::vector<size_t> result;
2830

29-
minXs_.reserve(envelopes.size());
30-
minYs_.reserve(envelopes.size());
31-
maxXs_.reserve(envelopes.size());
32-
maxYs_.reserve(envelopes.size());
33-
rowIndices_.reserve(envelopes.size());
31+
for (size_t branchIdx : branchIndices) {
32+
size_t startIdx = branchIdx * branchSize_;
33+
size_t endIdx = std::min(startIdx + branchSize_, minXs_.size());
34+
for (size_t idx = startIdx; idx < endIdx; ++idx) {
35+
bool intersects = (queryEnv.maxX >= minXs_[idx]) &&
36+
(queryEnv.maxY >= minYs_[idx]) && (queryEnv.minX <= maxXs_[idx]) &&
37+
(queryEnv.minY <= maxYs_[idx]);
38+
if (intersects) {
39+
result.push_back(idx);
40+
}
41+
}
42+
}
3443

44+
return result;
45+
}
46+
47+
namespace {
48+
std::pair<RTreeLevel, std::vector<Envelope>> buildLevel(
49+
uint32_t branchSize,
50+
const std::vector<Envelope>& envelopes) {
51+
std::vector<float> minXs;
52+
minXs.reserve(envelopes.size());
53+
std::vector<float> minYs;
54+
minYs.reserve(envelopes.size());
55+
std::vector<float> maxXs;
56+
maxXs.reserve(envelopes.size());
57+
std::vector<float> maxYs;
58+
maxYs.reserve(envelopes.size());
59+
60+
std::vector<Envelope> parentEnvelopes;
61+
parentEnvelopes.reserve((envelopes.size() + branchSize - 1) / branchSize);
62+
Envelope currentBounds = Envelope::empty();
63+
64+
uint32_t idx = 0;
3565
for (const auto& env : envelopes) {
36-
minXs_.push_back(env.minX);
37-
minYs_.push_back(env.minY);
38-
maxXs_.push_back(env.maxX);
39-
maxYs_.push_back(env.maxY);
40-
rowIndices_.push_back(env.rowIndex);
66+
++idx;
67+
currentBounds.maxX = std::max(currentBounds.maxX, env.maxX);
68+
currentBounds.maxY = std::max(currentBounds.maxY, env.maxY);
69+
currentBounds.minX = std::min(currentBounds.minX, env.minX);
70+
currentBounds.minY = std::min(currentBounds.minY, env.minY);
71+
if (idx % branchSize == 0) {
72+
parentEnvelopes.push_back(currentBounds);
73+
currentBounds = Envelope::empty();
74+
}
75+
76+
minXs.push_back(env.minX);
77+
minYs.push_back(env.minY);
78+
maxXs.push_back(env.maxX);
79+
maxYs.push_back(env.maxY);
80+
}
81+
82+
if (!currentBounds.isEmpty()) {
83+
parentEnvelopes.push_back(currentBounds);
4184
}
85+
86+
return {
87+
RTreeLevel(
88+
branchSize,
89+
std::move(minXs),
90+
std::move(minYs),
91+
std::move(maxXs),
92+
std::move(maxYs)),
93+
std::move(parentEnvelopes)};
4294
}
95+
} // namespace
4396

44-
std::vector<int32_t> SpatialIndex::query(const Envelope& queryEnv) const {
45-
std::vector<int32_t> result;
46-
if (!Envelope::intersects(queryEnv, bounds_)) {
47-
return result;
97+
SpatialIndex::SpatialIndex(
98+
Envelope bounds,
99+
std::vector<Envelope> envelopes,
100+
uint32_t branchSize)
101+
: branchSize_(branchSize), bounds_(std::move(bounds)) {
102+
VELOX_CHECK_GT(branchSize_, 1);
103+
104+
if (!bounds_.isEmpty()) {
105+
HilbertIndex hilbert(
106+
bounds_.minX, bounds_.minY, bounds_.maxX, bounds_.maxY);
107+
108+
std::sort(
109+
envelopes.begin(), envelopes.end(), [&](const auto& a, const auto& b) {
110+
return hilbert.indexOf(a.minX, a.minY) <
111+
hilbert.indexOf(b.minX, b.minY);
112+
});
48113
}
49114

50-
// Find the last minX that is <= queryEnv.maxX . These first envelopes
51-
// are the only ones that can intersect the query envelope.
52-
// `it` is _one past_ the last element, so we iterate up to it - 1.
53-
auto it = std::upper_bound(minXs_.begin(), minXs_.end(), queryEnv.maxX);
54-
if (it == minXs_.begin()) {
115+
rowIndices_.reserve(envelopes.size());
116+
for (const auto& env : envelopes) {
117+
VELOX_CHECK(env.minX >= bounds_.minX);
118+
VELOX_CHECK(env.minY >= bounds_.minY);
119+
VELOX_CHECK(env.maxX <= bounds_.maxX);
120+
VELOX_CHECK(env.maxY <= bounds_.maxY);
121+
rowIndices_.push_back(env.rowIndex);
122+
}
123+
124+
if (envelopes.size() > 0) {
125+
size_t numLevels =
126+
std::ceil(std::log(envelopes.size()) / std::log(branchSize_));
127+
levels_.reserve(numLevels);
128+
}
129+
130+
while (envelopes.size() > branchSize_) {
131+
auto [level, parentEnvelopes] = buildLevel(branchSize_, envelopes);
132+
levels_.push_back(std::move(level));
133+
envelopes = std::move(parentEnvelopes);
134+
}
135+
136+
if (envelopes.size() > 1 || levels_.empty()) {
137+
levels_.push_back(buildLevel(branchSize_, envelopes).first);
138+
}
139+
140+
VELOX_CHECK_GT(branchSize_ + 1, levels_.back().size());
141+
}
142+
143+
std::vector<vector_size_t> SpatialIndex::query(const Envelope& queryEnv) const {
144+
std::vector<vector_size_t> result;
145+
if (!Envelope::intersects(queryEnv, bounds_)) {
55146
return result;
56147
}
57148

58-
auto lastIdx = std::distance(minXs_.begin(), it);
59-
VELOX_CHECK_GT(lastIdx, 0);
149+
size_t thisLevel = levels_.size() - 1;
150+
VELOX_CHECK_GT(levels_[thisLevel].size(), 0);
151+
VELOX_CHECK_GT(branchSize_ + 1, levels_[thisLevel].size());
60152

61-
for (size_t idx = 0; idx < lastIdx; ++idx) {
62-
bool intersects = (queryEnv.maxY >= minYs_[idx]) &&
63-
(queryEnv.minX <= maxXs_[idx]) && (queryEnv.minY <= maxYs_[idx]);
64-
if (intersects) {
65-
result.push_back(rowIndices_[idx]);
153+
// The top level should have only one branch.
154+
std::vector<size_t> childIndices = {0};
155+
for (; thisLevel > 0; --thisLevel) {
156+
// Avoiding thisLevel = 0 due to int underflow
157+
childIndices = levels_[thisLevel].query(queryEnv, childIndices);
158+
// If we have no matches, return.
159+
if (childIndices.empty()) {
160+
return result;
66161
}
67162
}
68163

164+
// We're at level 0 now. The indices index into rowIndices.
165+
VELOX_DCHECK_EQ(thisLevel, 0);
166+
childIndices = levels_[thisLevel].query(queryEnv, childIndices);
167+
result.reserve(childIndices.size());
168+
for (auto idx : childIndices) {
169+
result.push_back(rowIndices_[idx]);
170+
}
171+
69172
return result;
70173
}
71174

velox/exec/SpatialIndex.h

Lines changed: 65 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
#include <cstdint>
2121
#include <limits>
2222
#include <vector>
23+
#include "velox/vector/TypeAliases.h"
2324

2425
namespace facebook::velox::exec {
2526

@@ -73,11 +74,11 @@ namespace facebook::velox::exec {
7374
/// nextUp to maxX/Ys, the float32 envelope intersects in all cases that the
7475
/// float64 envelope would (but not necessarily the converse).
7576
struct Envelope {
76-
float minX;
77-
float minY;
78-
float maxX;
79-
float maxY;
80-
int32_t rowIndex = -1;
77+
float minX{std::numeric_limits<float>::infinity()};
78+
float minY{std::numeric_limits<float>::infinity()};
79+
float maxX{-std::numeric_limits<float>::infinity()};
80+
float maxY{-std::numeric_limits<float>::infinity()};
81+
vector_size_t rowIndex = -1;
8182

8283
/// Returns true if the intersection of two envelopes is not empty.
8384
static inline bool intersects(const Envelope& left, const Envelope& right) {
@@ -114,7 +115,7 @@ struct Envelope {
114115
double minY,
115116
double maxX,
116117
double maxY,
117-
int32_t rowIndex = -1) {
118+
vector_size_t rowIndex = -1) {
118119
return Envelope{
119120
.minX = std::nextafterf(
120121
static_cast<float>(minX), -std::numeric_limits<float>::infinity()),
@@ -136,6 +137,52 @@ struct Envelope {
136137
}
137138
};
138139

140+
/// A single level of an R-tree. It is a set of envelopes that can be linearly
141+
/// scanned for envelope intersection.
142+
class RTreeLevel {
143+
public:
144+
RTreeLevel(const RTreeLevel&) = delete;
145+
RTreeLevel& operator=(const RTreeLevel&) = delete;
146+
147+
RTreeLevel() = default;
148+
RTreeLevel(RTreeLevel&&) = default;
149+
RTreeLevel& operator=(RTreeLevel&&) = default;
150+
~RTreeLevel() = default;
151+
152+
explicit RTreeLevel(
153+
size_t branchSize,
154+
std::vector<float> minXs,
155+
std::vector<float> minYs,
156+
std::vector<float> maxXs,
157+
std::vector<float> maxYs)
158+
: branchSize_{branchSize},
159+
minXs_(std::move(minXs)),
160+
minYs_(std::move(minYs)),
161+
maxXs_(std::move(maxXs)),
162+
maxYs_(std::move(maxYs)) {}
163+
164+
/// Returns the internal indices of all envelopes that probeEnv intersects.
165+
/// Order of the returned indices is an implementation detail and cannot be
166+
/// relied upon.
167+
/// This does not do a short-circuit bounds check: the caller should do that
168+
/// first.
169+
std::vector<size_t> query(
170+
const Envelope& queryEnv,
171+
const std::vector<size_t>& branchIndices) const;
172+
173+
size_t size() const {
174+
return minXs_.size();
175+
}
176+
177+
private:
178+
size_t branchSize_{};
179+
Envelope bounds_;
180+
std::vector<float> minXs_{};
181+
std::vector<float> minYs_{};
182+
std::vector<float> maxXs_{};
183+
std::vector<float> maxYs_{};
184+
};
185+
139186
/// A spatial index for a set of geometries. The index only cares about the
140187
/// envelopes of the geometries, and an index into the geometries (not stored in
141188
/// SpatialIndex).
@@ -154,29 +201,31 @@ class SpatialIndex {
154201
SpatialIndex& operator=(SpatialIndex&&) = default;
155202
~SpatialIndex() = default;
156203

204+
static const uint32_t kDefaultRTreeBranchSize = 32;
205+
157206
/// Constructs a spatial index from envelopes contained with `bounds`.
158207
/// `bounds` must contain all envelopes in `envelopes`, otherwise the
159-
/// an assertio will fail. Envelopes should not contains
160-
/// and NaN coordinates.
161-
explicit SpatialIndex(Envelope bounds, std::vector<Envelope> envelopes);
208+
/// an assertio will fail. Envelopes should not contain NaN coordinates.
209+
explicit SpatialIndex(
210+
Envelope bounds,
211+
std::vector<Envelope> envelopes,
212+
uint32_t branchSize = kDefaultRTreeBranchSize);
162213

163214
/// Returns the row indices of all envelopes that probeEnv intersects.
164215
/// Order of the returned indices is an implementation detail and cannot be
165216
/// relied upon.
166-
std::vector<int32_t> query(const Envelope& queryEnv) const;
217+
std::vector<vector_size_t> query(const Envelope& queryEnv) const;
167218

168219
/// Returns the envelope of the all envelopes in the index.
169220
/// The returned envelope will have index = -1.
170221
Envelope bounds() const;
171222

172223
private:
173-
Envelope bounds_ = Envelope::empty();
224+
uint32_t branchSize_ = kDefaultRTreeBranchSize;
174225

175-
std::vector<double> minXs_{};
176-
std::vector<double> minYs_{};
177-
std::vector<double> maxXs_{};
178-
std::vector<double> maxYs_{};
179-
std::vector<int32_t> rowIndices_{};
226+
Envelope bounds_ = Envelope::empty();
227+
std::vector<RTreeLevel> levels_{};
228+
std::vector<vector_size_t> rowIndices_{};
180229
};
181230

182231
} // namespace facebook::velox::exec

0 commit comments

Comments
 (0)