Skip to content

Commit a3e1bb4

Browse files
rui-moglutenperfbot
authored andcommitted
Support struct schema evolution matching by name
Alchemy-item: [ [5962] Support struct schema evolution matching by name](#805 (comment)) commit 1/1 - e3d77fd
1 parent 19f95dc commit a3e1bb4

17 files changed

+321
-33
lines changed

velox/connectors/hive/SplitReader.cpp

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -416,11 +416,17 @@ std::vector<TypePtr> SplitReader::adaptColumns(
416416
auto fileTypeIdx = fileType->getChildIdxIfExists(fieldName);
417417
if (!fileTypeIdx.has_value()) {
418418
// Column is missing. Most likely due to schema evolution.
419-
VELOX_CHECK(tableSchema, "Unable to resolve column '{}'", fieldName);
419+
auto outputTypeIdx = readerOutputType_->getChildIdxIfExists(fieldName);
420+
TypePtr fieldType;
421+
if (outputTypeIdx.has_value()) {
422+
// Field name exists in the user-specified output type.
423+
fieldType = readerOutputType_->childAt(outputTypeIdx.value());
424+
} else {
425+
VELOX_CHECK(tableSchema, "Unable to resolve column '{}'", fieldName);
426+
fieldType = tableSchema->findChild(fieldName);
427+
}
420428
childSpec->setConstantValue(BaseVector::createNullConstant(
421-
tableSchema->findChild(fieldName),
422-
1,
423-
connectorQueryCtx_->memoryPool()));
429+
fieldType, 1, connectorQueryCtx_->memoryPool()));
424430
} else {
425431
// Column no longer missing, reset constant value set on the spec.
426432
childSpec->setConstantValue(nullptr);

velox/dwio/common/ScanSpec.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -157,7 +157,7 @@ bool ScanSpec::hasFilter() const {
157157
if (hasFilter_.has_value()) {
158158
return hasFilter_.value();
159159
}
160-
if (!isConstant() && filter()) {
160+
if (filter()) {
161161
hasFilter_ = true;
162162
return true;
163163
}

velox/dwio/common/SelectiveFlatMapColumnReader.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,11 +24,13 @@ namespace facebook::velox::dwio::common {
2424
class SelectiveFlatMapColumnReader : public SelectiveStructColumnReaderBase {
2525
protected:
2626
SelectiveFlatMapColumnReader(
27+
const dwio::common::ColumnReaderOptions& columnReaderOptions,
2728
const TypePtr& requestedType,
2829
const std::shared_ptr<const dwio::common::TypeWithId>& fileType,
2930
FormatParams& params,
3031
velox::common::ScanSpec& scanSpec)
3132
: SelectiveStructColumnReaderBase(
33+
columnReaderOptions,
3234
requestedType,
3335
fileType,
3436
params,

velox/dwio/common/SelectiveStructColumnReader.cpp

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -426,7 +426,6 @@ void SelectiveStructColumnReaderBase::read(
426426
}
427427

428428
const auto& childSpecs = scanSpec_->children();
429-
VELOX_CHECK(!childSpecs.empty());
430429
for (size_t i = 0; i < childSpecs.size(); ++i) {
431430
const auto& childSpec = childSpecs[i];
432431
VELOX_TRACE_HISTORY_PUSH("read %s", childSpec->fieldName().c_str());
@@ -526,15 +525,17 @@ bool SelectiveStructColumnReaderBase::isChildMissing(
526525
// row type that doesn't exist
527526
// in the output.
528527
fileType_->type()->kind() !=
529-
TypeKind::MAP && // If this is the case it means this is a flat map,
530-
// so it can't have "missing" fields.
531-
childSpec.channel() >= fileType_->size());
528+
TypeKind::MAP // If this is the case it means this is a flat map,
529+
// so it can't have "missing" fields.
530+
) &&
531+
(columnReaderOptions_.useColumnNamesForColumnMapping_
532+
? !asRowType(fileType_->type())->containsChild(childSpec.fieldName())
533+
: childSpec.channel() >= fileType_->size());
532534
}
533535

534536
void SelectiveStructColumnReaderBase::getValues(
535537
const RowSet& rows,
536538
VectorPtr* result) {
537-
VELOX_CHECK(!scanSpec_->children().empty());
538539
VELOX_CHECK_NOT_NULL(
539540
*result, "SelectiveStructColumnReaderBase expects a non-null result");
540541
VELOX_CHECK(

velox/dwio/common/SelectiveStructColumnReader.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616

1717
#pragma once
1818

19+
#include "velox/dwio/common/Options.h"
1920
#include "velox/dwio/common/SelectiveColumnReaderInternal.h"
2021

2122
namespace facebook::velox::dwio::common {
@@ -111,13 +112,15 @@ class SelectiveStructColumnReaderBase : public SelectiveColumnReader {
111112
static constexpr int32_t kConstantChildSpecSubscript = -1;
112113

113114
SelectiveStructColumnReaderBase(
115+
const dwio::common::ColumnReaderOptions& columnReaderOptions,
114116
const TypePtr& requestedType,
115117
const std::shared_ptr<const dwio::common::TypeWithId>& fileType,
116118
FormatParams& params,
117119
velox::common::ScanSpec& scanSpec,
118120
bool isRoot = false,
119121
bool generateLazyChildren = true)
120122
: SelectiveColumnReader(requestedType, fileType, params, scanSpec),
123+
columnReaderOptions_(columnReaderOptions),
121124
debugString_(
122125
getExceptionContext().message(VeloxException::Type::kSystem)),
123126
isRoot_(isRoot),
@@ -172,6 +175,8 @@ class SelectiveStructColumnReaderBase : public SelectiveColumnReader {
172175
}
173176
}
174177

178+
const dwio::common::ColumnReaderOptions& columnReaderOptions_;
179+
175180
// Context information obtained from ExceptionContext. Stored here
176181
// so that LazyVector readers under this can add this to their
177182
// ExceptionContext. Allows contextualizing reader errors to split

velox/dwio/dwrf/reader/SelectiveFlatMapColumnReader.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,7 @@ class SelectiveFlatMapAsStructReader : public SelectiveStructColumnReaderBase {
203203
DwrfParams& params,
204204
common::ScanSpec& scanSpec)
205205
: SelectiveStructColumnReaderBase(
206+
columnReaderOptions,
206207
requestedType,
207208
fileType,
208209
params,
@@ -241,6 +242,7 @@ class SelectiveFlatMapAsMapReader : public SelectiveStructColumnReaderBase {
241242
DwrfParams& params,
242243
common::ScanSpec& scanSpec)
243244
: SelectiveStructColumnReaderBase(
245+
columnReaderOptions,
244246
requestedType,
245247
fileType,
246248
params,
@@ -280,6 +282,7 @@ class SelectiveFlatMapReader
280282
DwrfParams& params,
281283
common::ScanSpec& scanSpec)
282284
: dwio::common::SelectiveFlatMapColumnReader(
285+
columnReaderOptions,
283286
requestedType,
284287
fileType,
285288
params,

velox/dwio/dwrf/reader/SelectiveStructColumnReader.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ SelectiveStructColumnReader::SelectiveStructColumnReader(
3131
common::ScanSpec& scanSpec,
3232
bool isRoot)
3333
: SelectiveStructColumnReaderBase(
34+
columnReaderOptions,
3435
requestedType,
3536
fileType,
3637
params,

velox/dwio/dwrf/reader/SelectiveStructColumnReader.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,13 +25,15 @@ class SelectiveStructColumnReaderBase
2525
: public dwio::common::SelectiveStructColumnReaderBase {
2626
public:
2727
SelectiveStructColumnReaderBase(
28+
const dwio::common::ColumnReaderOptions& columnReaderOptions,
2829
const TypePtr& requestedType,
2930
const std::shared_ptr<const dwio::common::TypeWithId>& fileType,
3031
DwrfParams& params,
3132
common::ScanSpec& scanSpec,
3233
bool isRoot = false,
3334
bool generateLazyChildren = true)
3435
: dwio::common::SelectiveStructColumnReaderBase(
36+
columnReaderOptions,
3537
requestedType,
3638
fileType,
3739
params,

velox/dwio/parquet/reader/ParquetColumnReader.cpp

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,8 @@ std::unique_ptr<dwio::common::SelectiveColumnReader> ParquetColumnReader::build(
3838
const TypePtr& requestedType,
3939
const std::shared_ptr<const dwio::common::TypeWithId>& fileType,
4040
ParquetParams& params,
41-
common::ScanSpec& scanSpec) {
41+
common::ScanSpec& scanSpec,
42+
memory::MemoryPool& pool) {
4243
auto colName = scanSpec.fieldName();
4344

4445
switch (fileType->type()->kind()) {
@@ -59,19 +60,19 @@ std::unique_ptr<dwio::common::SelectiveColumnReader> ParquetColumnReader::build(
5960

6061
case TypeKind::ROW:
6162
return std::make_unique<StructColumnReader>(
62-
columnReaderOptions, requestedType, fileType, params, scanSpec);
63+
columnReaderOptions, requestedType, fileType, params, scanSpec, pool);
6364

6465
case TypeKind::VARBINARY:
6566
case TypeKind::VARCHAR:
6667
return std::make_unique<StringColumnReader>(fileType, params, scanSpec);
6768

6869
case TypeKind::ARRAY:
6970
return std::make_unique<ListColumnReader>(
70-
columnReaderOptions, requestedType, fileType, params, scanSpec);
71+
columnReaderOptions, requestedType, fileType, params, scanSpec, pool);
7172

7273
case TypeKind::MAP:
7374
return std::make_unique<MapColumnReader>(
74-
columnReaderOptions, requestedType, fileType, params, scanSpec);
75+
columnReaderOptions, requestedType, fileType, params, scanSpec, pool);
7576

7677
case TypeKind::BOOLEAN:
7778
return std::make_unique<BooleanColumnReader>(

velox/dwio/parquet/reader/ParquetColumnReader.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ class ParquetColumnReader {
4747
const TypePtr& requestedType,
4848
const std::shared_ptr<const dwio::common::TypeWithId>& fileType,
4949
ParquetParams& params,
50-
common::ScanSpec& scanSpec);
50+
common::ScanSpec& scanSpec,
51+
memory::MemoryPool& pool);
5152
};
5253
} // namespace facebook::velox::parquet

0 commit comments

Comments
 (0)