Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from scripts.data_generators.tests.base import IcebergTest
import pathlib


@IcebergTest.register()
class Test(IcebergTest):
def __init__(self):
path = pathlib.PurePath(__file__)
super().__init__(path.parent.name)
13 changes: 13 additions & 0 deletions scripts/data_generators/tests/count_star_optimizer/q00.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
CREATE OR REPLACE TABLE default.count_star_optimizer (
id INT,
name STRING
)
USING iceberg
TBLPROPERTIES (
'format-version' = '2',
'write.delete.mode' = 'merge-on-read',
'write.merge.mode' = 'merge-on-read',
'write.update.mode' = 'merge-on-read',
'write.data.partition-columns' = false,
'write.parquet.write-partition-values' = false
);
1 change: 1 addition & 0 deletions scripts/data_generators/tests/count_star_optimizer/q01.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
INSERT INTO default.count_star_optimizer VALUES (1,'aaa'), (2,'bbb');
30 changes: 30 additions & 0 deletions src/iceberg_functions/iceberg_multi_file_list.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,36 @@ unique_ptr<NodeStatistics> IcebergMultiFileList::GetCardinality(ClientContext &c
return make_uniq<NodeStatistics>(cardinality, cardinality);
}

void IcebergMultiFileList::GetStatistics(vector<PartitionStatistics> &result) const {
if (GetMetadata().iceberg_version == 1) {
//! We collect no statistics information from manifests for V1 tables.
return;
}

if (!transaction_delete_manifests.empty() || !delete_manifests.empty()) {
//! if exist delete_manifests , return;
return;
}

idx_t count = 0;
for (idx_t i = 0; i < data_manifests.size(); i++) {
count += data_manifests[i].existing_rows_count;
count += data_manifests[i].added_rows_count;
}

for (idx_t i = 0; i < transaction_data_manifests.size(); i++) {
auto files = transaction_data_manifests[i].get().data_files;
for (idx_t j = 0; j < files.size(); j++) {
count += files[j].record_count;
}
}

PartitionStatistics partition_stats;
partition_stats.count = count;
partition_stats.count_type = CountType::COUNT_EXACT;
result.push_back(partition_stats);
}

IcebergPredicateStats IcebergPredicateStats::DeserializeBounds(const Value &lower_bound, const Value &upper_bound,
const string &name, const LogicalType &type) {
IcebergPredicateStats res;
Expand Down
9 changes: 9 additions & 0 deletions src/iceberg_functions/iceberg_multi_file_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -467,4 +467,13 @@ unique_ptr<Expression> IcebergMultiFileReader::GetVirtualColumnExpression(
global_column_reference);
}

vector<PartitionStatistics> IcebergMultiFileReader::IcebergGetPartitionStats(ClientContext &context,
GetPartitionStatsInput &input) {
auto &bind_data = input.bind_data->Cast<MultiFileBindData>();
vector<PartitionStatistics> result;
auto &multi_file_list = bind_data.file_list->Cast<IcebergMultiFileList>();
multi_file_list.GetStatistics(result);
return result;
}

} // namespace duckdb
1 change: 1 addition & 0 deletions src/iceberg_functions/iceberg_scan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ TableFunctionSet IcebergFunctions::GetIcebergScanFunction(ExtensionLoader &loade
function.table_scan_progress = nullptr;
function.get_bind_info = nullptr;
function.get_virtual_columns = IcebergVirtualColumns;
function.get_partition_stats = IcebergMultiFileReader::IcebergGetPartitionStats;

// Schema param is just confusing here
function.named_parameters.erase("schema");
Expand Down
1 change: 1 addition & 0 deletions src/include/iceberg_multi_file_list.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ struct IcebergMultiFileList : public MultiFileList {
unique_ptr<DeleteFilter> GetPositionalDeletesForFile(const string &file_path) const;
void ProcessDeletes(const vector<MultiFileColumnDefinition> &global_columns,
const vector<ColumnIndex> &column_indexes) const;
void GetStatistics(vector<PartitionStatistics> &result) const;

public:
//! MultiFileList API
Expand Down
1 change: 1 addition & 0 deletions src/include/iceberg_multi_file_reader.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ struct IcebergMultiFileReader : public MultiFileReader {

public:
static unique_ptr<MultiFileReader> CreateInstance(const TableFunction &table);
static vector<PartitionStatistics> IcebergGetPartitionStats(ClientContext &context, GetPartitionStatsInput &input);

public:
shared_ptr<MultiFileList> CreateFileList(ClientContext &context, const vector<string> &paths,
Expand Down
83 changes: 83 additions & 0 deletions test/sql/local/irc/insert/count_star_optimizer.test
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
# name: test/sql/local/irc/insert/count_star_optimizer.test
# description: Test count star optimizer in an iceberg scan
# group: [insert]

require-env ICEBERG_SERVER_AVAILABLE

require avro

require parquet

require iceberg

require httpfs

# Do not ignore 'HTTP' error messages!
set ignore_error_messages

statement ok
set enable_logging=true

statement ok
set logging_level='debug'

statement ok
CREATE SECRET (
TYPE S3,
KEY_ID 'admin',
SECRET 'password',
ENDPOINT '127.0.0.1:9000',
URL_STYLE 'path',
USE_SSL 0
);

statement ok
ATTACH '' AS my_datalake (
TYPE ICEBERG,
CLIENT_ID 'admin',
CLIENT_SECRET 'password',
ENDPOINT 'http://127.0.0.1:8181'
);

# use statistics information.
query II
explain SELECT count(*) FROM my_datalake.default.count_star_optimizer;
----
physical_plan <REGEX>:.*COLUMN_DATA_SCAN.*~1.*


statement ok
BEGIN;

query I
select count(*) from my_datalake.default.count_star_optimizer;
----
2

statement ok
insert into my_datalake.default.count_star_optimizer select 3,'ccc';

query I
select count(*) from my_datalake.default.count_star_optimizer;
----
3

statement ok
ABORT;

statement ok
delete from my_datalake.default.count_star_optimizer where id = 1;

query I
select count(*) from my_datalake.default.count_star_optimizer;
----
1

# contain delete file, do not use the statistics information.
query II
explain SELECT count(*) FROM my_datalake.default.count_star_optimizer;
----
physical_plan <REGEX>:.*ICEBERG_SCAN.*rows.*



Loading