diff --git a/scripts/data_generators/tests/count_star_optimizer/__init__.py b/scripts/data_generators/tests/count_star_optimizer/__init__.py new file mode 100644 index 00000000..b4dc6bcd --- /dev/null +++ b/scripts/data_generators/tests/count_star_optimizer/__init__.py @@ -0,0 +1,9 @@ +from scripts.data_generators.tests.base import IcebergTest +import pathlib + + +@IcebergTest.register() +class Test(IcebergTest): + def __init__(self): + path = pathlib.PurePath(__file__) + super().__init__(path.parent.name) diff --git a/scripts/data_generators/tests/count_star_optimizer/q00.sql b/scripts/data_generators/tests/count_star_optimizer/q00.sql new file mode 100644 index 00000000..8b52c75b --- /dev/null +++ b/scripts/data_generators/tests/count_star_optimizer/q00.sql @@ -0,0 +1,13 @@ +CREATE OR REPLACE TABLE default.count_star_optimizer ( + id INT, + name STRING +) +USING iceberg +TBLPROPERTIES ( + 'format-version' = '2', + 'write.delete.mode' = 'merge-on-read', + 'write.merge.mode' = 'merge-on-read', + 'write.update.mode' = 'merge-on-read', + 'write.data.partition-columns' = false, + 'write.parquet.write-partition-values' = false +); \ No newline at end of file diff --git a/scripts/data_generators/tests/count_star_optimizer/q01.sql b/scripts/data_generators/tests/count_star_optimizer/q01.sql new file mode 100644 index 00000000..730adabb --- /dev/null +++ b/scripts/data_generators/tests/count_star_optimizer/q01.sql @@ -0,0 +1 @@ +INSERT INTO default.count_star_optimizer VALUES (1,'aaa'), (2,'bbb'); \ No newline at end of file diff --git a/src/iceberg_functions/iceberg_multi_file_list.cpp b/src/iceberg_functions/iceberg_multi_file_list.cpp index b907c071..4065546c 100644 --- a/src/iceberg_functions/iceberg_multi_file_list.cpp +++ b/src/iceberg_functions/iceberg_multi_file_list.cpp @@ -235,6 +235,36 @@ unique_ptr IcebergMultiFileList::GetCardinality(ClientContext &c return make_uniq(cardinality, cardinality); } +void IcebergMultiFileList::GetStatistics(vector &result) const { + if (GetMetadata().iceberg_version == 1) { + //! We collect no statistics information from manifests for V1 tables. + return; + } + + if (!transaction_delete_manifests.empty() || !delete_manifests.empty()) { + //! if exist delete_manifests , return; + return; + } + + idx_t count = 0; + for (idx_t i = 0; i < data_manifests.size(); i++) { + count += data_manifests[i].existing_rows_count; + count += data_manifests[i].added_rows_count; + } + + for (idx_t i = 0; i < transaction_data_manifests.size(); i++) { + auto files = transaction_data_manifests[i].get().data_files; + for (idx_t j = 0; j < files.size(); j++) { + count += files[j].record_count; + } + } + + PartitionStatistics partition_stats; + partition_stats.count = count; + partition_stats.count_type = CountType::COUNT_EXACT; + result.push_back(partition_stats); +} + IcebergPredicateStats IcebergPredicateStats::DeserializeBounds(const Value &lower_bound, const Value &upper_bound, const string &name, const LogicalType &type) { IcebergPredicateStats res; diff --git a/src/iceberg_functions/iceberg_multi_file_reader.cpp b/src/iceberg_functions/iceberg_multi_file_reader.cpp index e458c3ed..5a318b4a 100644 --- a/src/iceberg_functions/iceberg_multi_file_reader.cpp +++ b/src/iceberg_functions/iceberg_multi_file_reader.cpp @@ -467,4 +467,13 @@ unique_ptr IcebergMultiFileReader::GetVirtualColumnExpression( global_column_reference); } +vector IcebergMultiFileReader::IcebergGetPartitionStats(ClientContext &context, + GetPartitionStatsInput &input) { + auto &bind_data = input.bind_data->Cast(); + vector result; + auto &multi_file_list = bind_data.file_list->Cast(); + multi_file_list.GetStatistics(result); + return result; +} + } // namespace duckdb diff --git a/src/iceberg_functions/iceberg_scan.cpp b/src/iceberg_functions/iceberg_scan.cpp index 8376b112..70e79bc7 100644 --- a/src/iceberg_functions/iceberg_scan.cpp +++ b/src/iceberg_functions/iceberg_scan.cpp @@ -74,6 +74,7 @@ TableFunctionSet IcebergFunctions::GetIcebergScanFunction(ExtensionLoader &loade function.table_scan_progress = nullptr; function.get_bind_info = nullptr; function.get_virtual_columns = IcebergVirtualColumns; + function.get_partition_stats = IcebergMultiFileReader::IcebergGetPartitionStats; // Schema param is just confusing here function.named_parameters.erase("schema"); diff --git a/src/include/iceberg_multi_file_list.hpp b/src/include/iceberg_multi_file_list.hpp index 52256e19..2d13300d 100644 --- a/src/include/iceberg_multi_file_list.hpp +++ b/src/include/iceberg_multi_file_list.hpp @@ -54,6 +54,7 @@ struct IcebergMultiFileList : public MultiFileList { unique_ptr GetPositionalDeletesForFile(const string &file_path) const; void ProcessDeletes(const vector &global_columns, const vector &column_indexes) const; + void GetStatistics(vector &result) const; public: //! MultiFileList API diff --git a/src/include/iceberg_multi_file_reader.hpp b/src/include/iceberg_multi_file_reader.hpp index 06a9d414..9da7503f 100644 --- a/src/include/iceberg_multi_file_reader.hpp +++ b/src/include/iceberg_multi_file_reader.hpp @@ -37,6 +37,7 @@ struct IcebergMultiFileReader : public MultiFileReader { public: static unique_ptr CreateInstance(const TableFunction &table); + static vector IcebergGetPartitionStats(ClientContext &context, GetPartitionStatsInput &input); public: shared_ptr CreateFileList(ClientContext &context, const vector &paths, diff --git a/test/sql/local/irc/insert/count_star_optimizer.test b/test/sql/local/irc/insert/count_star_optimizer.test new file mode 100644 index 00000000..2fdb3d52 --- /dev/null +++ b/test/sql/local/irc/insert/count_star_optimizer.test @@ -0,0 +1,83 @@ +# name: test/sql/local/irc/insert/count_star_optimizer.test +# description: Test count star optimizer in an iceberg scan +# group: [insert] + +require-env ICEBERG_SERVER_AVAILABLE + +require avro + +require parquet + +require iceberg + +require httpfs + +# Do not ignore 'HTTP' error messages! +set ignore_error_messages + +statement ok +set enable_logging=true + +statement ok +set logging_level='debug' + +statement ok +CREATE SECRET ( + TYPE S3, + KEY_ID 'admin', + SECRET 'password', + ENDPOINT '127.0.0.1:9000', + URL_STYLE 'path', + USE_SSL 0 +); + +statement ok +ATTACH '' AS my_datalake ( + TYPE ICEBERG, + CLIENT_ID 'admin', + CLIENT_SECRET 'password', + ENDPOINT 'http://127.0.0.1:8181' +); + +# use statistics information. +query II +explain SELECT count(*) FROM my_datalake.default.count_star_optimizer; +---- +physical_plan :.*COLUMN_DATA_SCAN.*~1.* + + +statement ok +BEGIN; + +query I +select count(*) from my_datalake.default.count_star_optimizer; +---- +2 + +statement ok +insert into my_datalake.default.count_star_optimizer select 3,'ccc'; + +query I +select count(*) from my_datalake.default.count_star_optimizer; +---- +3 + +statement ok +ABORT; + +statement ok +delete from my_datalake.default.count_star_optimizer where id = 1; + +query I +select count(*) from my_datalake.default.count_star_optimizer; +---- +1 + +# contain delete file, do not use the statistics information. +query II +explain SELECT count(*) FROM my_datalake.default.count_star_optimizer; +---- +physical_plan :.*ICEBERG_SCAN.*rows.* + + +