diff --git a/velox/connectors/hive/iceberg/CMakeLists.txt b/velox/connectors/hive/iceberg/CMakeLists.txt index 0c6662595097..5822cb7b4d2f 100644 --- a/velox/connectors/hive/iceberg/CMakeLists.txt +++ b/velox/connectors/hive/iceberg/CMakeLists.txt @@ -15,6 +15,7 @@ velox_add_library( velox_hive_iceberg_splitreader IcebergDataSink.cpp + IcebergPartitionPath.cpp IcebergSplit.cpp IcebergSplitReader.cpp PartitionSpec.cpp diff --git a/velox/connectors/hive/iceberg/IcebergPartitionPath.cpp b/velox/connectors/hive/iceberg/IcebergPartitionPath.cpp new file mode 100644 index 000000000000..2e6429597197 --- /dev/null +++ b/velox/connectors/hive/iceberg/IcebergPartitionPath.cpp @@ -0,0 +1,85 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "velox/connectors/hive/iceberg/IcebergPartitionPath.h" +#include "velox/common/encode/Base64.h" + +namespace facebook::velox::connector::hive::iceberg { + +std::string IcebergPartitionPath::toPartitionString( + int32_t value, + const TypePtr& type) const { + constexpr int32_t kEpochYear = 1970; + switch (transformType_) { + case TransformType::kIdentity: { + if (type->isDate()) { + return DATE()->toString(value); + } + return fmt::to_string(value); + } + case TransformType::kDay: + return DATE()->toString(value); + case TransformType::kYear: + return fmt::format("{:04d}", kEpochYear + value); + case TransformType::kMonth: { + int32_t year = kEpochYear + value / 12; + int32_t month = 1 + value % 12; + if (month <= 0) { + month += 12; + year -= 1; + } + return fmt::format("{:04d}-{:02d}", year, month); + } + case TransformType::kHour: { + int64_t seconds = static_cast(value) * 3600; + std::tm tmValue; + VELOX_USER_CHECK( + Timestamp::epochToCalendarUtc(seconds, tmValue), + "Failed to convert seconds to time: {}", + seconds); + return fmt::format( + "{:04d}-{:02d}-{:02d}-{:02d}", + tmValue.tm_year + 1900, + tmValue.tm_mon + 1, + tmValue.tm_mday, + tmValue.tm_hour); + } + default: + return fmt::to_string(value); + } +} + +std::string IcebergPartitionPath::toPartitionString( + Timestamp value, + const TypePtr& type) const { + VELOX_CHECK(transformType_ == TransformType::kIdentity); + TimestampToStringOptions options; + options.precision = TimestampPrecision::kMilliseconds; + options.zeroPaddingYear = true; + options.skipTrailingZeros = true; + options.leadingPositiveSign = true; + return value.toString(options); +} + +std::string IcebergPartitionPath::toPartitionString( + StringView value, + const TypePtr& type) const { + if (type->isVarbinary()) { + return encoding::Base64::encode(value.data(), value.size()); + } + return std::string(value); +} + +} // namespace facebook::velox::connector::hive::iceberg diff --git a/velox/connectors/hive/iceberg/IcebergPartitionPath.h b/velox/connectors/hive/iceberg/IcebergPartitionPath.h new file mode 100644 index 000000000000..7e6108b074f5 --- /dev/null +++ b/velox/connectors/hive/iceberg/IcebergPartitionPath.h @@ -0,0 +1,69 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "velox/connectors/hive/HivePartitionUtil.h" +#include "velox/connectors/hive/iceberg/PartitionSpec.h" + +namespace facebook::velox::connector::hive::iceberg { + +/// Converts a partition value to its string representation for use in +/// partition directory path. The implementation follows the behavior of +/// the Apache Iceberg Java library for partition path name. +class IcebergPartitionPath : public HivePartitionUtil { + public: + explicit IcebergPartitionPath(TransformType transformType) + : transformType_(transformType) {} + + using HivePartitionUtil::toPartitionString; + + /// Converts an int32_t partition key to its string representation based on + /// the transform type: + /// - kIdentity: For DATE type return "YYYY-MM-DD" format (e.g., + /// "2025-11-07"). + /// For other types return the value as-is (e.g., "-123"). + /// - kDay: Returns date in "YYYY-MM-DD" format (e.g., "2025-11-07"). + /// - kYear: Returns 4-digit year "YYYY" (e.g., "2025"). + /// - kMonth: Returns "YYYY-MM" format (e.g., "2025-01"). + /// - kHour: Returns "YYYY-MM-DD-HH" format (e.g., "2025-11-07-21"). + std::string toPartitionString(int32_t value, const TypePtr& type) + const override; + + /// Returns timestamp formatted with milliseconds precision, zero-padded year, + /// trailing zeros skipped, and leading positive sign for years >= 10000. + /// Examples: + /// - Timestamp(0, 0) -> "1970-01-01T00:00:00". + /// - Timestamp(1609459200, 999000000) -> "2021-01-01T00:00:00.999". + /// - Timestamp(1640995200, 500000000) -> "2022-01-01T00:00:00.5". + /// - Timestamp(-1, 999000000) -> "1969-12-31T23:59:59.999". + /// - Timestamp(253402300800, 100000000) -> "+10000-01-01T00:00:00.1". + std::string toPartitionString(Timestamp value, const TypePtr& type) + const override; + + /// Converts a StringView partition key to its string representation. + /// - For VARBINARY type returns Base64-encoded string. + /// - For VARCHAR type returns the string value as-is. + std::string toPartitionString(StringView value, const TypePtr& type) + const override; + + private: + const TransformType transformType_; +}; + +using IcebergPartitionPathPtr = std::shared_ptr; + +} // namespace facebook::velox::connector::hive::iceberg diff --git a/velox/connectors/hive/iceberg/tests/CMakeLists.txt b/velox/connectors/hive/iceberg/tests/CMakeLists.txt index 8abc11f7b9a1..bed84d138c1b 100644 --- a/velox/connectors/hive/iceberg/tests/CMakeLists.txt +++ b/velox/connectors/hive/iceberg/tests/CMakeLists.txt @@ -60,6 +60,7 @@ if(NOT VELOX_DISABLE_GOOGLETEST) add_executable( velox_hive_iceberg_insert_test IcebergInsertTest.cpp + IcebergPartitionPathTest.cpp IcebergTestBase.cpp Main.cpp PartitionSpecTest.cpp diff --git a/velox/connectors/hive/iceberg/tests/IcebergPartitionPathTest.cpp b/velox/connectors/hive/iceberg/tests/IcebergPartitionPathTest.cpp new file mode 100644 index 000000000000..01c20d321e8f --- /dev/null +++ b/velox/connectors/hive/iceberg/tests/IcebergPartitionPathTest.cpp @@ -0,0 +1,145 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include "velox/connectors/hive/iceberg/IcebergPartitionPath.h" +#include "velox/type/Type.h" + +namespace facebook::velox::connector::hive::iceberg { + +namespace { + +template +std::string toPath(TransformType transform, T value, const TypePtr& type) { + return IcebergPartitionPath(transform).toPartitionString(value, type); +} + +std::string timestampToPath(const Timestamp& timestamp) { + return toPath(TransformType::kIdentity, timestamp, TIMESTAMP()); +} + +std::string testString( + const std::string& value, + const TypePtr& typePtr = VARCHAR()) { + auto identityResult = + toPath(TransformType::kIdentity, StringView(value), typePtr); + auto truncateResult = + toPath(TransformType::kTruncate, StringView(value), typePtr); + EXPECT_EQ(identityResult, truncateResult); + return identityResult; +} + +std::string testVarbinary(const std::string& value) { + return testString(value, VARBINARY()); +} + +std::string testInteger(int32_t value) { + auto identity = toPath(TransformType::kIdentity, value, INTEGER()); + auto bucket = toPath(TransformType::kBucket, value, INTEGER()); + auto trunc = toPath(TransformType::kTruncate, value, INTEGER()); + EXPECT_EQ(identity, trunc); + EXPECT_EQ(bucket, trunc); + return trunc; +} + +TEST(IcebergPartitionPathTest, integer) { + EXPECT_EQ(testInteger(0), "0"); + EXPECT_EQ(testInteger(1), "1"); + EXPECT_EQ(testInteger(100), "100"); + EXPECT_EQ(testInteger(-100), "-100"); + EXPECT_EQ(testInteger(128), "128"); + EXPECT_EQ(testInteger(1024), "1024"); +} + +TEST(IcebergPartitionPathTest, date) { + EXPECT_EQ(toPath(TransformType::kIdentity, 18'262, DATE()), "2020-01-01"); + EXPECT_EQ(toPath(TransformType::kIdentity, 0, DATE()), "1970-01-01"); + EXPECT_EQ(toPath(TransformType::kIdentity, -1, DATE()), "1969-12-31"); + EXPECT_EQ(toPath(TransformType::kIdentity, 2'932'897, DATE()), "10000-01-01"); +} + +TEST(IcebergPartitionPathTest, boolean) { + EXPECT_EQ(toPath(TransformType::kIdentity, true, BOOLEAN()), "true"); + EXPECT_EQ(toPath(TransformType::kIdentity, false, BOOLEAN()), "false"); +} + +TEST(IcebergPartitionPathTest, string) { + EXPECT_EQ(testString("a/b/c=d"), "a/b/c=d"); + EXPECT_EQ(testString(""), ""); + EXPECT_EQ(testString("abc"), "abc"); +} + +TEST(IcebergPartitionPathTest, varbinary) { + EXPECT_EQ(testVarbinary("\x48\x65\x6c\x6c\x6f"), "SGVsbG8="); + EXPECT_EQ(testVarbinary("\x1\x2\x3"), "AQID"); + EXPECT_EQ(testVarbinary(""), ""); +} + +TEST(IcebergPartitionPathTest, timestamp) { + EXPECT_EQ(timestampToPath(Timestamp(0, 0)), "1970-01-01T00:00:00"); + EXPECT_EQ( + timestampToPath(Timestamp(1'609'459'200, 999'000'000)), + "2021-01-01T00:00:00.999"); + EXPECT_EQ( + timestampToPath(Timestamp(1'640'995'200, 500'000'000)), + "2022-01-01T00:00:00.5"); + EXPECT_EQ( + timestampToPath(Timestamp(-1, 999'000'000)), "1969-12-31T23:59:59.999"); + EXPECT_EQ( + timestampToPath(Timestamp(253'402'300'800, 100'000'000)), + "+10000-01-01T00:00:00.1"); + EXPECT_EQ( + timestampToPath(Timestamp(-62'170'000'000, 0)), "-0001-11-29T19:33:20"); + EXPECT_EQ( + timestampToPath(Timestamp(-62'167'219'199, 0)), "0000-01-01T00:00:01"); +} + +TEST(IcebergPartitionPathTest, year) { + EXPECT_EQ(toPath(TransformType::kYear, 0, INTEGER()), "1970"); + EXPECT_EQ(toPath(TransformType::kYear, 1, INTEGER()), "1971"); + EXPECT_EQ(toPath(TransformType::kYear, 8'030, INTEGER()), "10000"); + EXPECT_EQ(toPath(TransformType::kYear, -1, INTEGER()), "1969"); + EXPECT_EQ(toPath(TransformType::kYear, -50, INTEGER()), "1920"); +} + +TEST(IcebergPartitionPathTest, month) { + EXPECT_EQ(toPath(TransformType::kMonth, 0, INTEGER()), "1970-01"); + EXPECT_EQ(toPath(TransformType::kMonth, 1, INTEGER()), "1970-02"); + EXPECT_EQ(toPath(TransformType::kMonth, 11, INTEGER()), "1970-12"); + EXPECT_EQ(toPath(TransformType::kMonth, 612, INTEGER()), "2021-01"); + EXPECT_EQ(toPath(TransformType::kMonth, -1, INTEGER()), "1969-12"); + EXPECT_EQ(toPath(TransformType::kMonth, -13, INTEGER()), "1968-12"); +} + +TEST(IcebergPartitionPathTest, day) { + EXPECT_EQ(toPath(TransformType::kDay, 0, DATE()), "1970-01-01"); + EXPECT_EQ(toPath(TransformType::kDay, 1, DATE()), "1970-01-02"); + EXPECT_EQ(toPath(TransformType::kDay, 18'262, DATE()), "2020-01-01"); + EXPECT_EQ(toPath(TransformType::kDay, -1, DATE()), "1969-12-31"); +} + +TEST(IcebergPartitionPathTest, hour) { + EXPECT_EQ(toPath(TransformType::kHour, 0, INTEGER()), "1970-01-01-00"); + EXPECT_EQ(toPath(TransformType::kHour, 1, INTEGER()), "1970-01-01-01"); + EXPECT_EQ(toPath(TransformType::kHour, 24, INTEGER()), "1970-01-02-00"); + EXPECT_EQ(toPath(TransformType::kHour, 438'288, INTEGER()), "2020-01-01-00"); + EXPECT_EQ(toPath(TransformType::kHour, -1, INTEGER()), "1969-12-31-23"); +} + +} // namespace + +} // namespace facebook::velox::connector::hive::iceberg