Skip to content

Commit 219ec2a

Browse files
committed
Add Iceberg partition path utility
1 parent 3debc3b commit 219ec2a

File tree

5 files changed

+317
-0
lines changed

5 files changed

+317
-0
lines changed

velox/connectors/hive/iceberg/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
velox_add_library(
1616
velox_hive_iceberg_splitreader
1717
IcebergDataSink.cpp
18+
IcebergPartitionPath.cpp
1819
IcebergSplit.cpp
1920
IcebergSplitReader.cpp
2021
PartitionSpec.cpp
Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
/*
2+
* Copyright (c) Facebook, Inc. and its affiliates.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
#include "velox/connectors/hive/iceberg/IcebergPartitionPath.h"
17+
#include "velox/common/encode/Base64.h"
18+
19+
namespace facebook::velox::connector::hive::iceberg {
20+
21+
constexpr int32_t kEpochYear = 1970;
22+
23+
std::string IcebergPartitionPath::toPartitionString(
24+
int32_t value,
25+
const TypePtr& type) const {
26+
switch (transformType_) {
27+
case TransformType::kIdentity: {
28+
if (type->isDate()) {
29+
return DATE()->toString(value);
30+
}
31+
return folly::to<std::string>(value);
32+
}
33+
case TransformType::kDay:
34+
return DATE()->toString(value);
35+
case TransformType::kYear:
36+
return fmt::format("{:04d}", kEpochYear + value);
37+
case TransformType::kMonth: {
38+
int32_t year = kEpochYear + value / 12;
39+
int32_t month = 1 + value % 12;
40+
if (month <= 0) {
41+
month += 12;
42+
year -= 1;
43+
}
44+
return fmt::format("{:04d}-{:02d}", year, month);
45+
}
46+
case TransformType::kHour: {
47+
int64_t seconds = static_cast<int64_t>(value) * 3600;
48+
std::tm tmValue;
49+
VELOX_USER_CHECK(
50+
Timestamp::epochToCalendarUtc(seconds, tmValue),
51+
"Failed to convert seconds to time: {}",
52+
seconds);
53+
return fmt::format(
54+
"{:04d}-{:02d}-{:02d}-{:02d}",
55+
tmValue.tm_year + 1900,
56+
tmValue.tm_mon + 1,
57+
tmValue.tm_mday,
58+
tmValue.tm_hour);
59+
}
60+
default:
61+
return folly::to<std::string>(value);
62+
}
63+
}
64+
65+
std::string IcebergPartitionPath::toPartitionString(
66+
Timestamp value,
67+
const TypePtr& type) const {
68+
TimestampToStringOptions options;
69+
options.precision = TimestampPrecision::kMilliseconds;
70+
options.zeroPaddingYear = true;
71+
options.skipTrailingZeros = true;
72+
options.leadingPositiveSign = true;
73+
return value.toString(options);
74+
}
75+
76+
std::string IcebergPartitionPath::toPartitionString(
77+
StringView value,
78+
const TypePtr& type) const {
79+
if (type->isVarbinary()) {
80+
return encoding::Base64::encode(value.data(), value.size());
81+
}
82+
return std::string(value);
83+
}
84+
85+
} // namespace facebook::velox::connector::hive::iceberg
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
/*
2+
* Copyright (c) Facebook, Inc. and its affiliates.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
#pragma once
18+
19+
#include "velox/connectors/hive/HivePartitionUtil.h"
20+
#include "velox/connectors/hive/iceberg/PartitionSpec.h"
21+
22+
namespace facebook::velox::connector::hive::iceberg {
23+
24+
/// Converts a partition value to its string representation for use in
25+
/// partition directory path. The implementation follows the behavior of
26+
/// the Apache Iceberg Java library for partition path name.
27+
class IcebergPartitionPath : public HivePartitionUtil {
28+
public:
29+
explicit IcebergPartitionPath(TransformType transformType)
30+
: transformType_(transformType) {}
31+
32+
using HivePartitionUtil::toPartitionString;
33+
34+
/// Converts an int32_t partition key to its string representation based on
35+
/// the transform type:
36+
/// - kIdentity: For DATE type return "YYYY-MM-DD" format (e.g.,
37+
/// "2025-11-07").
38+
/// For other types return the value as-is (e.g., "-123").
39+
/// - kDay: Returns date in "YYYY-MM-DD" format (e.g., "2025-11-07").
40+
/// - kYear: Returns 4-digit year "YYYY" (e.g., "2025").
41+
/// - kMonth: Returns "YYYY-MM" format (e.g., "2025-01").
42+
/// - kHour: Returns "YYYY-MM-DD-HH" format (e.g., "2025-11-07-21").
43+
std::string toPartitionString(int32_t value, const TypePtr& type)
44+
const override;
45+
46+
/// Converts a Timestamp partition key to its string representation.
47+
/// Returns timestamp formatted with milliseconds precision, zero-padded year,
48+
/// trailing zeros skipped, and leading positive sign for years >= 10000.
49+
/// Examples:
50+
/// - Timestamp(0, 0) -> "1970-01-01T00:00:00".
51+
/// - Timestamp(1609459200, 999000000) -> "2021-01-01T00:00:00.999".
52+
/// - Timestamp(1640995200, 500000000) -> "2022-01-01T00:00:00.5".
53+
/// - Timestamp(-1, 999000000) -> "1969-12-31T23:59:59.999".
54+
/// - Timestamp(253402300800, 100000000) -> "+10000-01-01T00:00:00.1".
55+
std::string toPartitionString(Timestamp value, const TypePtr& type)
56+
const override;
57+
58+
/// Converts a StringView partition key to its string representation.
59+
/// - For VARBINARY type returns Base64-encoded string.
60+
/// - For VARCHAR type returns the string value as-is.
61+
std::string toPartitionString(StringView value, const TypePtr& type)
62+
const override;
63+
64+
private:
65+
const TransformType transformType_;
66+
};
67+
68+
using IcebergPartitionPathPtr = std::shared_ptr<const IcebergPartitionPath>;
69+
70+
} // namespace facebook::velox::connector::hive::iceberg

velox/connectors/hive/iceberg/tests/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ if(NOT VELOX_DISABLE_GOOGLETEST)
6060
add_executable(
6161
velox_hive_iceberg_insert_test
6262
IcebergInsertTest.cpp
63+
IcebergPartitionPathTest.cpp
6364
IcebergTestBase.cpp
6465
Main.cpp
6566
PartitionSpecTest.cpp
Lines changed: 160 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,160 @@
1+
/*
2+
* Copyright (c) Facebook, Inc. and its affiliates.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
#include <gtest/gtest.h>
18+
19+
#include "velox/connectors/hive/iceberg/IcebergPartitionPath.h"
20+
#include "velox/type/Type.h"
21+
22+
namespace facebook::velox::connector::hive::iceberg {
23+
24+
namespace {
25+
26+
template <typename T>
27+
std::string test(TransformType transform, T value, const TypePtr& type) {
28+
return IcebergPartitionPath(transform).toPartitionString(value, type);
29+
}
30+
31+
TEST(IcebergPartitionPathTest, identityInt32) {
32+
EXPECT_EQ(test(TransformType::kIdentity, 100, INTEGER()), "100");
33+
EXPECT_EQ(test(TransformType::kIdentity, -100, INTEGER()), "-100");
34+
EXPECT_EQ(test(TransformType::kIdentity, 0, INTEGER()), "0");
35+
}
36+
37+
TEST(IcebergPartitionPathTest, identityDate) {
38+
EXPECT_EQ(test(TransformType::kIdentity, 18'262, DATE()), "2020-01-01");
39+
EXPECT_EQ(test(TransformType::kIdentity, 0, DATE()), "1970-01-01");
40+
EXPECT_EQ(test(TransformType::kIdentity, -1, DATE()), "1969-12-31");
41+
EXPECT_EQ(test(TransformType::kIdentity, 2'932'897, DATE()), "10000-01-01");
42+
}
43+
44+
TEST(IcebergPartitionPathTest, identityBoolean) {
45+
EXPECT_EQ(test(TransformType::kIdentity, true, BOOLEAN()), "true");
46+
EXPECT_EQ(test(TransformType::kIdentity, false, BOOLEAN()), "false");
47+
}
48+
49+
TEST(IcebergPartitionPathTest, identityVarchar) {
50+
EXPECT_EQ(
51+
test(TransformType::kIdentity, StringView("a/b/c=d"), VARCHAR()),
52+
"a/b/c=d");
53+
EXPECT_EQ(test(TransformType::kIdentity, StringView(""), VARCHAR()), "");
54+
}
55+
56+
TEST(IcebergPartitionPathTest, identityVarbinary) {
57+
EXPECT_EQ(
58+
test(
59+
TransformType::kIdentity,
60+
StringView("\x48\x65\x6c\x6c\x6f"),
61+
VARBINARY()),
62+
"SGVsbG8=");
63+
EXPECT_EQ(
64+
test(TransformType::kIdentity, StringView("\x1\x2\x3"), VARBINARY()),
65+
"AQID");
66+
EXPECT_EQ(test(TransformType::kIdentity, StringView(""), VARBINARY()), "");
67+
}
68+
69+
TEST(IcebergPartitionPathTest, identityTimestamp) {
70+
EXPECT_EQ(
71+
test(TransformType::kIdentity, Timestamp(0, 0), TIMESTAMP()),
72+
"1970-01-01T00:00:00");
73+
EXPECT_EQ(
74+
test(
75+
TransformType::kIdentity,
76+
Timestamp(1609459200, 999000000),
77+
TIMESTAMP()),
78+
"2021-01-01T00:00:00.999");
79+
EXPECT_EQ(
80+
test(
81+
TransformType::kIdentity,
82+
Timestamp(1640995200, 500000000),
83+
TIMESTAMP()),
84+
"2022-01-01T00:00:00.5");
85+
EXPECT_EQ(
86+
test(TransformType::kIdentity, Timestamp(-1, 999000000), TIMESTAMP()),
87+
"1969-12-31T23:59:59.999");
88+
EXPECT_EQ(
89+
test(
90+
TransformType::kIdentity,
91+
Timestamp(253402300800, 100000000),
92+
TIMESTAMP()),
93+
"+10000-01-01T00:00:00.1");
94+
EXPECT_EQ(
95+
test(TransformType::kIdentity, Timestamp(-62170000000, 0), TIMESTAMP()),
96+
"-0001-11-29T19:33:20");
97+
EXPECT_EQ(
98+
test(TransformType::kIdentity, Timestamp(-62167219199, 0), TIMESTAMP()),
99+
"0000-01-01T00:00:01");
100+
}
101+
102+
TEST(IcebergPartitionPathTest, year) {
103+
EXPECT_EQ(test(TransformType::kYear, 0, INTEGER()), "1970");
104+
EXPECT_EQ(test(TransformType::kYear, 1, INTEGER()), "1971");
105+
EXPECT_EQ(test(TransformType::kYear, 8030, INTEGER()), "10000");
106+
EXPECT_EQ(test(TransformType::kYear, -1, INTEGER()), "1969");
107+
EXPECT_EQ(test(TransformType::kYear, -50, INTEGER()), "1920");
108+
}
109+
110+
TEST(IcebergPartitionPathTest, month) {
111+
EXPECT_EQ(test(TransformType::kMonth, 0, INTEGER()), "1970-01");
112+
EXPECT_EQ(test(TransformType::kMonth, 1, INTEGER()), "1970-02");
113+
EXPECT_EQ(test(TransformType::kMonth, 11, INTEGER()), "1970-12");
114+
EXPECT_EQ(test(TransformType::kMonth, 612, INTEGER()), "2021-01");
115+
EXPECT_EQ(test(TransformType::kMonth, -1, INTEGER()), "1969-12");
116+
EXPECT_EQ(test(TransformType::kMonth, -13, INTEGER()), "1968-12");
117+
}
118+
119+
TEST(IcebergPartitionPathTest, day) {
120+
EXPECT_EQ(test(TransformType::kDay, 0, DATE()), "1970-01-01");
121+
EXPECT_EQ(test(TransformType::kDay, 1, DATE()), "1970-01-02");
122+
EXPECT_EQ(test(TransformType::kDay, 18'262, DATE()), "2020-01-01");
123+
EXPECT_EQ(test(TransformType::kDay, -1, DATE()), "1969-12-31");
124+
}
125+
126+
TEST(IcebergPartitionPathTest, hour) {
127+
EXPECT_EQ(test(TransformType::kHour, 0, INTEGER()), "1970-01-01-00");
128+
EXPECT_EQ(test(TransformType::kHour, 1, INTEGER()), "1970-01-01-01");
129+
EXPECT_EQ(test(TransformType::kHour, 24, INTEGER()), "1970-01-02-00");
130+
EXPECT_EQ(test(TransformType::kHour, 438'288, INTEGER()), "2020-01-01-00");
131+
EXPECT_EQ(test(TransformType::kHour, -1, INTEGER()), "1969-12-31-23");
132+
}
133+
134+
TEST(IcebergPartitionPathTest, bucket) {
135+
EXPECT_EQ(test(TransformType::kBucket, 0, INTEGER()), "0");
136+
EXPECT_EQ(test(TransformType::kBucket, 1, INTEGER()), "1");
137+
EXPECT_EQ(test(TransformType::kBucket, 128, INTEGER()), "128");
138+
}
139+
140+
TEST(IcebergPartitionPathTest, truncateInteger) {
141+
EXPECT_EQ(test(TransformType::kTruncate, 0, INTEGER()), "0");
142+
EXPECT_EQ(test(TransformType::kTruncate, 10, INTEGER()), "10");
143+
EXPECT_EQ(test(TransformType::kTruncate, 100, INTEGER()), "100");
144+
EXPECT_EQ(test(TransformType::kTruncate, -10, INTEGER()), "-10");
145+
EXPECT_EQ(test(TransformType::kTruncate, -100, INTEGER()), "-100");
146+
EXPECT_EQ(test(TransformType::kTruncate, -1000, INTEGER()), "-1000");
147+
}
148+
149+
TEST(IcebergPartitionPathTest, truncateString) {
150+
EXPECT_EQ(
151+
test(TransformType::kTruncate, StringView("abc"), VARCHAR()), "abc");
152+
EXPECT_EQ(test(TransformType::kTruncate, StringView(""), VARCHAR()), "");
153+
EXPECT_EQ(
154+
test(TransformType::kTruncate, StringView("\x1\x2\x3"), VARBINARY()),
155+
"AQID");
156+
}
157+
158+
} // namespace
159+
160+
} // namespace facebook::velox::connector::hive::iceberg

0 commit comments

Comments
 (0)