Skip to content

Commit 1bd480e

Browse files
kgpaifacebook-github-bot
authored andcommitted
fix: Add support to parse illegal unicode in json_parse (#11744)
Summary: Currently json_parse will throw if it encounters bad / illegal unicode in json whereas Presto java will add placeholder chars (�) and carry on parsing the string. This PR attempts to replicate this behavior. Pull Request resolved: #11744 Reviewed By: kevinwilfong Differential Revision: D66774226 Pulled By: kgpai fbshipit-source-id: 3048584017732a647d23f1d85245dd59024a7311
1 parent c12207a commit 1bd480e

File tree

7 files changed

+221
-39
lines changed

7 files changed

+221
-39
lines changed

velox/functions/lib/Utf8Utils.cpp

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -173,4 +173,75 @@ tryGetUtf8CharLength(const char* input, int64_t size, int32_t& codePoint) {
173173
return -1;
174174
}
175175

176+
bool hasInvalidUTF8(const char* input, int32_t len) {
177+
for (size_t inputIndex = 0; inputIndex < len;) {
178+
if (IS_ASCII(input[inputIndex])) {
179+
// Ascii
180+
inputIndex++;
181+
} else {
182+
// Unicode
183+
int32_t codePoint;
184+
auto charLength =
185+
tryGetUtf8CharLength(input + inputIndex, len - inputIndex, codePoint);
186+
if (charLength < 0) {
187+
return true;
188+
}
189+
inputIndex += charLength;
190+
}
191+
}
192+
193+
return false;
194+
}
195+
196+
size_t replaceInvalidUTF8Characters(
197+
char* outputBuffer,
198+
const char* input,
199+
int32_t len) {
200+
size_t inputIndex = 0;
201+
size_t outputIndex = 0;
202+
203+
while (inputIndex < len) {
204+
if (IS_ASCII(input[inputIndex])) {
205+
outputBuffer[outputIndex++] = input[inputIndex++];
206+
} else {
207+
// Unicode
208+
int32_t codePoint;
209+
const auto charLength =
210+
tryGetUtf8CharLength(input + inputIndex, len - inputIndex, codePoint);
211+
if (charLength > 0) {
212+
std::memcpy(outputBuffer + outputIndex, input + inputIndex, charLength);
213+
outputIndex += charLength;
214+
inputIndex += charLength;
215+
} else {
216+
size_t replaceCharactersToWriteOut = inputIndex < len - 1 &&
217+
isMultipleInvalidSequences(input, inputIndex)
218+
? -charLength
219+
: 1;
220+
const auto& replacementCharacterString =
221+
kReplacementCharacterStrings[replaceCharactersToWriteOut - 1];
222+
std::memcpy(
223+
outputBuffer + outputIndex,
224+
replacementCharacterString.data(),
225+
replacementCharacterString.size());
226+
outputIndex += replacementCharacterString.size();
227+
inputIndex += -charLength;
228+
}
229+
}
230+
}
231+
232+
return outputIndex;
233+
}
234+
235+
template <>
236+
void replaceInvalidUTF8Characters(
237+
std::string& out,
238+
const char* input,
239+
int32_t len) {
240+
auto maxLen = len * kReplacementCharacterStrings[0].size();
241+
out.resize(maxLen);
242+
auto outputBuffer = out.data();
243+
auto outputIndex = replaceInvalidUTF8Characters(outputBuffer, input, len);
244+
out.resize(outputIndex);
245+
}
246+
176247
} // namespace facebook::velox::functions

velox/functions/lib/Utf8Utils.h

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@
2323

2424
namespace facebook::velox::functions {
2525

26+
#define IS_ASCII(x) !((x) & 0x80)
27+
2628
/// This function is not part of the original utf8proc.
2729
/// Tries to get the length of UTF-8 encoded code point. A
2830
/// positive return value means the UTF-8 sequence is valid, and
@@ -86,4 +88,75 @@ FOLLY_ALWAYS_INLINE int validateAndGetNextUtf8Length(
8688
/// -1 for invalid UTF-8 first byte.
8789
int firstByteCharLength(const char* u_input);
8890

91+
/// Invalid character replacement matrix.
92+
constexpr std::array<std::string_view, 6> kReplacementCharacterStrings{
93+
"\xef\xbf\xbd",
94+
"\xef\xbf\xbd\xef\xbf\xbd",
95+
"\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd",
96+
"\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd",
97+
"\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd",
98+
"\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd"};
99+
100+
/// Returns true if there are multiple UTF-8 invalid sequences.
101+
template <typename T>
102+
FOLLY_ALWAYS_INLINE bool isMultipleInvalidSequences(
103+
const T& inputBuffer,
104+
size_t inputIndex) {
105+
return
106+
// 0xe0 followed by a value less than 0xe0 or 0xf0 followed by a
107+
// value less than 0x90 is considered an overlong encoding.
108+
(inputBuffer[inputIndex] == '\xe0' &&
109+
(inputBuffer[inputIndex + 1] & 0xe0) == 0x80) ||
110+
(inputBuffer[inputIndex] == '\xf0' &&
111+
(inputBuffer[inputIndex + 1] & 0xf0) == 0x80) ||
112+
// 0xf4 followed by a byte >= 0x90 looks valid to
113+
// tryGetUtf8CharLength, but is actually outside the range of valid
114+
// code points.
115+
(inputBuffer[inputIndex] == '\xf4' &&
116+
(inputBuffer[inputIndex + 1] & 0xf0) != 0x80) ||
117+
// The bytes 0xf5-0xff, 0xc0, and 0xc1 look like the start of
118+
// multi-byte code points to tryGetUtf8CharLength, but are not part of
119+
// any valid code point.
120+
(unsigned char)inputBuffer[inputIndex] > 0xf4 ||
121+
inputBuffer[inputIndex] == '\xc0' || inputBuffer[inputIndex] == '\xc1';
122+
}
123+
124+
/// Returns true only if invalid UTF-8 is present in the input string.
125+
bool hasInvalidUTF8(const char* input, int32_t len);
126+
127+
/// Replaces invalid UTF-8 characters with replacement characters similar to
128+
/// that produced by Presto java. The function requires that output have
129+
/// sufficient capacity for the output string.
130+
/// @param out Pointer to output string
131+
/// @param input Pointer to input string
132+
/// @param len Length of input string
133+
/// @return number of bytes written
134+
size_t
135+
replaceInvalidUTF8Characters(char* output, const char* input, int32_t len);
136+
137+
/// Replaces invalid UTF-8 characters with replacement characters similar to
138+
/// that produced by Presto java. The function will allocate 1 byte for each
139+
/// orininal character plus extra 2 bytes for each maximal subpart of an
140+
/// ill-formed subsequence for an upper bound of 3x size of the input string.
141+
/// @param out Reference to output string
142+
/// @param input Pointer to input string
143+
/// @param len Length of input string
144+
template <typename TOutString>
145+
void replaceInvalidUTF8Characters(
146+
TOutString& out,
147+
const char* input,
148+
int32_t len) {
149+
auto maxLen = len * kReplacementCharacterStrings[0].size();
150+
out.reserve(maxLen);
151+
auto outputBuffer = out.data();
152+
auto outputIndex = replaceInvalidUTF8Characters(outputBuffer, input, len);
153+
out.resize(outputIndex);
154+
}
155+
156+
template <>
157+
void replaceInvalidUTF8Characters(
158+
std::string& out,
159+
const char* input,
160+
int32_t len);
161+
89162
} // namespace facebook::velox::functions

velox/functions/lib/tests/Utf8Test.cpp

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,5 +104,48 @@ TEST(Utf8Test, tryCharLength) {
104104
ASSERT_EQ(-1, tryCharLength({0xBF}));
105105
}
106106

107+
TEST(UTF8Test, validUtf8) {
108+
auto tryHasInvalidUTF8 = [](const std::vector<unsigned char>& bytes) {
109+
return hasInvalidUTF8(
110+
reinterpret_cast<const char*>(bytes.data()), bytes.size());
111+
};
112+
113+
ASSERT_FALSE(tryHasInvalidUTF8({0x5c, 0x19, 0x7A}));
114+
ASSERT_TRUE(tryHasInvalidUTF8({0x5c, 0x19, 0x7A, 0xBF}));
115+
ASSERT_TRUE(tryHasInvalidUTF8({0x64, 0x65, 0x1A, 0b11100000, 0x81, 0xBF}));
116+
}
117+
118+
TEST(UTF8Test, replaceInvalidUTF8Characters) {
119+
auto testReplaceInvalidUTF8Chars = [](const std::string& input,
120+
const std::string& expected) {
121+
std::string output;
122+
replaceInvalidUTF8Characters(output, input.data(), input.size());
123+
ASSERT_EQ(expected, output);
124+
};
125+
126+
// Good case
127+
testReplaceInvalidUTF8Chars("Hello World", "Hello World");
128+
// Bad encoding
129+
testReplaceInvalidUTF8Chars("hello \xBF world", "hello � world");
130+
// Bad encoding with 3 byte char
131+
testReplaceInvalidUTF8Chars("hello \xe0\x94\x83 world", "hello ��� world");
132+
// Bad encoding with 4 byte char
133+
testReplaceInvalidUTF8Chars(
134+
"hello \xf0\x80\x80\x80\x80 world", "hello ����� world");
135+
136+
// Overlong 4 byte utf8 character.
137+
testReplaceInvalidUTF8Chars(
138+
"hello \xef\xbf\xbd\xef\xbf\xbd world", "hello �� world");
139+
140+
// Test invalid byte 0xC0
141+
testReplaceInvalidUTF8Chars(
142+
"hello \xef\xbf\xbd\xef\xbf\xbd world", "hello �� world");
143+
144+
// Test long 4 byte utf8 with continuation byte
145+
testReplaceInvalidUTF8Chars(
146+
"hello \xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd world",
147+
"hello ���� world");
148+
}
149+
107150
} // namespace
108151
} // namespace facebook::velox::functions

velox/functions/prestosql/JsonFunctions.cpp

Lines changed: 25 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
* limitations under the License.
1515
*/
1616
#include "velox/expression/VectorFunction.h"
17+
#include "velox/functions/lib/Utf8Utils.h"
1718
#include "velox/functions/prestosql/json/JsonStringUtil.h"
1819
#include "velox/functions/prestosql/json/SIMDJsonUtil.h"
1920
#include "velox/functions/prestosql/types/JsonType.h"
@@ -166,15 +167,23 @@ class JsonParseFunction : public exec::VectorFunction {
166167
const auto& arg = args[0];
167168
if (arg->isConstantEncoding()) {
168169
auto value = arg->as<ConstantVector<StringView>>()->valueAt(0);
169-
paddedInput_.resize(value.size() + simdjson::SIMDJSON_PADDING);
170-
memcpy(paddedInput_.data(), value.data(), value.size());
171-
auto escapeSize = escapedStringSize(value.data(), value.size());
170+
auto size = value.size();
171+
if (FOLLY_UNLIKELY(hasInvalidUTF8(value.data(), value.size()))) {
172+
size = replaceInvalidUTF8Characters(
173+
paddedInput_.data(), value.data(), value.size());
174+
paddedInput_.resize(size + simdjson::SIMDJSON_PADDING);
175+
} else {
176+
paddedInput_.resize(size + simdjson::SIMDJSON_PADDING);
177+
memcpy(paddedInput_.data(), value.data(), size);
178+
}
179+
180+
auto escapeSize = escapedStringSize(value.data(), size);
172181
auto buffer = AlignedBuffer::allocate<char>(escapeSize, context.pool());
173182
BufferTracker bufferTracker{buffer};
174183

175184
JsonViews jsonViews;
176185

177-
if (auto error = parse(value.size(), jsonViews)) {
186+
if (auto error = parse(size, jsonViews)) {
178187
context.setErrors(rows, errors_[error]);
179188
return;
180189
}
@@ -219,8 +228,18 @@ class JsonParseFunction : public exec::VectorFunction {
219228
rows.applyToSelected([&](auto row) {
220229
JsonViews jsonViews;
221230
auto value = flatInput->valueAt(row);
222-
memcpy(paddedInput_.data(), value.data(), value.size());
223-
if (auto error = parse(value.size(), jsonViews)) {
231+
auto size = value.size();
232+
if (FOLLY_UNLIKELY(hasInvalidUTF8(value.data(), size))) {
233+
size = replaceInvalidUTF8Characters(
234+
paddedInput_.data(), value.data(), size);
235+
if (maxSize < size) {
236+
paddedInput_.resize(size + simdjson::SIMDJSON_PADDING);
237+
}
238+
} else {
239+
memcpy(paddedInput_.data(), value.data(), size);
240+
}
241+
242+
if (auto error = parse(size, jsonViews)) {
224243
context.setVeloxExceptionError(row, errors_[error]);
225244
} else {
226245
auto canonicalString = bufferTracker.getCanonicalString(jsonViews);

velox/functions/prestosql/URIParser.h

Lines changed: 1 addition & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
#pragma once
1717

1818
#include <boost/regex.hpp>
19+
#include "velox/functions/lib/Utf8Utils.h"
1920
#include "velox/type/StringView.h"
2021

2122
namespace facebook::velox::functions {
@@ -51,29 +52,6 @@ bool parseUri(const StringView& uriStr, URI& uri);
5152
/// false and pos is unchanged.
5253
bool tryConsumeIPV6Address(const char* str, const size_t len, int32_t& pos);
5354

54-
template <typename T>
55-
FOLLY_ALWAYS_INLINE bool isMultipleInvalidSequences(
56-
const T& inputBuffer,
57-
size_t inputIndex) {
58-
return
59-
// 0xe0 followed by a value less than 0xe0 or 0xf0 followed by a
60-
// value less than 0x90 is considered an overlong encoding.
61-
(inputBuffer[inputIndex] == '\xe0' &&
62-
(inputBuffer[inputIndex + 1] & 0xe0) == 0x80) ||
63-
(inputBuffer[inputIndex] == '\xf0' &&
64-
(inputBuffer[inputIndex + 1] & 0xf0) == 0x80) ||
65-
// 0xf4 followed by a byte >= 0x90 looks valid to
66-
// tryGetUtf8CharLength, but is actually outside the range of valid
67-
// code points.
68-
(inputBuffer[inputIndex] == '\xf4' &&
69-
(inputBuffer[inputIndex + 1] & 0xf0) != 0x80) ||
70-
// The bytes 0xf5-0xff, 0xc0, and 0xc1 look like the start of
71-
// multi-byte code points to tryGetUtf8CharLength, but are not part of
72-
// any valid code point.
73-
(unsigned char)inputBuffer[inputIndex] > 0xf4 ||
74-
inputBuffer[inputIndex] == '\xc0' || inputBuffer[inputIndex] == '\xc1';
75-
}
76-
7755
/// Find an extract the value for the parameter with key `param` from the query
7856
/// portion of a URI `query`. `query` should already be decoded if necessary.
7957
template <typename TString>

velox/functions/prestosql/URLFunctions.h

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -23,20 +23,15 @@
2323
namespace facebook::velox::functions {
2424

2525
namespace detail {
26+
27+
/// Encoded replacement character strings.
2628
constexpr std::array<std::string_view, 6> kEncodedReplacementCharacterStrings =
2729
{"%EF%BF%BD",
2830
"%EF%BF%BD%EF%BF%BD",
2931
"%EF%BF%BD%EF%BF%BD%EF%BF%BD",
3032
"%EF%BF%BD%EF%BF%BD%EF%BF%BD%EF%BF%BD",
3133
"%EF%BF%BD%EF%BF%BD%EF%BF%BD%EF%BF%BD%EF%BF%BD",
3234
"%EF%BF%BD%EF%BF%BD%EF%BF%BD%EF%BF%BD%EF%BF%BD%EF%BF%BD"};
33-
constexpr std::array<std::string_view, 6> kDecodedReplacementCharacterStrings{
34-
"\xef\xbf\xbd",
35-
"\xef\xbf\xbd\xef\xbf\xbd",
36-
"\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd",
37-
"\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd",
38-
"\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd",
39-
"\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd"};
4035

4136
FOLLY_ALWAYS_INLINE unsigned char toHex(unsigned char c) {
4237
return c < 10 ? (c + '0') : (c + 'A' - 10);
@@ -178,7 +173,7 @@ FOLLY_ALWAYS_INLINE void urlUnescape(
178173
} else if (charLength < 0) {
179174
// This isn't the start of a valid UTF-8 character, write out the
180175
// replacement character.
181-
const auto& replacementString = kDecodedReplacementCharacterStrings[0];
176+
const auto& replacementString = kReplacementCharacterStrings[0];
182177
std::memcpy(
183178
outputBuffer, replacementString.data(), replacementString.length());
184179
outputBuffer += replacementString.length();
@@ -216,8 +211,8 @@ FOLLY_ALWAYS_INLINE void urlUnescape(
216211
size_t charLength = outputBuffer - charStart;
217212
size_t replaceCharactersToWriteOut =
218213
isMultipleInvalidSequences(charStart, 0) ? charLength : 1;
219-
const auto& replacementString = kDecodedReplacementCharacterStrings
220-
[replaceCharactersToWriteOut - 1];
214+
const auto& replacementString =
215+
kReplacementCharacterStrings[replaceCharactersToWriteOut - 1];
221216

222217
outputBuffer = charStart;
223218
std::memcpy(

velox/functions/prestosql/tests/JsonFunctionsTest.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -235,6 +235,9 @@ TEST_F(JsonFunctionsTest, jsonParse) {
235235
R"("Items for D \ud835\udc52\ud835\udcc1 ")",
236236
R"("Items for D \uD835\uDC52\uD835\uDCC1 ")");
237237

238+
// Test bad unicode characters
239+
testJsonParse("\"Hello \xc0\xaf World\"", "\"Hello �� World\"");
240+
238241
VELOX_ASSERT_THROW(
239242
jsonParse(R"({"k1":})"), "The JSON document has an improper structure");
240243
VELOX_ASSERT_THROW(

0 commit comments

Comments
 (0)