|
23 | 23 |
|
24 | 24 | namespace facebook::velox::functions { |
25 | 25 |
|
| 26 | +#define IS_ASCII(x) !((x) & 0x80) |
| 27 | + |
26 | 28 | /// This function is not part of the original utf8proc. |
27 | 29 | /// Tries to get the length of UTF-8 encoded code point. A |
28 | 30 | /// positive return value means the UTF-8 sequence is valid, and |
@@ -86,4 +88,75 @@ FOLLY_ALWAYS_INLINE int validateAndGetNextUtf8Length( |
86 | 88 | /// -1 for invalid UTF-8 first byte. |
87 | 89 | int firstByteCharLength(const char* u_input); |
88 | 90 |
|
| 91 | +/// Invalid character replacement matrix. |
| 92 | +constexpr std::array<std::string_view, 6> kReplacementCharacterStrings{ |
| 93 | + "\xef\xbf\xbd", |
| 94 | + "\xef\xbf\xbd\xef\xbf\xbd", |
| 95 | + "\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd", |
| 96 | + "\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd", |
| 97 | + "\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd", |
| 98 | + "\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd"}; |
| 99 | + |
| 100 | +/// Returns true if there are multiple UTF-8 invalid sequences. |
| 101 | +template <typename T> |
| 102 | +FOLLY_ALWAYS_INLINE bool isMultipleInvalidSequences( |
| 103 | + const T& inputBuffer, |
| 104 | + size_t inputIndex) { |
| 105 | + return |
| 106 | + // 0xe0 followed by a value less than 0xe0 or 0xf0 followed by a |
| 107 | + // value less than 0x90 is considered an overlong encoding. |
| 108 | + (inputBuffer[inputIndex] == '\xe0' && |
| 109 | + (inputBuffer[inputIndex + 1] & 0xe0) == 0x80) || |
| 110 | + (inputBuffer[inputIndex] == '\xf0' && |
| 111 | + (inputBuffer[inputIndex + 1] & 0xf0) == 0x80) || |
| 112 | + // 0xf4 followed by a byte >= 0x90 looks valid to |
| 113 | + // tryGetUtf8CharLength, but is actually outside the range of valid |
| 114 | + // code points. |
| 115 | + (inputBuffer[inputIndex] == '\xf4' && |
| 116 | + (inputBuffer[inputIndex + 1] & 0xf0) != 0x80) || |
| 117 | + // The bytes 0xf5-0xff, 0xc0, and 0xc1 look like the start of |
| 118 | + // multi-byte code points to tryGetUtf8CharLength, but are not part of |
| 119 | + // any valid code point. |
| 120 | + (unsigned char)inputBuffer[inputIndex] > 0xf4 || |
| 121 | + inputBuffer[inputIndex] == '\xc0' || inputBuffer[inputIndex] == '\xc1'; |
| 122 | +} |
| 123 | + |
| 124 | +/// Returns true only if invalid UTF-8 is present in the input string. |
| 125 | +bool hasInvalidUTF8(const char* input, int32_t len); |
| 126 | + |
| 127 | +/// Replaces invalid UTF-8 characters with replacement characters similar to |
| 128 | +/// that produced by Presto java. The function requires that output have |
| 129 | +/// sufficient capacity for the output string. |
| 130 | +/// @param out Pointer to output string |
| 131 | +/// @param input Pointer to input string |
| 132 | +/// @param len Length of input string |
| 133 | +/// @return number of bytes written |
| 134 | +size_t |
| 135 | +replaceInvalidUTF8Characters(char* output, const char* input, int32_t len); |
| 136 | + |
| 137 | +/// Replaces invalid UTF-8 characters with replacement characters similar to |
| 138 | +/// that produced by Presto java. The function will allocate 1 byte for each |
| 139 | +/// orininal character plus extra 2 bytes for each maximal subpart of an |
| 140 | +/// ill-formed subsequence for an upper bound of 3x size of the input string. |
| 141 | +/// @param out Reference to output string |
| 142 | +/// @param input Pointer to input string |
| 143 | +/// @param len Length of input string |
| 144 | +template <typename TOutString> |
| 145 | +void replaceInvalidUTF8Characters( |
| 146 | + TOutString& out, |
| 147 | + const char* input, |
| 148 | + int32_t len) { |
| 149 | + auto maxLen = len * kReplacementCharacterStrings[0].size(); |
| 150 | + out.reserve(maxLen); |
| 151 | + auto outputBuffer = out.data(); |
| 152 | + auto outputIndex = replaceInvalidUTF8Characters(outputBuffer, input, len); |
| 153 | + out.resize(outputIndex); |
| 154 | +} |
| 155 | + |
| 156 | +template <> |
| 157 | +void replaceInvalidUTF8Characters( |
| 158 | + std::string& out, |
| 159 | + const char* input, |
| 160 | + int32_t len); |
| 161 | + |
89 | 162 | } // namespace facebook::velox::functions |
0 commit comments