|
1 | 1 | module validate |
2 | 2 |
|
3 | | -struct Utf8State { |
4 | | -mut: |
5 | | - index int |
6 | | - subindex int |
7 | | - failed bool |
8 | | -} |
| 3 | +// http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ |
| 4 | + |
| 5 | +// vfmt off |
| 6 | +const utf8d = [ |
| 7 | + u8(0), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 8 | + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 9 | + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 10 | + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 11 | + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 12 | + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| 13 | + 1, 1, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, |
| 14 | + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, |
| 15 | + 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
| 16 | + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 10, 3, 3, 3, 3, |
| 17 | + 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 11, 6, 6, 6, 5, 8, 8, 8, 8, 8, 8, 8, |
| 18 | + 8, 8, 8, 8, 0, 12, 24, 36, 60, 96, 84, 12, 12, 12, 48, 72, 12, 12, 12, |
| 19 | + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 0, 12, 12, 12, 12, 12, 0, 12, |
| 20 | + 0, 12, 12, 12, 24, 12, 12, 12, 12, 12, 24, 12, 24, 12, 12, 12, 12, 12, |
| 21 | + 12, 12, 12, 12, 24, 12, 12, 12, 12, 12, 24, 12, 12, 12, 12, 12, 12, 12, |
| 22 | + 24, 12, 12, 12, 12, 12, 12, 12, 12, 12, 36, 12, 36, 12, 12, 12, 36, 12, |
| 23 | + 12, 12, 12, 12, 36, 12, 36, 12, 12, 12, 36, 12, 12, 12, 12, 12, 12, 12, |
| 24 | + 12, 12, 12 ]! |
| 25 | +// vfmt on |
9 | 26 |
|
10 | 27 | // utf8_string returns true, if the given string `s` consists only of valid UTF-8 runes |
11 | 28 | pub fn utf8_string(s string) bool { |
12 | 29 | return utf8_data(s.str, s.len) |
13 | 30 | } |
14 | 31 |
|
15 | 32 | // utf8_data returns true, if the given `data` block, with length `len` bytes, consists only of valid UTF-8 runes |
| 33 | +@[direct_array_access] |
16 | 34 | pub fn utf8_data(data &u8, len int) bool { |
17 | | - mut state := Utf8State{} |
| 35 | + mut state := 0 |
| 36 | + |
18 | 37 | for i := 0; i < len; i++ { |
19 | | - s := unsafe { data[i] } |
20 | | - if s == 0 { |
21 | | - break |
22 | | - } |
23 | | - state.next_state(s) |
24 | | - if state.failed { |
| 38 | + b := unsafe { data[i] } |
| 39 | + state = utf8d[256 + state + utf8d[b]] |
| 40 | + if state == 12 { |
25 | 41 | return false |
26 | 42 | } |
27 | 43 | } |
28 | | - return !state.failed && state.subindex <= 0 |
29 | | -} |
30 | | - |
31 | | -fn (mut s Utf8State) seq(r0 bool, r1 bool, is_tail bool) bool { |
32 | | - if s.subindex == 0 || (s.index > 1 && s.subindex == 1) || (s.index >= 6 && s.subindex == 2) { |
33 | | - if (s.subindex == 0 && r0) || (s.subindex == 1 && r1) || (s.subindex == 2 && is_tail) { |
34 | | - s.subindex++ |
35 | | - return true |
36 | | - } |
37 | | - } else { |
38 | | - s.failed = true |
39 | | - if is_tail { |
40 | | - s.index = 0 |
41 | | - s.subindex = 0 |
42 | | - s.failed = false |
43 | | - } |
44 | | - return true |
45 | | - } |
46 | | - s.index++ |
47 | | - s.subindex = 0 |
48 | | - return false |
49 | | -} |
50 | | - |
51 | | -/* Check UTF-8 Byte sequences according to Unicode Standard |
52 | | - * https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/ |
53 | | - * Code Points 1st 2s 3s 4s |
54 | | - * U+0000..U+007F 00..7F |
55 | | - * U+0080..U+07FF C2..DF 80..BF |
56 | | - * U+0800..U+0FFF E0 A0..BF 80..BF |
57 | | - * U+1000..U+CFFF E1..EC 80..BF 80..BF |
58 | | - * U+D000..U+D7FF ED 80..9F 80..BF |
59 | | - * U+E000..U+FFFF EE..EF 80..BF 80..BF |
60 | | - * U+10000..U+3FFFF F0 90..BF 80..BF 80..BF |
61 | | - * U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF |
62 | | - * U+100000..U+10FFFF F4 80..8F 80..BF 80..BF |
63 | | - */ |
64 | | -fn (mut s Utf8State) next_state(c u8) { |
65 | | - // sequence 1 |
66 | | - if s.index == 0 { |
67 | | - if (c >= 0x00 + 1 && c <= 0x7F) || c == 0x00 { |
68 | | - return |
69 | | - } |
70 | | - s.index++ |
71 | | - s.subindex = 0 |
72 | | - } |
73 | | - is_tail := c >= 0x80 && c <= 0xBF |
74 | | - // sequence 2 |
75 | | - if s.index == 1 && s.seq(c >= 0xC2 && c <= 0xDF, false, is_tail) { |
76 | | - return |
77 | | - } |
78 | | - // sequence 3 |
79 | | - if s.index == 2 && s.seq(c == 0xE0, c >= 0xA0 && c <= 0xBF, is_tail) { |
80 | | - return |
81 | | - } |
82 | | - if s.index == 3 && s.seq(c >= 0xE1 && c <= 0xEC, c >= 0x80 && c <= 0xBF, is_tail) { |
83 | | - return |
84 | | - } |
85 | | - if s.index == 4 && s.seq(c == 0xED, c >= 0x80 && c <= 0x9F, is_tail) { |
86 | | - return |
87 | | - } |
88 | | - if s.index == 5 && s.seq(c >= 0xEE && c <= 0xEF, c >= 0x80 && c <= 0xBF, is_tail) { |
89 | | - return |
90 | | - } |
91 | | - // sequence 4 |
92 | | - if s.index == 6 && s.seq(c == 0xF0, c >= 0x90 && c <= 0xBF, is_tail) { |
93 | | - return |
94 | | - } |
95 | | - if s.index == 7 && s.seq(c >= 0xF1 && c <= 0xF3, c >= 0x80 && c <= 0xBF, is_tail) { |
96 | | - return |
97 | | - } |
98 | | - if s.index == 8 && s.seq(c == 0xF4, c >= 0x80 && c <= 0x8F, is_tail) { |
99 | | - return |
100 | | - } |
101 | | - // we should never reach here |
102 | | - s.failed = true |
| 44 | + return state == 0 |
103 | 45 | } |
0 commit comments