Skip to content

Commit bc333ab

Browse files
authored
encoding.utf8.validate: fix validation, add test (#25748)
1 parent 2f96bff commit bc333ab

File tree

2 files changed

+31
-88
lines changed

2 files changed

+31
-88
lines changed

vlib/encoding/utf8/validate/encoding_utf8_test.v

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,4 +84,5 @@ fn test_validate_invalid_str() {
8484
assert validate.utf8_string('\xF1\xBF\xBF\xC0') == false
8585
assert validate.utf8_string('\xF1\xBF\xC0\x80') == false
8686
assert validate.utf8_string('\xF1\xC0\x80\x80') == false
87+
assert validate.utf8_string('\xED\xEF\xBF\x89') == false
8788
}
Lines changed: 30 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -1,103 +1,45 @@
11
module validate
22

3-
struct Utf8State {
4-
mut:
5-
index int
6-
subindex int
7-
failed bool
8-
}
3+
// http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
4+
5+
// vfmt off
6+
const utf8d = [
7+
u8(0), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
10+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
11+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
12+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
13+
1, 1, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
14+
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
15+
7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
16+
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 10, 3, 3, 3, 3,
17+
3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 11, 6, 6, 6, 5, 8, 8, 8, 8, 8, 8, 8,
18+
8, 8, 8, 8, 0, 12, 24, 36, 60, 96, 84, 12, 12, 12, 48, 72, 12, 12, 12,
19+
12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 0, 12, 12, 12, 12, 12, 0, 12,
20+
0, 12, 12, 12, 24, 12, 12, 12, 12, 12, 24, 12, 24, 12, 12, 12, 12, 12,
21+
12, 12, 12, 12, 24, 12, 12, 12, 12, 12, 24, 12, 12, 12, 12, 12, 12, 12,
22+
24, 12, 12, 12, 12, 12, 12, 12, 12, 12, 36, 12, 36, 12, 12, 12, 36, 12,
23+
12, 12, 12, 12, 36, 12, 36, 12, 12, 12, 36, 12, 12, 12, 12, 12, 12, 12,
24+
12, 12, 12 ]!
25+
// vfmt on
926

1027
// utf8_string returns true, if the given string `s` consists only of valid UTF-8 runes
1128
pub fn utf8_string(s string) bool {
1229
return utf8_data(s.str, s.len)
1330
}
1431

1532
// utf8_data returns true, if the given `data` block, with length `len` bytes, consists only of valid UTF-8 runes
33+
@[direct_array_access]
1634
pub fn utf8_data(data &u8, len int) bool {
17-
mut state := Utf8State{}
35+
mut state := 0
36+
1837
for i := 0; i < len; i++ {
19-
s := unsafe { data[i] }
20-
if s == 0 {
21-
break
22-
}
23-
state.next_state(s)
24-
if state.failed {
38+
b := unsafe { data[i] }
39+
state = utf8d[256 + state + utf8d[b]]
40+
if state == 12 {
2541
return false
2642
}
2743
}
28-
return !state.failed && state.subindex <= 0
29-
}
30-
31-
fn (mut s Utf8State) seq(r0 bool, r1 bool, is_tail bool) bool {
32-
if s.subindex == 0 || (s.index > 1 && s.subindex == 1) || (s.index >= 6 && s.subindex == 2) {
33-
if (s.subindex == 0 && r0) || (s.subindex == 1 && r1) || (s.subindex == 2 && is_tail) {
34-
s.subindex++
35-
return true
36-
}
37-
} else {
38-
s.failed = true
39-
if is_tail {
40-
s.index = 0
41-
s.subindex = 0
42-
s.failed = false
43-
}
44-
return true
45-
}
46-
s.index++
47-
s.subindex = 0
48-
return false
49-
}
50-
51-
/* Check UTF-8 Byte sequences according to Unicode Standard
52-
* https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/
53-
* Code Points 1st 2s 3s 4s
54-
* U+0000..U+007F 00..7F
55-
* U+0080..U+07FF C2..DF 80..BF
56-
* U+0800..U+0FFF E0 A0..BF 80..BF
57-
* U+1000..U+CFFF E1..EC 80..BF 80..BF
58-
* U+D000..U+D7FF ED 80..9F 80..BF
59-
* U+E000..U+FFFF EE..EF 80..BF 80..BF
60-
* U+10000..U+3FFFF F0 90..BF 80..BF 80..BF
61-
* U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
62-
* U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
63-
*/
64-
fn (mut s Utf8State) next_state(c u8) {
65-
// sequence 1
66-
if s.index == 0 {
67-
if (c >= 0x00 + 1 && c <= 0x7F) || c == 0x00 {
68-
return
69-
}
70-
s.index++
71-
s.subindex = 0
72-
}
73-
is_tail := c >= 0x80 && c <= 0xBF
74-
// sequence 2
75-
if s.index == 1 && s.seq(c >= 0xC2 && c <= 0xDF, false, is_tail) {
76-
return
77-
}
78-
// sequence 3
79-
if s.index == 2 && s.seq(c == 0xE0, c >= 0xA0 && c <= 0xBF, is_tail) {
80-
return
81-
}
82-
if s.index == 3 && s.seq(c >= 0xE1 && c <= 0xEC, c >= 0x80 && c <= 0xBF, is_tail) {
83-
return
84-
}
85-
if s.index == 4 && s.seq(c == 0xED, c >= 0x80 && c <= 0x9F, is_tail) {
86-
return
87-
}
88-
if s.index == 5 && s.seq(c >= 0xEE && c <= 0xEF, c >= 0x80 && c <= 0xBF, is_tail) {
89-
return
90-
}
91-
// sequence 4
92-
if s.index == 6 && s.seq(c == 0xF0, c >= 0x90 && c <= 0xBF, is_tail) {
93-
return
94-
}
95-
if s.index == 7 && s.seq(c >= 0xF1 && c <= 0xF3, c >= 0x80 && c <= 0xBF, is_tail) {
96-
return
97-
}
98-
if s.index == 8 && s.seq(c == 0xF4, c >= 0x80 && c <= 0x8F, is_tail) {
99-
return
100-
}
101-
// we should never reach here
102-
s.failed = true
44+
return state == 0
10345
}

0 commit comments

Comments
 (0)