forked from Wiladams/LAPHLibs
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathutf.lua
More file actions
136 lines (105 loc) · 3.33 KB
/
utf.lua
File metadata and controls
136 lines (105 loc) · 3.33 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
local ffi = require "ffi"
local bit = require "bit"
local band = bit.band
local bor = bit.bor
local rshift = bit.rshift
local lshift = bit.lshift
--[[
References:
http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
Also a mention to Rich Felker
For UTF-16
http://en.wikipedia.org/wiki/UTF-16#Example_UTF-16_encoding_procedure
--]]
local UTF8_ACCEPT = 0
local UTF8_REJECT = 12
local utf8d = ffi.new("const uint8_t[364]", {
-- The first part of the table maps bytes to character classes that
-- to reduce the size of the transition table and create bitmasks.
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
-- The second part is a transition table that maps a combination
-- of a state of the automaton and a character class to a state.
0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
12,36,12,12,12,12,12,12,12,12,12,12,
});
local function decode_utf8_byte(state, codep, byte)
local ctype = utf8d[byte];
if (state ~= UTF8_ACCEPT) then
codep = bor(band(byte, 0x3f), lshift(codep, 6))
else
codep = band(rshift(0xff, ctype), byte);
end
state = utf8d[256 + state + ctype];
return state, codep;
end
--[[
Given a UTF-8 string, this routine will feed
out UNICODE code points as an iterator.
Usage:
for codepoint, err in utf8_string_iterator(utf8string) do
print(codepoint)
end
--]]
local function utf8_string_iterator(utf8string, len)
len = len or #utf8string
local state = UTF8_ACCEPT
local codep =0;
local offset = 0;
local ptr = ffi.cast("uint8_t *", utf8string)
local bufflen = len;
return function()
while offset < bufflen do
state, codep = decode_utf8_byte(state, codep, ptr[offset])
offset = offset + 1
if state == UTF8_ACCEPT then
return codep
elseif state == UTF8_REJECT then
return nil, state
end
end
return nil, state;
end
end
--[[
What is the length of a utf8-string in codepoints
Similar to strlen for ASCII
Basically, just count the code points
--]]
local function utf8_string_length(utf8string, len)
local count = 0;
for codepoint, err in utf8_string_iterator(utf8string,len) do
count = count + 1
end
return count
end
local function codepoint_to_utf16(codepoint)
if codepoint <= 0xffff then
return codepoint
end
local w1 = 0xD7C0 + rshift(codepoint, 10)
local w2 = 0xDC00 + band(codepoint, 0x3ff)
return w1, w2
end
local function utf16_to_codepoint(w1, w2)
local cph = lshift((w1 - 0xD7C0), 10)
if w2 then
cph = cph + w2 - 0xDC00
end
return cph
end
return {
utf8_iterator = utf8_string_iterator;
utf8_strlen = utf8_string_length;
codepoint_to_utf16 = codepoint_to_utf16;
utf16_to_codepoint = utf16_to_codepoint;
}