-
Notifications
You must be signed in to change notification settings - Fork 21
/
utf8ReadMonkeypatch.ts
67 lines (62 loc) · 2.3 KB
/
utf8ReadMonkeypatch.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
/**
Looks like the homebrewed utf8 decoder of protobufjs can be a little broke
sometimes in heavy unicode land. In here, we monkey patch protobufjs's standard
utf8.read function to our own fixed version.
*/
// eslint-disable-next-line max-len
/* eslint-disable no-bitwise, no-plusplus, import/no-extraneous-dependencies, no-eval, @typescript-eslint/ban-ts-comment */
import * as utf8 from '@protobufjs/utf8';
function utf8ReadFixed(buffer: Uint8Array, start: number, end: number) {
// this function is really a utf8 -> utf16 encoder (decoder???). Ideally we'd
// be using the environment's built in TextDecoder but this has unreliable
// behavior around BOM chars in some environments.
if (end - start < 1) {
return '';
}
let str = '';
for (let i = start; i < end;) {
const t = buffer[i++];
if (t <= 0x7F) {
// regular ol ascii, easy peasy
// 0aaaaaaa
str += String.fromCharCode(t);
} else if (t >= 0xC0 && t < 0xE0) {
// the only time utf16 is actually a bro. A two byte utf8 code point can
// be concated right into a utf16 code point.
//
// 110aaaaa 10bbbbbb
// -> 00000aaaaabbbbbb
str += String.fromCharCode(((t & 0x1F) << 6) | (buffer[i++] & 0x3F));
} else if (t >= 0xE0 && t < 0xF0) {
// also pretty straight forward. Worth noting this won't collide with
// surrogate pairs as that section has been reserved.
//
// 1110aaaa 10bbbbbb 10cccccc
// -> aaaabbbbbbcccccc
str += String.fromCharCode(
((t & 0xF) << 12) |
((buffer[i++] & 0x3F) << 6) |
(buffer[i++] & 0x3F),
);
} else if (t >= 0xF0) {
// here's where things really get nasty. These code points end up as
// utf16 surrogate pairs. It looks something like:
//
// 11110aaa 10bbbbbb 10cccccc 10dddddd
// concat the code units aaabbbbbbccccccdddddd
// subtract 0x10000
// -> 110110aabbbbbbcc 110111ccccdddddd
const t2 =
(((t & 7) << 18) |
((buffer[i++] & 0x3F) << 12) |
((buffer[i++] & 0x3F) << 6) |
(buffer[i++] & 0x3F)) -
0x10000;
str += String.fromCharCode(0xD800 + (t2 >> 10));
str += String.fromCharCode(0xDC00 + (t2 & 0x3FF));
}
}
return str;
}
// @ts-ignore we're monkey patching!
utf8.read = utf8ReadFixed;