Skip to content

Commit

Permalink
fix utf8 -> utf16 decoding bug on surrogate pairs
Browse files Browse the repository at this point in the history
This fixes protobufjs#1473

The custom utf8 -> utf16 decoder appears to be subtly flawed. From my reading it appears the chunking mechanism doesn't account for surrogate pairs at the end of a chunk causing variable size chunks. A larger chunk followed by a smaller chunk leaves behind garbage that'll be included in the latter chunk.

It looks like the chunking mechanism was added to prevent stack overflows when calling `formCharCode` with too many args. From some benchmarking it appears putting utf16 code units in an array and spreading that into `fromCharCode` wasn't helping performance much anyway. I simplified it significantly.

Here's a repro of the existing encoding bug in a fuzzing suite
https://repl.it/@turbio/oh-no-our-strings#decoder.js
  • Loading branch information
turbio committed Sep 10, 2020
1 parent 9893e35 commit e798c43
Showing 1 changed file with 29 additions and 31 deletions.
60 changes: 29 additions & 31 deletions lib/utf8/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -38,37 +38,35 @@ utf8.length = function utf8_length(string) {
* @returns {string} String read
*/
utf8.read = function utf8_read(buffer, start, end) {
var len = end - start;
if (len < 1)
return "";
var parts = null,
chunk = [],
i = 0, // char offset
t; // temporary
while (start < end) {
t = buffer[start++];
if (t < 128)
chunk[i++] = t;
else if (t > 191 && t < 224)
chunk[i++] = (t & 31) << 6 | buffer[start++] & 63;
else if (t > 239 && t < 365) {
t = ((t & 7) << 18 | (buffer[start++] & 63) << 12 | (buffer[start++] & 63) << 6 | buffer[start++] & 63) - 0x10000;
chunk[i++] = 0xD800 + (t >> 10);
chunk[i++] = 0xDC00 + (t & 1023);
} else
chunk[i++] = (t & 15) << 12 | (buffer[start++] & 63) << 6 | buffer[start++] & 63;
if (i > 8191) {
(parts || (parts = [])).push(String.fromCharCode.apply(String, chunk));
i = 0;
}
}
if (parts) {
if (i)
parts.push(String.fromCharCode.apply(String, chunk.slice(0, i)));
return parts.join("");
}
return String.fromCharCode.apply(String, chunk.slice(0, i));
};
if (end - start < 1) {
return '';
}

var str = '';
for (var i = start; i < end;) {
var t = buffer[i++];
if (t <= 0x7F) {
str += String.fromCharCode(t);
} else if (t >= 0xC0 && t < 0xE0) {
str += String.fromCharCode(((t & 0x1F) << 6) | (buffer[i++] & 0x3F));
} else if (t >= 0xE0 && t < 0xF0) {
str += String.fromCharCode(
((t & 0xF) << 12) |
((buffer[i++] & 0x3F) << 6) |
(buffer[i++] & 0x3F
));
} else if (t >= 0xF0) {
var t2 = (((t & 7) << 18) |
((buffer[i++] & 0x3F) << 12) |
((buffer[i++] & 0x3F) << 6) |
(buffer[i++] & 0x3F)) - 0x10000;
str += String.fromCharCode(0xD800 + (t2 >> 10));
str += String.fromCharCode(0xDC00 + (t2 & 0x3FF));
}
}

return str;
}

/**
* Writes a string as UTF8 bytes.
Expand Down

0 comments on commit e798c43

Please sign in to comment.