fix utf8 -> utf16 decoding bug on surrogate pairs

This fixes protobufjs#1473 The custom utf8 -> utf16 decoder appears to be subtly flawed. From my reading it appears the chunking mechanism doesn't account for surrogate pairs at the end of a chunk causing variable size chunks. A larger chunk followed by a smaller chunk leaves behind garbage that'll be included in the latter chunk. It looks like the chunking mechanism was added to prevent stack overflows when calling `formCharCode` with too many args. From some benchmarking it appears putting utf16 code units in an array and spreading that into `fromCharCode` wasn't helping performance much anyway. I simplified it significantly. Here's a repro of the existing encoding bug in a fuzzing suite https://repl.it/@turbio/oh-no-our-strings#decoder.js
turbio · Sep 10, 2020 · e798c43 · e798c43
1 parent 9893e35
commit e798c43
Showing 1 changed file with 29 additions and 31 deletions.
diff --git a/lib/utf8/index.js b/lib/utf8/index.js
@@ -38,37 +38,35 @@ utf8.length = function utf8_length(string) {
  * @returns {string} String read
  */
 utf8.read = function utf8_read(buffer, start, end) {
-    var len = end - start;
-    if (len < 1)
-        return "";
-    var parts = null,
-        chunk = [],
-        i = 0, // char offset
-        t;     // temporary
-    while (start < end) {
-        t = buffer[start++];
-        if (t < 128)
-            chunk[i++] = t;
-        else if (t > 191 && t < 224)
-            chunk[i++] = (t & 31) << 6 | buffer[start++] & 63;
-        else if (t > 239 && t < 365) {
-            t = ((t & 7) << 18 | (buffer[start++] & 63) << 12 | (buffer[start++] & 63) << 6 | buffer[start++] & 63) - 0x10000;
-            chunk[i++] = 0xD800 + (t >> 10);
-            chunk[i++] = 0xDC00 + (t & 1023);
-        } else
-            chunk[i++] = (t & 15) << 12 | (buffer[start++] & 63) << 6 | buffer[start++] & 63;
-        if (i > 8191) {
-            (parts || (parts = [])).push(String.fromCharCode.apply(String, chunk));
-            i = 0;
-        }
-    }
-    if (parts) {
-        if (i)
-            parts.push(String.fromCharCode.apply(String, chunk.slice(0, i)));
-        return parts.join("");
-    }
-    return String.fromCharCode.apply(String, chunk.slice(0, i));
-};
+	if (end - start < 1) {
+		return '';
+	}
+
+	var str = '';
+	for (var i = start; i < end;) {
+		var t = buffer[i++];
+		if (t <= 0x7F) {
+			str += String.fromCharCode(t);
+		} else if (t >= 0xC0 && t < 0xE0) {
+			str += String.fromCharCode(((t & 0x1F) << 6) | (buffer[i++] & 0x3F));
+		} else if (t >= 0xE0 && t < 0xF0) {
+			str += String.fromCharCode(
+				((t & 0xF) << 12) |
+				((buffer[i++] & 0x3F) << 6) |
+				(buffer[i++] & 0x3F
+			));
+		} else if (t >= 0xF0) {
+			var t2 = (((t & 7) << 18) |
+				((buffer[i++] & 0x3F) << 12) |
+				((buffer[i++] & 0x3F) << 6) |
+				(buffer[i++] & 0x3F)) - 0x10000;
+			str += String.fromCharCode(0xD800 + (t2 >> 10));
+			str += String.fromCharCode(0xDC00 + (t2 & 0x3FF));
+		}
+	}
+
+	return str;
+}
 
 /**
  * Writes a string as UTF8 bytes.