improve unicode handling (#3648)
authorAlex Lam S.L <alexlamsl@gmail.com>
Sat, 28 Dec 2019 18:06:51 +0000 (18:06 +0000)
committerGitHub <noreply@github.com>
Sat, 28 Dec 2019 18:06:51 +0000 (18:06 +0000)
lib/output.js
lib/parse.js
test/compress/unicode.js

index cc39c2d..38b4f8b 100644 (file)
@@ -119,15 +119,20 @@ function OutputStream(options) {
         });
     } : function(str) {
         var s = "";
-        for (var i = 0; i < str.length; i++) {
-            if (is_surrogate_pair_head(str[i]) && !is_surrogate_pair_tail(str[i + 1])
-                || is_surrogate_pair_tail(str[i]) && !is_surrogate_pair_head(str[i - 1])) {
-                s += "\\u" + str.charCodeAt(i).toString(16);
-            } else {
-                s += str[i];
+        for (var i = 0, j = 0; i < str.length; i++) {
+            var code = str.charCodeAt(i);
+            if (is_surrogate_pair_head(code)) {
+                if (is_surrogate_pair_tail(str.charCodeAt(i + 1))) {
+                    i++;
+                    continue;
+                }
+            } else if (!is_surrogate_pair_tail(code)) {
+                continue;
             }
+            s += str.slice(j, i) + "\\u" + code.toString(16);
+            j = i + 1;
         }
-        return s;
+        return j == 0 ? str : s + str.slice(j);
     };
 
     function make_string(str, quote) {
index 740ef5f..270af9b 100644 (file)
@@ -133,14 +133,10 @@ function is_letter(code) {
 }
 
 function is_surrogate_pair_head(code) {
-    if (typeof code == "string")
-        code = code.charCodeAt(0);
     return code >= 0xd800 && code <= 0xdbff;
 }
 
 function is_surrogate_pair_tail(code) {
-    if (typeof code == "string")
-        code = code.charCodeAt(0);
     return code >= 0xdc00 && code <= 0xdfff;
 }
 
index 3ef664c..a94dc73 100644 (file)
@@ -16,6 +16,81 @@ unicode_parse_variables: {
     }
 }
 
+unicode_escaped_identifier: {
+    input: {
+        var \u0061 = "\ud800\udc00";
+        console.log(a);
+    }
+    expect_exact: 'var a="\ud800\udc00";console.log(a);'
+    expect_stdout: "\ud800\udc00"
+}
+
+unicode_identifier_ascii_only: {
+    beautify = {
+        ascii_only: true,
+    }
+    input: {
+        var \u0061 = "testing \udbc4\udd11";
+        var bar = "h\u0065llo";
+        console.log(a, \u0062\u0061r);
+    }
+    expect_exact: 'var a="testing \\udbc4\\udd11";var bar="hello";console.log(a,bar);'
+    expect_stdout: "testing \udbc4\udd11 hello"
+}
+
+unicode_string_literals: {
+    beautify = {
+        ascii_only: true,
+    }
+    input: {
+        var a = "6 length unicode character: \udbc4\udd11";
+        console.log(\u0061);
+    }
+    expect_exact: 'var a="6 length unicode character: \\udbc4\\udd11";console.log(a);'
+    expect_stdout: "6 length unicode character: \udbc4\udd11"
+}
+
+check_escape_style: {
+    beautify = {
+        ascii_only: true,
+    }
+    input: {
+        var a = "\x01";
+        var \ua0081 = "\x10"; // \u0081 only in ID_Continue
+        var \u0100 = "\u0100";
+        var \u1000 = "\u1000";
+        var \u1000 = "\ud800\udc00";
+        var \u3f80 = "\udbc0\udc00";
+        console.log(\u0061, \ua0081, \u0100, \u1000, \u3f80);
+    }
+    expect_exact: 'var a="\\x01";var \\ua0081="\\x10";var \\u0100="\\u0100";var \\u1000="\\u1000";var \\u1000="\\ud800\\udc00";var \\u3f80="\\udbc0\\udc00";console.log(a,\\ua0081,\\u0100,\\u1000,\\u3f80);'
+    expect_stdout: "\u0001 \u0010 \u0100 \ud800\udc00 \udbc0\udc00"
+}
+
+escape_non_escaped_identifier: {
+    beautify = {
+        ascii_only: true,
+    }
+    input: {
+        var µþ = "µþ";
+        console.log(\u00b5þ);
+    }
+    expect_exact: 'var \\u00b5\\u00fe="\\xb5\\xfe";console.log(\\u00b5\\u00fe);'
+    expect_stdout: "µþ"
+}
+
+non_escape_2_non_escape: {
+    beautify = {
+        ascii_only: false,
+    }
+    input: {
+        var µþ = "µþ";
+        console.log(\u00b5þ);
+    }
+    expect_exact: 'var µþ="µþ";console.log(µþ);'
+    expect_stdout: "µþ"
+}
+
 issue_2242_1: {
     beautify = {
         ascii_only: false,
@@ -24,6 +99,7 @@ issue_2242_1: {
         console.log("\ud83d", "\ude00", "\ud83d\ude00", "\ud83d@\ude00");
     }
     expect_exact: 'console.log("\\ud83d","\\ude00","\ud83d\ude00","\\ud83d@\\ude00");'
+    expect_stdout: "\ud83d \ude00 \ud83d\ude00 \ud83d@\ude00"
 }
 
 issue_2242_2: {
@@ -34,6 +110,7 @@ issue_2242_2: {
         console.log("\ud83d", "\ude00", "\ud83d\ude00", "\ud83d@\ude00");
     }
     expect_exact: 'console.log("\\ud83d","\\ude00","\\ud83d\\ude00","\\ud83d@\\ude00");'
+    expect_stdout: "\ud83d \ude00 \ud83d\ude00 \ud83d@\ude00"
 }
 
 issue_2242_3: {
@@ -44,6 +121,7 @@ issue_2242_3: {
         console.log("\ud83d" + "\ude00", "\ud83d" + "@" + "\ude00");
     }
     expect_exact: 'console.log("\\ud83d"+"\\ude00","\\ud83d"+"@"+"\\ude00");'
+    expect_stdout: "\ud83d\ude00 \ud83d@\ude00"
 }
 
 issue_2242_4: {
@@ -54,6 +132,7 @@ issue_2242_4: {
         console.log("\ud83d" + "\ude00", "\ud83d" + "@" + "\ude00");
     }
     expect_exact: 'console.log("\ud83d\ude00","\\ud83d@\\ude00");'
+    expect_stdout: "\ud83d\ude00 \ud83d@\ude00"
 }
 
 issue_2569: {