From 22b47cdd639263313317b77a3166afad767a7ef6 Mon Sep 17 00:00:00 2001 From: "Alex Lam S.L" Date: Sat, 28 Dec 2019 18:06:51 +0000 Subject: [PATCH] improve unicode handling (#3648) --- lib/output.js | 19 ++++++---- lib/parse.js | 4 -- test/compress/unicode.js | 79 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 91 insertions(+), 11 deletions(-) diff --git a/lib/output.js b/lib/output.js index cc39c2df..38b4f8bc 100644 --- a/lib/output.js +++ b/lib/output.js @@ -119,15 +119,20 @@ function OutputStream(options) { }); } : function(str) { var s = ""; - for (var i = 0; i < str.length; i++) { - if (is_surrogate_pair_head(str[i]) && !is_surrogate_pair_tail(str[i + 1]) - || is_surrogate_pair_tail(str[i]) && !is_surrogate_pair_head(str[i - 1])) { - s += "\\u" + str.charCodeAt(i).toString(16); - } else { - s += str[i]; + for (var i = 0, j = 0; i < str.length; i++) { + var code = str.charCodeAt(i); + if (is_surrogate_pair_head(code)) { + if (is_surrogate_pair_tail(str.charCodeAt(i + 1))) { + i++; + continue; + } + } else if (!is_surrogate_pair_tail(code)) { + continue; } + s += str.slice(j, i) + "\\u" + code.toString(16); + j = i + 1; } - return s; + return j == 0 ? str : s + str.slice(j); }; function make_string(str, quote) { diff --git a/lib/parse.js b/lib/parse.js index 740ef5f2..270af9b4 100644 --- a/lib/parse.js +++ b/lib/parse.js @@ -133,14 +133,10 @@ function is_letter(code) { } function is_surrogate_pair_head(code) { - if (typeof code == "string") - code = code.charCodeAt(0); return code >= 0xd800 && code <= 0xdbff; } function is_surrogate_pair_tail(code) { - if (typeof code == "string") - code = code.charCodeAt(0); return code >= 0xdc00 && code <= 0xdfff; } diff --git a/test/compress/unicode.js b/test/compress/unicode.js index 3ef664c2..a94dc739 100644 --- a/test/compress/unicode.js +++ b/test/compress/unicode.js @@ -16,6 +16,81 @@ unicode_parse_variables: { } } +unicode_escaped_identifier: { + input: { + var \u0061 = "\ud800\udc00"; + console.log(a); + } + expect_exact: 'var a="\ud800\udc00";console.log(a);' + expect_stdout: "\ud800\udc00" +} + +unicode_identifier_ascii_only: { + beautify = { + ascii_only: true, + } + input: { + var \u0061 = "testing \udbc4\udd11"; + var bar = "h\u0065llo"; + console.log(a, \u0062\u0061r); + } + expect_exact: 'var a="testing \\udbc4\\udd11";var bar="hello";console.log(a,bar);' + expect_stdout: "testing \udbc4\udd11 hello" +} + +unicode_string_literals: { + beautify = { + ascii_only: true, + } + input: { + var a = "6 length unicode character: \udbc4\udd11"; + console.log(\u0061); + } + expect_exact: 'var a="6 length unicode character: \\udbc4\\udd11";console.log(a);' + expect_stdout: "6 length unicode character: \udbc4\udd11" +} + +check_escape_style: { + beautify = { + ascii_only: true, + } + input: { + var a = "\x01"; + var \ua0081 = "\x10"; // \u0081 only in ID_Continue + var \u0100 = "\u0100"; + var \u1000 = "\u1000"; + var \u1000 = "\ud800\udc00"; + var \u3f80 = "\udbc0\udc00"; + console.log(\u0061, \ua0081, \u0100, \u1000, \u3f80); + } + expect_exact: 'var a="\\x01";var \\ua0081="\\x10";var \\u0100="\\u0100";var \\u1000="\\u1000";var \\u1000="\\ud800\\udc00";var \\u3f80="\\udbc0\\udc00";console.log(a,\\ua0081,\\u0100,\\u1000,\\u3f80);' + expect_stdout: "\u0001 \u0010 \u0100 \ud800\udc00 \udbc0\udc00" +} + +escape_non_escaped_identifier: { + beautify = { + ascii_only: true, + } + input: { + var µþ = "µþ"; + console.log(\u00b5þ); + } + expect_exact: 'var \\u00b5\\u00fe="\\xb5\\xfe";console.log(\\u00b5\\u00fe);' + expect_stdout: "µþ" +} + +non_escape_2_non_escape: { + beautify = { + ascii_only: false, + } + input: { + var µþ = "µþ"; + console.log(\u00b5þ); + } + expect_exact: 'var µþ="µþ";console.log(µþ);' + expect_stdout: "µþ" +} + issue_2242_1: { beautify = { ascii_only: false, @@ -24,6 +99,7 @@ issue_2242_1: { console.log("\ud83d", "\ude00", "\ud83d\ude00", "\ud83d@\ude00"); } expect_exact: 'console.log("\\ud83d","\\ude00","\ud83d\ude00","\\ud83d@\\ude00");' + expect_stdout: "\ud83d \ude00 \ud83d\ude00 \ud83d@\ude00" } issue_2242_2: { @@ -34,6 +110,7 @@ issue_2242_2: { console.log("\ud83d", "\ude00", "\ud83d\ude00", "\ud83d@\ude00"); } expect_exact: 'console.log("\\ud83d","\\ude00","\\ud83d\\ude00","\\ud83d@\\ude00");' + expect_stdout: "\ud83d \ude00 \ud83d\ude00 \ud83d@\ude00" } issue_2242_3: { @@ -44,6 +121,7 @@ issue_2242_3: { console.log("\ud83d" + "\ude00", "\ud83d" + "@" + "\ude00"); } expect_exact: 'console.log("\\ud83d"+"\\ude00","\\ud83d"+"@"+"\\ude00");' + expect_stdout: "\ud83d\ude00 \ud83d@\ude00" } issue_2242_4: { @@ -54,6 +132,7 @@ issue_2242_4: { console.log("\ud83d" + "\ude00", "\ud83d" + "@" + "\ude00"); } expect_exact: 'console.log("\ud83d\ude00","\\ud83d@\\ude00");' + expect_stdout: "\ud83d\ude00 \ud83d@\ude00" } issue_2569: { -- 2.34.1