From 57b0fcabce916102ca4c079c308b704a0b3b4511 Mon Sep 17 00:00:00 2001
From: GoalSmashers <jakub@goalsmashers.com>
Date: Sat, 16 Mar 2013 16:39:19 +0100
Subject: [PATCH] Fixes #46 - special characters in URLs and attributes.

* Simplified preserving 'content' attribute's content.
---
 History.md        |   6 ++
 lib/clean.js      | 218 ++++++++++++++++++++++++----------------------
 test/unit-test.js |  10 ++-
 3 files changed, 128 insertions(+), 106 deletions(-)

diff --git a/History.md b/History.md
index 1b32cdee..faefb1c6 100644
--- a/History.md
+++ b/History.md
@@ -1,3 +1,9 @@
+1.0 / 2013-xx-xx
+==================
+
+* Fixed issue [#46](https://github.com/GoalSmashers/clean-css/issues/46) - preserving special characters in URLs
+  and attributes.
+
 0.10.1 / 2013-02-14
 ==================
 
diff --git a/lib/clean.js b/lib/clean.js
index 25119390..51215fe4 100644
--- a/lib/clean.js
+++ b/lib/clean.js
@@ -31,7 +31,8 @@ var CleanCSS = {
   process: function(data, options) {
     var context = {
       specialComments: [],
-      contentBlocks: []
+      freeTextBlocks: [],
+      urlBlocks: []
     };
     var replace = function() {
       if (typeof arguments[0] == 'function')
@@ -74,12 +75,7 @@ var CleanCSS = {
       data = CleanCSS._stripComments(context, data);
     });
 
-    // replace content: with a placeholder
-    replace(function stripContent() {
-      data = CleanCSS._stripContent(context, data);
-    });
-
-    // strip url's parentheses if possible (no spaces inside)
+    // strip parentheses in urls if possible (no spaces inside)
     replace(/url\(['"]([^\)]+)['"]\)/g, function(urlMatch) {
       if (urlMatch.match(/\s/g) !== null)
         return urlMatch;
@@ -87,6 +83,47 @@ var CleanCSS = {
         return urlMatch.replace(/\(['"]/, '(').replace(/['"]\)$/, ')');
     });
 
+    // strip parentheses in animation & font names
+    replace(/(animation|animation\-name|font|font\-family):([^;}]+)/g, function(match, propertyName, fontDef) {
+      return propertyName + ':' + fontDef.replace(/['"]([\w\-]+)['"]/g, '$1');
+    });
+
+    // strip parentheses in @keyframes
+    replace(/@(\-moz\-|\-o\-|\-webkit\-)?keyframes ([^{]+)/g, function(match, prefix, name) {
+      prefix = prefix || '';
+      return '@' + prefix + 'keyframes ' + (name.indexOf(' ') > -1 ? name : name.replace(/['"]/g, ''));
+    });
+
+    // IE shorter filters, but only if single (IE 7 issue)
+    replace(/progid:DXImageTransform\.Microsoft\.(Alpha|Chroma)(\([^\)]+\))([;}'"])/g, function(match, filter, args, suffix) {
+      return filter.toLowerCase() + args + suffix;
+    });
+
+    // strip parentheses in attribute values
+    replace(/\[([^\]]+)\]/g, function(match, content) {
+      var eqIndex = content.indexOf('=');
+      if (eqIndex < 0 && content.indexOf('\'') < 0 && content.indexOf('"') < 0)
+        return match;
+
+      var key = content.substring(0, eqIndex);
+      var value = content.substring(eqIndex + 1, content.length);
+
+      if (/^['"](?:[a-zA-Z][a-zA-Z\d\-]+)['"]$/.test(value))
+        return '[' + key + '=' + value.substring(1, value.length - 1) + ']';
+      else
+        return match;
+    });
+
+    // replace all free text content with a placeholder
+    replace(function stripFreeText() {
+      data = CleanCSS._stripFreeText(context, data);
+    });
+
+    // replace url(...) with a placeholder
+    replace(function stripUrls() {
+      data = CleanCSS._stripUrls(context, data);
+    });
+
     // line breaks
     if (!options.keepBreaks)
       replace(/[\r]?\n/g, ' ');
@@ -119,32 +156,6 @@ var CleanCSS = {
     // trailing semicolons
     replace(/;\}/g, '}');
 
-    // strip quotation in animation & font names
-    replace(/(animation|animation\-name|font|font\-family):([^;}]+)/g, function(match, propertyName, fontDef) {
-      return propertyName + ':' + fontDef.replace(/['"]([\w\-]+)['"]/g, '$1');
-    });
-
-    // strip quotation in @keyframes
-    replace(/@(\-moz\-|\-o\-|\-webkit\-)?keyframes ([^{]+)/g, function(match, prefix, name) {
-      prefix = prefix || '';
-      return '@' + prefix + 'keyframes ' + (name.indexOf(' ') > -1 ? name : name.replace(/['"]/g, ''));
-    });
-
-    // strip quotation in attribute values
-    replace(/\[([^\]]+)\]/g, function(match, content) {
-      var eqIndex = content.indexOf('=');
-      if (eqIndex < 0 && content.indexOf('\'') < 0 && content.indexOf('"') < 0)
-        return match;
-
-      var key = content.substring(0, eqIndex);
-      var value = content.substring(eqIndex + 1, content.length);
-
-      if (/^['"](?:[a-zA-Z][a-zA-Z\d\-]+)['"]$/.test(value))
-        return '[' + key + '=' + value.substring(1, value.length - 1) + ']';
-      else
-        return match;
-    });
-
     // rgb to hex colors
     replace(/rgb\s*\(([^\)]+)\)/g, function(match, color) {
       var parts = color.split(',');
@@ -184,11 +195,6 @@ var CleanCSS = {
         return match;
     });
 
-    // IE shorter filters, but only if single (IE 7 issue)
-    replace(/progid:DXImageTransform\.Microsoft\.(Alpha|Chroma)(\([^\)]+\))([;}'"])/g, function(match, filter, args, suffix) {
-      return filter.toLowerCase() + args + suffix;
-    });
-
     // zero + unit to zero
     replace(/(\s|:|,)0(?:px|em|ex|cm|mm|in|pt|pc|%)/g, '$1' + '0');
     replace(/rect\(0(?:px|em|ex|cm|mm|in|pt|pc|%)/g, 'rect(0');
@@ -284,12 +290,21 @@ var CleanCSS = {
     // remove universal selector when not needed (*#id, *.class etc)
     replace(/\*([\.#:\[])/g, '$1');
 
-    // Restore special comments, content content, and spaces inside calc back
-    var specialCommentsCount = context.specialComments.length;
-
+    // Restore spaces inside calc back
     replace(/calc\([^\}]+\}/g, function(match) {
       return match.replace(/\+/g, ' + ');
     });
+
+    // Restore urls, content content, and special comments (in that order)
+    replace(/__URL__/g, function() {
+      return context.urlBlocks.shift();
+    });
+
+    replace(/__CSSFREETEXT__/g, function() {
+      return context.freeTextBlocks.shift();
+    });
+
+    var specialCommentsCount = context.specialComments.length;
     replace(/__CSSCOMMENT__/g, function() {
       switch (options.keepSpecialComments) {
         case '*':
@@ -302,9 +317,6 @@ var CleanCSS = {
           return '';
       }
     });
-    replace(/__CSSCONTENT__/g, function() {
-      return context.contentBlocks.shift();
-    });
 
     // trim spaces at beginning and end
     return data.trim();
@@ -314,10 +326,10 @@ var CleanCSS = {
   // for further restoring. Plain comments are removed. It's done by scanning datq using
   // String#indexOf scanning instead of regexps to speed up the process.
   _stripComments: function(context, data) {
-    var tempData = [],
-      nextStart = 0,
-      nextEnd = 0,
-      cursor = 0;
+    var tempData = [];
+    var nextStart = 0;
+    var nextEnd = 0;
+    var cursor = 0;
 
     for (; nextEnd < data.length; ) {
       nextStart = data.indexOf('/*', nextEnd);
@@ -339,75 +351,73 @@ var CleanCSS = {
       data;
   },
 
-  // Strip content tags by replacing them by the __CSSCONTENT__
+  // Strip content tags by replacing them by the __CSSFREETEXT__
   // marker for further restoring. It's done via string scanning
   // instead of regexps to speed up the process.
-  _stripContent: function(context, data) {
-    var tempData = [],
-      nextStart = 0,
-      nextEnd = 0,
-      cursor = 0,
-      matchedParenthesis = null;
-    var allowedPrefixes = [' ', '{', ';', this.lineBreak];
-    var skipBy = 'content'.length;
-
-    // Find either first (matchedParenthesis == null) or second matching
-    // parenthesis so that we can determine boundaries of content block.
-    var nextParenthesis = function(pos) {
-      var min,
-        max = data.length;
-
-      if (matchedParenthesis) {
-        min = data.indexOf(matchedParenthesis, pos);
-        if (min == -1)
-          min = max;
-      } else {
-        var next1 = data.indexOf("'", pos);
-        var next2 = data.indexOf('"', pos);
-        if (next1 == -1)
-          next1 = max;
-        if (next2 == -1)
-          next2 = max;
-
-        min = next1 > next2 ? next2 : next1;
-      }
+  _stripFreeText: function(context, data) {
+    var tempData = [];
+    var nextStart = 0;
+    var nextEnd = 0;
+    var cursor = 0;
+    var matchedParenthesis = null;
+    var singleParenthesis = "'";
+    var doubleParenthesis = '"';
+    var dataLength = data.length;
 
-      if (min == max)
-        return -1;
+    for (; nextEnd < data.length; ) {
+      var nextStartSingle = data.indexOf(singleParenthesis, nextEnd + 1);
+      var nextStartDouble = data.indexOf(doubleParenthesis, nextEnd + 1);
 
-      if (matchedParenthesis) {
-        matchedParenthesis = null;
-        return min;
-      } else {
-        // check if there's anything else between pos and min
-        // that doesn't match ':' or whitespace
-        if (/[^:\s]/.test(data.substring(pos, min)))
-          return -1;
+      if (nextStartSingle == -1)
+        nextStartSingle = dataLength;
+      if (nextStartDouble == -1)
+        nextStartDouble = dataLength;
 
-        matchedParenthesis = data.charAt(min);
-        return min + 1;
+      if (nextStartSingle < nextStartDouble) {
+        nextStart = nextStartSingle;
+        matchedParenthesis = singleParenthesis;
+      } else {
+        nextStart = nextStartDouble;
+        matchedParenthesis = doubleParenthesis;
       }
-    };
 
-    for (; nextEnd < data.length; ) {
-      nextStart = data.indexOf('content', nextEnd);
       if (nextStart == -1)
         break;
 
-      // skip by `skipBy` bytes if matched declaration is not a property but ID, class name or a some substring
-      if (allowedPrefixes.indexOf(data[nextStart - 1]) == -1) {
-        nextEnd += skipBy;
-        continue;
-      }
-
-      nextStart = nextParenthesis(nextStart + skipBy);
-      nextEnd = nextParenthesis(nextStart);
+      nextEnd = data.indexOf(matchedParenthesis, nextStart + 1);
       if (nextStart == -1 || nextEnd == -1)
         break;
 
-      tempData.push(data.substring(cursor, nextStart - 1));
-      tempData.push('__CSSCONTENT__');
-      context.contentBlocks.push(data.substring(nextStart - 1, nextEnd + 1));
+      tempData.push(data.substring(cursor, nextStart));
+      tempData.push('__CSSFREETEXT__');
+      context.freeTextBlocks.push(data.substring(nextStart, nextEnd + 1));
+      cursor = nextEnd + 1;
+    }
+
+    return tempData.length > 0 ?
+      tempData.join('') + data.substring(cursor, data.length) :
+      data;
+  },
+
+  // Strip urls by replacing them by the __URL__
+  // marker for further restoring. It's done via string scanning
+  // instead of regexps to speed up the process.
+  _stripUrls: function(context, data) {
+    var nextStart = 0;
+    var nextEnd = 0;
+    var cursor = 0;
+    var tempData = [];
+
+    for (; nextEnd < data.length; ) {
+      nextStart = data.indexOf('url(', nextEnd);
+      if (nextStart == -1)
+        break;
+
+      nextEnd = data.indexOf(')', nextStart);
+
+      tempData.push(data.substring(cursor, nextStart));
+      tempData.push('__URL__');
+      context.urlBlocks.push(data.substring(nextStart, nextEnd + 1));
       cursor = nextEnd + 1;
     }
 
diff --git a/test/unit-test.js b/test/unit-test.js
index 995eaa39..92f54530 100644
--- a/test/unit-test.js
+++ b/test/unit-test.js
@@ -528,7 +528,10 @@ vows.describe('clean-units').addBatch({
     'not add a space before url\'s hash': [
       "url(\"../fonts/d90b3358-e1e2-4abb-ba96-356983a54c22.svg#d90b3358-e1e2-4abb-ba96-356983a54c22\")",
       "url(../fonts/d90b3358-e1e2-4abb-ba96-356983a54c22.svg#d90b3358-e1e2-4abb-ba96-356983a54c22)"
-    ]
+    ],
+    'keep urls from being stripped down #1': 'a{background:url(/image-1.0.png)}',
+    'keep urls from being stripped down #2': "a{background:url(/image-white.png)}",
+    'keep __URL__ in comments (so order is important)': '/*! __URL__ */a{}'
   }),
   'fonts': cssContext({
     'keep format quotation': "@font-face{font-family:PublicVintage;src:url(./PublicVintage.otf) format('opentype')}",
@@ -603,7 +606,10 @@ vows.describe('clean-units').addBatch({
     'should strip quotations if is less specific selectors': [
       'a[data-href*=\'object1\']{border-color:red}a[data-href|=\'object2\']{border-color:#0f0}',
       'a[data-href*=object1]{border-color:red}a[data-href|=object2]{border-color:#0f0}'
-    ]
+    ],
+    'should keep special characters inside attributes #1': "a[data-css='color:white']",
+    'should keep special characters inside attributes #2': "a[data-text='a\nb\nc']",
+    'should keep special characters inside attributes #3': 'a[href="/version-0.01.html"]'
   }),
   'ie filters': cssContext({
     'short alpha': [
-- 
2.34.1