From: GoalSmashers <jakub@goalsmashers.com>
Date: Sun, 17 Nov 2013 09:31:04 +0000 (+0100)
Subject: Fixes #161 - adds ~30x faster tokenizer.
X-Git-Url: https://git.ndcode.org/public/gitweb.cgi?a=commitdiff_plain;h=81fa8edf72ba40c11a628a0e0d7279523b855574;p=clean-css.git

Fixes #161 - adds ~30x faster tokenizer.

* Splits data into 128 bytes long chunks (rounded to nearest closing parenthesis).
* Won't seek trough the whole document all the time.
---

diff --git a/History.md b/History.md
index c11f3331..3edec0cb 100644
--- a/History.md
+++ b/History.md
@@ -1,6 +1,7 @@
 [2.1.0 / 2013-xx-xx (UNRELEASED)](https://github.com/GoalSmashers/clean-css/compare/v2.0.0...HEAD)
 ==================
 
+* Fixed issue [#161](https://github.com/GoalSmashers/clean-css/issues/161) - improves tokenizer performance.
 * Fixed issue [#163](https://github.com/GoalSmashers/clean-css/issues/163) - round pixels to 2nd decimal place.
 * Fixed issue [#165](https://github.com/GoalSmashers/clean-css/issues/165) - extra space after trailing parenthesis.
 
diff --git a/lib/selectors/tokenizer.js b/lib/selectors/tokenizer.js
index 6a7e7896..8edaf756 100644
--- a/lib/selectors/tokenizer.js
+++ b/lib/selectors/tokenizer.js
@@ -1,20 +1,33 @@
+/* jshint latedef: false */
+
 module.exports = function Tokenizer(data) {
+  var chunker = new Chunker(data, 128);
+  var chunk = chunker.next();
+
   var whatsNext = function(context) {
     var cursor = context.cursor;
     var mode = context.mode;
     var closest;
 
+    if (chunk.length == context.cursor) {
+      if (chunker.isEmpty())
+        return null;
+
+      chunk = chunker.next();
+      context.cursor = 0;
+    }
+
     if (mode == 'body') {
-      closest = data.indexOf('}', cursor);
+      closest = chunk.indexOf('}', cursor);
       return closest > -1 ?
         [closest, 'bodyEnd'] :
         null;
     }
 
-    var nextSpecial = data.indexOf('@', cursor);
-    var nextEscape = mode == 'top' ? data.indexOf('__ESCAPED_COMMENT_CLEAN_CSS', cursor) : -1;
-    var nextBodyStart = data.indexOf('{', cursor);
-    var nextBodyEnd = data.indexOf('}', cursor);
+    var nextSpecial = chunk.indexOf('@', context.cursor);
+    var nextEscape = mode == 'top' ? chunk.indexOf('__ESCAPED_COMMENT_CLEAN_CSS', context.cursor) : -1;
+    var nextBodyStart = chunk.indexOf('{', context.cursor);
+    var nextBodyEnd = chunk.indexOf('}', context.cursor);
 
     closest = nextSpecial;
     if (closest == -1 || (nextEscape > -1 && nextEscape < closest))
@@ -44,7 +57,7 @@ module.exports = function Tokenizer(data) {
     while (true) {
       var next = whatsNext(context);
       if (!next) {
-        var whatsLeft = data.substring(context.cursor);
+        var whatsLeft = chunk.substring(context.cursor);
         if (whatsLeft.length > 0) {
           tokenized.push(whatsLeft);
           context.cursor += whatsLeft.length;
@@ -58,16 +71,16 @@ module.exports = function Tokenizer(data) {
       var oldMode;
 
       if (what == 'special') {
-        var fragment = data.substring(nextSpecial, context.cursor + '@font-face'.length + 1);
+        var fragment = chunk.substring(nextSpecial, context.cursor + '@font-face'.length + 1);
         var isSingle = fragment.indexOf('@import') === 0 || fragment.indexOf('@charset') === 0;
         if (isSingle) {
-          nextEnd = data.indexOf(';', nextSpecial + 1);
-          tokenized.push(data.substring(context.cursor, nextEnd + 1));
+          nextEnd = chunk.indexOf(';', nextSpecial + 1);
+          tokenized.push(chunk.substring(context.cursor, nextEnd + 1));
 
           context.cursor = nextEnd + 1;
         } else {
-          nextEnd = data.indexOf('{', nextSpecial + 1);
-          var block = data.substring(context.cursor, nextEnd).trim();
+          nextEnd = chunk.indexOf('{', nextSpecial + 1);
+          var block = chunk.substring(context.cursor, nextEnd).trim();
 
           var isFlat = fragment.indexOf('@font-face') === 0;
           oldMode = context.mode;
@@ -79,13 +92,13 @@ module.exports = function Tokenizer(data) {
           tokenized.push({ block: block, body: specialBody });
         }
       } else if (what == 'escape') {
-        nextEnd = data.indexOf('__', nextSpecial + 1);
-        var escaped = data.substring(context.cursor, nextEnd + 2);
+        nextEnd = chunk.indexOf('__', nextSpecial + 1);
+        var escaped = chunk.substring(context.cursor, nextEnd + 2);
         tokenized.push(escaped);
 
         context.cursor = nextEnd + 2;
       } else if (what == 'bodyStart') {
-        var selector = data.substring(context.cursor, nextSpecial).trim();
+        var selector = chunk.substring(context.cursor, nextSpecial).trim();
 
         oldMode = context.mode;
         context.cursor = nextSpecial + 1;
@@ -102,7 +115,7 @@ module.exports = function Tokenizer(data) {
         }
 
         if (context.mode != 'block')
-          tokenized = data.substring(context.cursor, nextSpecial);
+          tokenized = chunk.substring(context.cursor, nextSpecial);
 
         context.cursor = nextSpecial + 1;
 
@@ -119,3 +132,31 @@ module.exports = function Tokenizer(data) {
     }
   };
 };
+
+// Divides `data` into chunks of `chunkSize` for faster processing
+var Chunker = function(data, chunkSize) {
+  var chunks = [];
+  for (var cursor = 0, dataSize = data.length; cursor < dataSize;) {
+    var nextCursor = cursor + chunkSize > dataSize ?
+      dataSize - 1 :
+      cursor + chunkSize;
+
+    if (data[nextCursor] != '}')
+      nextCursor = data.indexOf('}', nextCursor);
+    if (nextCursor == -1)
+      nextCursor = data.length - 1;
+
+    chunks.push(data.substring(cursor, nextCursor + 1));
+    cursor = nextCursor + 1;
+  }
+
+  return {
+    isEmpty: function() {
+      return chunks.length === 0;
+    },
+
+    next: function() {
+      return chunks.shift() || '';
+    }
+  };
+};