Convert tabs to spaces in htmlparser.js.
authorJuriy Zaytsev <kangax@gmail.com>
Mon, 22 Feb 2010 03:15:30 +0000 (22:15 -0500)
committerJuriy Zaytsev <kangax@gmail.com>
Mon, 22 Feb 2010 03:15:30 +0000 (22:15 -0500)
index.html
src/htmlparser.js

index b930529..a322615 100644 (file)
           <ul>
             <li>
               <input type="checkbox" id="remove-comments" checked>
-              <label for="remove-comments">Remove comments (</label>
-              <input type="checkbox" id="remove-comments-from-cdata" checked>
-              <label for="remove-comments-from-cdata">also from scripts and styles )</label>
+              <label for="remove-comments">Remove comments</label>
               <span class="quiet short" style="margin-left:1.5em">
                 Conditional comments are left intact.
               </span>
             </li>
+            <li>
+              <input type="checkbox" id="remove-comments-from-cdata" checked>
+              <label for="remove-comments-from-cdata">Also remove comments from scripts and styles</label>
+            </li>
             <li>
               <input type="checkbox" id="remove-cdata-sections-from-cdata" checked>
               <label for="remove-cdata-sections-from-cdata">Remove CDATA sections from scripts and styles</label>
index 75537c5..31dcfd0 100644 (file)
@@ -1,5 +1,6 @@
 /*
  * HTML Parser By John Resig (ejohn.org)
+ * Modified by Juriy "kangax" Zaytsev
  * Original code by Erik Arvidsson, Mozilla Public License
  * http://erik.eae.net/simplehtmlparser/simplehtmlparser.js
  *
 
 (function(){
 
-       // Regular Expressions for parsing tags and attributes
-       var startTag = /^<(\w+)((?:\s+[\w:-]+(?:\s*=\s*(?:(?:"[^"]*")|(?:'[^']*')|[^>\s]+))?)*)\s*(\/?)>/,
-                 endTag = /^<\/(\w+)[^>]*>/,
-                 attr = /([\w:-]+)(?:\s*=\s*(?:(?:"((?:\\.|[^"])*)")|(?:'((?:\\.|[^'])*)')|([^>\s]+)))?/g,
-                 doctype = /^<!DOCTYPE [^>]+>/i;
-               
-       // Empty Elements - HTML 4.01
-       var empty = makeMap("area,base,basefont,br,col,frame,hr,img,input,isindex,link,meta,param,embed");
+  // Regular Expressions for parsing tags and attributes
+  var startTag = /^<(\w+)((?:\s+[\w:-]+(?:\s*=\s*(?:(?:"[^"]*")|(?:'[^']*')|[^>\s]+))?)*)\s*(\/?)>/,
+      endTag = /^<\/(\w+)[^>]*>/,
+      attr = /([\w:-]+)(?:\s*=\s*(?:(?:"((?:\\.|[^"])*)")|(?:'((?:\\.|[^'])*)')|([^>\s]+)))?/g,
+      doctype = /^<!DOCTYPE [^>]+>/i;
+    
+  // Empty Elements - HTML 4.01
+  var empty = makeMap("area,base,basefont,br,col,frame,hr,img,input,isindex,link,meta,param,embed");
 
-       // Block Elements - HTML 4.01
-       var block = makeMap("address,applet,blockquote,button,center,dd,del,dir,div,dl,dt,fieldset,form,frameset,hr,iframe,ins,isindex,li,map,menu,noframes,noscript,object,ol,p,pre,script,table,tbody,td,tfoot,th,thead,tr,ul");
+  // Block Elements - HTML 4.01
+  var block = makeMap("address,applet,blockquote,button,center,dd,del,dir,div,dl,dt,fieldset,form,frameset,hr,iframe,ins,isindex,li,map,menu,noframes,noscript,object,ol,p,pre,script,table,tbody,td,tfoot,th,thead,tr,ul");
 
-       // Inline Elements - HTML 4.01
-       var inline = makeMap("a,abbr,acronym,applet,b,basefont,bdo,big,br,button,cite,code,del,dfn,em,font,i,iframe,img,input,ins,kbd,label,map,object,q,s,samp,script,select,small,span,strike,strong,sub,sup,textarea,tt,u,var");
+  // Inline Elements - HTML 4.01
+  var inline = makeMap("a,abbr,acronym,applet,b,basefont,bdo,big,br,button,cite,code,del,dfn,em,font,i,iframe,img,input,ins,kbd,label,map,object,q,s,samp,script,select,small,span,strike,strong,sub,sup,textarea,tt,u,var");
 
-       // Elements that you can, intentionally, leave open
-       // (and which close themselves)
-       var closeSelf = makeMap("colgroup,dd,dt,li,options,p,td,tfoot,th,thead,tr");
+  // Elements that you can, intentionally, leave open
+  // (and which close themselves)
+  var closeSelf = makeMap("colgroup,dd,dt,li,options,p,td,tfoot,th,thead,tr");
 
-       // Attributes that have their values filled in disabled="disabled"
-       var fillAttrs = makeMap("checked,compact,declare,defer,disabled,ismap,multiple,nohref,noresize,noshade,nowrap,readonly,selected");
+  // Attributes that have their values filled in disabled="disabled"
+  var fillAttrs = makeMap("checked,compact,declare,defer,disabled,ismap,multiple,nohref,noresize,noshade,nowrap,readonly,selected");
 
-       // Special Elements (can contain anything)
-       var special = makeMap("script,style");
-       
-       var reCache = { }, stackedTag, re;
+  // Special Elements (can contain anything)
+  var special = makeMap("script,style");
+  
+  var reCache = { }, stackedTag, re;
 
-       var HTMLParser = this.HTMLParser = function( html, handler ) {
-               var index, chars, match, stack = [], last = html;
-               stack.last = function(){
-                       return this[ this.length - 1 ];
-               };
+  var HTMLParser = this.HTMLParser = function( html, handler ) {
+    var index, chars, match, stack = [], last = html;
+    stack.last = function(){
+      return this[ this.length - 1 ];
+    };
 
-               while ( html ) {
-                       chars = true;
+    while ( html ) {
+      chars = true;
 
-                       // Make sure we're not in a script or style element
-                       if ( !stack.last() || !special[ stack.last() ] ) {
+      // Make sure we're not in a script or style element
+      if ( !stack.last() || !special[ stack.last() ] ) {
 
-                               // Comment
-                               if ( html.indexOf("<!--") === 0 ) {
-                                       index = html.indexOf("-->");
-       
-                                       if ( index >= 0 ) {
-                                               if ( handler.comment )
-                                                       handler.comment( html.substring( 4, index ) );
-                                               html = html.substring( index + 3 );
-                                               chars = false;
-                                       }
-                               }
-             else if ( match = doctype.exec( html )) {
-               if ( handler.doctype )
-                                               handler.doctype( match[0] );
-                                       html = html.substring( match[0].length );
-                                       chars = false;
-                               
-                               // end tag
-                               } else if ( html.indexOf("</") === 0 ) {
-                                       match = html.match( endTag );
-       
-                                       if ( match ) {
-                                               html = html.substring( match[0].length );
-                                               match[0].replace( endTag, parseEndTag );
-                                               chars = false;
-                                       }
-       
-                               // start tag
-                               } else if ( html.indexOf("<") === 0 ) {
-                                       match = html.match( startTag );
-       
-                                       if ( match ) {
-                                               html = html.substring( match[0].length );
-                                               match[0].replace( startTag, parseStartTag );
-                                               chars = false;
-                                       }
-                               }
+        // Comment
+        if ( html.indexOf("<!--") === 0 ) {
+          index = html.indexOf("-->");
+  
+          if ( index >= 0 ) {
+            if ( handler.comment )
+              handler.comment( html.substring( 4, index ) );
+            html = html.substring( index + 3 );
+            chars = false;
+          }
+        }
+        else if ( match = doctype.exec( html )) {
+          if ( handler.doctype )
+            handler.doctype( match[0] );
+          html = html.substring( match[0].length );
+          chars = false;
+        
+        // end tag
+        } else if ( html.indexOf("</") === 0 ) {
+          match = html.match( endTag );
+  
+          if ( match ) {
+            html = html.substring( match[0].length );
+            match[0].replace( endTag, parseEndTag );
+            chars = false;
+          }
+  
+        // start tag
+        } else if ( html.indexOf("<") === 0 ) {
+          match = html.match( startTag );
+  
+          if ( match ) {
+            html = html.substring( match[0].length );
+            match[0].replace( startTag, parseStartTag );
+            chars = false;
+          }
+        }
 
-                               if ( chars ) {
-                                       index = html.indexOf("<");
-                                       
-                                       var text = index < 0 ? html : html.substring( 0, index );
-                                       html = index < 0 ? "" : html.substring( index );
-                                       
-                                       if ( handler.chars )
-                                               handler.chars( text );
-                               }
+        if ( chars ) {
+          index = html.indexOf("<");
+          
+          var text = index < 0 ? html : html.substring( 0, index );
+          html = index < 0 ? "" : html.substring( index );
+          
+          if ( handler.chars )
+            handler.chars( text );
+        }
 
-                       } else {
-                         
-                         stackedTag = stack.last().toLowerCase();
-                         reStackedTag = reCache[stackedTag] || (reCache[stackedTag] = new RegExp("([\\s\\S]*?)<\/" + stackedTag + "[^>]*>", "i"));
-                         
-                               html = html.replace(reStackedTag, function(all, text) {
-                                 if (stackedTag !== 'script' && stackedTag !== 'style') {
-                                   text = text
-                                         .replace(/<!--([\s\S]*?)-->/g, "$1")
-                                               .replace(/<!\[CDATA\[([\s\S]*?)\]\]>/g, "$1");
-                                 }
+      } else {
+        
+        stackedTag = stack.last().toLowerCase();
+        reStackedTag = reCache[stackedTag] || (reCache[stackedTag] = new RegExp("([\\s\\S]*?)<\/" + stackedTag + "[^>]*>", "i"));
+        
+        html = html.replace(reStackedTag, function(all, text) {
+          if (stackedTag !== 'script' && stackedTag !== 'style') {
+            text = text
+              .replace(/<!--([\s\S]*?)-->/g, "$1")
+              .replace(/<!\[CDATA\[([\s\S]*?)\]\]>/g, "$1");
+          }
 
-                                       if ( handler.chars )
-                                               handler.chars( text );
+          if ( handler.chars )
+            handler.chars( text );
 
-                                       return "";
-                               });
+          return "";
+        });
 
-                               parseEndTag( "", stackedTag );
-                       }
+        parseEndTag( "", stackedTag );
+      }
 
-                       if ( html == last )
-                               throw "Parse Error: " + html;
-                       last = html;
-               }
-               
-               // Clean up any remaining tags
-               parseEndTag();
+      if ( html == last )
+        throw "Parse Error: " + html;
+      last = html;
+    }
+    
+    // Clean up any remaining tags
+    parseEndTag();
 
-               function parseStartTag( tag, tagName, rest, unary ) {
-                       if ( block[ tagName ] ) {
-                               while ( stack.last() && inline[ stack.last() ] ) {
-                                       parseEndTag( "", stack.last() );
-                               }
-                       }
+    function parseStartTag( tag, tagName, rest, unary ) {
+      if ( block[ tagName ] ) {
+        while ( stack.last() && inline[ stack.last() ] ) {
+          parseEndTag( "", stack.last() );
+        }
+      }
 
-                       if ( closeSelf[ tagName ] && stack.last() == tagName ) {
-                               parseEndTag( "", tagName );
-                       }
+      if ( closeSelf[ tagName ] && stack.last() == tagName ) {
+        parseEndTag( "", tagName );
+      }
 
-                       unary = empty[ tagName ] || !!unary;
+      unary = empty[ tagName ] || !!unary;
 
-                       if ( !unary )
-                               stack.push( tagName );
-                       
-                       if ( handler.start ) {
-                               var attrs = [];
-       
-                               rest.replace(attr, function(match, name) {
-                                       var value = arguments[2] ? arguments[2] :
-                                               arguments[3] ? arguments[3] :
-                                               arguments[4] ? arguments[4] :
-                                               fillAttrs[name] ? name : "";
-                                       
-                                       attrs.push({
-                                               name: name,
-                                               value: value,
-                                               escaped: value.replace(/(^|[^\\])"/g, '$1\\\"') //"
-                                       });
-                               });
-       
-                               if ( handler.start )
-                                       handler.start( tagName, attrs, unary );
-                       }
-               }
+      if ( !unary )
+        stack.push( tagName );
+      
+      if ( handler.start ) {
+        var attrs = [];
+  
+        rest.replace(attr, function(match, name) {
+          var value = arguments[2] ? arguments[2] :
+            arguments[3] ? arguments[3] :
+            arguments[4] ? arguments[4] :
+            fillAttrs[name] ? name : "";
+          
+          attrs.push({
+            name: name,
+            value: value,
+            escaped: value.replace(/(^|[^\\])"/g, '$1\\\"') //"
+          });
+        });
+  
+        if ( handler.start )
+          handler.start( tagName, attrs, unary );
+      }
+    }
 
-               function parseEndTag( tag, tagName ) {
-                       // If no tag name is provided, clean shop
-                       if ( !tagName )
-                               var pos = 0;
-                               
-                       // Find the closest opened tag of the same type
-                       else
-                               for ( var pos = stack.length - 1; pos >= 0; pos-- )
-                                       if ( stack[ pos ] == tagName )
-                                               break;
-                       
-                       if ( pos >= 0 ) {
-                               // Close all the open elements, up the stack
-                               for ( var i = stack.length - 1; i >= pos; i-- )
-                                       if ( handler.end )
-                                               handler.end( stack[ i ] );
-                               
-                               // Remove the open elements from the stack
-                               stack.length = pos;
-                       }
-               }
-       };
-       
-       this.HTMLtoXML = function( html ) {
-               var results = "";
-               
-               HTMLParser(html, {
-                       start: function( tag, attrs, unary ) {
-                               results += "<" + tag;
-               
-                               for ( var i = 0; i < attrs.length; i++ )
-                                       results += " " + attrs[i].name + '="' + attrs[i].escaped + '"';
-               
-                               results += (unary ? "/" : "") + ">";
-                       },
-                       end: function( tag ) {
-                               results += "</" + tag + ">";
-                       },
-                       chars: function( text ) {
-                               results += text;
-                       },
-                       comment: function( text ) {
-                               results += "<!--" + text + "-->";
-                       }
-               });
-               
-               return results;
-       };
-       
-       this.HTMLtoDOM = function( html, doc ) {
-               // There can be only one of these elements
-               var one = makeMap("html,head,body,title");
-               
-               // Enforce a structure for the document
-               var structure = {
-                       link: "head",
-                       base: "head"
-               };
-       
-               if ( !doc ) {
-                       if ( typeof DOMDocument != "undefined" )
-                               doc = new DOMDocument();
-                       else if ( typeof document != "undefined" && document.implementation && document.implementation.createDocument )
-                               doc = document.implementation.createDocument("", "", null);
-                       else if ( typeof ActiveX != "undefined" )
-                               doc = new ActiveXObject("Msxml.DOMDocument");
-                       
-               } else
-                       doc = doc.ownerDocument ||
-                               doc.getOwnerDocument && doc.getOwnerDocument() ||
-                               doc;
-               
-               var elems = [],
-                       documentElement = doc.documentElement ||
-                               doc.getDocumentElement && doc.getDocumentElement();
-                               
-               // If we're dealing with an empty document then we
-               // need to pre-populate it with the HTML document structure
-               if ( !documentElement && doc.createElement ) (function(){
-                       var html = doc.createElement("html");
-                       var head = doc.createElement("head");
-                       head.appendChild( doc.createElement("title") );
-                       html.appendChild( head );
-                       html.appendChild( doc.createElement("body") );
-                       doc.appendChild( html );
-               })();
-               
-               // Find all the unique elements
-               if ( doc.getElementsByTagName )
-                       for ( var i in one )
-                               one[ i ] = doc.getElementsByTagName( i )[0];
-               
-               // If we're working with a document, inject contents into
-               // the body element
-               var curParentNode = one.body;
-               
-               HTMLParser( html, {
-                       start: function( tagName, attrs, unary ) {
-                               // If it's a pre-built element, then we can ignore
-                               // its construction
-                               if ( one[ tagName ] ) {
-                                       curParentNode = one[ tagName ];
-                                       return;
-                               }
-                       
-                               var elem = doc.createElement( tagName );
-                               
-                               for ( var attr in attrs )
-                                       elem.setAttribute( attrs[ attr ].name, attrs[ attr ].value );
-                               
-                               if ( structure[ tagName ] && typeof one[ structure[ tagName ] ] != "boolean" )
-                                       one[ structure[ tagName ] ].appendChild( elem );
-                               
-                               else if ( curParentNode && curParentNode.appendChild )
-                                       curParentNode.appendChild( elem );
-                                       
-                               if ( !unary ) {
-                                       elems.push( elem );
-                                       curParentNode = elem;
-                               }
-                       },
-                       end: function( tag ) {
-                               elems.length -= 1;
-                               
-                               // Init the new parentNode
-                               curParentNode = elems[ elems.length - 1 ];
-                       },
-                       chars: function( text ) {
-                               curParentNode.appendChild( doc.createTextNode( text ) );
-                       },
-                       comment: function( text ) {
-                               // create comment node
-                       }
-               });
-               
-               return doc;
-       };
+    function parseEndTag( tag, tagName ) {
+      // If no tag name is provided, clean shop
+      if ( !tagName )
+        var pos = 0;
+        
+      // Find the closest opened tag of the same type
+      else
+        for ( var pos = stack.length - 1; pos >= 0; pos-- )
+          if ( stack[ pos ] == tagName )
+            break;
+      
+      if ( pos >= 0 ) {
+        // Close all the open elements, up the stack
+        for ( var i = stack.length - 1; i >= pos; i-- )
+          if ( handler.end )
+            handler.end( stack[ i ] );
+        
+        // Remove the open elements from the stack
+        stack.length = pos;
+      }
+    }
+  };
+  
+  this.HTMLtoXML = function( html ) {
+    var results = "";
+    
+    HTMLParser(html, {
+      start: function( tag, attrs, unary ) {
+        results += "<" + tag;
+    
+        for ( var i = 0; i < attrs.length; i++ )
+          results += " " + attrs[i].name + '="' + attrs[i].escaped + '"';
+    
+        results += (unary ? "/" : "") + ">";
+      },
+      end: function( tag ) {
+        results += "</" + tag + ">";
+      },
+      chars: function( text ) {
+        results += text;
+      },
+      comment: function( text ) {
+        results += "<!--" + text + "-->";
+      }
+    });
+    
+    return results;
+  };
+  
+  this.HTMLtoDOM = function( html, doc ) {
+    // There can be only one of these elements
+    var one = makeMap("html,head,body,title");
+    
+    // Enforce a structure for the document
+    var structure = {
+      link: "head",
+      base: "head"
+    };
+  
+    if ( !doc ) {
+      if ( typeof DOMDocument != "undefined" )
+        doc = new DOMDocument();
+      else if ( typeof document != "undefined" && document.implementation && document.implementation.createDocument )
+        doc = document.implementation.createDocument("", "", null);
+      else if ( typeof ActiveX != "undefined" )
+        doc = new ActiveXObject("Msxml.DOMDocument");
+      
+    } else
+      doc = doc.ownerDocument ||
+        doc.getOwnerDocument && doc.getOwnerDocument() ||
+        doc;
+    
+    var elems = [],
+      documentElement = doc.documentElement ||
+        doc.getDocumentElement && doc.getDocumentElement();
+        
+    // If we're dealing with an empty document then we
+    // need to pre-populate it with the HTML document structure
+    if ( !documentElement && doc.createElement ) (function(){
+      var html = doc.createElement("html");
+      var head = doc.createElement("head");
+      head.appendChild( doc.createElement("title") );
+      html.appendChild( head );
+      html.appendChild( doc.createElement("body") );
+      doc.appendChild( html );
+    })();
+    
+    // Find all the unique elements
+    if ( doc.getElementsByTagName )
+      for ( var i in one )
+        one[ i ] = doc.getElementsByTagName( i )[0];
+    
+    // If we're working with a document, inject contents into
+    // the body element
+    var curParentNode = one.body;
+    
+    HTMLParser( html, {
+      start: function( tagName, attrs, unary ) {
+        // If it's a pre-built element, then we can ignore
+        // its construction
+        if ( one[ tagName ] ) {
+          curParentNode = one[ tagName ];
+          return;
+        }
+      
+        var elem = doc.createElement( tagName );
+        
+        for ( var attr in attrs )
+          elem.setAttribute( attrs[ attr ].name, attrs[ attr ].value );
+        
+        if ( structure[ tagName ] && typeof one[ structure[ tagName ] ] != "boolean" )
+          one[ structure[ tagName ] ].appendChild( elem );
+        
+        else if ( curParentNode && curParentNode.appendChild )
+          curParentNode.appendChild( elem );
+          
+        if ( !unary ) {
+          elems.push( elem );
+          curParentNode = elem;
+        }
+      },
+      end: function( tag ) {
+        elems.length -= 1;
+        
+        // Init the new parentNode
+        curParentNode = elems[ elems.length - 1 ];
+      },
+      chars: function( text ) {
+        curParentNode.appendChild( doc.createTextNode( text ) );
+      },
+      comment: function( text ) {
+        // create comment node
+      }
+    });
+    
+    return doc;
+  };
 
-       function makeMap(str){
-               var obj = {}, items = str.split(",");
-               for ( var i = 0; i < items.length; i++ ) {
-                 obj[ items[i] ] = true;
-                       obj[ items[i].toUpperCase() ] = true;
-               }
-               return obj;
-       }
+  function makeMap(str){
+    var obj = {}, items = str.split(",");
+    for ( var i = 0; i < items.length; i++ ) {
+      obj[ items[i] ] = true;
+      obj[ items[i].toUpperCase() ] = true;
+    }
+    return obj;
+  }
 })();
\ No newline at end of file