Last active
June 6, 2023 10:24
-
-
Save Munawwar/8c46433e89f08850944d547acef367ca to your computer and use it in GitHub Desktop.
Tiny HTML5 SAX Parser for browser
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* The smallest html sax parser - 0.5kb gzipped | |
* | |
* Usage: Find the comments/jsdoc of export below. | |
*/ | |
// Regular Expressions for parsing tags and attributes | |
let startTagRegex = /(?:<([a-zA-Z][^\s\/>]*)(?:\s+[^\s\/>"'=]+(?:\s*=\s*(?:(?:"[^"]*")|(?:'[^']*')|[^>\s]+))?)*\s*(\/?)\s*>)|(?:<\/\s*([a-zA-Z][^\s\/>]*)>)|(?:<!--(.+?)-->)|(?:<!\[CDATA\[([^>]+)\]\]>)/ig, | |
// Void Tags - HTML 5 | |
voidTags = new Set('area,base,br,col,embed,hr,img,input,keygen,link,meta,param,source,track,wbr'.split(',')), | |
// Raw Text Tags - HTML 5 (except <template> tag) | |
rawTextTags = new Set('script,style,textarea,title,template'.split(',')), | |
rawTextParser = /^([\s\S]*?)(<\/(?:script|style|textarea|title|template)[^>]*>)/i; | |
/** | |
* @param {string} html Assumes balanced, valid HTML as input | |
* @param {( | |
* type: 'tag'|'tagEnd'|'text'|'comment'|'cdata'|'content', | |
* matchedSubstring: string, | |
* contextualInfo: string, | |
* selfClosed: boolean, | |
* ) => undefined} callback contextualInfo is based on the `type` | |
* if type is `tag` or `tagEnd`, its the tag name | |
* if type is `text`, `comment` or `cdata`, its the nodeValue | |
* if type is `content`, its the textContent inside the script/style/textarea/title/template tag | |
* | |
* selfClosed is true or false for type = 'tag' type | |
*/ | |
export default function parseHtml(html, callback) { | |
let lastIndex = 0, match, text; | |
startTagRegex.lastIndex = 0; | |
while ((match = startTagRegex.exec(html))) { | |
let [matchedSubString, tagStartName, unary, tagEndName, comment, cdata] = match; | |
if (lastIndex === startTagRegex.lastIndex) { | |
throw new Error('Parser error'); | |
} | |
text = html.slice(lastIndex, startTagRegex.lastIndex - matchedSubString.length); | |
if (text) callback('text', text); | |
lastIndex = startTagRegex.lastIndex; | |
if (tagStartName) { | |
let tagLowercase = tagStartName.toLowerCase(); | |
// Handle script, style and other text-only tags | |
if (rawTextTags.has(tagLowercase)) { | |
callback('tag', matchedSubString, tagStartName, false); | |
let [substr, content, endTagSubstr] = html.slice(lastIndex).match(rawTextParser); | |
callback('content', content); | |
callback('tagEnd', endTagSubstr, tagStartName); | |
lastIndex += substr.length; | |
startTagRegex.lastIndex = lastIndex; | |
} else { | |
callback('tag', matchedSubString, tagStartName, !!unary || voidTags.has(tagLowercase)); | |
} | |
} else if (tagEndName) { | |
callback('tagEnd', matchedSubString, tagEndName); | |
// Comment | |
} else if (comment) { | |
callback('comment', matchedSubString, comment); | |
//CDATA | |
} else if (cdata) { | |
callback('cdata', matchedSubString, cdata); | |
} | |
} | |
text = html.slice(lastIndex); | |
if (text) callback('text', text); | |
}; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment