|
/** |
|
* @namespace cf_HTMLTableParser |
|
* @description A module for parsing HTML tables into an array of objects. |
|
*/ |
|
var cf_HTMLTableParser = (function () { |
|
var logger = new GSLog('', "cf_HTMLTableParser"); |
|
|
|
/** |
|
* Parses HTML tables into an array of objects. |
|
Example Structure of an HTML document with two tables |
|
Tables Array |
|
|__Table1 Array |
|
|__Object1 |
|
|__Object2 |
|
|__Object3 |
|
|__Table2 Array |
|
|__Object1 |
|
|__Object2 |
|
* @param {string} strHtml - The HTML string containing the tables. |
|
* @param {string} [strIdentifier] - An optional string to identify specific tables to parse. Regex is also supported. Searches the header row |
|
* @param {string} [logLevel] - The log level for the logger (error, warn, info, debug). |
|
* @returns {Array} An array of objects representing the parsed tables. |
|
* @example |
|
* // Parse all tables in the HTML string |
|
* var html = '<table>...</table><table>...</table>'; |
|
* var parsedTables = cf_HTMLTableParser.parseHTMLTables(html); |
|
* |
|
* // Parse tables containing the string "Example" and debug level logging |
|
* var parsedTables = cf_HTMLTableParser.parseHTMLTables(html, 'Example',"debug"); |
|
|
|
*/ |
|
function parseHTMLTables(strHtml, strIdentifier, logLevel) { |
|
var results = []; |
|
identRegExp = strIdentifier ? new RegExp(strIdentifier, 'g') : /.*/; |
|
|
|
var tableMatches = strHtml.match(/<table[\s\S]*?<\/table>/g); // Match all tables |
|
tableMatches.forEach(function (table) { |
|
var tableObj = _processTable(table, identRegExp); |
|
if (tableObj) { |
|
results.push(tableObj); |
|
} |
|
}); |
|
|
|
return results; |
|
} |
|
|
|
/** |
|
* Sets the log level for the logger. |
|
* @param {string} strLevel - The log level (error, warn, info, debug). |
|
* @returns {this} This object for chaining. |
|
* @example |
|
* // Set the log level to "debug" and parseHTML |
|
* var html = '<table>...</table><table>...</table>'; |
|
* var parsedTables = cf_HTMLTableParser.setLogLevel('debug').parseHTMLTables(html); |
|
*/ |
|
function setLogLevel(strLevel) { |
|
var validLevels = { |
|
"error": "error", |
|
"warn": "warn", |
|
"info": "info", |
|
"debug": "debug" |
|
}; |
|
strLevel = (strLevel || "").toLowerCase(); |
|
if (gs.nil(strLevel) || (typeof strLevel !== "string") || !validLevels.hasOwnProperty(strLevel)) { |
|
return; |
|
} |
|
|
|
logger.setLevel(validLevels[strLevel]); |
|
return this; |
|
} |
|
|
|
/** |
|
* Processes a single table and returns an array of objects representing the table rows. |
|
* @param {string} table - The HTML string representing the table. |
|
* @param {RegExp} identRegExp - A regular expression to identify specific tables. |
|
* @returns {Array} An array of objects representing the table rows. |
|
* @private |
|
* @example |
|
* // Process a table and get the parsed rows |
|
* var table = '<table><tr><th>Header 1</th><th>Header 2</th></tr><tr><td>Value 1</td><td>Value 2</td></tr></table>'; |
|
* var identRegExp = /; |
|
* var parsedRows = _processTable(table, identRegExp); |
|
* */ |
|
function _processTable(table, identRegExp) { |
|
identRegExp = identRegExp || /.*/g; |
|
logger.debug("Table: " + table); |
|
var arrRows = table.match(/<tr[\s\S]*?<\/tr>/g); // Match all rows within table(tr elements) |
|
if (!arrRows) { //Fail early if no rows found |
|
logger.logDebug("No Rows Found"); |
|
return; |
|
} |
|
|
|
var arrHeaders = _getHeaders(arrRows[0]); //Build array of headers. Assume row 0 is header row. |
|
if (!arrHeaders || arrHeaders.length === 0) { //return if no headers found |
|
gs.error("No Headers Found"); |
|
return; |
|
} |
|
logger.logDebug("Headers:" + arrHeaders); |
|
var identifierFound = identRegExp.test(arrRows[0]); |
|
|
|
if (!identifierFound) { //Do not process table if identifier not found |
|
gs.error("Ident not found"); |
|
return; |
|
} |
|
|
|
//process all rows and add to table array |
|
var arrTable = []; |
|
for (var i = 1; i < arrRows.length; i++) { // Start from 1 to skip header row |
|
var arrRowData = arrRows[i].match(/<td[\s\S]*?<\/td>/g); //match all TD Elements |
|
var objRow = _processRow(arrRowData, arrHeaders); |
|
if (objRow) { |
|
arrTable.push(objRow); |
|
} |
|
} |
|
|
|
return arrTable; |
|
} |
|
|
|
/** |
|
* Extracts the headers from the header row of a table. |
|
* @param {string} headerRow - The HTML string representing the header row. |
|
* @returns {Array} An array of header strings. |
|
* @private |
|
* @example |
|
* // Extract headers from a header row |
|
* var headerRow = '<tr><th>Header 1</th><th>Header 2</th></tr>'; |
|
* var headers = _getHeaders(headerRow); |
|
* */ |
|
function _getHeaders(headerRow) { |
|
return headerRow.match(/<t[hr][\s\S]*?<\/t[hr]>/g).map(function (header) { |
|
return header.replace(/<\/?[^>]+>/g, '').trim(); // Strip HTML to get clean header text |
|
}); |
|
} |
|
|
|
/** |
|
* Processes a single row of a table and returns an object representing the row data. |
|
* @param {Array} arrRowData - An array of HTML strings representing the table cells. |
|
* @param {Array} arrHeaders - An array of header strings. |
|
* @returns {Object} An object representing the row data, with keys corresponding to the headers. |
|
* @private |
|
* @example |
|
* // Process a row and get the row data |
|
* var arrRowData = ['<td>Value 1</td>', '<td>Value 2</td>']; |
|
* var arrHeaders = ['Header 1', 'Header 2']; |
|
* var rowData = _processRow(arrRowData, arrHeaders); |
|
* */ |
|
function _processRow(arrRowData, arrHeaders) { |
|
var objRow = {}; |
|
arrRowData.forEach(function (strRowItem, idx) { |
|
strRowItem = _htmlToStr(strRowItem); // Strip HTML and get text |
|
logger.logDebug("Row Item: " + strRowItem); |
|
if (idx < arrHeaders.length) { |
|
objRow[arrHeaders[idx]] = strRowItem; // Map text to corresponding header as key |
|
} |
|
}); |
|
logger.logDebug(JSON.stringify(objRow)); |
|
return JSON.stringify(objRow) === "{}" ? null : objRow; |
|
} |
|
|
|
/** |
|
* Converts an HTML string to plain text by removing HTML tags and decoding HTML entities. |
|
* @param {string} html - The HTML string to convert. |
|
* @returns {string} The plain text string. |
|
* @private |
|
* @example |
|
* // Convert an HTML string to plain text |
|
* var htmlString = '<p>This is <b>bold</b> text.</p>'; |
|
* var plainText = _htmlToStr(htmlString); |
|
* */ |
|
function _htmlToStr(html) { |
|
function decodeUTF(match, group1) { |
|
return decodeURIComponent(group1); |
|
} |
|
|
|
function translateChar(match, group1) { |
|
var charMap = { |
|
'nbsp': String.fromCharCode(160), |
|
'amp': '&', |
|
'quot': '"', |
|
'lt': '<', |
|
'gt': '>' |
|
}; |
|
return charMap[group1]; |
|
} |
|
|
|
var utfRegEx = /(%\w{2}(%\w{2}%\w{1,2})?)/gm; |
|
var specialCharRegex = /&(nbsp|amp|quot|lt|gt)/g; |
|
var noHTML = html.replace(/<br(?:\s*)?\/?>/gi, "\n").replace(/(<\/(?:tr|div|p)>)\s*([^\n])/, "$1\n$2").replace(/(<\/(?:td)>)\s*\n/gi, "$1").replace(/(<([^>]+)>)/ig, '').replace(utfRegEx, decodeUTF).replace(specialCharRegex, translateChar); |
|
return decodeURI(noHTML); |
|
} |
|
|
|
return { |
|
parseHTMLTables: parseHTMLTables, |
|
setLogLevel: setLogLevel |
|
}; |
|
})(); |