Last active
January 20, 2023 15:18
-
-
Save yuhui/609466e58a15c447005b30c6b80de3b6 to your computer and use it in GitHub Desktop.
Crawl a page's content DOM tree and set names in each section using a "data-" attribute
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* Add a "data-content-section-name" attribute to all page content sections. | |
* | |
* Page content sections are determined in this order: | |
* 1. Wherever there is a DIV with an "id" attribute in the container DIV. | |
* 2. DIVs that are immediate children of the container DIV. | |
* ... more methods can be added as required. | |
* | |
* In all cases, when the identified content section's DIV has a "data-content-section-name" attribute already, | |
* then that attribute is left alone. | |
*/ | |
const contentSectionNameAttribute = 'data-content-section-name'; | |
const containerDiv = document.querySelector('body > div.container'); | |
// set the section name from the section's ID | |
const sectionsWithIds = containerDiv.querySelectorAll(`div[id]:not([${contentSectionNameAttribute}])`); | |
sectionsWithIds.forEach((sectionWithId) => { | |
// get the ID | |
const sectionId = sectionWithId.id; | |
// verify that there is only one section with this ID in the entire page, | |
// because some CMSes allow more than one DIV element to have the same ID | |
const allDivsWithSectionId = document.querySelectorAll(`div#${sectionId}`); | |
if (allDivsWithSectionId.length === 1) { | |
sectionWithId.setAttribute(contentSectionNameAttribute, sectionId); | |
} | |
}); | |
// this list of class names is used to identify sections that should not be named | |
var unimportantSectionPrimaryClasses = [ | |
// add class names one by one | |
]; | |
const unnamedSections = containerDiv.querySelectorAll(`:scope > div[class]:not([${contentSectionNameAttribute}])`); | |
unnamedSections.forEach((unnamedSection) => { | |
// don't continue if there are already descendant DIVs with the "data-content-section-name" attribute | |
const divsWithSectionNameAttribute = unnamedSection.querySelectorAll(`div[${contentSectionNameAttribute}]`); | |
if (divsWithSectionNameAttribute.length > 0) { | |
return; | |
} | |
// get the section's first class, assuming that it is unique enough to be the section's name | |
const unnamedSectionClassList = unnamedSection.classList; | |
const unnamedSectionPrimaryClass = unnamedSectionClassList[0]; | |
// don't set a name if this is section does not need to be named | |
if (unimportantSectionPrimaryClasses.indexOf(unnamedSectionPrimaryClass) > -1) { | |
return; | |
} | |
// get the first H1 value | |
const h1 = unnamedSection.querySelector('h1'); | |
const h1Value = h1 ? h1.textContent : ''; | |
// get the first H2 value | |
const h2 = unnamedSection.querySelector('h2'); | |
const h2Value = h2 ? h2.textContent : ''; | |
// prefer H1 over H2 as the section's heading | |
const unnamedSectionHeading = h1Value || h2Value || ''; | |
// IMPORTANT! Note the order of determining where to get the section name from | |
let contentSectionName = unnamedSectionHeading || unnamedSectionPrimaryClass || ''; | |
if (contentSectionName) { | |
// set the "data-content-section-name" attribute in the DIV section | |
// ... perform any string cleanups here | |
// keep English alphanumeric characters, hyphens, underscores and spaces in the content section name | |
contentSectionName = contentSectionName.replace(/[^0-9a-z\-_\s]/gi, ''); | |
// replace linebreaks with a space | |
contentSectionName = contentSectionName.replace(/\n/g, ' '); | |
// finally, replace all double spaces with single spaces, and trim all leading/trailing whitespces | |
contentSectionName = contentSectionName.replace(/\s+/g, ' '); | |
contentSectionName = contentSectionName.trim(); | |
// don't continue if there are *any* DIVs with the same section name in the "data-content-section-name" attribute | |
const divsWithContentSectionName = document.querySelectorAll(`div[${contentSectionNameAttribute}="${contentSectionName}"]`); | |
if (divsWithContentSectionName.length > 0) { | |
return; | |
} | |
unnamedSection.setAttribute(contentSectionNameAttribute, contentSectionName); | |
} | |
}); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment