Created
January 18, 2022 00:31
-
-
Save fredriccliver/4af6070bf66abdfa870b0adb7c408bd5 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* | |
* Usage | |
* | |
* node getRemote.js [target url] | |
* | |
*/ | |
// import { readSync } from "to-vfile"; | |
import { toString } from "nlcst-to-string"; | |
import { retext } from "retext"; | |
import retextPos from "retext-pos"; | |
import retextKeywords from "retext-keywords"; | |
import keyword_extractor from "keyword-extractor"; | |
// import fetch from "node-fetch"; | |
import { JSDOM } from "jsdom"; | |
import readability from "node-readability"; | |
import fs from "fs"; | |
const targetUrl = process.argv[2] || "https://www.creatrip.com/en/blog/1491"; | |
/* | |
Result example of the default URL | |
---- Keywords ---- | |
exchange (SCORE:1 WEIGHT:undefined) | |
rates (SCORE:0.75 WEIGHT:undefined) | |
currency (SCORE:0.29 WEIGHT:undefined) | |
Myeongdong (SCORE:0.29 WEIGHT:undefined) | |
bank (SCORE:0.25 WEIGHT:undefined) | |
money (SCORE:0.25 WEIGHT:undefined) | |
counters (SCORE:0.25 WEIGHT:undefined) | |
---- Key-phrases ---- | |
exchange counters (SCORE:1 WEIGHT:29) | |
exchange rate (SCORE:0.62 WEIGHT:41) | |
commission rate (SCORE:0.33 WEIGHT:22) | |
rates (SCORE:0.26 WEIGHT:17) | |
bank (SCORE:0.17 WEIGHT:5) | |
*/ | |
// SIMPLE FETCH VERSION CODE | |
// fetch(targetUrl) | |
// .then((response) => response.text()) | |
// .then((text) => { | |
// const dom = new JSDOM(text); | |
// const textContent = Array.from(dom.window.document.querySelectorAll("p")) | |
// .map((e) => getInnerText(e)) | |
// .join(" "); | |
// // print paragraphs | |
// // console.log(textContent); | |
// extractKeywords(textContent); | |
// }); | |
console.log(targetUrl); | |
// It convert raw HTML to simplified and summarised content HTML document. | |
readability(targetUrl, function (err, article, meta) { | |
// Main Article | |
// console.log(article.content); | |
if (err != null || article == null) { | |
console.log(err); | |
return; | |
} | |
// console.log(article.content); | |
const dom = new JSDOM(article.content); | |
const textContent = Array.from(dom.window.document.querySelectorAll("p")) | |
.map((e) => e.textContent) | |
.join(" "); | |
saveLatestDocument(article, textContent); | |
// Extract keywords from converted plain texts | |
extractKeywords(textContent, "retext"); | |
// Close article to clean up jsdom and prevent leaks | |
article.close(); | |
}); | |
// It extract keywords and key-phrases from long text. | |
// There are two algorithm, retext and keyword_extractor | |
/** parameters | |
* p: phrase, long text | |
* mode: "retext" or "ke" | |
*/ | |
function extractKeywords(p, mode) { | |
if (mode == "retext") { | |
retext() | |
.use(retextPos) // Make sure to use `retext-pos` before `retext-keywords`. | |
.use(retextKeywords, { maximum: 5 }) | |
// .use(retextKeywords) | |
.process(p) | |
.then((p) => { | |
console.log("---- Keywords ----"); | |
p.data.keywords.forEach((keyword) => { | |
// if (keyword.score < 0.4) return false; | |
console.log( | |
`${toString(keyword.matches[0].node)} (SCORE:${ | |
Math.floor(keyword.score * 100) / 100 | |
} WEIGHT:${keyword.weight})` | |
); | |
}); | |
console.log("---- Key-phrases ----"); | |
p.data.keyphrases.forEach((phrase) => { | |
// if (phrase.score < 0.01 || phrase.weight < 10) return false; | |
console.log( | |
`${phrase.matches[0].nodes | |
.map((d) => toString(d)) | |
.join("")} (SCORE:${ | |
Math.floor(phrase.score * 100) / 100 | |
} WEIGHT:${phrase.weight})` | |
); | |
}); | |
}); | |
} else if (mode == "ke") { | |
const extraction_result = keyword_extractor.extract(p, { | |
language: "english", | |
remove_digits: true, | |
return_changed_case: true, | |
remove_duplicates: false, | |
return_max_ngrams: 5, | |
}); | |
console.log(extraction_result); | |
} | |
} | |
// Logging purpose | |
function saveLatestDocument(article, textContent) { | |
fs.writeFile("./latestPage/content.html", article.content, (err) => { | |
if (err) { | |
console.error(err); | |
return; | |
} | |
//file written successfully | |
}); | |
fs.writeFile("./latestPage/textContent.html", textContent, (err) => { | |
if (err) { | |
console.error(err); | |
return; | |
} | |
//file written successfully | |
}); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment