Skip to content

Instantly share code, notes, and snippets.

@paulgrammer
Last active November 21, 2023 09:18
Show Gist options
  • Save paulgrammer/dd63c4e2f7a9933be7d2a85a2ccf2154 to your computer and use it in GitHub Desktop.
Save paulgrammer/dd63c4e2f7a9933be7d2a85a2ccf2154 to your computer and use it in GitHub Desktop.
const axios = require("axios");
const cheerio = require("cheerio");
const fs = require("fs");
function getWords(q) {
console.log("Requesting words");
return axios
.get("https://iapi.glosbe.com/iapi3/wordlist", {
params: {
l1: "lg",
l2: "en",
q: q,
after: 30,
before: 0,
env: "en",
},
})
.then((response) => response.data.after.map(({ phrase }) => phrase))
.catch(() => []);
}
function getCorpus(q, page) {
return axios
.get(`https://glosbe.com/lg/en/${q}/fragment/tmem`, {
params: {
page,
mode: "MUST",
stem: false,
includedAuthors: "",
excludedAuthors: "",
},
})
.then((response) => {
console.log(response.request.res.responseUrl);
let left = [];
let right = [];
if (response.data.includes("No examples found")) return "no-more";
const $ = cheerio.load(response.data);
let nodes = $(".tmem__item");
let format = (text) =>
text
.replace(/&/g, "and")
.replace(new RegExp('<strong class="keyword">', "ig"), "")
.replace(new RegExp("</strong>", "gi"), "")
.replace(new RegExp("<div>", "gi"), "")
.replace(new RegExp("</div>", "gi"), "")
.replace(/\+/g, "")
.replace(/\*/g, "")
.replace(new RegExp("• ", "ig"), "")
.trim();
nodes.each(function (_, node) {
let lg = $(node).find("div[lang=lg]").html();
let en = $(node).find(".relative").html();
right.push(format(en));
left.push(format(lg));
});
return { text: right, translations: left };
})
.catch((err) => {
console.log(err.message);
return "errors";
});
}
function fetchCorpus(phrase) {
return new Promise((done) => {
let currentPage = 1;
let text = [];
let translations = [];
let run = async () => {
let result = await getCorpus(phrase, currentPage);
if (["no-more", "errors"].includes(result)) {
return done({ text, translations });
}
text = text.concat(result.text);
translations = translations.concat(result.translations);
currentPage++;
run();
};
run();
});
}
function fetchWords() {
return new Promise((done) => {
let output = [];
let letters = "abcdefghijklmnopqrstuvwxyz".split("");
let run = async () => {
let letter = letters.shift();
if (!letter) {
return done([...new Set(output)]);
}
let words = await getWords(letter);
output = output.concat(words);
run();
};
run();
});
}
function getDone() {
return fs.readFileSync("./done.txt").toString().split("\n");
}
function setDone(word) {
let words = getDone();
words.push(word);
fs.writeFileSync("./done.txt", words.join("\n"));
}
async function main(input = "words") {
let words = [];
if (fs.existsSync(`./${input}.txt`)) {
words = fs.readFileSync(`./${input}.txt`).toString().split("\n");
} else {
words = await fetchWords();
}
let next = async () => {
let word = words.shift();
if (!word) {
return;
}
let id = word.replace(/\s/g, "_");
let done = getDone();
if (done.includes(id)) {
return next();
}
let { text, translations } = await fetchCorpus(word.replace(/\?/g, "%3F"));
if (text.length && translations.length) {
if (!fs.existsSync(`./${input}`)) {
fs.mkdirSync(`./${input}`);
}
fs.writeFileSync(
`./${input}/data.corpus.www.glosbe.com.jw2019.${id}.en.txt`,
text.join("\n")
);
fs.writeFileSync(
`./${input}/data.corpus.www.glosbe.com.jw2019.${id}.lg.txt`,
translations.join("\n")
);
}
setDone(id);
next();
};
next();
}
main("continents");
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment