Skip to content

Instantly share code, notes, and snippets.

@huttj
Last active July 28, 2019 16:21
Show Gist options
  • Save huttj/db959201697a7e34e745efbcc8c7348c to your computer and use it in GitHub Desktop.
Save huttj/db959201697a7e34e745efbcc8c7348c to your computer and use it in GitHub Desktop.
Scrape a site and capture all of the content (and images)
const puppeteer = require('puppeteer'); // v 1.1.0
const { URL } = require('url');
const fse = require('fs-extra'); // v 5.0.0
const path = require('path');
const NO_RESPONSE_ERROR_MESSAGE = 'No data found for resource with given identifier';
const NAME_TOO_LONG_ERROR_MESSAGE = 'ENAMETOOLONG';
const THIRTY_SECONDS_MS = 1000 * 30;
const visited = new Set();
const queued = new Set();
const queue = [];
const whitelist = new Set();
let currentUrl = null;
let currentTask = Promise.resolve();
async function start(urlToFetch) {
addToWhitelist(urlToFetch);
await fse.remove('./output');
/* 1 */
const browser = await puppeteer.launch();
const page = await browser.newPage();
/* 2 */
page.on('response', async (response) => {
logFile(response);
const filePath = getFilePath(response);
try {
const data = await getData(response);
await fse.outputFile(filePath, data);
} catch (e) {
if (e.message.includes(NAME_TOO_LONG_ERROR_MESSAGE) || e.message.includes(NO_RESPONSE_ERROR_MESSAGE)) {
// Do nothing
} else {
console.warn(e.message);
}
}
});
page.on('load', async () => {
// Don't cut off a page that's loading
await currentTask;
console.log(`Loaded page: ${page.url()}`);
fse.outputFile('./data.json', JSON.stringify({
visited: Array.from(visited),
queue,
}, null, 2));
await enqueueLinks(page);
goToNextPage(page);
});
/* 3 */
await page.goto(urlToFetch, {
waitUntil: 'networkidle2'
});
// /* 4 */
// setTimeout(async () => browser.close(), 60000 * 4);
}
function addToWhitelist(url) {
const { hostname } = new URL(url);
whitelist.add(hostname);
if (!hostname.match(/^www/)) {
whitelist.add('www.' + hostname);
}
}
function logFile(response) {
const type = response._request._resourceType;
const url = response.url();
// console.log(`Loading ${type}: ${url}`);
}
function getFilePath(response) {
const url = new URL(response.url());
let filePath = path.resolve(`./output${url.pathname}`);
if (path.extname(url.pathname).trim() === '') {
filePath = `${filePath}/index.html`;
}
return filePath;
}
function addToQueue(rawUrl) {
if (!queued.has(rawUrl)) {
const url = new URL(rawUrl)
if (whitelist.has(url.hostname)) {
queued.add(rawUrl);
queue.push(rawUrl);
return true;
}
}
return false;
}
async function getData(response) {
if (response._request._resourceType === 'document') {
const data = await response.text();
return data
.replace(/href="(https?:\/\/[^"]+)"/g, (_, capture) => {
const url = new URL(capture);
if (whitelist.has(url.hostname)) {
addToQueue(capture);
return `href="${url.pathname}"`;
}
return `href="${capture}"`;
})
.replace(/src="(https?:\/\/[^"]+)"/g, (_, capture) => {
const url = new URL(capture);
return `src="${url.pathname}"`;
})
.replace(/image="(https?:\/\/[^"]+)"/g, (_, capture) => {
const url = new URL(capture);
return `image="${url.pathname}"`;
})
.replace(/poster="(https?:\/\/[^"]+)"/g, (_, capture) => {
const url = new URL(capture);
return `poster="${url.pathname}"`;
})
.replace(/content="(https?:\/\/[^"]+)"/g, (_, capture) => {
const url = new URL(capture);
return `content="${url.pathname}"`;
})
.replace(/"image":"(https?:\/\/[^"]+)"/g, (_, capture) => {
const url = new URL(capture);
return `image:"${url.pathname}"`;
})
.replace(/"url":"(https?:\\\/\\\/[^"]+)"/g, (_, capture) => {
const rawUrl = capture.replace(/\\\//g, '/')
addToQueue(rawUrl);
const url = new URL(rawUrl);
return `url:"${url.pathname.replace(/\//g, '\\\/')}"`;
})
.replace(/background-image:\s*url\(['"]([^'"]+)['"]\)/g, (_, capture) => {
const url = new URL(capture);
return `background-image: url('${url.pathname}')`;
});
}
return response.buffer();
}
async function enqueueLinks(page) {
// Skip non-html docs (images, etc);
const type = await page.evaluate(() => document.contentType);
if (type !== 'text/html') return;
const links = await page.evaluate(() => {
return Array.from(document.querySelectorAll('*')).reduce((links, el) => {
if (!el.href) return links;
return links.concat(el.href);
}, []);
});
let count = 0;
for (const link of links) {
if (addToQueue(link)) {
count++;
}
}
console.log(`Added ${count} urls`);
}
function getNextUnvisited() {
let next;
while ((!next || visited.has(next)) && queue.length) {
next = queue.shift();
}
return visited.has(next) ? null : next;
}
function sleep(ms) {
return new Promise(ok => setTimeout(ok, ms));
}
async function goToWithTimeout(page, next) {
try {
const result = await Promise.race([
page.goto(next, { waitUntil: 'networkidle0' }),
sleep(THIRTY_SECONDS_MS)
]);
const reason = result ? '' : ' - timed out';
console.log(`Done with ${next}${reason}`);
} catch (e) {
console.warn(`Failed ${next}: ${e.message}`);
}
}
async function goToNextPage(page) {
const next = getNextUnvisited();
if (next) {
visited.add(next);
currentUrl = next;
currentTask = goToWithTimeout(page, next);
await currentTask;
// Nothing else got queued up
await sleep(THIRTY_SECONDS_MS);
if (currentUrl === next) {
goToNextPage(page);
}
} else {
console.log('Done...?');
}
}
@huttj
Copy link
Author

huttj commented Jul 28, 2019

Apparently, Puppeteer won't fire a load event if you hit a page a second time. So, rather than try to figure out the right way to do it, I just added a timeout to goToNextPage(). Not great, but it's working well.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment