Skip to content

Instantly share code, notes, and snippets.

@chrisheseltine
Created August 31, 2018 14:34
Show Gist options
  • Save chrisheseltine/cbcf80ef15d9e54107ba31c61ab8ca09 to your computer and use it in GitHub Desktop.
Save chrisheseltine/cbcf80ef15d9e54107ba31c61ab8ca09 to your computer and use it in GitHub Desktop.
const Apify = require('apify');
Apify.main(async () => {
const requestQueue = await Apify.openRequestQueue();
await requestQueue.addRequest(new Apify.Request({
url: 'https://petharbor.com/results.asp?searchtype=ADOPT&start=1&miles=20&shelterlist=%27HAMP%27&zip=&where=type_CAT&nosuccess=1&nomax=1&rows=25&nobreedreq=1&nopod=1&nocustom=1&samaritans=1&view=sysadm.v_hamp&imgres=detail&stylesheet=https://cbbb1e2ef05c549bf4c2-7b792f487d9839572907a6863bac8ad2.ssl.cf5.rackcdn.com/petharbor.css&grid=1&NewOrderBy=Name&text=000000&link=007c0f&col_bg=ffffff',
userData: {
label: "roster"
}
}));
await requestQueue.addRequest(new Apify.Request({
url: 'https://petharbor.com/results.asp?searchtype=ADOPT&start=1&miles=20&shelterlist=%27HAMP%27&zip=&where=type_DOG&nosuccess=1&nomax=1&rows=25&nobreedreq=1&nopod=1&nocustom=1&samaritans=1&view=sysadm.v_hamp&imgres=detail&stylesheet=https://cbbb1e2ef05c549bf4c2-7b792f487d9839572907a6863bac8ad2.ssl.cf5.rackcdn.com/petharbor.css&grid=1&NewOrderBy=Name&text=000000&link=007c0f&col_bg=ffffff',
userData: {
label: "roster"
}
}));
async function addDataLinksToQueue(page) {
const links = await page.$$eval('.GridResultsContainer a', el => el.href);
console.log('links: ${links}')
// enqueue the pages and give them label detail so you can distinguish between roster and data
for (let i = 0; i < links.length; i++) {
await requestQueue.addRequest(new Apify.Request({
url: request.links[i], //?
userData: {
label: "data"
}
}));
console.log('dataLinkQueued');
}
}
async function extractData(page) {
const id = await page.$eval('font', el => el.innerText);
return { id };
}
const crawler = new Apify.PuppeteerCrawler({
requestQueue,
handlePageFunction: async ({ page, request }) => {
// add data links to the queue on roster pages
if (request.userData.label === "roster") {
await addDataLinksToQueue(page);
}
// when reaching data pages extract the data
else if (request.userData.label === "data") {
const data = await extractData(page);
console.log('got data: ${data}');
await Apify.pushData(data);
}
}
});
crawler.run()
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment