Skip to content

Instantly share code, notes, and snippets.

@meelash
Created November 13, 2019 13:15
Show Gist options
  • Save meelash/49b806605de637ad2e0bfecde4825a0d to your computer and use it in GitHub Desktop.
Save meelash/49b806605de637ad2e0bfecde4825a0d to your computer and use it in GitHub Desktop.
const Apify = require('apify');
(async () => {
const catchDups = {};
const rootAddress = new URL('https://stores.cosmoprofbeauty.com');
const requestQueue = await Apify.openRequestQueue();
await requestQueue.addRequest({
url: rootAddress.href,
userData: {requestType:'listPage'}
})
const enqueueListItems = ($, requestType) =>{
return Promise.all(Array.prototype.map.call($('.map-list-item a').get(), elem => {
return requestQueue.addRequest({
url: (new URL($(elem).attr('href'), rootAddress)).href,
userData: {requestType}
});
}));
}
const crawler = new Apify.CheerioCrawler({
requestQueue,
handlePageFunction: async ({ request, response, html, $ }) => {
if (!!catchDups[request.url]) {
console.log('DUPLICATE!!!!!!!');
console.log(`visited page: ${request.url} ${request.retryCount} times, because ${request.errorMessages}`);
} else {
catchDups[request.url] = true;
}
switch (request.userData.requestType) {
case 'listPage':
return await enqueueListItems($, 'statePage');
case 'statePage':
return await enqueueListItems($, 'cityPage');
case 'cityPage':
// console.log(`Pre-promise visited page: ${request.url} ${request.retryCount} times, because ${request.errorMessages}`);
return Promise.all(Array.prototype.map.call($('.map-list-item-header a').first().get(), elem => {
// console.log(`visited page: ${request.url} ${request.retryCount} times, because ${request.errorMessages}`);
// console.log(`Request with id: ${(new URL($(elem).attr('href'), rootAddress)).href} added`);
return requestQueue.addRequest({
url: (new URL($(elem).attr('href'), rootAddress)).href,
userData: {requestType: 'detailsPage'}
}).then(({wasAlreadyPresent, request}) => {
if (wasAlreadyPresent) {
console.log(`The request with id: ${request.url} was already present`);
}
});
}));
case 'detailsPage':
break
default:
throw 'ERROR: Unhandled Page!'
break;
}
},
});
await crawler.run();
})();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment