Skip to content

Instantly share code, notes, and snippets.

@meelash
Created September 3, 2019 12:54
Show Gist options
  • Save meelash/b65c12777c8abcfa71866c3d41be73dd to your computer and use it in GitHub Desktop.
Save meelash/b65c12777c8abcfa71866c3d41be73dd to your computer and use it in GitHub Desktop.
const Apify = require('apify');
(async () => {
const rootAddress = new URL('https://stores.cosmoprofbeauty.com');
const requestQueue = await Apify.openRequestQueue();
await requestQueue.addRequest({
url: rootAddress.href,
userData: {requestType:'listPage'}
})
const enqueueListItems = ($, requestType) =>{
return Promise.all(Array.prototype.map.call($('.mapListItemWrap a').get(), elem => {
return requestQueue.addRequest({
url: (new URL($(elem).attr('href'), rootAddress)).href,
userData: {requestType}
});
}));
}
const crawler = new Apify.CheerioCrawler({
requestQueue,
handlePageFunction: async ({ request, response, html, $ }) => {
switch (request.userData.requestType) {
case 'listPage':
return await enqueueListItems($, 'statePage');
case 'statePage':
return await enqueueListItems($, 'cityPage');
case 'cityPage':
// console.log(`Pre-promise visited page: ${request.url} ${request.retryCount} times, because ${request.errorMessages}`);
return Promise.all(Array.prototype.map.call($('.rio-list-links a').first().get(), elem => {
// console.log(`visited page: ${request.url} ${request.retryCount} times, because ${request.errorMessages}`);
// console.log(`Request with id: ${(new URL($(elem).attr('href'), rootAddress)).href} added`);
return requestQueue.addRequest({
url: (new URL($(elem).attr('href'), rootAddress)).href,
userData: {requestType: 'detailsPage'}
}).then(({wasAlreadyPresent, request}) => {
if (wasAlreadyPresent) {
// console.log(`The request with id: ${request.url} was already present`);
}
});
}));
case 'detailsPage':
const poi = {
store_number: $('#rio-store-id').text(),
debug: { url: request.url, id: request.id, uniqueKey: request.uniqueKey, all: JSON.stringify(request)},
};
await Apify.pushData([poi]);
break
default:
throw 'ERROR: Unhandled Page!'
break;
}
},
});
await crawler.run();
})();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment