Created
August 30, 2019 17:02
-
-
Save miguelitoVargas/62b64f55530d6a9260c33f0632a964fa to your computer and use it in GitHub Desktop.
Apify Hello World
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* Apify SDK — The scalable web crawling and scraping library for JavaScript/Node.js. | |
* Enables development of data extraction and web automation jobs (not only) with headless Chrome and Puppeteer. | |
* git ---> https://github.com/apifytech/apify-js | |
* API reference ----> https://sdk.apify.com/ | |
* install ----> `npm i --save apify` | |
* run ---> node index.js | |
*/ | |
/* | |
* By default apify uses project dir as data storage, this can be changed by setting env variable APIFY_LOCAL_STORAGE_DIR | |
* memory usage can be set with APIFY_MEMORY_MBYTES, i.e export APIFY_MEMORY_MBYTES=100000 in linux | |
*/ | |
/* | |
* Apify has 3 types of crawlers: BasicCrawler, CheerioCrawler, PuppeterCrawler; if you want page download, extract data | |
* out of the box, use puppeter or cheerio crawlers. | |
* this example is done using the puppeter crawler, for more information on puppeter visit | |
* https://github.com/GoogleChrome/puppeteer/blob/v1.19.0/docs/api.md | |
* https://pptr.dev | |
*/ | |
/* | |
* this example is based on https://sdk.apify.com/docs/examples/puppeteercrawler | |
*/ | |
const Apify = require('apify') | |
Apify.main(async () => { | |
// create the requestQueue, a crawler needs at least a requestQueue or a requestList or both | |
const requestQueue = await Apify.openRequestQueue() | |
await requestQueue.addRequest({ url:'https://www.aritzia.com/intl/en/clothing/skirts' }) | |
// we could create a list of pseudo urls to be queued later | |
// see different ways to generate pseudo urls https://sdk.apify.com/docs/api/pseudourl | |
// const pseudoUrls = [new Apify.PseudoUrl('https://www.aritzia.com/intl/en/[.*]')] | |
// instanciate puppeter crawler | |
const crawler = new Apify.PuppeteerCrawler({ | |
requestQueue, // pass the queue | |
handlePageFunction: async ({ request, page }) => { | |
// page is an instance of puppeter.page.goTo(url) | |
const title = await page.title() | |
console.log(`Title of ${request.url}: ${title}`) | |
const pageFunction = $skirts => { | |
const data = [] | |
$skirts.forEach($skirt => { | |
data.push({ | |
name: $skirt.querySelector('a').title, | |
}) | |
}) | |
return data | |
} | |
// page.$$eval returns an array of all matching elements | |
// see https://pptr.dev/#?product=Puppeteer&version=v1.19.0&show=api-pageevalselector-pagefunction-args | |
const data = await page.$$eval('#search-result-items > li .product-image', pageFunction) | |
await Apify.pushData(data) | |
// here we could put more links to the queue either using Apify.utils.enqueueLinks | |
// or Apify.utils.puppeteer.enqueueLinks; this is a TODO for this gist | |
// Find a link to the next page and enqueue it if it exists. | |
/* const infos = await Apify.utils.enqueueLinks({ | |
page, | |
requestQueue, | |
selector: '.morelink', | |
}) | |
if (infos.length === 0) console.log(`${request.url} is the last page!`) | |
*/ | |
}, | |
// This function is called if the page processing failed more than maxRequestRetries+1 times. | |
handleFailedRequestFunction: async ({ request }) => { | |
console.log(`Request ${request.url} failed too many times`) | |
await Apify.pushData({ | |
'#debug': Apify.utils.createRequestDebugInfo(request), | |
}) | |
}, | |
maxRequestsPerCrawl: 100, | |
maxConcurrency: 10, | |
}) | |
await crawler.run() | |
console.log('Done Crawling') | |
}) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment