miguelitoVargas · August 30, 2019 17:02
diff --git a/apifyIntro.js b/apifyIntro.js
 /*
 * Apify SDK — The scalable web crawling and scraping library for JavaScript/Node.js. 
 * Enables development of data extraction and web automation jobs (not only) with headless Chrome and Puppeteer. 
 * git ---> https://github.com/apifytech/apify-js
 * API reference ----> https://sdk.apify.com/
 * install ----> `npm i --save apify`
 * run ---> node index.js
 */
 /*
 * By default apify uses project dir as data storage, this can be changed by setting env variable  APIFY_LOCAL_STORAGE_DIR
 * memory usage can be set with APIFY_MEMORY_MBYTES, i.e export APIFY_MEMORY_MBYTES=100000 in linux
 */
 /*
 * Apify has 3 types of crawlers: BasicCrawler, CheerioCrawler, PuppeterCrawler; if you want page download, extract data 
 * out of the box, use puppeter or cheerio crawlers.
 * this example is done using the puppeter crawler, for more information on puppeter visit 
 * https://github.com/GoogleChrome/puppeteer/blob/v1.19.0/docs/api.md
 * https://pptr.dev
 */

 /*
 * this example is based on https://sdk.apify.com/docs/examples/puppeteercrawler
 */
 const Apify = require('apify')

 Apify.main(async () => {
    // create the requestQueue, a crawler needs at least a requestQueue or a requestList or both
    const requestQueue = await Apify.openRequestQueue()
    await requestQueue.addRequest({ url:'https://www.aritzia.com/intl/en/clothing/skirts'  })
  
    // we could create a list of pseudo urls to be queued later
    // see different ways to generate pseudo urls https://sdk.apify.com/docs/api/pseudourl
    //    const pseudoUrls = [new Apify.PseudoUrl('https://www.aritzia.com/intl/en/[.*]')]
  
    // instanciate puppeter crawler
    const crawler = new Apify.PuppeteerCrawler({
        requestQueue, // pass the queue
        handlePageFunction: async ({ request, page }) => {
            // page is an instance of puppeter.page.goTo(url)
            const title = await page.title()
            console.log(`Title of ${request.url}: ${title}`)
            
            const pageFunction = $skirts => {
              const data = []
              $skirts.forEach($skirt => {
                data.push({
                  name: $skirt.querySelector('a').title,
                })
              })
              return data
            }
            // page.$$eval returns an array of all matching elements
            // see https://pptr.dev/#?product=Puppeteer&version=v1.19.0&show=api-pageevalselector-pagefunction-args
             const data = await page.$$eval('#search-result-items > li .product-image', pageFunction)
             await Apify.pushData(data)
          
          // here we could put more links to the queue either using Apify.utils.enqueueLinks 
          // or Apify.utils.puppeteer.enqueueLinks; this is a TODO for this gist
          // Find a link to the next page and enqueue it if it exists.
           /* const infos = await Apify.utils.enqueueLinks({
                page,
                requestQueue,
                selector: '.morelink',
            })

            if (infos.length === 0) console.log(`${request.url} is the last page!`)
            */
     
        },
        // This function is called if the page processing failed more than maxRequestRetries+1 times.
        handleFailedRequestFunction: async ({ request }) => {
          console.log(`Request ${request.url} failed too many times`)
          await Apify.pushData({
            '#debug': Apify.utils.createRequestDebugInfo(request),
          })
        },
        maxRequestsPerCrawl: 100,
        maxConcurrency: 10,
    })

    await crawler.run()

    console.log('Done Crawling')
 })
	/*
	* Apify SDK — The scalable web crawling and scraping library for JavaScript/Node.js.
	* Enables development of data extraction and web automation jobs (not only) with headless Chrome and Puppeteer.
	* git ---> https://github.com/apifytech/apify-js
	* API reference ----> https://sdk.apify.com/
	* install ----> `npm i --save apify`
	* run ---> node index.js
	*/
	/*
	* By default apify uses project dir as data storage, this can be changed by setting env variable APIFY_LOCAL_STORAGE_DIR
	* memory usage can be set with APIFY_MEMORY_MBYTES, i.e export APIFY_MEMORY_MBYTES=100000 in linux
	*/
	/*
	* Apify has 3 types of crawlers: BasicCrawler, CheerioCrawler, PuppeterCrawler; if you want page download, extract data
	* out of the box, use puppeter or cheerio crawlers.
	* this example is done using the puppeter crawler, for more information on puppeter visit
	* https://github.com/GoogleChrome/puppeteer/blob/v1.19.0/docs/api.md
	* https://pptr.dev
	*/

	/*
	* this example is based on https://sdk.apify.com/docs/examples/puppeteercrawler
	*/
	const Apify = require('apify')

	Apify.main(async () => {
	// create the requestQueue, a crawler needs at least a requestQueue or a requestList or both
	const requestQueue = await Apify.openRequestQueue()
	await requestQueue.addRequest({ url:'https://www.aritzia.com/intl/en/clothing/skirts' })

	// we could create a list of pseudo urls to be queued later
	// see different ways to generate pseudo urls https://sdk.apify.com/docs/api/pseudourl
	// const pseudoUrls = [new Apify.PseudoUrl('https://www.aritzia.com/intl/en/[.*]')]

	// instanciate puppeter crawler
	const crawler = new Apify.PuppeteerCrawler({
	requestQueue, // pass the queue
	handlePageFunction: async ({ request, page }) => {
	// page is an instance of puppeter.page.goTo(url)
	const title = await page.title()
	console.log(`Title of ${request.url}: ${title}`)

	const pageFunction = $skirts => {
	const data = []
	$skirts.forEach($skirt => {
	data.push({
	name: $skirt.querySelector('a').title,
	})
	})
	return data
	}
	// page.$$eval returns an array of all matching elements
	// see https://pptr.dev/#?product=Puppeteer&version=v1.19.0&show=api-pageevalselector-pagefunction-args
	const data = await page.$$eval('#search-result-items > li .product-image', pageFunction)
	await Apify.pushData(data)

	// here we could put more links to the queue either using Apify.utils.enqueueLinks
	// or Apify.utils.puppeteer.enqueueLinks; this is a TODO for this gist
	// Find a link to the next page and enqueue it if it exists.
	/* const infos = await Apify.utils.enqueueLinks({
	page,
	requestQueue,
	selector: '.morelink',
	})

	if (infos.length === 0) console.log(`${request.url} is the last page!`)
	*/

	},
	// This function is called if the page processing failed more than maxRequestRetries+1 times.
	handleFailedRequestFunction: async ({ request }) => {
	console.log(`Request ${request.url} failed too many times`)
	await Apify.pushData({
	'#debug': Apify.utils.createRequestDebugInfo(request),
	})
	},
	maxRequestsPerCrawl: 100,
	maxConcurrency: 10,
	})

	await crawler.run()

	console.log('Done Crawling')
	})