Created
March 2, 2020 21:49
-
-
Save tuaplicacionpropia/deb62e1d84d53ea2ce310fd16c5bad48 to your computer and use it in GitHub Desktop.
Forma sencillo de extraer un listado de datos paginados. Ver funciones loadItems y singleReadItem
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// This is the main Node.js source code file of your actor. | |
// It is referenced from the "scripts" section of the package.json file. | |
const Apify = require('apify'); | |
const readItem = () => { | |
var result = null; | |
result = $("div[class='aditem']").map(function() { | |
//return $(this).attr("href"); | |
var eBookData = null;//readTitle(parent); | |
var title = $(this).find("a[class='aditem-detail-title']").text(); | |
var url = $(this).find("a[class='aditem-detail-title']").attr("href"); | |
var baseUrl = window.location.href.split('?')[0]; | |
var category = $(this).find("div[class='display-desktop list-location-link']").text(); | |
var location = $(this).find("div[class='list-location-region']").text(); | |
var description = $(this).find("div[class='tx']").text(); | |
var price = $(this).find("div[class='aditem-price']").text(); | |
var year = $(this).find("div[class='ano tag-mobile']").text(); | |
var kms = $(this).find("div[class='kms tag-mobile']").text(); | |
eBookData = {}; | |
eBookData['title'] = title; | |
eBookData['url'] = baseUrl + url; | |
eBookData['category'] = category; | |
eBookData['location'] = location; | |
eBookData['description'] = description; | |
eBookData['price'] = price; | |
eBookData['year'] = year; | |
eBookData['kms'] = kms; | |
//result['baseUrl'] = baseUrl; | |
return eBookData; | |
}).get(); | |
return result; | |
}; | |
const singleReadItem_FULL = (self) => { | |
//return $(self).attr("href"); | |
var eBookData = null;//readTitle(parent); | |
var title = $(self).find("a[class='aditem-detail-title']").text(); | |
var url = $(self).find("a[class='aditem-detail-title']").attr("href"); | |
var baseUrl = window.location.href.split('?')[0]; | |
var category = $(self).find("div[class='display-desktop list-location-link']").text(); | |
var location = $(self).find("div[class='list-location-region']").text(); | |
var description = $(self).find("div[class='tx']").text(); | |
var price = $(self).find("div[class='aditem-price']").text(); | |
var year = $(self).find("div[class='ano tag-mobile']").text(); | |
var kms = $(self).find("div[class='kms tag-mobile']").text(); | |
eBookData = {}; | |
eBookData['title'] = title; | |
eBookData['url'] = baseUrl + url; | |
eBookData['category'] = category; | |
eBookData['location'] = location; | |
eBookData['description'] = description; | |
eBookData['price'] = price; | |
eBookData['year'] = year; | |
eBookData['kms'] = kms; | |
//result['baseUrl'] = baseUrl; | |
return eBookData; | |
}; | |
function singleReadItem_FUNCIONA (self) { | |
return {title: "vengita ya77"}; | |
} | |
const singleReadItem_FUNCIONA2 = (self) => { | |
return {title: "vengita ya7788"}; | |
}; | |
const singleReadItem = (self) => { | |
//return $(self).attr("href"); | |
var eBookData = null;//readTitle(parent); | |
var title = $(self).find("a[class='aditem-detail-title']").text(); | |
var url = $(self).find("a[class='aditem-detail-title']").attr("href"); | |
var baseUrl = window.location.href.split('?')[0]; | |
var category = $(self).find("div[class='display-desktop list-location-link']").text(); | |
var location = $(self).find("div[class='list-location-region']").text(); | |
var description = $(self).find("div[class='tx']").text(); | |
var price = $(self).find("div[class='aditem-price']").text(); | |
var year = $(self).find("div[class='ano tag-mobile']").text(); | |
var kms = $(self).find("div[class='kms tag-mobile']").text(); | |
eBookData = {}; | |
eBookData['title'] = title + self.toString(); | |
eBookData['url'] = baseUrl + url; | |
eBookData['category'] = category; | |
eBookData['location'] = location; | |
eBookData['description'] = description; | |
eBookData['price'] = price; | |
eBookData['year'] = year; | |
eBookData['kms'] = kms; | |
//result['baseUrl'] = baseUrl; | |
return eBookData; | |
}; | |
const loadItems_FUNCIONA = async ( page ) => { | |
var result = []; | |
var goOn = true; | |
var pageIndex = 1; | |
//var goOnSelector = "a:contains('Siguiente')"; | |
const goOnSelector = "div[onclick*='pSiguiente']"; | |
while (goOn) { | |
console.log("reading page " + pageIndex); | |
const pageItems = await page.evaluate(readItem); | |
//console.log("pageItems = " + pageItems); | |
result = result.concat(pageItems); | |
goOn = await page.evaluate((selector) => { | |
return $(selector).length; | |
}, goOnSelector); | |
if (goOn) { | |
await page.click(goOnSelector); | |
await Apify.utils.sleep(10000); | |
console.log("GO ON NEXT PAGE"); | |
} | |
pageIndex += 1; | |
if (pageIndex > 2) { | |
break; | |
} | |
} | |
return result; | |
}; | |
const loadItems_FUNCIONA2 = async ( page, goOnSelector ) => { | |
var result = []; | |
var goOn = true; | |
var pageIndex = 1; | |
//var goOnSelector = "a:contains('Siguiente')"; | |
//const goOnSelector = "div[onclick*='pSiguiente']"; | |
while (goOn) { | |
console.log("reading page " + pageIndex); | |
const pageItems = await page.evaluate(readItem); | |
//console.log("pageItems = " + pageItems); | |
result = result.concat(pageItems); | |
goOn = await page.evaluate((selector) => { | |
return $(selector).length; | |
}, goOnSelector); | |
if (goOn) { | |
await page.click(goOnSelector); | |
await Apify.utils.sleep(10000); | |
console.log("GO ON NEXT PAGE"); | |
} | |
pageIndex += 1; | |
if (pageIndex > 2) { | |
break; | |
} | |
} | |
return result; | |
}; | |
const loadItems_FUNCIONA3 = async ( page, goOnSelector, fnReadItem ) => { | |
var result = []; | |
var goOn = true; | |
var pageIndex = 1; | |
//var goOnSelector = "a:contains('Siguiente')"; | |
//const goOnSelector = "div[onclick*='pSiguiente']"; | |
while (goOn) { | |
console.log("reading page " + pageIndex); | |
const pageItems = await page.evaluate(fnReadItem); | |
//console.log("pageItems = " + pageItems); | |
result = result.concat(pageItems); | |
goOn = await page.evaluate((selector) => { | |
return $(selector).length; | |
}, goOnSelector); | |
if (goOn) { | |
await page.click(goOnSelector); | |
await Apify.utils.sleep(10000); | |
console.log("GO ON NEXT PAGE"); | |
} | |
pageIndex += 1; | |
if (pageIndex > 2) { | |
break; | |
} | |
} | |
return result; | |
}; | |
const loadItems = async ( page, goOnSelector, listSelector, fnReadItem ) => { | |
var result = []; | |
var goOn = true; | |
var pageIndex = 1; | |
//var goOnSelector = "a:contains('Siguiente')"; | |
//const goOnSelector = "div[onclick*='pSiguiente']"; | |
console.log(">>>>>>>>>>>>> " + typeof fnReadItem); | |
while (goOn) { | |
console.log("reading page " + pageIndex); | |
//https://stackoverflow.com/questions/46088351/puppeteer-pass-variable-in-evaluate | |
const pageItems = await page.evaluate((selector2, myFnReadItem) => { | |
var selector4 = selector2 + " " + myFnReadItem; | |
//const func = new Function(`return ${myFnReadItem}.apply(null, arguments)`); | |
//const func = new Function("self", "return {title: 'periquito'};"); | |
const func = new Function("return " + myFnReadItem)(); | |
return $(selector2).map(function(self) { | |
//return {title: 'HOLA'}; | |
return func(this); | |
//return {title: selector4}; | |
//return {title: selector4 + ">>>>>>>>>>" + (typeof func)}; | |
}).get(); | |
//return $(selector2).map(func).get(); | |
}, listSelector, fnReadItem.toString()); | |
//console.log("pageItems = " + pageItems); | |
result = result.concat(pageItems); | |
goOn = await page.evaluate((selector) => { | |
return $(selector).length; | |
}, goOnSelector); | |
if (goOn) { | |
await page.click(goOnSelector); | |
await Apify.utils.sleep(10000); | |
console.log("GO ON NEXT PAGE"); | |
} | |
pageIndex += 1; | |
if (pageIndex > 2) { | |
break; | |
} | |
} | |
return result; | |
}; | |
Apify.main(async () => { | |
// Get input of the actor. | |
// If you'd like to have your input checked and generate a user | |
// interface for it, add INPUT_SCHEMA.json file to your actor. | |
// For more information, see https://docs.apify.com/actor/input-schema | |
const input = await Apify.getInput(); | |
console.log('Input:'); | |
console.dir(input); | |
// Do something useful here... | |
const url = "https://www.milanuncios.com/autocaravanas-de-segunda-mano/?fromSearch=1&demanda=n"; | |
console.log('Launching Puppeteer...'); | |
const browser = await Apify.launchPuppeteer(); | |
const page = await browser.newPage(); | |
const inputUrl = url;//input.startUrl; | |
console.log(`Opening URL: ${inputUrl}`); | |
await page.goto(inputUrl); | |
//const screenshotBuffer = await page.screenshot(); | |
//await Apify.setValue("screenshot.png", screenshotBuffer, { contentType: 'image/png' }); | |
console.log("titles"); | |
//var titles = await loadTitles(page); | |
//var titles = await page.evaluate(loadTitles); | |
//var titles = await page.evaluate(loadTitles4); | |
var titles = await loadItems(page, "div[onclick*='pSiguiente']", "div[class='aditem']", singleReadItem); | |
/* | |
const diaSemana = await page.evaluate((testDiaMes) => { | |
var widget = document.evaluate("//table[contains(@class, 'work-calendar')]/caption[contains(text(), 'Enero')]/parent::table//td[contains(@class, 'datepicker-day')][contains(text(), '" + testDiaMes.toString() + "')]", document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue; | |
widget = widget.previousElementSibling; | |
var diaSemana = 0; | |
while (widget != null && widget.innerText.length > 0) { | |
widget = widget.previousElementSibling; | |
diaSemana += 1; | |
} | |
const dw = ['L','M','X','J','V','S','D'] | |
return dw[diaSemana]; | |
}, "testDiaMes"); | |
*/ | |
//var titles = await page.evaluate(readParentsTitle, readTitle); | |
for (var i = 0; i < titles.length; i++) { | |
console.log(titles[i].title); | |
} | |
// Save output | |
const output = { | |
receivedInput: input, | |
titles: titles, | |
message: 'Hello sir!', | |
}; | |
console.log('Output:'); | |
console.dir(output); | |
await Apify.setValue('OUTPUT3', output); | |
}); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment