Skip to content

Instantly share code, notes, and snippets.

@tuaplicacionpropia
Created March 2, 2020 21:49
Show Gist options
  • Save tuaplicacionpropia/deb62e1d84d53ea2ce310fd16c5bad48 to your computer and use it in GitHub Desktop.
Save tuaplicacionpropia/deb62e1d84d53ea2ce310fd16c5bad48 to your computer and use it in GitHub Desktop.
Forma sencillo de extraer un listado de datos paginados. Ver funciones loadItems y singleReadItem
// This is the main Node.js source code file of your actor.
// It is referenced from the "scripts" section of the package.json file.
const Apify = require('apify');
const readItem = () => {
var result = null;
result = $("div[class='aditem']").map(function() {
//return $(this).attr("href");
var eBookData = null;//readTitle(parent);
var title = $(this).find("a[class='aditem-detail-title']").text();
var url = $(this).find("a[class='aditem-detail-title']").attr("href");
var baseUrl = window.location.href.split('?')[0];
var category = $(this).find("div[class='display-desktop list-location-link']").text();
var location = $(this).find("div[class='list-location-region']").text();
var description = $(this).find("div[class='tx']").text();
var price = $(this).find("div[class='aditem-price']").text();
var year = $(this).find("div[class='ano tag-mobile']").text();
var kms = $(this).find("div[class='kms tag-mobile']").text();
eBookData = {};
eBookData['title'] = title;
eBookData['url'] = baseUrl + url;
eBookData['category'] = category;
eBookData['location'] = location;
eBookData['description'] = description;
eBookData['price'] = price;
eBookData['year'] = year;
eBookData['kms'] = kms;
//result['baseUrl'] = baseUrl;
return eBookData;
}).get();
return result;
};
const singleReadItem_FULL = (self) => {
//return $(self).attr("href");
var eBookData = null;//readTitle(parent);
var title = $(self).find("a[class='aditem-detail-title']").text();
var url = $(self).find("a[class='aditem-detail-title']").attr("href");
var baseUrl = window.location.href.split('?')[0];
var category = $(self).find("div[class='display-desktop list-location-link']").text();
var location = $(self).find("div[class='list-location-region']").text();
var description = $(self).find("div[class='tx']").text();
var price = $(self).find("div[class='aditem-price']").text();
var year = $(self).find("div[class='ano tag-mobile']").text();
var kms = $(self).find("div[class='kms tag-mobile']").text();
eBookData = {};
eBookData['title'] = title;
eBookData['url'] = baseUrl + url;
eBookData['category'] = category;
eBookData['location'] = location;
eBookData['description'] = description;
eBookData['price'] = price;
eBookData['year'] = year;
eBookData['kms'] = kms;
//result['baseUrl'] = baseUrl;
return eBookData;
};
function singleReadItem_FUNCIONA (self) {
return {title: "vengita ya77"};
}
const singleReadItem_FUNCIONA2 = (self) => {
return {title: "vengita ya7788"};
};
const singleReadItem = (self) => {
//return $(self).attr("href");
var eBookData = null;//readTitle(parent);
var title = $(self).find("a[class='aditem-detail-title']").text();
var url = $(self).find("a[class='aditem-detail-title']").attr("href");
var baseUrl = window.location.href.split('?')[0];
var category = $(self).find("div[class='display-desktop list-location-link']").text();
var location = $(self).find("div[class='list-location-region']").text();
var description = $(self).find("div[class='tx']").text();
var price = $(self).find("div[class='aditem-price']").text();
var year = $(self).find("div[class='ano tag-mobile']").text();
var kms = $(self).find("div[class='kms tag-mobile']").text();
eBookData = {};
eBookData['title'] = title + self.toString();
eBookData['url'] = baseUrl + url;
eBookData['category'] = category;
eBookData['location'] = location;
eBookData['description'] = description;
eBookData['price'] = price;
eBookData['year'] = year;
eBookData['kms'] = kms;
//result['baseUrl'] = baseUrl;
return eBookData;
};
const loadItems_FUNCIONA = async ( page ) => {
var result = [];
var goOn = true;
var pageIndex = 1;
//var goOnSelector = "a:contains('Siguiente')";
const goOnSelector = "div[onclick*='pSiguiente']";
while (goOn) {
console.log("reading page " + pageIndex);
const pageItems = await page.evaluate(readItem);
//console.log("pageItems = " + pageItems);
result = result.concat(pageItems);
goOn = await page.evaluate((selector) => {
return $(selector).length;
}, goOnSelector);
if (goOn) {
await page.click(goOnSelector);
await Apify.utils.sleep(10000);
console.log("GO ON NEXT PAGE");
}
pageIndex += 1;
if (pageIndex > 2) {
break;
}
}
return result;
};
const loadItems_FUNCIONA2 = async ( page, goOnSelector ) => {
var result = [];
var goOn = true;
var pageIndex = 1;
//var goOnSelector = "a:contains('Siguiente')";
//const goOnSelector = "div[onclick*='pSiguiente']";
while (goOn) {
console.log("reading page " + pageIndex);
const pageItems = await page.evaluate(readItem);
//console.log("pageItems = " + pageItems);
result = result.concat(pageItems);
goOn = await page.evaluate((selector) => {
return $(selector).length;
}, goOnSelector);
if (goOn) {
await page.click(goOnSelector);
await Apify.utils.sleep(10000);
console.log("GO ON NEXT PAGE");
}
pageIndex += 1;
if (pageIndex > 2) {
break;
}
}
return result;
};
const loadItems_FUNCIONA3 = async ( page, goOnSelector, fnReadItem ) => {
var result = [];
var goOn = true;
var pageIndex = 1;
//var goOnSelector = "a:contains('Siguiente')";
//const goOnSelector = "div[onclick*='pSiguiente']";
while (goOn) {
console.log("reading page " + pageIndex);
const pageItems = await page.evaluate(fnReadItem);
//console.log("pageItems = " + pageItems);
result = result.concat(pageItems);
goOn = await page.evaluate((selector) => {
return $(selector).length;
}, goOnSelector);
if (goOn) {
await page.click(goOnSelector);
await Apify.utils.sleep(10000);
console.log("GO ON NEXT PAGE");
}
pageIndex += 1;
if (pageIndex > 2) {
break;
}
}
return result;
};
const loadItems = async ( page, goOnSelector, listSelector, fnReadItem ) => {
var result = [];
var goOn = true;
var pageIndex = 1;
//var goOnSelector = "a:contains('Siguiente')";
//const goOnSelector = "div[onclick*='pSiguiente']";
console.log(">>>>>>>>>>>>> " + typeof fnReadItem);
while (goOn) {
console.log("reading page " + pageIndex);
//https://stackoverflow.com/questions/46088351/puppeteer-pass-variable-in-evaluate
const pageItems = await page.evaluate((selector2, myFnReadItem) => {
var selector4 = selector2 + " " + myFnReadItem;
//const func = new Function(`return ${myFnReadItem}.apply(null, arguments)`);
//const func = new Function("self", "return {title: 'periquito'};");
const func = new Function("return " + myFnReadItem)();
return $(selector2).map(function(self) {
//return {title: 'HOLA'};
return func(this);
//return {title: selector4};
//return {title: selector4 + ">>>>>>>>>>" + (typeof func)};
}).get();
//return $(selector2).map(func).get();
}, listSelector, fnReadItem.toString());
//console.log("pageItems = " + pageItems);
result = result.concat(pageItems);
goOn = await page.evaluate((selector) => {
return $(selector).length;
}, goOnSelector);
if (goOn) {
await page.click(goOnSelector);
await Apify.utils.sleep(10000);
console.log("GO ON NEXT PAGE");
}
pageIndex += 1;
if (pageIndex > 2) {
break;
}
}
return result;
};
Apify.main(async () => {
// Get input of the actor.
// If you'd like to have your input checked and generate a user
// interface for it, add INPUT_SCHEMA.json file to your actor.
// For more information, see https://docs.apify.com/actor/input-schema
const input = await Apify.getInput();
console.log('Input:');
console.dir(input);
// Do something useful here...
const url = "https://www.milanuncios.com/autocaravanas-de-segunda-mano/?fromSearch=1&demanda=n";
console.log('Launching Puppeteer...');
const browser = await Apify.launchPuppeteer();
const page = await browser.newPage();
const inputUrl = url;//input.startUrl;
console.log(`Opening URL: ${inputUrl}`);
await page.goto(inputUrl);
//const screenshotBuffer = await page.screenshot();
//await Apify.setValue("screenshot.png", screenshotBuffer, { contentType: 'image/png' });
console.log("titles");
//var titles = await loadTitles(page);
//var titles = await page.evaluate(loadTitles);
//var titles = await page.evaluate(loadTitles4);
var titles = await loadItems(page, "div[onclick*='pSiguiente']", "div[class='aditem']", singleReadItem);
/*
const diaSemana = await page.evaluate((testDiaMes) => {
var widget = document.evaluate("//table[contains(@class, 'work-calendar')]/caption[contains(text(), 'Enero')]/parent::table//td[contains(@class, 'datepicker-day')][contains(text(), '" + testDiaMes.toString() + "')]", document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue;
widget = widget.previousElementSibling;
var diaSemana = 0;
while (widget != null && widget.innerText.length > 0) {
widget = widget.previousElementSibling;
diaSemana += 1;
}
const dw = ['L','M','X','J','V','S','D']
return dw[diaSemana];
}, "testDiaMes");
*/
//var titles = await page.evaluate(readParentsTitle, readTitle);
for (var i = 0; i < titles.length; i++) {
console.log(titles[i].title);
}
// Save output
const output = {
receivedInput: input,
titles: titles,
message: 'Hello sir!',
};
console.log('Output:');
console.dir(output);
await Apify.setValue('OUTPUT3', output);
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment