Skip to content

Instantly share code, notes, and snippets.

@shukla2112
Created February 5, 2019 10:17
Show Gist options
  • Save shukla2112/c3bda3c41a93662d582158dfd03f34e4 to your computer and use it in GitHub Desktop.
Save shukla2112/c3bda3c41a93662d582158dfd03f34e4 to your computer and use it in GitHub Desktop.
puppeteer with resident google chrome - working with high blocking sites
// overwrite the `languages` property to use a custom getter
Object.defineProperty(navigator, "languages", {
get: function() {
return ["en-US", "en"];
};
});
// overwrite the `plugins` property to use a custom getter
Object.defineProperty(navigator, 'plugins', {
get: function() {
// this just needs to have `length > 0`, but we could mock the plugins too
return [1, 2, 3, 4, 5];
},
});
// const puppeteer = require('puppeteer');
const puppeteer = require('puppeteer-extra')
const pluginStealth = require("puppeteer-extra-plugin-stealth")
puppeteer.use(pluginStealth())
// Enable stealth plugin
// puppeteer.use(require('puppeteer-extra-plugin-stealth')())
//
// Read the file line by line - Each line would be url
// Visit the url and get the response and store it in desired format
// Capture the screenshot
// Save the stats to local mongodb
// Send the stats at the end of file
//
// Uses the below techniques to avoid bot detection
// Using the stealth mode
// load the plugins
// Use the resident browser
// Not use the default settings by puppeteer
// use headless : false
// Enable javascript
// Add headers
var mongoose = require("mongoose");
mongoose.Promise = global.Promise;
mongoose.connect("mongodb://localhost:27017/simple-crawler",{ useNewUrlParser: true });
var resSchema = new mongoose.Schema({
url: String,
html: String,
img: String,
statusCode : String,
fileName : String
});
var Result = mongoose.model("Result", resSchema);
async function crawlAndScreenshot(fileline, url) {
const args = [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-infobars',
'--window-position=0,0',
'--ignore-certifcate-errors',
'--ignore-certifcate-errors-spki-list',
'--user-data-dir="/Users/nikunjshukla/Library/Application Support/Google/Chrome/"',
];
// '--user-agent="Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14"',
// Worked for vitacost - '--user-agent="Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14"'
// '--user-agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3312.0 Safari/537.36"'
const options = {
args,
headless: false,
ignoreHTTPSErrors: true,
executablePath: '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
handleSIGINT: true,
ignoreDefaultArgs: true,
};
// ignoreDefaultArgs: true,
// devtools: true,
// slowMo: 2000,
const browser = await puppeteer.launch(options);
const context = await browser.createIncognitoBrowserContext();
var fs = require('fs');
const preloadFile = fs.readFileSync('./preload.js', 'utf8');
// const page = await browser.newPage();
const page = await context.newPage();
await page.setJavaScriptEnabled(true)
await page.evaluateOnNewDocument(preloadFile);
const headers = {
'Accept': '*/*',
'Accept-Language': 'en-US',
}
await page.setExtraHTTPHeaders(headers);
// await page.setUserAgent('Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)');
var obj = {"url" : url}
try {
// await page.setViewport({width: 1000, height: 500})
const response = await page.goto(url, {waitUntil: 'networkidle2'});
await page.waitFor(1000);
let bodyHTML = await page.evaluate(() => document.body.innerHTML);
const userAgent = await page.evaluate('navigator.userAgent');
console.log('Useragent used - ', userAgent)
const html = await page.content();
const statusCode = response.status();
// await page.setViewport({width: 1000, height: 500})
image_path = 'images/' + fileline + '.png'
const imgData = await page.screenshot({path: image_path});
const img = Buffer.from(imgData, 'base64');
obj["html"] = html;
obj["img"] = img;
obj["statusCode"] = statusCode;
if (statusCode == 403) {
await sleep(60000); //sleep 30secs
}
} catch (err) {
console.log('Error loading page:', err);
}
// await sleep(180000); //sleep 30secs
await context.close();
browser.close();
return obj;
};
async function saveData(data) {
var myData = new Result(data);
myData.save()
.then(item => {
console.log("data saved to database");
})
.catch(err => {
console.log("Unable to save to database");
});
}
async function processFile() {
const args = process.argv;
if (args.length != 3) {
console.log("usage - `node scrape.js <filepath>`");
process.exit(1);
}
inputFile = args[2]
var fs = require('fs'),
readline = require('readline');
var fileContents = fs.readFileSync(inputFile, { encoding: 'utf-8' });
var c = 0
var lines = fileContents.split("\n");
lines.pop();
var summary = {}
for (var line of lines) {
await sleep(3000); //sleep 30secs
var retVal = {};
console.log(line);
fileNameArr = inputFile.split("/");
fileline = fileNameArr[fileNameArr.length - 1] + c
// await crawlAndScreenshot(fileline, line).then((value) => {
// console.log(value); // Success!
// });
await crawlAndScreenshot(fileline, line).then(function (obj) {
console.log("Finished retrieving page : " + obj.url);
retVal['url'] = obj.url;
retVal['html'] = obj.html;
retVal['img'] = obj.img;
retVal['statusCode'] = obj.statusCode;
retVal['fileName'] = inputFile;
console.log("Status = " + obj.statusCode);
});
c++
await saveData(retVal);
if ( retVal['statusCode'] in summary ) {
summary[retVal['statusCode']] = summary[retVal['statusCode']] + 1
} else {
summary[retVal['statusCode']] = 1
}
}
console.log(summary)
process.exit(0);
}
processFile();
function sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment