Created
September 5, 2014 17:59
-
-
Save maxpaynestory/02563716bef271316c3d to your computer and use it in GitHub Desktop.
Scrape all tweets from a twitter account using casperjs and phantomjs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
String.prototype.trim=function(){return this.replace(/^\s+|\s+$/g, '');} | |
String.prototype.ltrim=function(){return this.replace(/^\s+/,'');}; | |
String.prototype.rtrim=function(){return this.replace(/\s+$/,'');}; | |
String.prototype.fulltrim=function(){return this.replace(/(?:(?:^|\n)\s+|\s+(?:$|\n))/g,'').replace(/\s+/g,' ');}; | |
Date.prototype.MMDDYYYY = function() { | |
var yyyy = this.getUTCFullYear().toString(); | |
var mm = (this.getUTCMonth()+1).toString(); // getMonth() is zero-based | |
var dd = this.getUTCDate().toString(); | |
return (mm[1]?mm:"0"+mm[0]) + "/" + (dd[1]?dd:"0"+dd[0]) + "/" + yyyy; | |
}; | |
SanitizeString = function(str){ | |
if(str==null){ | |
str = ''; | |
} | |
str = str.trim(); | |
var returnstring = str.replace(/"/g,''); | |
returnstring = returnstring.replace(/\n/g," "); | |
returnstring = '"' + returnstring.fulltrim() + '"'; | |
return returnstring; | |
} | |
var utils = require('utils'); | |
var fs = require('fs'); | |
var casper = require('casper').create({ | |
verbose: true, | |
logLevel: 'error', | |
pageSettings: { | |
loadImages: false, | |
loadPlugins: false, | |
userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36' | |
} | |
}); | |
var tweet_account_name; | |
if(!casper.cli.has("account_name")){ | |
casper.echo("account name is missing"); | |
casper.echo("Use like this: casperjs.exe tweet_scrap.js --account_name=Karachi_Update"); | |
casper.exit(1); | |
} | |
tweet_account_name = casper.cli.get("account_name"); | |
outputfilename = "scraped_tweets.csv"; | |
var header = "Tweet,Timetamp"; | |
if(fs.exists(outputfilename)){ | |
fs.remove(outputfilename); | |
} | |
var stream = fs.open(outputfilename,"w"); | |
stream.writeLine(header); | |
function RecursiveTriverse(thecasper,newurl,stream) | |
{ | |
thecasper.wait(1000); | |
thecasper.thenOpen(newurl,function(){ | |
var timestamps = this.getElementsInfo("td.timestamp a"); | |
var tweets = this.getElementsInfo('div.tweet-text div.dir-ltr'); | |
this.echo("Scraping tweets from " + this.getCurrentUrl()); | |
for( var index in tweets){ | |
var tweet = tweets[index].text; | |
var time_stamp = timestamps[index].text; | |
tweet = SanitizeString(tweet); | |
time_stamp = SanitizeString(time_stamp); | |
stream.writeLine(tweet + "," + time_stamp); | |
} | |
if(this.exists("div.w-button-more")){ | |
thea = this.getElementInfo("div.w-button-more a"); | |
RecursiveTriverse(this,thea.attributes.href,stream); | |
} | |
}); | |
return; | |
} | |
casper.start('https://mobile.twitter.com/' + tweet_account_name,function(){ | |
RecursiveTriverse(this,this.getCurrentUrl(),stream); | |
}); | |
casper.then(function(){ | |
stream.close(); | |
stream.flush(); | |
}); | |
casper.run(); |
Something is not right, I get this message
PhantomJS has crashed. Please read the crash reporting guide at https://github.com/ariya/phantomjs/wiki/Crash-Reporting and file a bug report at https://github.com/ariya/phantomjs/issues/new with the crash dump file attached: /tmp/500da006-756f-95e2-76aadad4-5f7b9274.dmp
Segmentation fault
@CAMIZOCA, I have been having that issue too and it has to do with many intricacies I am not able to explain in full details but I made a work around.
I switched to v2.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hello, I altered your scrape_tweets.js and it's at https://gist.github.com/nwaomachux/35d1c424966fccd16ae1. The one you provided entered an infinite loop.