Created
February 21, 2013 07:05
-
-
Save privong/5002852 to your computer and use it in GitHub Desktop.
Simple analysis of interval between tweets and tweets as a function of time of day. Uses an xml format from (the now non-functioning) twitterbackup (http://johannburkard.de/blog/programming/java/backup-twitter-tweets-with-twitterbackup.html). Fits a powerlaw to the time between tweets.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# | |
# Usage: | |
# twitter_t_histogram.py file1.xml [file2.xml] | |
# | |
# first file required, second file optional (will overplot and give legends) | |
# Output files are file1-min.png, file1-sec.png | |
from xml.dom import minidom | |
from datetime import datetime | |
import sys | |
import numpy | |
import math | |
import matplotlib.pyplot as plt | |
import matplotlib.mlab as mlab | |
second=False | |
months={'Jan':1, 'Feb':2, 'Mar':3, 'Apr':4, 'May':5, 'Jun':6, 'Jul':7, 'Aug':8, 'Sep':9, 'Oct':10, 'Nov':11, 'Dec':12} | |
print "Loading and parsing tweets from %s." % (sys.argv[1]) | |
twixml=minidom.parse(sys.argv[1]) | |
twitimes=twixml.getElementsByTagName('created_at') | |
print "Found %i tweets" % (len(twitimes)/2) | |
name=sys.argv[1].split('.') | |
if (len(sys.argv)>2): | |
second=True | |
print "Loading and parsing tweets from %s." % (sys.argv[2]) | |
twixml2=minidom.parse(sys.argv[2]) | |
twitimes2=twixml2.getElementsByTagName('created_at') | |
print "Found %i tweets" % (len(twitimes2)/2) | |
name2=sys.argv[2].split('.') | |
prevt=0 | |
count=0 # count the number of tweets analyzed | |
tint=[] # array of time intervals for the first histogram (all intervals) | |
tint2=[] # array of time intervals < 1500 seconds | |
tint3=[] # array of time intervals < 300s | |
hlist=[] # listing of hours for our histogram | |
for i in twitimes: | |
if ((count % 2)==0): | |
j=(i.firstChild.data).split(' ') | |
l=j[3].split(':') | |
# make sure we're still on UT time | |
if (j[4]!='+0000'): | |
print "Not UT, ignoring..." | |
else: | |
if (prevt!=0): | |
nowt=datetime(int(j[5]),int(months[j[1]]),int(j[2]),int(l[0]),int(l[1]),int(l[2])) | |
dt=prevt-nowt | |
dtint=((dt.microseconds + (dt.seconds + dt.days * 24. * 3600) * 10**6) / 10**6)/60. | |
hlist.append(nowt.hour+nowt.minute/60.) | |
# generate three separate histograms. All intervals, intervals less than 1500s and intervals less than 300s | |
if (dtint!=0): | |
tint.append(dtint) | |
if (dtint < 1500.): | |
tint2.append(dtint) | |
if (dtint < 300.): | |
tint3.append(dtint*60) | |
prevt=nowt | |
else: | |
print "First timestamp" | |
prevt=datetime(int(j[5]),int(months[j[1]]),int(j[2]),int(l[0]),int(l[1]),int(l[2])) | |
count+=1 | |
# see if a second file has been provided, analyze in the same way as above | |
if (second): | |
tint21=[] | |
tint22=[] | |
tint23=[] | |
hlist2=[] | |
for i in twitimes2: | |
if ((count % 2)==0): | |
j=(i.firstChild.data).split(' ') | |
l=j[3].split(':') | |
# make sure we're still on UT time | |
if (j[4]!='+0000'): | |
print "Not UT, ignoring..." | |
else: | |
if (prevt!=0): | |
nowt=datetime(int(j[5]),int(months[j[1]]),int(j[2]),int(l[0]),int(l[1]),int(l[2])) | |
dt=prevt-nowt | |
dtint=((dt.microseconds + (dt.seconds + dt.days * 24. * 3600) * 10**6) / 10**6)/60. | |
hlist2.append(nowt.hour+nowt.minute/60.) | |
if (dtint!=0): | |
tint21.append(dtint) | |
# the interval is logarithmic so the histogram can be! | |
if (dtint < 1500.): | |
tint22.append(dtint) | |
if (dtint < 300.): | |
tint23.append(dtint*60) | |
prevt=nowt | |
else: | |
print "First timestamp" | |
prevt=datetime(int(j[5]),int(months[j[1]]),int(j[2]),int(l[0]),int(l[1]),int(l[2])) | |
count+=1 | |
# 2 panel plot with the whole interval on top and the < 1500s intervals on the bottom | |
# histogram of the intervals | |
fig=plt.figure() | |
ax=fig.add_subplot(211) | |
n,bins,patches=ax.hist(tint,bins=20,range=(0,max(tint)),log=True,normed=True,label=name[0]) | |
if (second): | |
n21,bins21,patches21=ax.hist(tint21,bins=20,range=(0,max(tint)),log=True,normed=True,label=name2[0]) | |
plt.suptitle(name[0]+', '+name2[0]+' (Minutes)') | |
else: | |
plt.suptitle(name[0]+' (Minutes)') | |
ax.set_xlabel('Time Between Tweets (minutes)') | |
ax.set_ylabel('Fraction of Tweets') | |
plt.legend() | |
ax=fig.add_subplot(212) | |
n2,bins2,patches2=ax.hist(tint2,bins=20,range=(0,max(tint2)),log=True,normed=True) | |
if (second): | |
n22,bins22,patches22=ax.hist(tint22,bins=20,range=(0,max(tint2)),log=True,normed=True) | |
ax.set_xlabel('Time Between Tweets (minutes)') | |
ax.set_ylabel('Fraction of Tweets') | |
plt.savefig(name[0]+'-min.png',format='png') | |
# make a second plot for the 300s binning | |
fig=plt.figure() | |
ax=fig.add_subplot(111) | |
n3,bins3,patches3=ax.hist(tint3,bins=20,range=(0,max(tint3)),log=True,normed=True,label=name[0]) | |
midpt=[] | |
nn=[] | |
y=[] | |
# fit the powerlaw | |
for i in xrange(len(bins3)-1): | |
if n3[i]!=0: | |
midpt.append(math.log10(bins3[i]+(bins3[i+1]-bins3[i])/2.)) | |
nn.append(math.log10(n3[i])) | |
p=numpy.polyfit(midpt,nn,1) | |
midpt=[] | |
for i in xrange(len(bins3)-1): | |
midpt.append(bins3[i]+(bins3[i+1]-bins3[i])/2.) | |
y.append((10**p[1])*(midpt[i]**p[0])) | |
plt.plot(midpt,y,label="Power law fit, $\gamma$"+str(p[0])) | |
if (second): | |
n23,bins23,patches23=ax.hist(tint23,bins=20,range=(0,max(tint3)),log=True,normed=True,label=name2[0]) | |
plt.suptitle(name[0]+', '+name2[0]+' (Seconds)') | |
else: | |
plt.suptitle(name[0]+' (Seconds)') | |
ax.set_xlabel('Time Between Tweets (seconds)') | |
plt.legend() | |
ax.set_ylabel('Fraction of Tweets') | |
plt.savefig(name[0]+'-sec.png',format='png') | |
# plot the histogram of UT times posted | |
fig=plt.figure() | |
ax=fig.add_subplot(111) | |
nh,binsh,patchesh=ax.hist(hlist,bins=24,range=(0,24),log=False,normed=True,label=name[0]) | |
if (second): | |
nh2,binsh2,patchesh2=ax.hist(hlist2,bins=24,range=(0,24),log=False,normed=True,label=name2[0]) | |
ax.set_xlabel('Hour of the Day (UTC)') | |
ax.set_ylabel('Fraction of Tweets') | |
plt.legend() | |
plt.savefig(name[0]+'-HR_hist.png',format='png') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment