shreyaskarnik · July 31, 2011 00:49
diff --git a/USA.gov_bitly_hackathon.R b/USA.gov_bitly_hackathon.R
 #This is my own interpatation of USA.gov PubSub feed with some tips and code from HarlanH from twitter.
 #I am interested finding out links about which agency are shared from which part of US.
 library(stringr)
 library(plyr)
 library(ggplot2)
 library(scrapeR)
 library(RJSONIO)
 library(colorspace)
 library(RColorBrewer)
 library(maps)
 data(us.cities)
 ###getting the data
 #cbgColourPalette <- scale_colour_manual(values=c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7","#CC24A7","#C679A7"))

 #

 options(stringsAsFactors=FALSE)

 index <- getURL('http://bitly.measuredvoice.com/bitly_archive/?C=M;O=D')
 files <- str_replace(str_sub(str_extract_all(index, 'href="(.+?)"')[[1]], start=7), '"', '')
 files <- files[str_detect(files, 'bitly')]

 naifnull <- function(a,b) { if (is.null(a)) NA else b }

 # sample a few dozen files and merge them
 n.files=10
 n.top=10




 dat.samp <- ldply(sample(files[1:n.files], n.files), function (ff) {
 dat.txt<-str_split(getURL(paste('http://bitly.measuredvoice.com/bitly_archive/', ff, sep='')), '\n')[[1]]
  ldply(dat.txt, function(jj) { if (str_sub(jj,1,1)=='{') {
                                  ll <- fromJSON(jj) ;
                                  if (length(ll) > 1 ) data.frame(known_user=ll$nk,
                                             country=naifnull(ll$c,ll$c),
                                             geo_city_name=naifnull(ll$cy,ll$cy),
                                             lat=naifnull(ll$ll, ll$ll[[1]]),
                                             lon=naifnull(ll$ll, ll$ll[[2]]),
                                             timestamp=as.POSIXct(ll$t, origin="1970-01-01", tz="GMT"),
                                             hash_timestamp=as.POSIXct(ll$hc, origin="1970-01-01", tz="GMT"),
                                             long_url=ll$u,
                                             referring_url=ll$r) else NULL
                                } else NULL
                                })
 }, .progress='text')
 idx_us<-which(dat.samp$country=="US")
 dat.samp<-dat.samp[idx_us,]
 dat.samp$agency <- with(dat.samp, str_extract(long_url, '[[:alpha:]]+.gov'))
 na_agency_index<-which(is.na(dat.samp$agency))
 na_city_index<-which(is.na(dat.samp$geo_city_name))
 na_full<-union(na_agency_index,na_city_index)
 dat.samp_clean<-dat.samp[-na_full,]
 common.agencies <- names(head(sort(table(dat.samp$agency), decreasing=TRUE), n.top))
 dat.common.agency <- subset(dat.samp_clean, subset=agency %in% common.agencies)
 top_n_agencies<-names(head(sort(table(dat.samp$agency), decreasing=TRUE),n.top))
 ####Some Expts
 #colours<-c("#2f4c3d","#d741bb","#0c96c8","#a982ff","#585bed","#7b135e","#8d0a30","#d38205","#d1003d","#ac132e")
 colours<-brewer.pal(n.top,"Paired")
 cbgColourPalette<-scale_color_manual(values=colours)
 th = theme_bw()
 th$panel.background = theme_rect(fill = "gray", colour = NA)
 theme_set(th)
                                  
 g = ggplot(data=us.cities)
 g = g + geom_point(aes(x=dat.common.agency$lon,y=dat.common.agency$lat,colour=dat.common.agency$agency),size =I(3)) +borders("state", size = 0.5)
 g = g + scale_x_continuous(limits = c(-125,-66), breaks = NA)
 g = g + scale_y_continuous(limits = c(25,50), breaks = NA)
 g = g + cbgColourPalette
 g = g + labs(x=NULL, y=NULL)
 g = g + opts(title = 'Top 10 Agencies by Location', plot.title = theme_text(colour = 'black', size = 12,hjust = 0.5, vjust = 0.5, face = 'bold'))
 g = g + opts(legend.key = theme_rect(colour = 'gray', fill = 'black', size = 0.1))
 print(g)
	#This is my own interpatation of USA.gov PubSub feed with some tips and code from HarlanH from twitter.
	#I am interested finding out links about which agency are shared from which part of US.
	library(stringr)
	library(plyr)
	library(ggplot2)
	library(scrapeR)
	library(RJSONIO)
	library(colorspace)
	library(RColorBrewer)
	library(maps)
	data(us.cities)
	###getting the data
	#cbgColourPalette <- scale_colour_manual(values=c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7","#CC24A7","#C679A7"))

	#

	options(stringsAsFactors=FALSE)

	index <- getURL('http://bitly.measuredvoice.com/bitly_archive/?C=M;O=D')
	files <- str_replace(str_sub(str_extract_all(index, 'href="(.+?)"')[[1]], start=7), '"', '')
	files <- files[str_detect(files, 'bitly')]

	naifnull <- function(a,b) { if (is.null(a)) NA else b }

	# sample a few dozen files and merge them
	n.files=10
	n.top=10




	dat.samp <- ldply(sample(files[1:n.files], n.files), function (ff) {
	dat.txt<-str_split(getURL(paste('http://bitly.measuredvoice.com/bitly_archive/', ff, sep='')), '\n')[[1]]
	ldply(dat.txt, function(jj) { if (str_sub(jj,1,1)=='{') {
	ll <- fromJSON(jj) ;
	if (length(ll) > 1 ) data.frame(known_user=ll$nk,
	country=naifnull(ll$c,ll$c),
	geo_city_name=naifnull(ll$cy,ll$cy),
	lat=naifnull(ll$ll, ll$ll[[1]]),
	lon=naifnull(ll$ll, ll$ll[[2]]),
	timestamp=as.POSIXct(ll$t, origin="1970-01-01", tz="GMT"),
	hash_timestamp=as.POSIXct(ll$hc, origin="1970-01-01", tz="GMT"),
	long_url=ll$u,
	referring_url=ll$r) else NULL
	} else NULL
	})
	}, .progress='text')
	idx_us<-which(dat.samp$country=="US")
	dat.samp<-dat.samp[idx_us,]
	dat.samp$agency <- with(dat.samp, str_extract(long_url, '[[:alpha:]]+.gov'))
	na_agency_index<-which(is.na(dat.samp$agency))
	na_city_index<-which(is.na(dat.samp$geo_city_name))
	na_full<-union(na_agency_index,na_city_index)
	dat.samp_clean<-dat.samp[-na_full,]
	common.agencies <- names(head(sort(table(dat.samp$agency), decreasing=TRUE), n.top))
	dat.common.agency <- subset(dat.samp_clean, subset=agency %in% common.agencies)
	top_n_agencies<-names(head(sort(table(dat.samp$agency), decreasing=TRUE),n.top))
	####Some Expts
	#colours<-c("#2f4c3d","#d741bb","#0c96c8","#a982ff","#585bed","#7b135e","#8d0a30","#d38205","#d1003d","#ac132e")
	colours<-brewer.pal(n.top,"Paired")
	cbgColourPalette<-scale_color_manual(values=colours)
	th = theme_bw()
	th$panel.background = theme_rect(fill = "gray", colour = NA)
	theme_set(th)

	g = ggplot(data=us.cities)
	g = g + geom_point(aes(x=dat.common.agency$lon,y=dat.common.agency$lat,colour=dat.common.agency$agency),size =I(3)) +borders("state", size = 0.5)
	g = g + scale_x_continuous(limits = c(-125,-66), breaks = NA)
	g = g + scale_y_continuous(limits = c(25,50), breaks = NA)
	g = g + cbgColourPalette
	g = g + labs(x=NULL, y=NULL)
	g = g + opts(title = 'Top 10 Agencies by Location', plot.title = theme_text(colour = 'black', size = 12,hjust = 0.5, vjust = 0.5, face = 'bold'))
	g = g + opts(legend.key = theme_rect(colour = 'gray', fill = 'black', size = 0.1))
	print(g)