hillarysanders · January 7, 2015 02:17
diff --git a/gistfile1.txt b/gistfile1.txt
 ##########################################################################################
 ##########################################################################################
 # PREMISE
 # hills
 ##########################################################################################
 wd <- "~/Desktop/PREMISE/"
 setwd(wd)
 # source("Hillary_Premise/utils/env.R")
 ##########################################################################################
 ##########################################################################################


 x = read.csv('~/Downloads/anand-google-electricity-results.csv')
 dim(x)
 print(colnames(x))
 wanted_columns = c('b_type', 'b_size', 'b_wall', 'b_roof', 'wired', 'image', 'o_uuid', 'timestamp',
                   'g_ring_dis', 'g_ring_id', 'g_ring_name', 'e_sub', 'e_obj', 'e_rel', 'e_has_rel',
                   'user_name', 'id')
 x = x[ , wanted_columns]
 x = fix.factors(x)
 # x$g_ring_id[x$g_ring_id=='-'] = '00000'
 x = x[!is.na(x$wired), ]
 x = x[x$g_ring_id!='-', ]
 x = x[!is.na(x$wired), ]
 x = x[order(x$g_ring_id), ]

 info = x
 y = x$wired
 x = x[ ,colnames(x)!='wired']
 x = x[ , c('g_ring_dis', 'b_type', 'b_size', 'b_wall', 'b_roof')]

 # 65.7
 for(w in sort(unique(x$b_wall))){
  print(w)
  print(cor(x$b_wall==w, y))
 }

 ####################################################################################
 # CLEANING:
 x$residence = c(0, 1)[1+(x$b_type!='t_business')]
 x$b_type = NULL
 x$b_size[x$b_size == 's_small'] = 1
 x$b_size[x$b_size == 's_medium'] = 2
 x$b_size[x$b_size == 's_large'] = 3
 x$b_size[x$b_size == ''] = NA
 x$b_size = as.numeric(x$b_size)
 x$b_wall[x$b_wall=='w_mud'] = 0
 x$b_wall[x$b_wall=='w_wood'] = 0
 x$b_wall[x$b_wall=='w_sheet'] = 0
 x$b_wall[x$b_wall==''] = 0
 x$b_wall[x$b_wall=='w_brick'] = 1
 x$b_wall[x$b_wall=='w_cement'] = 1
 x$b_wall = as.numeric(x$b_wall)
 x$b_roof[x$b_roof=='r_tile'] = 2
 x$b_roof[x$b_roof=='r_sheet'] = 1
 x$b_roof[x$b_roof=='' | x$b_roof=='r_other'] = NA
 x$b_roof = as.numeric(x$b_roof)
 table(x$b_roof)

 for(i in 1:ncol(x)){
  y = y[!is.na(x[ , i])]
  info = info[!is.na(x[ , i]), ]
  x = x[!is.na(x[ , i]), ]  
 } 
 ####################################################################################



 ####################################################################################
 # do it:
 rings = unique(info$g_ring_id)
 acc = data.frame(ring=rings, accuracy=NA, MSE=NA, abs.error=NA, n=NA, cor=NA,
                 precision.wired=NA, recall.wired=NA, precision.unwired=NA, recall.unwired=NA, 
                 stringsAsFactors = F)

 all.predictions = NULL
 all.y = NULL
 for(ring in rings){
  
  cat(paste('\n Ring:', ring))
  
  test.idx = info$g_ring_id == ring
  train.idx = info$g_ring_id != ring
  x.train = (x[train.idx, ])
  x.test = (x[test.idx, ])
  y.train = y[train.idx]
  y.test = y[test.idx]
  
  fitted = glm(formula = y.train ~ g_ring_dis+b_size+b_wall+b_roof+residence,
               data = x.train, family = binomial(link = "logit"))
 #   fitted = glm(formula = y.train ~ b_size + b_wall,
 #              data = x.train, family = binomial(link = "logit"))
  predictions = predict.glm(fitted, newdata = x.test)
  binary = predictions > 0
  tab =  table(binary, y.test)
  
  n = length(y.test)
  cat(paste0('\nTrained ', length(y.train), ' obs from other rings to test ', n, ' obs from ring ', ring))
  n.correct = sum(tab[c(1,4)])
  cat(paste0('\nUsing prob=0 as a cutoff, success rate was ', + n.correct , '/', n, ': ', round(100*n.correct / n, 1), '%'))

  ring.idx = which(acc$ring==ring)
  acc$accuracy[ring.idx] = n.correct / n
  acc$MSE[ring.idx] = mse(x = predictions, y = y.test, normalize = F, exp = 2)
  acc$abs.error[ring.idx] = mse(x = predictions, y = y.test, normalize = F, exp = 1)
  acc$n[ring.idx] = n
  acc$cor[ring.idx] = cor(predictions, y.test)
  acc$precision.wired[ring.idx] = tab[4] / (tab[4]+tab[2])
  acc$recall.wired[ring.idx] = tab[4] / (tab[4]+tab[3])
  
  acc$precision.unwired[ring.idx] =  tab[1] / (tab[1] + tab[3])
  acc$recall.unwired[ring.idx] = tab[1] / (tab[1] + tab[2])
  
  all.predictions = c(all.predictions, predictions)
  all.y = c(all.y, y.test)
  
 }


 ave.accuracy = weighted.mean(x = acc$accuracy, w = acc$n, na.rm=T)
 ave.cor = weighted.mean(x = acc$cor, w=acc$n, na.rm=T)
 ave.wired.precision = weighted.mean(x = acc$precision.wired, w=acc$n, na.rm=T)
 ave.unwired.precision = weighted.mean(x = acc$precision.unwired, w=acc$n, na.rm=T)
 ave.wired.recall = weighted.mean(x = acc$recall.wired, w=acc$n, na.rm=T)
 ave.unwired.recall = weighted.mean(x = acc$recall.unwired, w=acc$n, na.rm=T)
 ave.recall = mean(ave.unwired.recall, ave.wired.recall)
 ave.precision = mean(ave.unwired.precision, ave.wired.precision)
 cat(paste0('\n\n\n Average Accuracy (out of sample) = ', round(ave.accuracy*100, 1), '%'))
 cat(paste0('\nCorrelation between predictions and truth: ', round(ave.cor, 2)))
 cat(paste0('\nAve Precision: ', round(ave.precision, 2)))
 cat(paste0('\nAve Recall: ', round(ave.recall, 2)))



 cols = get.colors(length(rings))
 plot(c(1, (length(all.predictions))), range(all.predictions, na.rm=T)+c(-.3, .1), xaxt='n', xlab='',
       ylab='Does the Model think a building has Electricity?',
       main=paste0(pretty(length(all.predictions)), ' out-of-sample Predictions'), cex=0)
 for(i in 1:length(rings)){
  draw.shape(range(which(info$g_ring_id==rings[i])), y1 = -2, y2 = 4, col = cols[i], border=NA)
  text(mean(which(info$g_ring_id==rings[i])), min(all.predictions, na.rm=T)-.6, labels=rings[i], cex=.65, xpd=T, srt=45)
 }

 points(all.predictions, col=c('black', 'yellow')[1+all.y], pch=19, cex=.4)
 lines(c(1, length(all.y)), c(0,0), lwd=2, lty=2)
 legend('bottomright', col = c('yellow', 'black'), 
       c( 'Wired', 'Not Wired'), pch=19, pt.cex=1.5, bty='n', box.lwd = 0)  
 legend('bottomleft', bty='n', paste0( round(100*ave.accuracy, 1), '% Accuracy'))



 ####################################################################################
 ####################################################################################
 ####################################################################################
 ####################################################################################


 # - making vals into ranked numbers helped a tiny bit, only thing that really matters is
 # wall type. 66% accuracy

 # So. 65% is not that great, it's okay. Basically, b_wall is the only thing that makes the prediction good. Type of roof
 # is interesting, but doesn't seem to actually be that helpful (likely due to scarcity and errors)
 # cor(y, x$b_roof)
 round(cor(x, y), 2)
 # I would have guessed that adding in e.g. b_size to b_wall in the predictors would help, but in this particular 
 # test it didn't. 

 ####################################################################################
 ####################################################################################
 ####################################################################################
 ####################################################################################

 # # ---> add geo clusters and then add that as a feature.
 # Use KNN to extract clusters and then predict how wired houses are. 
 # - first you need to know where each transformer is
 # 
 # or use streets on a grid to do this. 
 # (everything is an obs so a few rows = the transformers)
 # 
 # end of week = position to
 # 1) give them clusters and scores
 # 2) tell them we're adding more signal so this is preliminary
 # 3) bs some 'good candidate bad canditate thing'

 # also make bin/run work and play witha CSV in python!! run through the clean and index code
	##########################################################################################
	##########################################################################################
	# PREMISE
	# hills
	##########################################################################################
	wd <- "~/Desktop/PREMISE/"
	setwd(wd)
	# source("Hillary_Premise/utils/env.R")
	##########################################################################################
	##########################################################################################


	x = read.csv('~/Downloads/anand-google-electricity-results.csv')
	dim(x)
	print(colnames(x))
	wanted_columns = c('b_type', 'b_size', 'b_wall', 'b_roof', 'wired', 'image', 'o_uuid', 'timestamp',
	'g_ring_dis', 'g_ring_id', 'g_ring_name', 'e_sub', 'e_obj', 'e_rel', 'e_has_rel',
	'user_name', 'id')
	x = x[ , wanted_columns]
	x = fix.factors(x)
	# x$g_ring_id[x$g_ring_id=='-'] = '00000'
	x = x[!is.na(x$wired), ]
	x = x[x$g_ring_id!='-', ]
	x = x[!is.na(x$wired), ]
	x = x[order(x$g_ring_id), ]

	info = x
	y = x$wired
	x = x[ ,colnames(x)!='wired']
	x = x[ , c('g_ring_dis', 'b_type', 'b_size', 'b_wall', 'b_roof')]

	# 65.7
	for(w in sort(unique(x$b_wall))){
	print(w)
	print(cor(x$b_wall==w, y))
	}

	####################################################################################
	# CLEANING:
	x$residence = c(0, 1)[1+(x$b_type!='t_business')]
	x$b_type = NULL
	x$b_size[x$b_size == 's_small'] = 1
	x$b_size[x$b_size == 's_medium'] = 2
	x$b_size[x$b_size == 's_large'] = 3
	x$b_size[x$b_size == ''] = NA
	x$b_size = as.numeric(x$b_size)
	x$b_wall[x$b_wall=='w_mud'] = 0
	x$b_wall[x$b_wall=='w_wood'] = 0
	x$b_wall[x$b_wall=='w_sheet'] = 0
	x$b_wall[x$b_wall==''] = 0
	x$b_wall[x$b_wall=='w_brick'] = 1
	x$b_wall[x$b_wall=='w_cement'] = 1
	x$b_wall = as.numeric(x$b_wall)
	x$b_roof[x$b_roof=='r_tile'] = 2
	x$b_roof[x$b_roof=='r_sheet'] = 1
	x$b_roof[x$b_roof=='' \| x$b_roof=='r_other'] = NA
	x$b_roof = as.numeric(x$b_roof)
	table(x$b_roof)

	for(i in 1:ncol(x)){
	y = y[!is.na(x[ , i])]
	info = info[!is.na(x[ , i]), ]
	x = x[!is.na(x[ , i]), ]
	}
	####################################################################################



	####################################################################################
	# do it:
	rings = unique(info$g_ring_id)
	acc = data.frame(ring=rings, accuracy=NA, MSE=NA, abs.error=NA, n=NA, cor=NA,
	precision.wired=NA, recall.wired=NA, precision.unwired=NA, recall.unwired=NA,
	stringsAsFactors = F)

	all.predictions = NULL
	all.y = NULL
	for(ring in rings){

	cat(paste('\n Ring:', ring))

	test.idx = info$g_ring_id == ring
	train.idx = info$g_ring_id != ring
	x.train = (x[train.idx, ])
	x.test = (x[test.idx, ])
	y.train = y[train.idx]
	y.test = y[test.idx]

	fitted = glm(formula = y.train ~ g_ring_dis+b_size+b_wall+b_roof+residence,
	data = x.train, family = binomial(link = "logit"))
	# fitted = glm(formula = y.train ~ b_size + b_wall,
	# data = x.train, family = binomial(link = "logit"))
	predictions = predict.glm(fitted, newdata = x.test)
	binary = predictions > 0
	tab = table(binary, y.test)

	n = length(y.test)
	cat(paste0('\nTrained ', length(y.train), ' obs from other rings to test ', n, ' obs from ring ', ring))
	n.correct = sum(tab[c(1,4)])
	cat(paste0('\nUsing prob=0 as a cutoff, success rate was ', + n.correct , '/', n, ': ', round(100*n.correct / n, 1), '%'))

	ring.idx = which(acc$ring==ring)
	acc$accuracy[ring.idx] = n.correct / n
	acc$MSE[ring.idx] = mse(x = predictions, y = y.test, normalize = F, exp = 2)
	acc$abs.error[ring.idx] = mse(x = predictions, y = y.test, normalize = F, exp = 1)
	acc$n[ring.idx] = n
	acc$cor[ring.idx] = cor(predictions, y.test)
	acc$precision.wired[ring.idx] = tab[4] / (tab[4]+tab[2])
	acc$recall.wired[ring.idx] = tab[4] / (tab[4]+tab[3])

	acc$precision.unwired[ring.idx] = tab[1] / (tab[1] + tab[3])
	acc$recall.unwired[ring.idx] = tab[1] / (tab[1] + tab[2])

	all.predictions = c(all.predictions, predictions)
	all.y = c(all.y, y.test)

	}


	ave.accuracy = weighted.mean(x = acc$accuracy, w = acc$n, na.rm=T)
	ave.cor = weighted.mean(x = acc$cor, w=acc$n, na.rm=T)
	ave.wired.precision = weighted.mean(x = acc$precision.wired, w=acc$n, na.rm=T)
	ave.unwired.precision = weighted.mean(x = acc$precision.unwired, w=acc$n, na.rm=T)
	ave.wired.recall = weighted.mean(x = acc$recall.wired, w=acc$n, na.rm=T)
	ave.unwired.recall = weighted.mean(x = acc$recall.unwired, w=acc$n, na.rm=T)
	ave.recall = mean(ave.unwired.recall, ave.wired.recall)
	ave.precision = mean(ave.unwired.precision, ave.wired.precision)
	cat(paste0('\n\n\n Average Accuracy (out of sample) = ', round(ave.accuracy*100, 1), '%'))
	cat(paste0('\nCorrelation between predictions and truth: ', round(ave.cor, 2)))
	cat(paste0('\nAve Precision: ', round(ave.precision, 2)))
	cat(paste0('\nAve Recall: ', round(ave.recall, 2)))



	cols = get.colors(length(rings))
	plot(c(1, (length(all.predictions))), range(all.predictions, na.rm=T)+c(-.3, .1), xaxt='n', xlab='',
	ylab='Does the Model think a building has Electricity?',
	main=paste0(pretty(length(all.predictions)), ' out-of-sample Predictions'), cex=0)
	for(i in 1:length(rings)){
	draw.shape(range(which(info$g_ring_id==rings[i])), y1 = -2, y2 = 4, col = cols[i], border=NA)
	text(mean(which(info$g_ring_id==rings[i])), min(all.predictions, na.rm=T)-.6, labels=rings[i], cex=.65, xpd=T, srt=45)
	}

	points(all.predictions, col=c('black', 'yellow')[1+all.y], pch=19, cex=.4)
	lines(c(1, length(all.y)), c(0,0), lwd=2, lty=2)
	legend('bottomright', col = c('yellow', 'black'),
	c( 'Wired', 'Not Wired'), pch=19, pt.cex=1.5, bty='n', box.lwd = 0)
	legend('bottomleft', bty='n', paste0( round(100*ave.accuracy, 1), '% Accuracy'))



	####################################################################################
	####################################################################################
	####################################################################################
	####################################################################################


	# - making vals into ranked numbers helped a tiny bit, only thing that really matters is
	# wall type. 66% accuracy

	# So. 65% is not that great, it's okay. Basically, b_wall is the only thing that makes the prediction good. Type of roof
	# is interesting, but doesn't seem to actually be that helpful (likely due to scarcity and errors)
	# cor(y, x$b_roof)
	round(cor(x, y), 2)
	# I would have guessed that adding in e.g. b_size to b_wall in the predictors would help, but in this particular
	# test it didn't.

	####################################################################################
	####################################################################################
	####################################################################################
	####################################################################################

	# # ---> add geo clusters and then add that as a feature.
	# Use KNN to extract clusters and then predict how wired houses are.
	# - first you need to know where each transformer is
	#
	# or use streets on a grid to do this.
	# (everything is an obs so a few rows = the transformers)
	#
	# end of week = position to
	# 1) give them clusters and scores
	# 2) tell them we're adding more signal so this is preliminary
	# 3) bs some 'good candidate bad canditate thing'

	# also make bin/run work and play witha CSV in python!! run through the clean and index code