Skip to content

Instantly share code, notes, and snippets.

@hillarysanders
Created January 7, 2015 02:17
Show Gist options
  • Save hillarysanders/22bc484b0d24f0600aaf to your computer and use it in GitHub Desktop.
Save hillarysanders/22bc484b0d24f0600aaf to your computer and use it in GitHub Desktop.
##########################################################################################
##########################################################################################
# PREMISE
# hills
##########################################################################################
wd <- "~/Desktop/PREMISE/"
setwd(wd)
# source("Hillary_Premise/utils/env.R")
##########################################################################################
##########################################################################################
x = read.csv('~/Downloads/anand-google-electricity-results.csv')
dim(x)
print(colnames(x))
wanted_columns = c('b_type', 'b_size', 'b_wall', 'b_roof', 'wired', 'image', 'o_uuid', 'timestamp',
'g_ring_dis', 'g_ring_id', 'g_ring_name', 'e_sub', 'e_obj', 'e_rel', 'e_has_rel',
'user_name', 'id')
x = x[ , wanted_columns]
x = fix.factors(x)
# x$g_ring_id[x$g_ring_id=='-'] = '00000'
x = x[!is.na(x$wired), ]
x = x[x$g_ring_id!='-', ]
x = x[!is.na(x$wired), ]
x = x[order(x$g_ring_id), ]
info = x
y = x$wired
x = x[ ,colnames(x)!='wired']
x = x[ , c('g_ring_dis', 'b_type', 'b_size', 'b_wall', 'b_roof')]
# 65.7
for(w in sort(unique(x$b_wall))){
print(w)
print(cor(x$b_wall==w, y))
}
####################################################################################
# CLEANING:
x$residence = c(0, 1)[1+(x$b_type!='t_business')]
x$b_type = NULL
x$b_size[x$b_size == 's_small'] = 1
x$b_size[x$b_size == 's_medium'] = 2
x$b_size[x$b_size == 's_large'] = 3
x$b_size[x$b_size == ''] = NA
x$b_size = as.numeric(x$b_size)
x$b_wall[x$b_wall=='w_mud'] = 0
x$b_wall[x$b_wall=='w_wood'] = 0
x$b_wall[x$b_wall=='w_sheet'] = 0
x$b_wall[x$b_wall==''] = 0
x$b_wall[x$b_wall=='w_brick'] = 1
x$b_wall[x$b_wall=='w_cement'] = 1
x$b_wall = as.numeric(x$b_wall)
x$b_roof[x$b_roof=='r_tile'] = 2
x$b_roof[x$b_roof=='r_sheet'] = 1
x$b_roof[x$b_roof=='' | x$b_roof=='r_other'] = NA
x$b_roof = as.numeric(x$b_roof)
table(x$b_roof)
for(i in 1:ncol(x)){
y = y[!is.na(x[ , i])]
info = info[!is.na(x[ , i]), ]
x = x[!is.na(x[ , i]), ]
}
####################################################################################
####################################################################################
# do it:
rings = unique(info$g_ring_id)
acc = data.frame(ring=rings, accuracy=NA, MSE=NA, abs.error=NA, n=NA, cor=NA,
precision.wired=NA, recall.wired=NA, precision.unwired=NA, recall.unwired=NA,
stringsAsFactors = F)
all.predictions = NULL
all.y = NULL
for(ring in rings){
cat(paste('\n Ring:', ring))
test.idx = info$g_ring_id == ring
train.idx = info$g_ring_id != ring
x.train = (x[train.idx, ])
x.test = (x[test.idx, ])
y.train = y[train.idx]
y.test = y[test.idx]
fitted = glm(formula = y.train ~ g_ring_dis+b_size+b_wall+b_roof+residence,
data = x.train, family = binomial(link = "logit"))
# fitted = glm(formula = y.train ~ b_size + b_wall,
# data = x.train, family = binomial(link = "logit"))
predictions = predict.glm(fitted, newdata = x.test)
binary = predictions > 0
tab = table(binary, y.test)
n = length(y.test)
cat(paste0('\nTrained ', length(y.train), ' obs from other rings to test ', n, ' obs from ring ', ring))
n.correct = sum(tab[c(1,4)])
cat(paste0('\nUsing prob=0 as a cutoff, success rate was ', + n.correct , '/', n, ': ', round(100*n.correct / n, 1), '%'))
ring.idx = which(acc$ring==ring)
acc$accuracy[ring.idx] = n.correct / n
acc$MSE[ring.idx] = mse(x = predictions, y = y.test, normalize = F, exp = 2)
acc$abs.error[ring.idx] = mse(x = predictions, y = y.test, normalize = F, exp = 1)
acc$n[ring.idx] = n
acc$cor[ring.idx] = cor(predictions, y.test)
acc$precision.wired[ring.idx] = tab[4] / (tab[4]+tab[2])
acc$recall.wired[ring.idx] = tab[4] / (tab[4]+tab[3])
acc$precision.unwired[ring.idx] = tab[1] / (tab[1] + tab[3])
acc$recall.unwired[ring.idx] = tab[1] / (tab[1] + tab[2])
all.predictions = c(all.predictions, predictions)
all.y = c(all.y, y.test)
}
ave.accuracy = weighted.mean(x = acc$accuracy, w = acc$n, na.rm=T)
ave.cor = weighted.mean(x = acc$cor, w=acc$n, na.rm=T)
ave.wired.precision = weighted.mean(x = acc$precision.wired, w=acc$n, na.rm=T)
ave.unwired.precision = weighted.mean(x = acc$precision.unwired, w=acc$n, na.rm=T)
ave.wired.recall = weighted.mean(x = acc$recall.wired, w=acc$n, na.rm=T)
ave.unwired.recall = weighted.mean(x = acc$recall.unwired, w=acc$n, na.rm=T)
ave.recall = mean(ave.unwired.recall, ave.wired.recall)
ave.precision = mean(ave.unwired.precision, ave.wired.precision)
cat(paste0('\n\n\n Average Accuracy (out of sample) = ', round(ave.accuracy*100, 1), '%'))
cat(paste0('\nCorrelation between predictions and truth: ', round(ave.cor, 2)))
cat(paste0('\nAve Precision: ', round(ave.precision, 2)))
cat(paste0('\nAve Recall: ', round(ave.recall, 2)))
cols = get.colors(length(rings))
plot(c(1, (length(all.predictions))), range(all.predictions, na.rm=T)+c(-.3, .1), xaxt='n', xlab='',
ylab='Does the Model think a building has Electricity?',
main=paste0(pretty(length(all.predictions)), ' out-of-sample Predictions'), cex=0)
for(i in 1:length(rings)){
draw.shape(range(which(info$g_ring_id==rings[i])), y1 = -2, y2 = 4, col = cols[i], border=NA)
text(mean(which(info$g_ring_id==rings[i])), min(all.predictions, na.rm=T)-.6, labels=rings[i], cex=.65, xpd=T, srt=45)
}
points(all.predictions, col=c('black', 'yellow')[1+all.y], pch=19, cex=.4)
lines(c(1, length(all.y)), c(0,0), lwd=2, lty=2)
legend('bottomright', col = c('yellow', 'black'),
c( 'Wired', 'Not Wired'), pch=19, pt.cex=1.5, bty='n', box.lwd = 0)
legend('bottomleft', bty='n', paste0( round(100*ave.accuracy, 1), '% Accuracy'))
####################################################################################
####################################################################################
####################################################################################
####################################################################################
# - making vals into ranked numbers helped a tiny bit, only thing that really matters is
# wall type. 66% accuracy
# So. 65% is not that great, it's okay. Basically, b_wall is the only thing that makes the prediction good. Type of roof
# is interesting, but doesn't seem to actually be that helpful (likely due to scarcity and errors)
# cor(y, x$b_roof)
round(cor(x, y), 2)
# I would have guessed that adding in e.g. b_size to b_wall in the predictors would help, but in this particular
# test it didn't.
####################################################################################
####################################################################################
####################################################################################
####################################################################################
# # ---> add geo clusters and then add that as a feature.
# Use KNN to extract clusters and then predict how wired houses are.
# - first you need to know where each transformer is
#
# or use streets on a grid to do this.
# (everything is an obs so a few rows = the transformers)
#
# end of week = position to
# 1) give them clusters and scores
# 2) tell them we're adding more signal so this is preliminary
# 3) bs some 'good candidate bad canditate thing'
# also make bin/run work and play witha CSV in python!! run through the clean and index code
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment