# hills
wd <- "~/Desktop/PREMISE/"
# source("Hillary_Premise/utils/env.R")
x = read.csv('~/Downloads/anand-google-electricity-results.csv')
wanted_columns = c('b_type', 'b_size', 'b_wall', 'b_roof', 'wired', 'image', 'o_uuid', 'timestamp',
'g_ring_dis', 'g_ring_id', 'g_ring_name', 'e_sub', 'e_obj', 'e_rel', 'e_has_rel',
'user_name', 'id')
x = x[ , wanted_columns]
x = fix.factors(x)
# x$g_ring_id[x$g_ring_id=='-'] = '00000'
x = x[!$wired), ]
x = x[x$g_ring_id!='-', ]
x = x[!$wired), ]
x = x[order(x$g_ring_id), ]
info = x
y = x$wired
x = x[ ,colnames(x)!='wired']
x = x[ , c('g_ring_dis', 'b_type', 'b_size', 'b_wall', 'b_roof')]
# 65.7
for(w in sort(unique(x$b_wall))){
print(cor(x$b_wall==w, y))
x$residence = c(0, 1)[1+(x$b_type!='t_business')]
x$b_type = NULL
x$b_size[x$b_size == 's_small'] = 1
x$b_size[x$b_size == 's_medium'] = 2
x$b_size[x$b_size == 's_large'] = 3
x$b_size[x$b_size == ''] = NA
x$b_size = as.numeric(x$b_size)
x$b_wall[x$b_wall=='w_mud'] = 0
x$b_wall[x$b_wall=='w_wood'] = 0
x$b_wall[x$b_wall=='w_sheet'] = 0
x$b_wall[x$b_wall==''] = 0
x$b_wall[x$b_wall=='w_brick'] = 1
x$b_wall[x$b_wall=='w_cement'] = 1
x$b_wall = as.numeric(x$b_wall)
x$b_roof[x$b_roof=='r_tile'] = 2
x$b_roof[x$b_roof=='r_sheet'] = 1
x$b_roof[x$b_roof=='' | x$b_roof=='r_other'] = NA
x$b_roof = as.numeric(x$b_roof)
for(i in 1:ncol(x)){
y = y[![ , i])]
info = info[![ , i]), ]
x = x[![ , i]), ]
# do it:
rings = unique(info$g_ring_id)
acc = data.frame(ring=rings, accuracy=NA, MSE=NA, abs.error=NA, n=NA, cor=NA,
precision.wired=NA, recall.wired=NA, precision.unwired=NA, recall.unwired=NA,
stringsAsFactors = F)
all.predictions = NULL
all.y = NULL
for(ring in rings){
cat(paste('\n Ring:', ring))
test.idx = info$g_ring_id == ring
train.idx = info$g_ring_id != ring
x.train = (x[train.idx, ])
x.test = (x[test.idx, ])
y.train = y[train.idx]
y.test = y[test.idx]
fitted = glm(formula = y.train ~ g_ring_dis+b_size+b_wall+b_roof+residence,
data = x.train, family = binomial(link = "logit"))
# fitted = glm(formula = y.train ~ b_size + b_wall,
# data = x.train, family = binomial(link = "logit"))
predictions = predict.glm(fitted, newdata = x.test)
binary = predictions > 0
tab = table(binary, y.test)
n = length(y.test)
cat(paste0('\nTrained ', length(y.train), ' obs from other rings to test ', n, ' obs from ring ', ring))
n.correct = sum(tab[c(1,4)])
cat(paste0('\nUsing prob=0 as a cutoff, success rate was ', + n.correct , '/', n, ': ', round(100*n.correct / n, 1), '%'))
ring.idx = which(acc$ring==ring)
acc$accuracy[ring.idx] = n.correct / n
acc$MSE[ring.idx] = mse(x = predictions, y = y.test, normalize = F, exp = 2)
acc$abs.error[ring.idx] = mse(x = predictions, y = y.test, normalize = F, exp = 1)
acc$n[ring.idx] = n
acc$cor[ring.idx] = cor(predictions, y.test)
acc$precision.wired[ring.idx] = tab[4] / (tab[4]+tab[2])
acc$recall.wired[ring.idx] = tab[4] / (tab[4]+tab[3])
acc$precision.unwired[ring.idx] = tab[1] / (tab[1] + tab[3])
acc$recall.unwired[ring.idx] = tab[1] / (tab[1] + tab[2])
all.predictions = c(all.predictions, predictions)
all.y = c(all.y, y.test)
ave.accuracy = weighted.mean(x = acc$accuracy, w = acc$n, na.rm=T)
ave.cor = weighted.mean(x = acc$cor, w=acc$n, na.rm=T)
ave.wired.precision = weighted.mean(x = acc$precision.wired, w=acc$n, na.rm=T)
ave.unwired.precision = weighted.mean(x = acc$precision.unwired, w=acc$n, na.rm=T)
ave.wired.recall = weighted.mean(x = acc$recall.wired, w=acc$n, na.rm=T)
ave.unwired.recall = weighted.mean(x = acc$recall.unwired, w=acc$n, na.rm=T)
ave.recall = mean(ave.unwired.recall, ave.wired.recall)
ave.precision = mean(ave.unwired.precision, ave.wired.precision)
cat(paste0('\n\n\n Average Accuracy (out of sample) = ', round(ave.accuracy*100, 1), '%'))
cat(paste0('\nCorrelation between predictions and truth: ', round(ave.cor, 2)))
cat(paste0('\nAve Precision: ', round(ave.precision, 2)))
cat(paste0('\nAve Recall: ', round(ave.recall, 2)))
cols = get.colors(length(rings))
plot(c(1, (length(all.predictions))), range(all.predictions, na.rm=T)+c(-.3, .1), xaxt='n', xlab='',
ylab='Does the Model think a building has Electricity?',
main=paste0(pretty(length(all.predictions)), ' out-of-sample Predictions'), cex=0)
for(i in 1:length(rings)){
draw.shape(range(which(info$g_ring_id==rings[i])), y1 = -2, y2 = 4, col = cols[i], border=NA)
text(mean(which(info$g_ring_id==rings[i])), min(all.predictions, na.rm=T)-.6, labels=rings[i], cex=.65, xpd=T, srt=45)
points(all.predictions, col=c('black', 'yellow')[1+all.y], pch=19, cex=.4)
lines(c(1, length(all.y)), c(0,0), lwd=2, lty=2)
legend('bottomright', col = c('yellow', 'black'),
c( 'Wired', 'Not Wired'), pch=19, pt.cex=1.5, bty='n', box.lwd = 0)
legend('bottomleft', bty='n', paste0( round(100*ave.accuracy, 1), '% Accuracy'))
# - making vals into ranked numbers helped a tiny bit, only thing that really matters is
# wall type. 66% accuracy
# So. 65% is not that great, it's okay. Basically, b_wall is the only thing that makes the prediction good. Type of roof
# is interesting, but doesn't seem to actually be that helpful (likely due to scarcity and errors)
# cor(y, x$b_roof)
round(cor(x, y), 2)
# I would have guessed that adding in e.g. b_size to b_wall in the predictors would help, but in this particular
# test it didn't.
# # ---> add geo clusters and then add that as a feature.
# Use KNN to extract clusters and then predict how wired houses are.
# - first you need to know where each transformer is
# or use streets on a grid to do this.
# (everything is an obs so a few rows = the transformers)
# end of week = position to
# 1) give them clusters and scores
# 2) tell them we're adding more signal so this is preliminary
# 3) bs some 'good candidate bad canditate thing'
# also make bin/run work and play witha CSV in python!! run through the clean and index code
