Skip to content

Instantly share code, notes, and snippets.

@hillarysanders
Last active December 28, 2015 03:59
Show Gist options
  • Save hillarysanders/7439581 to your computer and use it in GitHub Desktop.
Save hillarysanders/7439581 to your computer and use it in GitHub Desktop.
hackbright example functions
######################
# outliers:
# supah simple outlier killer. Also kills right tails.
##' @param trim trim what % of the data off each tail before calcuating the mean and sd.
##' @param z how many standard deviations will you allow observations to be away from the
##' mean before you consider it an outlier?
##' @param verbose do you like talkative functions?
##' @param flag.only logical (TRUE or FALSE).
##' Only flag the outliers as outliers, don't remove them.
trimmed.normal <- function(spec.df, trim=.05, z=4, verbose=TRUE,
flag.only=F){
price <- spec.df$price
# get quantiles:
q <- quantile(price, probs=c(trim, 1-trim))
# get trimmed mean and standard deviation:
m <- mean(subset(price, subset=(price > q[1] & price < q[2])))
std.dev <- sd(subset(price, subset=(price >= q[1] & price <= q[2])))
if(std.dev>0){
remove.idx <- price > m+z*std.dev | price < m-z*std.dev
if(verbose==TRUE){
print(paste("removed", sum(remove.idx), "out of", nrow(spec.df), "observations",
"from", spec.df$spec_name[nrow(spec.df)], ",", toupper(spec.df$country[1])))
}
if(flag.only==F){
spec.df <- spec.df[!remove.idx, ]
} else {
spec.df$outlier <- remove.idx
}
}
return(spec.df)
}
library(stringr) # could not use this if neccessary, but it is handy
##' unit.standardizer
##' not be obvious as to why unless the size units were being prominently shown.
##' @param spec.df price observation data.frame
##' @param verbose feeling talkative?
##' @param safety.first Only modify prices by size if the change brings the price
##' closer to the mean price?
##' @param coerce.unit.only
##' @param prep.only prep the size metadata, but do nothing else? Logical.
##' @param require.all.three only try to coerce to something if the most common types
##' of size, size_unit and quantity are non-NA. logical
##' @param require.all.for.conversion for each observation, only coerce if size,
##' size_unit, and quantity are non-NA.
##' @param country Only matters for China; they treat define certain units differently.
##' If china, let country = 'cn'.
unit.standardizer <- function(spec.df, verbose=FALSE, safety.first=FALSE,
coerce.unit.only=F, allow.crappy.common.units=TRUE,
require.all.three=F, require.all.for.conversion=FALSE,
only.return.most.common.unit=F, prep.only=FALSE,
country="notchina"){
# so that stuff can be easily modified:
size <- as.character(spec.df$size)
unit <- as.character(spec.df$size_unit)
quantity <- as.character(spec.df$quantity)
price <- spec.df$price
spec.df$price.raw <- spec.df$price
# translate chinese
unit = sub("克", "g", unit)
unit = sub("公斤", "kg", unit)
unit = sub("毫升", "ml", unit)
unit = sub("升", "ltr", unit)
unit = sub("盎司", "oz", unit)
unit = sub("磅", "lb", unit)
unit = sub("斤", "catty", unit)
unit = sub("两", "tael", unit)
unit = sub("盒", "box", unit)
unit = sub("个", "pcs.", unit)
unit = sub("袋", "pack", unit)
size[size=="0"] <- NA
unit[unit=="0"] <- NA
quantity[quantity=="0"] <- NA
size[size==""] <- NA
unit[unit==""] <- NA
quantity[quantity==""] <- NA
size[size=="NULL"] <- NA
unit[unit=="NULL"] <- NA
quantity[quantity=="NULL"] <- NA
unit[grepl(unit, pattern="[/?]+")] <- NA
quantity[grepl(quantity, pattern="[/?]+")] <- NA
size[grepl(size, pattern="[/?]+")] <- NA
# throw away any character fluff (may want to conserve and compare instead in the future)
size <- gsub("[a-z]+", "", x=size)
size <- gsub(" ", "", x=size)
size <- gsub(",", ".", x=size)
size <- as.numeric(size)
size[size==0] <- NA
quantity <- gsub("[a-z]+", "", x=quantity)
quantity <- gsub(" ", "", x=quantity)
quantity <- as.numeric(quantity)
quantity[quantity==0] <- NA
# now that they are cleaner, replace spec.df size and quantity w/ these
spec.df$size <- size
spec.df$quantity <- quantity
spec.df$size_unit <- unit
if(prep.only==TRUE) return(spec.df)
if(allow.crappy.common.units==F){
unit.idx <- ! unit %in% c("box", "pcs.", "pack")
} else { unit.idx <- 1:length(unit) }
most.common <- names(sort(table((paste(quantity, size, unit[unit.idx])), useNA='ifany'), decreasing=TRUE))[1]
most.common. <- str_split(most.common, " ")[[1]]
most.common.[most.common.=="NA"] <- NA
quantity.mode <- as.numeric(most.common.[1])
size.mode <- as.numeric(most.common.[2])
unit.mode <- most.common.[3]
# only work with workable data, yo.
if(require.all.three==TRUE){
if(any(is.na(c(size.mode, unit.mode, quantity.mode)))) return(spec.df)
}
# NOTE THAT THESE MOST COMMON VALUES ARE POTENTIALLY UNSTABLE.
# FIXED IN PRODUCTIONIZED VERSION.
names <- c("catty", "g", "kg", "lb", "oz", "ltr", "ml", "l", "tael")
# if COUNTRY = CHINA, then catty is 500 g, not 600, and a tael is 10 catties, not 16.
if(country=="cn"){
conversion_matrix <- matrix(
#"catty" "g" "kg" "lb" "oz" "ltr" "ml" "l" "tael"
c(1, 500, .5, 1.1025, 17.64, .5 , 500, .5, 10, #catty
1/500, 1, .001, 0.002205, 0.03527, .001, 1, .001, 10/500, #g
1/.5, 1000, 1, 2.205, 35.27, 1, 1000, 1, 10/.5, #kg
1/1.1025, 0, 1/2.205, 1, 16, .4536, 453.6, .4536, 10/1.1025, #lb
1/17.64, 1/0.03527, 1/35.27, 1/16, 1, 0.02957, 29.57, .02957, 10/17.64, #oz
1/.5, 1000, 1, 1/.4536, 1/0.02957, 1, 1000, 1, 10/.5, #ltr
1/500, 1, .001, 1/453.6, 1/29.57, .001, 1, .001, 10/500, #ml
1/.5, 1000, 1, 1/.4536, 1/0.02957, 1, 1000, 1, 10/.5, #l
1/10, 500/10, .5/10, 1.1025/10, 17.64/10, .5/10, 500/10, .5/10, 1), # tael
ncol=9, dimnames=list(names, names))
} else {
conversion_matrix <- matrix(c(
#"catty" "g" "kg" "lb" "oz" "ltr" "ml" "l" "tael"
1, 600, .6, 1.323, 21.168, .6 , 600, .6, 16, #catty
1/600, 1, .001, 0.002205, 0.03527, .001, 1, .001, 16/600, #g
1/.6, 1000, 1, 2.205, 35.27, 1, 1000, 1, 16/.6, #kg
1/1.323, 0, 1/2.205, 1, 16, .4536, 453.6, .4536, 16/1.323, #lb
1/21.168, 1/0.03527, 1/35.27, 1/16, 1, 0.02957, 29.57, .02957, 16/21.168, #oz
1/.6, 1000, 1, 1/.4536, 1/0.02957, 1, 1000, 1, 16/.6, #ltr
1/600, 1, .001, 1/453.6, 1/29.57, .001, 1, .001, 16/600, #ml
1/.6, 1000, 1, 1/.4536, 1/0.02957, 1, 1000, 1, 16/.6, #l
1/16, 600/16, .6/16, 1.323/16, 21.168/16, .6/16, 600/16, .6/16, 1), # tael
ncol=9, dimnames=list(names, names))
}
quant.change <- quantity.mode / quantity
size.change <- size.mode / size
# unit change is a bit more complicated:
col <- which(names == unit.mode)
rows <- sapply(unit, FUN=function(x) which(names == x))
unit.change <- sapply(rows, FUN=function(x){
change <- conversion_matrix[x, col]
if(length(change)==1) return(change) else return(NA)
})
if(coerce.unit.only){
proposed.price <- price
} else {
if(require.all.for.conversion==TRUE){
proposed.price <- price*quant.change*size.change*unit.change
} else {
quant.change[is.na(quant.change)] <- 1
unit.change[is.na(unit.change)] <- 1
size.change[is.na(size.change)] <- 1
proposed.price <- price*quant.change*size.change*unit.change
}
}
# good idx = those observations for which all units were transformable:
good.idx <- which(!is.na(proposed.price))
if(safety.first==TRUE){
# only accept proposed changes if it brings you closer to the most.common mean:
mu <- mean(price[paste(quantity, size, unit)==most.common])
# sd <- sd(price[paste(quantity, size, unit)==most.common])
makes.it.better <- abs(mu-price) > abs(mu-proposed.price)
makes.it.better <- which(makes.it.better)
} else {
makes.it.better <- 1:length(price)
}
makes.it.better <- intersect(good.idx, makes.it.better)
if(verbose==TRUE){
print(paste("Altering", sum(price!=proposed.price, na.rm=T),
"out of", length(price), "sizes and respective prices."))
}
price[makes.it.better] <- proposed.price[makes.it.better]
unit.idx <- makes.it.better[which((unit.change!=1)[makes.it.better])]
unit[unit.idx] <- unit.mode
if(coerce.unit.only){
# units have been changed so size should chnage to reflect this
size[unit.idx] <- size[unit.idx]/unit.change[unit.idx]
} else {
size[makes.it.better] <- size.mode
quantity[makes.it.better] <- quantity.mode
unit[makes.it.better] <- unit.mode
}
spec.df$price <- price
spec.df$size <- size
spec.df$quantity <- quantity
spec.df$size_unit <- unit
if(only.return.most.common.unit==TRUE){
if(!is.na(unit.mode)){
idx.good.size_unit <- spec.df$size_unit == unit.mode
idx.good.size_unit[is.na(idx.good.size_unit)] <- F
} else {
# idx.good.size_unit <- is.na(spec.df$size_unit)
idx.good.size_unit <- rep(TRUE, nrow(spec.df))
}
if(verbose==T) print(paste("Removing", sum(!idx.good.size_unit), "obs whose units cannot be coerced."))
spec.df <- spec.df[idx.good.size_unit, ]
}
# hmm. One to one increases in price due to size changes doesn't represent how the real
# world sells stuff...
return(spec.df)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment