Skip to content

Instantly share code, notes, and snippets.

@amchercashin
Last active November 11, 2016 06:48
Show Gist options
  • Save amchercashin/70db9cfd90eb714a13ad4027365dbae7 to your computer and use it in GitHub Desktop.
Save amchercashin/70db9cfd90eb714a13ad4027365dbae7 to your computer and use it in GitHub Desktop.
SDSJ contest, C ex., example
library(readr)
library(stringr)
library(dplyr)
library(ggplot2)
library(lubridate)
library(glmnet)
library(purrr)
library(tidyr)
train_gender <- read_csv("./data/customers_gender_train.csv", col_types = "ci")
train <- read_csv("./data/transactions.csv", col_types = "ccccnc")
train$day_of_year <- as.integer(str_split(train$tr_datetime, " ", simplify = TRUE)[,1])
unique_customers <- unique(train$customer_id); unique_mcc_codes <- unique(train$mcc_code)
### CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
all_data <- train[!train$customer_id %in% train_gender$customer_id, ]
all_data$amount[all_data$amount > 0] <- 0
all_data <- all_data %>% select(customer_id, day = day_of_year, mcc_code, amount)
all_data <- all_data %>% mutate(date = dmy("01.08.2014") + day, customer_id = as.character(customer_id), mcc_code = as.character(mcc_code),
month = as.integer(month(date)), year = as.integer(year(date)-2013),
month_num = as.integer(12 * year - 12 + month - 7))
all_data <- all_data %>% group_by(customer_id, mcc_code, month, year, month_num) %>% summarise(volume = sum(amount))
all_data <- all_data %>% ungroup() %>% complete(customer_id, mcc_code, nesting(month, year, month_num), fill = list(volume = 0))
unique_customers <- unique(all_data$customer_id)
pred_data <- crossing(customer_id = unique_customers, mcc_code = unique_mcc_codes, month = as.integer(11), year = as.integer(2),
month_num = as.integer(16), volume = 0)
all_data <- bind_rows(all_data, pred_data)
rm(pred_data)
all_data <- all_data %>% arrange(customer_id, mcc_code, year, month)
all_data <- all_data %>% mutate(mcc_code = factor(mcc_code)) %>% group_by(customer_id, mcc_code) %>% nest()
i <- 0
all_data <- all_data %>% mutate(data = map(data, function(d){
print(i<<-i+1)
bind_cols(d, tibble(volume_lag_1 = c(0,d$volume[1:15]),
volume_lag_2 = c(0,0,d$volume[1:14]),
volume_lag_3 = c(0,0,0,d$volume[1:13])))
}
)
)
all_data <- all_data %>% unnest()
data <- all_data %>% filter(month_num != 16) %>% group_by(customer_id, mcc_code) %>% nest(.key = data)
train_data <- all_data %>% filter(month_num < 15) %>% group_by(customer_id, mcc_code) %>% nest(.key = train_data)
test_data <- all_data %>% filter(month_num == 15) %>% group_by(customer_id, mcc_code) %>% nest(.key = test_data)
pred_data <- all_data %>% filter(month_num == 16) %>% group_by(customer_id, mcc_code) %>% nest(.key = pred_data)
all_data <- data; rm(data)
all_data$train_data <- train_data$train_data; rm(train_data)
all_data$test_data <- test_data$test_data; rm(test_data)
all_data$pred_data <- pred_data$pred_data; rm(pred_data)
#### MODEL CHOOSING ####
train_data <- all_data[sample(552000,200000),]
i = 0
fit_formula = log(-volume+1) ~ month * year + month_num + I(volume_lag_1==0) + I(volume_lag_2==0) + I(volume_lag_3==0)
# fit_formula_l = I(volume != 0) ~ month * year + month_num + I(volume_lag_1==0) + I(volume_lag_2==0) + I(volume_lag_3==0)
fit_model_glmnet <- function(data) {
print(i<<-i+1)
X <- model.matrix(as.formula(fit_formula), data = data)
y <- log(-data$volume + 1)
return(glmnet(x = X, y = y, lambda = 3.5, standardize = TRUE, alpha = 0.5))
}
fit_model_glmnet_l <- function(data) {
# print(i<<-i+1)
X <- model.matrix(as.formula(fit_formula_l), data = data)
y <- data$volume != 0
return(glmnet(x = X, y = y, lambda = 3.5, standardize = FALSE, alpha = 0.5, family="binomial"))
}
fit_model_cv.glmnet <- function(data) {
print(i<<-i+1)
X <- model.matrix(as.formula(fit_formula), data = data)
y <- log(-data$volume + 1)
return(cv.glmnet(x = X, y = y, nfolds = 10))
}
library(kernlab)
fit_model_rvm <- function(data) {
print(i<<-i+1)
X <- model.matrix(as.formula(fit_formula), data = data)
y <- log(-data$volume + 1)
return(rvm(x = X, y = y, fit = FALSE))
}
library(e1071)
fit_model_svm <- function(data) {
print(i<<-i+1)
X <- Matrix::sparse.model.matrix(as.formula(fit_formula), data = data)
y <- log(-data$volume + 1)
return(svm(x = X, y = y, fitted = FALSE, kernel = "polynomial"))
}
library(xgboost)
fit_model_xgb <- function(data) {
print(i<<-i+1)
dtrain <- xgb.DMatrix(data = model.matrix(as.formula(fit_formula), data = data),
label = log(-data$volume + 1))
return(xgb.train(data = dtrain, nthread = 4, nround = 500, booster = "gblinear", alpha = 10))
}
library(biglm)
fit_model_blm <- function(data) {
print(i<<-i+1)
return(biglm(as.formula(fit_formula), data = data))
}
system.time({
train_data <- train_data %>% mutate(#train_lm = map(train_data3, fit_model_lm),
fit = map(train_data, function(t) {
if (sum(t$volume) == 0 ) {0}
else {fit_model_glmnet(t)}
}))
})
train_data <- train_data %>% mutate(#test_pred_lm = map2(test_data, train_lm, ~predict(.y, newdata = .x)),
test_predictions = map2(test_data, fit, function(x,y){
if(y[[1]]==0) {0}
else {predict(y, model.matrix(as.formula(fit_formula), data=x))}
}))
eval <- train_data %>% select(customer_id, mcc_code, test_data, test_predictions) %>% unnest() %>% mutate(volume = log(-volume + 1))
RMSE <- sqrt(mean((eval$volume - eval$test_predictions)^2))
RMSE
#### SUBMISSION ####
system.time({
all_data <- all_data %>% mutate(fit = map(data, function(t) {
if (sum(t$volume[10:15]) == 0 ) {0}
else {fit_model_glmnet(t)}
}))
})
all_data <- all_data %>% mutate(sub_predictions = map2(pred_data, fit, function(x,y){
if(y[[1]]==0) {0}
else {predict(y, model.matrix(as.formula(fit_formula), data=x))}
}))
submission <- all_data %>% select(customer_id, mcc_code, pred_data, sub_predictions) %>% unnest() %>%
select(customer_id, mcc_code, sub_predictions) %>% rename(volume = sub_predictions) %>%
mutate(volume = exp(volume)-1)
write_csv(submission, "./submissions/C_lm_lasso+1_ver2.csv")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment