Last active
November 11, 2016 06:48
-
-
Save amchercashin/70db9cfd90eb714a13ad4027365dbae7 to your computer and use it in GitHub Desktop.
SDSJ contest, C ex., example
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(readr) | |
library(stringr) | |
library(dplyr) | |
library(ggplot2) | |
library(lubridate) | |
library(glmnet) | |
library(purrr) | |
library(tidyr) | |
train_gender <- read_csv("./data/customers_gender_train.csv", col_types = "ci") | |
train <- read_csv("./data/transactions.csv", col_types = "ccccnc") | |
train$day_of_year <- as.integer(str_split(train$tr_datetime, " ", simplify = TRUE)[,1]) | |
unique_customers <- unique(train$customer_id); unique_mcc_codes <- unique(train$mcc_code) | |
### CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC | |
all_data <- train[!train$customer_id %in% train_gender$customer_id, ] | |
all_data$amount[all_data$amount > 0] <- 0 | |
all_data <- all_data %>% select(customer_id, day = day_of_year, mcc_code, amount) | |
all_data <- all_data %>% mutate(date = dmy("01.08.2014") + day, customer_id = as.character(customer_id), mcc_code = as.character(mcc_code), | |
month = as.integer(month(date)), year = as.integer(year(date)-2013), | |
month_num = as.integer(12 * year - 12 + month - 7)) | |
all_data <- all_data %>% group_by(customer_id, mcc_code, month, year, month_num) %>% summarise(volume = sum(amount)) | |
all_data <- all_data %>% ungroup() %>% complete(customer_id, mcc_code, nesting(month, year, month_num), fill = list(volume = 0)) | |
unique_customers <- unique(all_data$customer_id) | |
pred_data <- crossing(customer_id = unique_customers, mcc_code = unique_mcc_codes, month = as.integer(11), year = as.integer(2), | |
month_num = as.integer(16), volume = 0) | |
all_data <- bind_rows(all_data, pred_data) | |
rm(pred_data) | |
all_data <- all_data %>% arrange(customer_id, mcc_code, year, month) | |
all_data <- all_data %>% mutate(mcc_code = factor(mcc_code)) %>% group_by(customer_id, mcc_code) %>% nest() | |
i <- 0 | |
all_data <- all_data %>% mutate(data = map(data, function(d){ | |
print(i<<-i+1) | |
bind_cols(d, tibble(volume_lag_1 = c(0,d$volume[1:15]), | |
volume_lag_2 = c(0,0,d$volume[1:14]), | |
volume_lag_3 = c(0,0,0,d$volume[1:13]))) | |
} | |
) | |
) | |
all_data <- all_data %>% unnest() | |
data <- all_data %>% filter(month_num != 16) %>% group_by(customer_id, mcc_code) %>% nest(.key = data) | |
train_data <- all_data %>% filter(month_num < 15) %>% group_by(customer_id, mcc_code) %>% nest(.key = train_data) | |
test_data <- all_data %>% filter(month_num == 15) %>% group_by(customer_id, mcc_code) %>% nest(.key = test_data) | |
pred_data <- all_data %>% filter(month_num == 16) %>% group_by(customer_id, mcc_code) %>% nest(.key = pred_data) | |
all_data <- data; rm(data) | |
all_data$train_data <- train_data$train_data; rm(train_data) | |
all_data$test_data <- test_data$test_data; rm(test_data) | |
all_data$pred_data <- pred_data$pred_data; rm(pred_data) | |
#### MODEL CHOOSING #### | |
train_data <- all_data[sample(552000,200000),] | |
i = 0 | |
fit_formula = log(-volume+1) ~ month * year + month_num + I(volume_lag_1==0) + I(volume_lag_2==0) + I(volume_lag_3==0) | |
# fit_formula_l = I(volume != 0) ~ month * year + month_num + I(volume_lag_1==0) + I(volume_lag_2==0) + I(volume_lag_3==0) | |
fit_model_glmnet <- function(data) { | |
print(i<<-i+1) | |
X <- model.matrix(as.formula(fit_formula), data = data) | |
y <- log(-data$volume + 1) | |
return(glmnet(x = X, y = y, lambda = 3.5, standardize = TRUE, alpha = 0.5)) | |
} | |
fit_model_glmnet_l <- function(data) { | |
# print(i<<-i+1) | |
X <- model.matrix(as.formula(fit_formula_l), data = data) | |
y <- data$volume != 0 | |
return(glmnet(x = X, y = y, lambda = 3.5, standardize = FALSE, alpha = 0.5, family="binomial")) | |
} | |
fit_model_cv.glmnet <- function(data) { | |
print(i<<-i+1) | |
X <- model.matrix(as.formula(fit_formula), data = data) | |
y <- log(-data$volume + 1) | |
return(cv.glmnet(x = X, y = y, nfolds = 10)) | |
} | |
library(kernlab) | |
fit_model_rvm <- function(data) { | |
print(i<<-i+1) | |
X <- model.matrix(as.formula(fit_formula), data = data) | |
y <- log(-data$volume + 1) | |
return(rvm(x = X, y = y, fit = FALSE)) | |
} | |
library(e1071) | |
fit_model_svm <- function(data) { | |
print(i<<-i+1) | |
X <- Matrix::sparse.model.matrix(as.formula(fit_formula), data = data) | |
y <- log(-data$volume + 1) | |
return(svm(x = X, y = y, fitted = FALSE, kernel = "polynomial")) | |
} | |
library(xgboost) | |
fit_model_xgb <- function(data) { | |
print(i<<-i+1) | |
dtrain <- xgb.DMatrix(data = model.matrix(as.formula(fit_formula), data = data), | |
label = log(-data$volume + 1)) | |
return(xgb.train(data = dtrain, nthread = 4, nround = 500, booster = "gblinear", alpha = 10)) | |
} | |
library(biglm) | |
fit_model_blm <- function(data) { | |
print(i<<-i+1) | |
return(biglm(as.formula(fit_formula), data = data)) | |
} | |
system.time({ | |
train_data <- train_data %>% mutate(#train_lm = map(train_data3, fit_model_lm), | |
fit = map(train_data, function(t) { | |
if (sum(t$volume) == 0 ) {0} | |
else {fit_model_glmnet(t)} | |
})) | |
}) | |
train_data <- train_data %>% mutate(#test_pred_lm = map2(test_data, train_lm, ~predict(.y, newdata = .x)), | |
test_predictions = map2(test_data, fit, function(x,y){ | |
if(y[[1]]==0) {0} | |
else {predict(y, model.matrix(as.formula(fit_formula), data=x))} | |
})) | |
eval <- train_data %>% select(customer_id, mcc_code, test_data, test_predictions) %>% unnest() %>% mutate(volume = log(-volume + 1)) | |
RMSE <- sqrt(mean((eval$volume - eval$test_predictions)^2)) | |
RMSE | |
#### SUBMISSION #### | |
system.time({ | |
all_data <- all_data %>% mutate(fit = map(data, function(t) { | |
if (sum(t$volume[10:15]) == 0 ) {0} | |
else {fit_model_glmnet(t)} | |
})) | |
}) | |
all_data <- all_data %>% mutate(sub_predictions = map2(pred_data, fit, function(x,y){ | |
if(y[[1]]==0) {0} | |
else {predict(y, model.matrix(as.formula(fit_formula), data=x))} | |
})) | |
submission <- all_data %>% select(customer_id, mcc_code, pred_data, sub_predictions) %>% unnest() %>% | |
select(customer_id, mcc_code, sub_predictions) %>% rename(volume = sub_predictions) %>% | |
mutate(volume = exp(volume)-1) | |
write_csv(submission, "./submissions/C_lm_lasso+1_ver2.csv") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment