Last active
March 2, 2018 07:59
-
-
Save Benedikt1992/19c48e3fddd8cb8c979cd167f47ee907 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import pandas as pd | |
import pickle | |
from keras.layers import BatchNormalization, Dropout, regularizers | |
from keras.layers import Dense | |
from keras.layers import Input | |
from keras.layers.merge import concatenate | |
from keras.models import Model | |
from keras.models import Sequential | |
from keras.utils import plot_model | |
from sklearn.model_selection import train_test_split | |
from sklearn.metrics import make_scorer | |
from sklearn.model_selection import GridSearchCV | |
from sklearn.model_selection import KFold | |
from keras.wrappers.scikit_learn import KerasRegressor | |
from keras.optimizers import Adam | |
#import matplotlib.pyplot as plt | |
from data.selected_features_pearson import selected_features_pearson | |
from data.selected_features_boosting import selected_features_boosting | |
TRAITS = set(['ope', 'con', 'ext', 'agr', 'neu']) | |
class ModelBuilder: | |
def __init__(self, number_of_features): | |
self.number_of_features = number_of_features | |
# https://machinelearningmastery.com/grid-search-hyperparameters-deep-learning-models-python-keras/ | |
# http://nbviewer.jupyter.org/github/bockjo/deeplearning-and-related/blob/master/Entity_Embedding_Model.ipynb | |
# noch learning rate einbauen | |
# regularization/dropout einbauen und optimieren | |
# activ und co kann man auch einbauen | |
# batch size nicht optimieren. so groß wie möglich wählen (ram bedarf steigt dann) | |
def build_keras_model(self, activ="relu", init="uniform", loss='mean_squared_error', learning_rate=0.01, beta_1=0.9, beta_2=0.999, p=0.2, reg_lambda=0., **kwargs): | |
list_of_inputs = [] | |
inps = Input(shape=(self.number_of_features,), name="contin") | |
dim = self.number_of_features * 10 | |
x = Dense(dim, input_dim=self.number_of_features, activation=activ, kernel_initializer=init, kernel_regularizer=regularizers.l2(reg_lambda), | |
name="contin_d")(inps) | |
x = BatchNormalization()(x) | |
list_of_inputs.append(inps) | |
# Build Dense network on top of feature encoders | |
x = Dropout(p / 10)(x) | |
x = Dense(2 ** 10, activation=activ, kernel_initializer=init, kernel_regularizer=regularizers.l2(reg_lambda))(x) | |
x = Dense(2 ** 9, activation=activ, kernel_initializer=init, kernel_regularizer=regularizers.l2(reg_lambda))(x) | |
x = Dense(2 ** 8, activation=activ, kernel_initializer=init, kernel_regularizer=regularizers.l2(reg_lambda))(x) | |
x = Dense(2 ** 7, activation=activ, kernel_initializer=init, kernel_regularizer=regularizers.l2(reg_lambda))(x) | |
x = Dropout(p)(x) | |
out = Dense(1, activation=activ)(x) | |
current_model = Model(list_of_inputs, out) | |
optimizer = Adam(lr=learning_rate, beta_1=beta_1, beta_2=beta_2) | |
current_model.compile(loss=loss, optimizer=optimizer) | |
return current_model | |
def save(model, grid, history, model_name, model_png=True): | |
if not os.path.exists('data/NN'): | |
os.makedirs('data/NN') | |
# Save model architecture picture | |
if model_png: | |
plot_model(model, to_file='data/NN/' + model_name + '.png', show_shapes=True) | |
# Save model structure | |
model_json = model.to_json() | |
with open('data/NN/' + model_name + ".json", "w") as json_file: | |
json_file.write(model_json) | |
# Save model weights | |
model.save_weights('data/NN/' + model_name + "_weights.h5") | |
grid_results = [grid.cv_results_, grid.best_score_, grid.best_params_] | |
pickle.dump(grid_results, open("data/NN/{}_grid_results.pickle".format(model_name), "wb")) | |
# https: // keras.io / getting - started / faq / # how-can-i-save-a-keras-model | |
model.save("data/NN/{}_grid.keras".format(model_name)) | |
pickle.dump(history.history, open("data/NN/{}_history.pickle".format(model_name), "wb")) | |
print(model_name + " has been saved to disk.") | |
return | |
# RMSE metric | |
def rmse(y_actual, y_predicted): | |
from sklearn.metrics import mean_squared_error | |
from math import sqrt | |
return sqrt(mean_squared_error(y_actual, y_predicted)) | |
# r2 metric | |
def r2(y_actual, y_predicted): | |
from sklearn.metrics import r2_score | |
return r2_score(y_actual, y_predicted) | |
def train(trait,number_of_features, x, y, x_test, y_test): | |
scoring = { | |
'rmse': make_scorer(rmse, greater_is_better=False), | |
'r2': 'r2' # make_scorer(r2) | |
} | |
grid_params = [ | |
{ | |
'epochs': [100,200,500], | |
'batch_size': [25], | |
'activ': ['relu'], | |
'loss': ['mean_squared_error'], | |
'learning_rate': [0.001, 0.0001], | |
'beta_1': [0.9], | |
'beta_2': [0.999], | |
'p': [0., 0.2, 0.5], | |
'reg_lambda': [0.,0.01, 0.001] | |
} | |
] | |
builder = ModelBuilder(number_of_features) | |
sklearn_mock = KerasRegressor(build_fn=builder.build_keras_model, verbose=0) | |
kfold = KFold(n_splits=3, random_state=7) | |
grid = GridSearchCV(sklearn_mock, | |
cv=kfold, | |
n_jobs=-1, | |
param_grid=grid_params, | |
scoring=scoring, | |
refit='rmse', # refit best scoring model on whole training data with rmse metric | |
return_train_score=False, # for better performance | |
verbose=2 # print progress | |
) | |
grid.fit(x, y) | |
model = builder.build_keras_model(**grid.best_params_) | |
history = model.fit(x, y, verbose=0, validation_split=0.2, epochs=grid.best_params_["epochs"], batch_size=grid.best_params_["batch_size"]) | |
save(model, grid, history, trait) | |
print("\n## Best score:", grid.best_score_) | |
print("## Best parameters:", grid.best_params_) | |
print("\n## Scoring on test set:") | |
s = grid.best_estimator_.score(x_test, y_test) | |
print(" score=", s) | |
def prepare_data(training_data, relative_test_size): | |
scores = training_data[[*TRAITS]] | |
features = training_data.drop(['userid'] + list(TRAITS), axis=1) | |
# split to train-validation and test sets | |
features_train, features_test, scores_train, scores_test = train_test_split(features, scores, test_size=relative_test_size, random_state=7) | |
return features_train, features_test, scores_train, scores_test | |
if __name__ == "__main__": | |
data = pd.read_csv("data/training_data.csv", index_col=0) | |
relative_test_size = 0.2 | |
features_train, features_test, scores_train, scores_test = prepare_data(data, relative_test_size) | |
print("Number of training samples", len(features_train)) | |
print("Number of test samples", len(features_test)) | |
for trait in TRAITS: | |
for feature_set in [trait,'common', 'union']: | |
if feature_set == 'union': | |
selected_features_boosting['union'] = set(selected_features_boosting[trait]) | set(selected_features_boosting['common']) | |
selected_features_pearson['union'] = set(selected_features_pearson[trait]) | set(selected_features_pearson['common']) | |
# trait = 'neu' | |
feature_names = selected_features_pearson[feature_set] | |
name = trait + "_" + feature_set + "_" + "pearson" | |
if not os.path.exists("data/NN/{}_grid.keras".format(name)): | |
print("######################") | |
print("starting next training with pearson:", trait, feature_set) | |
print("######################") | |
# select features | |
X_train = features_train[[*feature_names]] | |
X_test = features_test[[*feature_names]] | |
y_train = scores_train[trait].values.flatten() | |
y_test = scores_test[trait].values.flatten() | |
# actual training | |
train(name, len(feature_names), X_train, y_train, X_test, y_test) | |
#--------------------------------------------------------------------------------- | |
# trait = 'neu' | |
feature_names = selected_features_boosting[feature_set] | |
name = trait + "_" + feature_set + "_" + "boosting" | |
if not os.path.exists("data/NN/{}_grid.keras".format(name)): | |
print("######################") | |
print("starting next training with boosting:", trait, feature_set) | |
print("######################") | |
# select features | |
X_train = features_train[[*feature_names]] | |
X_test = features_test[[*feature_names]] | |
y_train = scores_train[trait].values.flatten() | |
y_test = scores_test[trait].values.flatten() | |
# actual training | |
train(name, len(feature_names), X_train, y_train, X_test, y_test) | |
# print(history) | |
# print(history.history['loss']) | |
# # summarize history for loss | |
# plt.plot(history.history['loss']) | |
# plt.plot(history.history['val_loss']) | |
# plt.title('model loss') | |
# plt.ylabel('loss') | |
# plt.xlabel('epoch') | |
# plt.legend(['train', 'test'], loc='upper left') | |
# plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment