Last active
March 2, 2018 07:56
-
-
Save Benedikt1992/c6f7ef175e957b011d1d13bf176818f8 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import pickle | |
import pandas as pd | |
from sklearn.metrics import make_scorer | |
from sklearn.model_selection import GridSearchCV | |
from sklearn.model_selection import KFold | |
from sklearn.model_selection import train_test_split | |
from xgboost import XGBRegressor | |
from data.selected_features_boosting import selected_features_boosting | |
from data.selected_features_pearson import selected_features_pearson | |
TRAITS_TO_TRAIN = ['ope', 'con', 'ext', 'agr', 'neu'] | |
TEST_SIZE = 0.2 | |
TRAITS = set(['ope', 'con', 'ext', 'agr', 'neu']) | |
data = pd.read_csv("data/training_data.csv", index_col=0) | |
selected_features = {} | |
# RMSE metric | |
def rmse(y_actual, y_predicted): | |
from sklearn.metrics import mean_squared_error | |
from math import sqrt | |
return sqrt(mean_squared_error(y_actual, y_predicted)) | |
# r2 metric | |
def r2(y_actual, y_predicted): | |
from sklearn.metrics import r2_score | |
return r2_score(y_actual, y_predicted) | |
def boostwith(X_train, X_test, y_train, y_test, trait, load_from_disk=False): | |
scoring = { | |
'rmse': make_scorer(rmse, greater_is_better=False), | |
'r2': 'r2' # make_scorer(r2) | |
} | |
kfold = KFold(n_splits=3, random_state=7) | |
# LEARN | |
if not load_from_disk: | |
xgb_model = XGBRegressor() | |
clf = GridSearchCV(xgb_model, | |
{'max_depth': [2, 3, 4], | |
'n_estimators': [300, 500], | |
'learning_rate': [0.01, 0.1, 0.001], | |
}, | |
n_jobs=-1, | |
cv=kfold, | |
scoring=scoring, | |
refit='rmse', # refit best scoring model on whole training data with rmse metric | |
return_train_score=False, # for better performance | |
verbose=2 # print progress | |
) | |
clf.fit(X_train, y_train) | |
print("Best score: {}".format(clf.best_score_)) | |
print("Best Params: {}".format(clf.best_params_)) | |
print("\n## Scoring on test set:") | |
s = clf.best_estimator_.score(X_test, y_test) | |
print(" score=", s) | |
if not os.path.exists('data/xgboost-models'): | |
os.makedirs('data/xgboost-models') | |
pickle.dump(clf, open("data/xgboost-models/{}_boosting.model".format(trait), "wb")) | |
else: | |
clf = pickle.load(open("data/xgboost-models/{}_boosting.model".format(trait), "rb")) | |
def prepare_data(training_data, relative_test_size): | |
scores = training_data[[*TRAITS]] | |
features = training_data.drop(['userid'] + list(TRAITS), axis=1) | |
# split to train-validation and test sets | |
features_train, features_test, scores_train, scores_test = train_test_split(features, scores, test_size=relative_test_size, random_state=7) | |
return features_train, features_test, scores_train, scores_test | |
if __name__ == "__main__": | |
data = pd.read_csv("data/training_data.csv", index_col=0) | |
relative_test_size = TEST_SIZE | |
features_train, features_test, scores_train, scores_test = prepare_data(data, relative_test_size) | |
print("Number of training samples", len(features_train)) | |
print("Number of test samples", len(features_test)) | |
for trait in TRAITS_TO_TRAIN: | |
for feature_set in [trait, 'common', 'union']: | |
if feature_set == 'union': | |
selected_features_boosting['union'] = set(selected_features_boosting[trait]) | set(selected_features_boosting['common']) | |
selected_features_pearson['union'] = set(selected_features_pearson[trait]) | set(selected_features_pearson['common']) | |
# trait = 'neu' | |
feature_names = selected_features_pearson[feature_set] | |
name = trait + "_" + feature_set + "_" + "pearson" | |
print("######################") | |
print("starting next training with pearson:", trait, feature_set) | |
print("######################") | |
# select features | |
X_train = features_train[[*feature_names]] | |
X_test = features_test[[*feature_names]] | |
y_train = scores_train[trait].values.flatten() | |
y_test = scores_test[trait].values.flatten() | |
# actual training | |
boostwith(X_train, X_test, y_train, y_test, name) | |
# --------------------------------------------------------------------------------- | |
# trait = 'neu' | |
feature_names = selected_features_boosting[feature_set] | |
name = trait + "_" + feature_set + "_" + "boosting" | |
print("######################") | |
print("starting next training with boosting:", trait, feature_set) | |
print("######################") | |
# select features | |
X_train = features_train[[*feature_names]] | |
X_test = features_test[[*feature_names]] | |
y_train = scores_train[trait].values.flatten() | |
y_test = scores_test[trait].values.flatten() | |
# actual training | |
boostwith(X_train, X_test, y_train, y_test, name) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment