-
-
Save Sandy4321/cc4fce780e1254fbeba32d55e0d89a58 to your computer and use it in GitHub Desktop.
A numpy/pandas implementation of XGBoost
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
from math import e | |
class Node: | |
''' | |
A node object that is recursivly called within itslef to construct a regression tree. Based on Tianqi Chen's XGBoost | |
the internal gain used to find the optimal split value uses both the gradient and hessian. Also a weighted quantlie sketch | |
and optimal leaf values all follow Chen's description in "XGBoost: A Scalable Tree Boosting System" the only thing not | |
implemented in this version is sparsity aware fitting or the ability to handle NA values with a default direction. | |
Inputs | |
------------------------------------------------------------------------------------------------------------------ | |
x: pandas datframe of the training data | |
gradient: negative gradient of the loss function | |
hessian: second order derivative of the loss function | |
idxs: used to keep track of samples within the tree structure | |
subsample_cols: is an implementation of layerwise column subsample randomizing the structure of the trees | |
(complexity parameter) | |
min_leaf: minimum number of samples for a node to be considered a node (complexity parameter) | |
min_child_weight: sum of the heassian inside a node is a meaure of purity (complexity parameter) | |
depth: limits the number of layers in the tree | |
lambda: L2 regularization term on weights. Increasing this value will make model more conservative. | |
gamma: This parameter also prevents over fitting and is present in the the calculation of the gain (structure score). | |
As this is subtracted from the gain it essentially sets a minimum gain amount to make a split in a node. | |
eps: This parameter is used in the quantile weighted skecth or 'approx' tree method roughly translates to | |
(1 / sketch_eps) number of bins | |
Outputs | |
-------------------------------------------------------------------------------------------------------------------- | |
A single tree object that will be used for gradient boosintg. | |
''' | |
def __init__(self, x, gradient, hessian, idxs, subsample_cols = 0.8 , min_leaf = 5, min_child_weight = 1 ,depth = 10, lambda_ = 1, gamma = 1, eps = 0.1): | |
self.x, self.gradient, self.hessian = x, gradient, hessian | |
self.idxs = idxs | |
self.depth = depth | |
self.min_leaf = min_leaf | |
self.lambda_ = lambda_ | |
self.gamma = gamma | |
self.min_child_weight = min_child_weight | |
self.row_count = len(idxs) | |
self.col_count = x.shape[1] | |
self.subsample_cols = subsample_cols | |
self.eps = eps | |
self.column_subsample = np.random.permutation(self.col_count)[:round(self.subsample_cols*self.col_count)] | |
self.val = self.compute_gamma(self.gradient[self.idxs], self.hessian[self.idxs]) | |
self.score = float('-inf') | |
self.find_varsplit() | |
def compute_gamma(self, gradient, hessian): | |
''' | |
Calculates the optimal leaf value equation (5) in "XGBoost: A Scalable Tree Boosting System" | |
''' | |
return(-np.sum(gradient)/(np.sum(hessian) + self.lambda_)) | |
def find_varsplit(self): | |
''' | |
Scans through every column and calcuates the best split point. | |
The node is then split at this point and two new nodes are created. | |
Depth is only parameter to change as we have added a new layer to tre structure. | |
If no split is better than the score initalised at the begining then no splits further splits are made | |
''' | |
for c in self.column_subsample: self.find_greedy_split(c) | |
if self.is_leaf: return | |
x = self.split_col | |
lhs = np.nonzero(x <= self.split)[0] | |
rhs = np.nonzero(x > self.split)[0] | |
self.lhs = Node(x = self.x, gradient = self.gradient, hessian = self.hessian, idxs = self.idxs[lhs], min_leaf = self.min_leaf, depth = self.depth-1, lambda_ = self.lambda_ , gamma = self.gamma, min_child_weight = self.min_child_weight, eps = self.eps, subsample_cols = self.subsample_cols) | |
self.rhs = Node(x = self.x, gradient = self.gradient, hessian = self.hessian, idxs = self.idxs[rhs], min_leaf = self.min_leaf, depth = self.depth-1, lambda_ = self.lambda_ , gamma = self.gamma, min_child_weight = self.min_child_weight, eps = self.eps, subsample_cols = self.subsample_cols) | |
def find_greedy_split(self, var_idx): | |
''' | |
For a given feature greedily calculates the gain at each split. | |
Globally updates the best score and split point if a better split point is found | |
''' | |
x = self.x.values[self.idxs, var_idx] | |
for r in range(self.row_count): | |
lhs = x <= x[r] | |
rhs = x > x[r] | |
lhs_indices = np.nonzero(x <= x[r])[0] | |
rhs_indices = np.nonzero(x > x[r])[0] | |
if(rhs.sum() < self.min_leaf or lhs.sum() < self.min_leaf | |
or self.hessian[lhs_indices].sum() < self.min_child_weight | |
or self.hessian[rhs_indices].sum() < self.min_child_weight): continue | |
curr_score = self.gain(lhs, rhs) | |
if curr_score > self.score: | |
self.var_idx = var_idx | |
self.score = curr_score | |
self.split = x[r] | |
def weighted_qauntile_sketch(self, var_idx): | |
''' | |
XGBOOST Mini-Version | |
Yiyang "Joe" Zeng | |
Is an approximation to the eact greedy approach faster for bigger datasets wher it is not feasible | |
to calculate the gain at every split point. Uses equation (8) and (9) from "XGBoost: A Scalable Tree Boosting System" | |
''' | |
x = self.x.values[self.idxs, var_idx] | |
hessian_ = self.hessian[self.idxs] | |
df = pd.DataFrame({'feature':x,'hess':hessian_}) | |
df.sort_values(by=['feature'], ascending = True, inplace = True) | |
hess_sum = df['hess'].sum() | |
df['rank'] = df.apply(lambda x : (1/hess_sum)*sum(df[df['feature'] < x['feature']]['hess']), axis=1) | |
for row in range(df.shape[0]-1): | |
# look at the current rank and the next ran | |
rk_sk_j, rk_sk_j_1 = df['rank'].iloc[row:row+2] | |
diff = abs(rk_sk_j - rk_sk_j_1) | |
if(diff >= self.eps): | |
continue | |
split_value = (df['rank'].iloc[row+1] + df['rank'].iloc[row])/2 | |
lhs = x <= split_value | |
rhs = x > split_value | |
lhs_indices = np.nonzero(x <= split_value)[0] | |
rhs_indices = np.nonzero(x > split_value)[0] | |
if(rhs.sum() < self.min_leaf or lhs.sum() < self.min_leaf | |
or self.hessian[lhs_indices].sum() < self.min_child_weight | |
or self.hessian[rhs_indices].sum() < self.min_child_weight): continue | |
curr_score = self.gain(lhs, rhs) | |
if curr_score > self.score: | |
self.var_idx = var_idx | |
self.score = curr_score | |
self.split = split_value | |
def gain(self, lhs, rhs): | |
''' | |
Calculates the gain at a particular split point bases on equation (7) from | |
"XGBoost: A Scalable Tree Boosting System" | |
''' | |
gradient = self.gradient[self.idxs] | |
hessian = self.hessian[self.idxs] | |
lhs_gradient = gradient[lhs].sum() | |
lhs_hessian = hessian[lhs].sum() | |
rhs_gradient = gradient[rhs].sum() | |
rhs_hessian = hessian[rhs].sum() | |
gain = 0.5 *( (lhs_gradient**2/(lhs_hessian + self.lambda_)) + (rhs_gradient**2/(rhs_hessian + self.lambda_)) - ((lhs_gradient + rhs_gradient)**2/(lhs_hessian + rhs_hessian + self.lambda_))) - self.gamma | |
return(gain) | |
@property | |
def split_col(self): | |
''' | |
splits a column | |
''' | |
return self.x.values[self.idxs , self.var_idx] | |
@property | |
def is_leaf(self): | |
''' | |
checks if node is a leaf | |
''' | |
return self.score == float('-inf') or self.depth <= 0 | |
def predict(self, x): | |
return np.array([self.predict_row(xi) for xi in x]) | |
def predict_row(self, xi): | |
if self.is_leaf: | |
return(self.val) | |
node = self.lhs if xi[self.var_idx] <= self.split else self.rhs | |
return node.predict_row(xi) | |
class XGBoostTree: | |
''' | |
Wrapper class that provides a scikit learn interface to the recursive regression tree above | |
Inputs | |
------------------------------------------------------------------------------------------------------------------ | |
x: pandas datframe of the training data | |
gradient: negative gradient of the loss function | |
hessian: second order derivative of the loss function | |
idxs: used to keep track of samples within the tree structure | |
subsample_cols: is an implementation of layerwise column subsample randomizing the structure of the trees | |
(complexity parameter) | |
min_leaf: minimum number of samples for a node to be considered a node (complexity parameter) | |
min_child_weight: sum of the heassian inside a node is a meaure of purity (complexity parameter) | |
depth: limits the number of layers in the tree | |
lambda: L2 regularization term on weights. Increasing this value will make model more conservative. | |
gamma: This parameter also prevents over fitting and is present in the the calculation of the gain (structure score). | |
As this is subtracted from the gain it essentially sets a minimum gain amount to make a split in a node. | |
eps: This parameter is used in the quantile weighted skecth or 'approx' tree method roughly translates to | |
(1 / sketch_eps) number of bins | |
Outputs | |
-------------------------------------------------------------------------------------------------------------------- | |
A single tree object that will be used for gradient boosintg. | |
''' | |
def fit(self, x, gradient, hessian, subsample_cols = 0.8 , min_leaf = 5, min_child_weight = 1 ,depth = 10, lambda_ = 1, gamma = 1, eps = 0.1): | |
self.dtree = Node(x, gradient, hessian, np.array(np.arange(len(x))), subsample_cols, min_leaf, min_child_weight, depth, lambda_, gamma, eps) | |
return self | |
def predict(self, X): | |
return self.dtree.predict(X.values) | |
class XGBoostClassifier: | |
''' | |
Full application of the XGBoost algorithm as described in "XGBoost: A Scalable Tree Boosting System" for | |
Binary Classification. | |
Inputs | |
------------------------------------------------------------------------------------------------------------------ | |
x: pandas datframe of the training data | |
gradient: negative gradient of the loss function | |
hessian: second order derivative of the loss function | |
idxs: used to keep track of samples within the tree structure | |
subsample_cols: is an implementation of layerwise column subsample randomizing the structure of the trees | |
(complexity parameter) | |
min_leaf: minimum number of samples for a node to be considered a node (complexity parameter) | |
min_child_weight: sum of the heassian inside a node is a meaure of purity (complexity parameter) | |
depth: limits the number of layers in the tree | |
lambda: L2 regularization term on weights. Increasing this value will make model more conservative. | |
gamma: This parameter also prevents over fitting and is present in the the calculation of the gain (structure score). | |
As this is subtracted from the gain it essentially sets a minimum gain amount to make a split in a node. | |
eps: This parameter is used in the quantile weighted skecth or 'approx' tree method roughly translates to | |
(1 / sketch_eps) number of bins | |
Outputs | |
-------------------------------------------------------------------------------------------------------------------- | |
A single tree object that will be used for gradient boosintg. | |
''' | |
def __init__(self): | |
self.estimators = [] | |
@staticmethod | |
def sigmoid(x): | |
return 1 / (1 + np.exp(-x)) | |
# first order gradient logLoss | |
def grad(self, preds, labels): | |
preds = self.sigmoid(preds) | |
return(preds - labels) | |
# second order gradient logLoss | |
def hess(self, preds, labels): | |
preds = self.sigmoid(preds) | |
return(preds * (1 - preds)) | |
@staticmethod | |
def log_odds(column): | |
binary_yes = np.count_nonzero(column == 1) | |
binary_no = np.count_nonzero(column == 0) | |
return(np.log(binary_yes/binary_no)) | |
def fit(self, X, y, subsample_cols = 0.8 , min_child_weight = 1, depth = 5, min_leaf = 5, learning_rate = 0.4, boosting_rounds = 5, lambda_ = 1.5, gamma = 1, eps = 0.1): | |
self.X, self.y = X, y.values | |
self.depth = depth | |
self.subsample_cols = subsample_cols | |
self.eps = eps | |
self.min_child_weight = min_child_weight | |
self.min_leaf = min_leaf | |
self.learning_rate = learning_rate | |
self.boosting_rounds = boosting_rounds | |
self.lambda_ = lambda_ | |
self.gamma = gamma | |
self.base_pred = np.full((X.shape[0], 1), 1).flatten().astype('float64') | |
for booster in range(self.boosting_rounds): | |
Grad = self.grad(self.base_pred, self.y) | |
Hess = self.hess(self.base_pred, self.y) | |
boosting_tree = XGBoostTree().fit(self.X, Grad, Hess, depth = self.depth, min_leaf = self.min_leaf, lambda_ = self.lambda_, gamma = self.gamma, eps = self.eps, min_child_weight = self.min_child_weight, subsample_cols = self.subsample_cols) | |
self.base_pred += self.learning_rate * boosting_tree.predict(self.X) | |
self.estimators.append(boosting_tree) | |
def predict_proba(self, X): | |
pred = np.zeros(X.shape[0]) | |
for estimator in self.estimators: | |
pred += self.learning_rate * estimator.predict(X) | |
return(self.sigmoid(np.full((X.shape[0], 1), 1).flatten().astype('float64') + pred)) | |
def predict(self, X): | |
pred = np.zeros(X.shape[0]) | |
for estimator in self.estimators: | |
pred += self.learning_rate * estimator.predict(X) | |
predicted_probas = self.sigmoid(np.full((X.shape[0], 1), 1).flatten().astype('float64') + pred) | |
preds = np.where(predicted_probas > np.mean(predicted_probas), 1, 0) | |
return(preds) | |
class XGBoostRegressor: | |
''' | |
Full application of the XGBoost algorithm as described in "XGBoost: A Scalable Tree Boosting System" for | |
regression. | |
Inputs | |
------------------------------------------------------------------------------------------------------------------ | |
x: pandas datframe of the training data | |
gradient: negative gradient of the loss function | |
hessian: second order derivative of the loss function | |
idxs: used to keep track of samples within the tree structure | |
subsample_cols: is an implementation of layerwise column subsample randomizing the structure of the trees | |
(complexity parameter) | |
min_leaf: minimum number of samples for a node to be considered a node (complexity parameter) | |
min_child_weight: sum of the heassian inside a node is a meaure of purity (complexity parameter) | |
depth: limits the number of layers in the tree | |
lambda: L2 regularization term on weights. Increasing this value will make model more conservative. | |
gamma: This parameter also prevents over fitting and is present in the the calculation of the gain (structure score). | |
As this is subtracted from the gain it essentially sets a minimum gain amount to make a split in a node. | |
eps: This parameter is used in the quantile weighted skecth or 'approx' tree method roughly translates to | |
(1 / sketch_eps) number of bins | |
Outputs | |
-------------------------------------------------------------------------------------------------------------------- | |
A single tree object that will be used for gradient boosintg. | |
''' | |
def __init__(self): | |
self.estimators = [] | |
# first order gradient mean squared error | |
@staticmethod | |
def grad(preds, labels): | |
return(2*(preds-labels)) | |
# second order gradient logLoss | |
@staticmethod | |
def hess(preds, labels): | |
''' | |
hessian of mean squared error is a constant value of two | |
returns an array of twos | |
''' | |
return(np.full((preds.shape[0], 1), 2).flatten().astype('float64')) | |
def fit(self, X, y, subsample_cols = 0.8 , min_child_weight = 1, depth = 5, min_leaf = 5, learning_rate = 0.4, boosting_rounds = 5, lambda_ = 1.5, gamma = 1, eps = 0.1): | |
self.X, self.y = X, y.values | |
self.depth = depth | |
self.subsample_cols = subsample_cols | |
self.eps = eps | |
self.min_child_weight = min_child_weight | |
self.min_leaf = min_leaf | |
self.learning_rate = learning_rate | |
self.boosting_rounds = boosting_rounds | |
self.lambda_ = lambda_ | |
self.gamma = gamma | |
self.base_pred = np.full((X.shape[0], 1), np.mean(y)).flatten().astype('float64') | |
for booster in range(self.boosting_rounds): | |
Grad = self.grad(self.base_pred, self.y) | |
Hess = self.hess(self.base_pred, self.y) | |
boosting_tree = XGBoostTree().fit(self.X, Grad, Hess, depth = self.depth, min_leaf = self.min_leaf, lambda_ = self.lambda_, gamma = self.gamma, eps = self.eps, min_child_weight = self.min_child_weight, subsample_cols = self.subsample_cols) | |
self.base_pred += self.learning_rate * boosting_tree.predict(self.X) | |
self.estimators.append(boosting_tree) | |
def predict(self, X): | |
pred = np.zeros(X.shape[0]) | |
for estimator in self.estimators: | |
pred += self.learning_rate * estimator.predict(X) | |
return np.full((X.shape[0], 1), np.mean(y)).flatten().astype('float64') + pred |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment