Last active
September 14, 2022 01:23
-
-
Save liangfu/28f814655abbc2cc89c575d69605141b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" Example script for defining and using custom models in AutoGluon Tabular """ | |
from autogluon.core.utils import infer_problem_type | |
from autogluon.tabular import TabularDataset, TabularPredictor | |
from autogluon.tabular.configs.hyperparameter_configs import get_hyperparameter_config | |
from autogluon.core.data import LabelCleaner | |
from autogluon.core.models import AbstractModel | |
from skl2onnx import convert_sklearn, get_model_alias | |
from skl2onnx.common._registration import get_shape_calculator, get_converter | |
from onnxconverter_common.data_types import Int64TensorType, FloatTensorType | |
from sklearn.pipeline import Pipeline | |
from skl2onnx import update_registered_converter | |
from skl2onnx._parse import _parse_sklearn | |
from sklearn.base import is_classifier | |
from sklearn.neural_network import MLPClassifier | |
from sklearn.neighbors import KNeighborsClassifier | |
from sklearn.svm import SVC | |
from sklearn.gaussian_process import GaussianProcessClassifier | |
from sklearn.gaussian_process.kernels import RBF | |
from sklearn.tree import DecisionTreeClassifier | |
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier | |
from sklearn.naive_bayes import GaussianNB | |
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis | |
import onnxruntime as rt | |
import numpy as np | |
import pandas as pd | |
import time | |
######################### | |
# Create a custom model # | |
######################### | |
# In this example, we create a custom Naive Bayes model for use in AutoGluon | |
class NaiveBayesModel(AbstractModel): | |
# The `_preprocess` method takes the input data and transforms it to the internal representation usable by the model. | |
# `_preprocess` is called by `preprocess` and is used during model fit and model inference. | |
def _preprocess(self, X, **kwargs): | |
# Drop category and object column dtypes, since NaiveBayes can't handle these dtypes. | |
cat_columns = X.select_dtypes(['category', 'object']).columns | |
X = X.drop(cat_columns, axis=1) | |
# Add a fillna call to handle missing values. | |
return super()._preprocess(X, **kwargs).fillna(0) | |
# The `_fit` method takes the input training data (and optionally the validation data) and trains the model. | |
def _fit(self, X, y, **kwargs): | |
from sklearn.naive_bayes import GaussianNB | |
# It is important to call `preprocess(X)` in `_fit` to replicate what will occur during inference. | |
X = self.preprocess(X) | |
self.model = GaussianNB(**self.params) | |
self.model.fit(X, y) | |
# Example of a more optimized implementation that drops the invalid features earlier on to avoid having to make repeated checks. | |
class AdvancedNaiveBayesModel(AbstractModel): | |
def _preprocess(self, X, **kwargs): | |
# Add a fillna call to handle missing values. | |
return super()._preprocess(X, **kwargs).fillna(0) | |
def _fit(self, X, y, **kwargs): | |
from sklearn.naive_bayes import GaussianNB | |
X = self.preprocess(X) | |
self.model = GaussianNB(**self.params) | |
self.model.fit(X, y) | |
# The `_get_default_auxiliary_params` method defines various model-agnostic parameters such as maximum memory usage and valid input column dtypes. | |
# For most users who build custom models, they will only need to specify the valid/invalid dtypes to the model here. | |
def _get_default_auxiliary_params(self) -> dict: | |
default_auxiliary_params = super()._get_default_auxiliary_params() | |
extra_auxiliary_params = dict( | |
# Drop category and object column dtypes, since NaiveBayes can't handle these dtypes. | |
ignored_type_group_raw=['category', 'object'], | |
) | |
default_auxiliary_params.update(extra_auxiliary_params) | |
return default_auxiliary_params | |
# In this example, we create a custom Naive Bayes model for use in AutoGluon | |
class GenericClassifierModel(AbstractModel): | |
names = [ | |
"Nearest Neighbors", | |
"Linear SVM", | |
"RBF SVM", | |
"Gaussian Process", | |
"Decision Tree", | |
# -- | |
"Random Forest", | |
"Neural Net", | |
"AdaBoost", | |
"Naive Bayes", | |
"QDA", | |
] | |
classifiers = [ | |
KNeighborsClassifier, | |
SVC, | |
SVC, | |
GaussianProcessClassifier, | |
DecisionTreeClassifier, | |
# -- | |
RandomForestClassifier, | |
MLPClassifier, | |
AdaBoostClassifier, | |
GaussianNB, | |
QuadraticDiscriminantAnalysis, | |
] | |
# The `_preprocess` method takes the input data and transforms it to the internal representation usable by the model. | |
# `_preprocess` is called by `preprocess` and is used during model fit and model inference. | |
def _preprocess(self, X, **kwargs): | |
# Drop category and object column dtypes, since NaiveBayes can't handle these dtypes. | |
cat_columns = X.select_dtypes(['category', 'object']).columns | |
X = X.drop(cat_columns, axis=1) | |
# Add a fillna call to handle missing values. | |
return super()._preprocess(X, **kwargs).fillna(0) | |
# The `_fit` method takes the input training data (and optionally the validation data) and trains the model. | |
def _fit(self, X, y, **kwargs): | |
# It is important to call `preprocess(X)` in `_fit` to replicate what will occur during inference. | |
X = self.preprocess(X) | |
classifier_map = {} | |
for n, c in zip(self.names, self.classifiers): | |
classifier_map[n] = c | |
classifier = classifier_map[self.classifier_name] | |
# import pdb | |
# pdb.set_trace() | |
if self.classifier_name == "Linear SVM": | |
self.params.update(kernel="linear", C=0.025) | |
elif self.classifier_name == "RBF SVM": | |
self.params.update(gamma=2, C=1, probability=True) | |
elif self.classifier_name == "Neural Net": | |
self.params.update(hidden_layer_sizes=(200,)) | |
self.model = classifier(**self.params) | |
self.model.fit(X, y) | |
def __init__(self, classifier_name, **kwargs): | |
super().__init__(**kwargs) | |
self.classifier_name = classifier_name | |
def advanced_naive_bayes_shape_calculator(operator): | |
pass | |
def advanced_naive_bayes_converter(scope, operator, container): | |
""" | |
:param scope: name space, where to keep node names, get unused new names | |
:param operator: operator to converter, same object as sent to | |
*predictable_tsne_shape_calculator* | |
:param container: contains the ONNX graph | |
""" | |
input = operator.inputs[0] # input in ONNX graph | |
output = operator.outputs[0] # output in ONNX graph | |
op = operator.raw_operator # scikit-learn model (mmust be fitted) | |
model = op.model | |
inputs = operator.inputs | |
n_features = model.n_features_in_ | |
feature_names = model.feature_names_in_ | |
# We adjust the output of the submodel. | |
operator.inputs[0].type.shape = (None, n_features) | |
val_label = scope.declare_local_variable('val_label', Int64TensorType()) | |
operator.outputs.insert(0, val_label) | |
model.classes_ = model.classes_.astype(np.int64) | |
# for step in model.steps: | |
for step in [(None, model),]: | |
step_model = step[1] | |
if is_classifier(step_model): | |
scope.add_options(id(step_model), options={'zipmap': False}) | |
container.add_options(id(step_model), options={'zipmap': False}) | |
outputs = _parse_sklearn(scope, step_model, inputs, | |
custom_parsers=None) | |
inputs = outputs | |
if len(outputs) != len(operator.outputs): | |
raise RuntimeError( | |
"Mismatch between pipeline output %d and " | |
"last step outputs %d." % ( | |
len(outputs), len(operator.outputs))) | |
for fr, to in zip(outputs, operator.outputs): | |
container.add_node( | |
'Identity', fr.full_name, to.full_name, | |
name=scope.get_unique_operator_name("Id" + operator.onnx_name)) | |
update_registered_converter(NaiveBayesModel, 'NaiveBayesModel', | |
advanced_naive_bayes_shape_calculator, | |
advanced_naive_bayes_converter) | |
update_registered_converter(AdvancedNaiveBayesModel, 'AdvancedNaiveBayesModel', | |
advanced_naive_bayes_shape_calculator, | |
advanced_naive_bayes_converter) | |
update_registered_converter(GenericClassifierModel, 'GenericClassifierModel', | |
advanced_naive_bayes_shape_calculator, | |
advanced_naive_bayes_converter) | |
def main(): | |
################ | |
# Loading Data # | |
################ | |
train_data = TabularDataset('https://autogluon.s3.amazonaws.com/datasets/Inc/train.csv') # can be local CSV file as well, returns Pandas DataFrame | |
label = 'class' # specifies which column do we want to predict | |
save_path = 'ag_models/' # where to save trained models | |
train_data = train_data.head(1000) # subsample for faster demo | |
##################################################### | |
# Training custom model outside of TabularPredictor # | |
##################################################### | |
# Separate features and labels | |
X = train_data.drop(columns=[label]) | |
y = train_data[label] | |
# Construct a LabelCleaner to neatly convert labels to float/integers during model training/inference, can also use to inverse_transform back to original. | |
problem_type = infer_problem_type(y=y) # Infer problem type (or else specify directly) | |
label_cleaner = LabelCleaner.construct(problem_type=problem_type, y=y) | |
y_clean = label_cleaner.transform(y) | |
# Prepare test data | |
test_data = TabularDataset('https://autogluon.s3.amazonaws.com/datasets/Inc/test.csv') # another Pandas DataFrame | |
X_test = test_data.drop(columns=[label]) | |
y_test = test_data[label] | |
y_test_clean = label_cleaner.transform(y_test) | |
print("=======================================================") | |
print("> Profiling AdvancedNaiveBayesModel") | |
print("=======================================================") | |
model = AdvancedNaiveBayesModel() | |
profile(X, y_clean, X_test, y_test_clean, model) | |
print("=======================================================") | |
print("> Profiling NaiveBayesModel") | |
print("=======================================================") | |
model = NaiveBayesModel() | |
profile(X, y_clean, X_test, y_test_clean, model) | |
names = [ | |
"Nearest Neighbors", # Slow in onnxruntime (10x slower) | |
"Decision Tree", | |
"Random Forest", | |
"Neural Net", | |
"AdaBoost", | |
"Naive Bayes", | |
"RBF SVM", | |
# "Linear SVM", # Too slow, not responsive | |
# "Gaussian Process", # com.microsoft.Solve operator not supported in onnxruntime | |
# "QDA", # Unable to find a shape calculator | |
] | |
for name in names: | |
print("=======================================================") | |
print(f"> Profiling {name} Model") | |
print("=======================================================") | |
model = GenericClassifierModel(name) | |
profile(X, y_clean, X_test, y_test_clean, model) | |
def profile(X, y_clean, X_test, y_test_clean, model): | |
naive_bayes_model = model | |
# naive_bayes_model = NaiveBayesModel() | |
naive_bayes_model.fit(X=X, y=y_clean) # Fit custom model | |
# To save to disk and load the model, do the following: | |
# load_path = naive_bayes_model.path | |
# naive_bayes_model.save() | |
# del naive_bayes_model | |
# naive_bayes_model = AdvancedNaiveBayesModel.load(path=load_path) | |
y_pred = naive_bayes_model.predict(X_test) | |
y_pred_proba = naive_bayes_model.predict_proba(X_test) | |
print(np.array(y_pred_proba).astype(np.float32)) | |
score = naive_bayes_model.score(X_test, y_test_clean) | |
print(f'>>>>>>>>>>>>>>>>>>>>>> test score ({naive_bayes_model.eval_metric.name}) = {score} <<<<<<<<<<<<<<<<<<<<<<') | |
##################################### | |
# Conversion to onnx using skl2onnx # | |
##################################### | |
# X = predictor._learner.transform_features(test_data) | |
X = X_test | |
# trainer = predictor._learner.load_trainer() | |
# model = predictor._learner.load_trainer()._get_best() | |
# model = trainer.load_model(model) | |
# y_pred = model.predict_proba(X) | |
## autogluon.core.models.ensemble.weighted_ensemble_model.WeightedEnsembleModel | |
# pipe = Pipeline(steps=[('model', model)]) | |
pipe = Pipeline(steps=[('model', naive_bayes_model)]) | |
print("skl predict_proba") | |
tic = time.time() | |
skl_pred = pipe.predict_proba(X) | |
toc = time.time() | |
print(skl_pred) | |
print(f">>>>>>>>>>>>>>>>>>>>>> skl elapsed: {(toc-tic)*1000.0:.3f} ms <<<<<<<<<<<<<<<<<<<<<<") | |
initial_types = [('input', FloatTensorType((None, X.shape[1])))] | |
model_onnx = convert_sklearn(pipe, initial_types=initial_types, | |
target_opset=12, verbose=0) | |
X = naive_bayes_model.preprocess(X_test).to_numpy() | |
with open("onnx.txt", 'wt') as fp: | |
fp.write(str(model_onnx)) | |
fp.flush() | |
print("onnx predict_proba") | |
sess = rt.InferenceSession(model_onnx.SerializeToString()) | |
tic = time.time() | |
onx_pred = sess.run(None, {'input': X.astype(np.float32)})[0] | |
toc = time.time() | |
print(onx_pred[:, 1]) | |
print(f">>>>>>>>>>>>>>>>>>>>>> onx elapsed: {(toc-tic)*1000.0:.3f} ms <<<<<<<<<<<<<<<<<<<<<<") | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment