Created
June 12, 2024 11:27
-
-
Save yanboyang713/cfce9908ea7acce6566598817dc56308 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def objective(trial): | |
with mlflow.start_run(nested=True): | |
print ("start trial") | |
# Suggest hyperparameters | |
C = trial.suggest_float('C', 0.1, 10.0) | |
epsilon = trial.suggest_float('epsilon', 0.01, 1.0) | |
kernel = trial.suggest_categorical('kernel', ['linear', 'poly', 'rbf', 'sigmoid']) | |
# Split the data into features and target | |
X = data[['HTTP_reply_code', 'Bytes']] | |
y = data[['HTTP_reply_code', 'Bytes']] | |
# Split the data into training and testing sets | |
n_test_obs = 20 | |
X_train, X_test = X[:-n_test_obs], X[-n_test_obs:] | |
y_train, y_test = y[:-n_test_obs], y[-n_test_obs:] | |
print ("done Split the data into training and testing sets") | |
# Train the Multioutput SVM model with suggested hyperparameters | |
svr = SVR(C=C, epsilon=epsilon, kernel=kernel) | |
model = MultiOutputRegressor(svr) | |
fitted_model = model.fit(X_train, y_train) | |
# Get size of the fitted_model object | |
fitted_model_size = asizeof.asizeof(fitted_model) | |
print ("done Train the Multioutput SVM model with suggested hyperparameters") | |
#print(f"Size of fitted_model: {fitted_model_size} bytes") | |
mlflow.log_metric("fitted_model_size", fitted_model_size) | |
mlflow.sklearn.log_model(model, "multioutput_svm_model") | |
# Make predictions | |
y_pred = model.predict(X_test) | |
# Calculate the Mean Absolute Error (MAE) for each target and take the average | |
mae_HTTP_reply_code = mean_absolute_error(y_test['HTTP_reply_code'], y_pred[:, 0]) | |
mae_Bytes = mean_absolute_error(y_test['Bytes'], y_pred[:, 1]) | |
mlflow.log_metric("mae_http_reply_code", mae_HTTP_reply_code) | |
mlflow.log_metric("mae_bytes", mae_Bytes) | |
mae_sum = mae_HTTP_reply_code + mae_Bytes | |
return mae_sum | |
best_param = "" | |
def champion_callback(study, frozen_trial): | |
""" | |
Logging callback that will report when a new trial iteration improves upon existing | |
best trial values. | |
Note: This callback is not intended for use in distributed computing systems such as Spark | |
or Ray due to the micro-batch iterative implementation for distributing trials to a cluster's | |
workers or agents. | |
The race conditions with file system state management for distributed trials will render | |
inconsistent values with this callback. | |
""" | |
winner = study.user_attrs.get("winner", None) | |
if study.best_value and winner != study.best_value: | |
study.set_user_attr("winner", study.best_value) | |
if winner: | |
improvement_percent = (abs(winner - study.best_value) / study.best_value) * 100 | |
print( | |
f"Trial {frozen_trial.number} achieved value: {frozen_trial.value} with " | |
f"{improvement_percent: .4f}% improvement" | |
) | |
else: | |
print(f"Initial trial {frozen_trial.number} achieved value: {frozen_trial.value}") | |
def run_Multioutput_Regression_SVM_Hyperparameters(data, model_family, dataset_name, experiment_id): | |
mlflow.end_run() | |
# Explicitly name runs | |
today = dt.datetime.now() | |
run_name = model_family + " " + str(today) + " " + dataset_name | |
# Create an instance of a PandasDataset | |
dataset = mlflow.data.from_pandas( | |
data, name=dataset_name | |
) | |
mlflow.enable_system_metrics_logging() | |
mlflow.system_metrics.set_system_metrics_sampling_interval(1) | |
with mlflow.start_run(run_name=run_name, experiment_id=experiment_id, log_system_metrics=True, nested=True) as run: | |
# Turn autolog on to save model artifacts, requirements, etc. | |
mlflow.autolog(log_models=True) | |
# Run the Optuna optimization | |
study = optuna.create_study(direction='minimize') | |
study.optimize(objective, n_trials=20, callbacks=[champion_callback], timeout=120, catch=(TimeoutError,)) | |
# Best hyperparameters | |
mlflow.log_params(study.best_params) | |
mlflow.log_metric("best_mse", study.best_value) | |
mlflow.log_input(dataset, context="training") | |
# Log tags | |
mlflow.set_tags( | |
tags={ | |
"project": "p4 workflow", | |
"optimizer_engine": "optuna", | |
"model_family": model_family, | |
"dataset": dataset_name, | |
"with_the_Best_Hyperparameters": "false", | |
} | |
) | |
print("run id: ", run.info.run_id) | |
print ("best_params: ", study.best_params) | |
best_param = study.best_params | |
return study | |
model_family = "Multioutput_Regression_SVM" | |
dataset_name = "calgary_one_day" | |
study = run_Multioutput_Regression_SVM_Hyperparameters(data, model_family, dataset_name) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment