total | states | stores | cats | ||
---|---|---|---|---|---|
OLS | agg_diff | 0.926737 | 0.922922 | 0.925042 | 0.958276 |
r2 | -6.790550 | -0.134777 | 0.861383 | 0.961364 | |
WLSS | agg_diff | 0.926805 | 0.922828 | 0.925050 | 0.957586 |
r2 | -6.780770 | -0.137587 | 0.860504 | 0.961340 | |
our-middle-out | agg_diff | 0.927709 | 0.923106 | 0.925246 | 0.958024 |
r2 | -6.680218 | -0.137359 | 0.861531 | 0.961585 | |
no-reconciliation | agg_diff | 0.926443 | 0.923957 | 0.925246 | 0.956866 |
r2 | -6.824754 | -0.120379 | 0.861531 | 0.960979 |
Created
March 11, 2022 15:46
-
-
Save MartinNowak/20f7bd4db1a356f296d0f479b0f07ad6 to your computer and use it in GitHub Desktop.
hts for m5 dataset
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# coding: utf-8 | |
# # scikit-hts showcase: [M5 kaggle competition](https://www.kaggle.com/c/m5-forecasting-accuracy) | |
# | |
# In this notebook we will use the data from the Kaggle competiton to perform some hierarchical forecasting. The problem is particularly well suited for the library, as there is a clear hierarchical relationship between each of the series: we have states, stores, categories, departments, and items; sum of sales of items resolve to departments, which summed resolved to categories and so forth. | |
# | |
# | |
# We will however limit the scope of the forecasting task to producing forecasts at the | |
# department level, rather than going to the full extent and forecasting for single items. | |
# | |
# The reasons for this is that this notebook is designed for exemplification purposes, rather than providing a workable solution for the challenge. | |
# See [Local runtimes](https://research.google.com/colaboratory/local-runtimes.html) | |
import os | |
# os.system('pip install matplotlib pandas scikit-hts kaggle') | |
import pandas as pd | |
import numpy as np | |
import matplotlib.pyplot as plt | |
import hts | |
from sklearn.pipeline import Pipeline | |
from prophet import Prophet | |
# Download the raw files into this directory | |
data = "data" | |
# ## Transform data in the format required by scikit-hts | |
# We will build a hierarchy tree that will look like the following: | |
# | |
# | |
# Level Node Key # of nodes | |
# | |
# 1 total 1 | |
# | |
# 2 CA TX WI 3 | |
# | |
# 3 CA_1 CA_2 CA_3 TX_1 TX_2 TX_3 WI_1 WI_2 WI_3 6 | |
# | |
# 4 CA_1_HOBBIES_1 .... TX_1_HOBBIES_1... ... WI_3_FOODS_3 70 | |
# ... | |
# | |
# | |
# | |
# The steps are the following: | |
# | |
# 1. Transform dataset into a column oriented one | |
# 2. Create the hierarchy representation as a dictionary | |
# | |
# For a complete description of how that is done under the hood, and for a sense of what the API accepts, see [scikit-hts' docs](https://scikit-hts.readthedocs.io/en/latest/hierarchy.html) | |
def get_data(): | |
os.system(" mkdir -p data") | |
os.system( | |
" cd data && kaggle competitions download -c m5-forecasting-accuracy && unzip -o m5-forecasting-accuracy.zip" | |
) | |
calendar = pd.read_csv(os.path.join(data, "calendar.csv"), index_col="d") | |
def preprocess(path: str) -> tuple[pd.DataFrame, np.ndarray, list[str]]: | |
df = pd.read_csv(path).drop(columns=["item_id", "id"]) | |
id_cols = [col for col in df.columns if not col.startswith("d_")] | |
df.columns = [ | |
pd.to_datetime(calendar.loc[col].date) if col not in id_cols else col | |
for col in df.columns | |
] | |
df = df.melt(id_vars=id_cols, var_name="date", value_name="sales") | |
# aggregate item_id to reduce number of models | |
hierarchy = ["state_id", "store_id", "cat_id", "dept_id"] | |
df = df.groupby(hierarchy + ["date"], as_index=False).sum() | |
# get rid of underscores to avoid colliding with hts's internal underscore concatenation | |
for col in hierarchy: | |
df[col] = df[col].str.replace("_", "") | |
return hts.functions.get_hierarchichal_df( | |
df, | |
level_names=hierarchy, | |
hierarchy=[hierarchy[0 : i + 1] for i in range(len(hierarchy) - 1)], | |
date_colname="date", | |
val_colname="sales", | |
) | |
def read_data(path: str) -> tuple[pd.DataFrame, np.ndarray, list[str]]: | |
if not os.path.exists(path): | |
get_data() | |
return preprocess(path) | |
df, sum_mat, sum_mat_labels = read_data( | |
os.path.join(data, "sales_train_evaluation.csv") | |
) | |
# reorder columns hierarchically | |
df = df[sum_mat_labels] | |
train_df, eval_df = df[:-28], df[-28:] | |
train_df.head() | |
forecasts = pd.DataFrame(index=eval_df.index, columns=eval_df.columns) | |
for col in sum_mat_labels: | |
m = Prophet() | |
m.fit(train_df[col].reset_index().rename(columns={"date": "ds", col: "y"})) | |
forecasts[col] = m.predict( | |
forecasts[col].reset_index().rename(columns={"date": "ds"}) | |
).yhat.values | |
# reconciliation | |
errors = pd.DataFrame() | |
for method in ["OLS", "WLSS"]: | |
revised_forecasts = pd.DataFrame( | |
data=hts.functions.optimal_combination( | |
{ | |
col: pd.DataFrame(data=forecasts[col].values, columns=["yhat"]) | |
for col in sum_mat_labels | |
}, | |
sum_mat, | |
method=method, | |
mse={}, | |
), | |
index=forecasts.index, | |
columns=sum_mat_labels, | |
) | |
agg_diff = revised_forecasts.sum() / eval_df.sum() | |
r2 = 1 - ((revised_forecasts - eval_df) ** 2).sum() / eval_df.var().sum() | |
errors = pd.concat( | |
( | |
errors, | |
pd.concat( | |
(agg_diff, r2), axis=1, keys=[(method, "agg_diff"), (method, "r2")] | |
), | |
), | |
axis=1, | |
) | |
# compared to our middle-out approach | |
revised_forecasts = forecasts.copy() | |
# bottom-up part | |
states = revised_forecasts.filter(regex="^\w\w$").columns | |
revised_forecasts["total"] = revised_forecasts[states].sum(axis=1) | |
for state in states: | |
revised_forecasts[state] = revised_forecasts.filter( | |
regex=f"^{state}_{state}\d$" | |
).sum(axis=1) | |
# top-down part | |
stores = revised_forecasts.filter(regex="^\w\w_\w\w\d$").columns | |
for store in stores: | |
cats = revised_forecasts.filter(regex=f"^{store}_[^_]*$").columns | |
pred_sums = revised_forecasts[cats].sum() | |
pred_shares = pred_sums / pred_sums.sum() | |
for cat in cats: | |
revised_forecasts[cat] = revised_forecasts[store] * pred_shares[cat] | |
cats = revised_forecasts.filter(regex="^\w\w_\w\w\d_[^_]*$").columns | |
for cat in cats: | |
depts = revised_forecasts.filter(regex=f"^{cat}_").columns | |
pred_sums = revised_forecasts[depts].sum() | |
pred_shares = pred_sums / pred_sums.sum() | |
for dept in depts: | |
revised_forecasts[dept] = revised_forecasts[cat] * pred_shares[dept] | |
method = "our-middle-out" | |
agg_diff = revised_forecasts.sum() / eval_df.sum() | |
r2 = 1 - ((revised_forecasts - eval_df) ** 2).sum() / eval_df.var().sum() | |
errors = pd.concat( | |
( | |
errors, | |
pd.concat((agg_diff, r2), axis=1, keys=[(method, "agg_diff"), (method, "r2")]), | |
), | |
axis=1, | |
) | |
method = "no-reconciliation" | |
agg_diff = forecasts.sum() / eval_df.sum() | |
r2 = 1 - ((forecasts - eval_df) ** 2).sum() / eval_df.var().sum() | |
errors = pd.concat( | |
( | |
errors, | |
pd.concat((agg_diff, r2), axis=1, keys=[(method, "agg_diff"), (method, "r2")]), | |
), | |
axis=1, | |
) | |
print( | |
pd.concat( | |
[ | |
errors.loc["total"].rename("total"), | |
errors.loc[states].mean().rename("states"), | |
errors.loc[stores].mean().rename("stores"), | |
errors.loc[cats].mean().rename("cats"), | |
], | |
axis=1, | |
) | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment