MartinNowak · March 11, 2022 15:46
diff --git a/errors.md b/errors.md
diff --git a/Scikit-HTS.py b/Scikit-HTS.py
 #!/usr/bin/env python
 # coding: utf-8

 # # scikit-hts showcase: [M5 kaggle competition](https://www.kaggle.com/c/m5-forecasting-accuracy)
 #
 # In this notebook we will use the data from the Kaggle competiton to perform some hierarchical forecasting. The problem is particularly well suited for the library, as there is a clear hierarchical relationship between each of the series: we have states, stores, categories, departments, and items; sum of sales of items resolve to departments, which summed resolved to categories and so forth.
 #
 #
 # We will however limit the scope of the forecasting task to producing forecasts at the
 # department level, rather than going to the full extent and forecasting for single items.
 #
 # The reasons for this is that this notebook is designed for exemplification purposes, rather than providing a workable solution for the challenge.

 # See [Local runtimes](https://research.google.com/colaboratory/local-runtimes.html)

 import os

 # os.system('pip install matplotlib pandas scikit-hts kaggle')

 import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt
 import hts

 from sklearn.pipeline import Pipeline
 from prophet import Prophet

 # Download the raw files into this directory
 data = "data"


 # ## Transform data in the format required by scikit-hts

 # We will build a hierarchy tree that will look like the following:
 #
 #
 #     Level                             Node Key                          # of nodes
 #
 #       1                                total                                  1
 #
 #       2                 CA               TX                   WI              3
 #
 #       3           CA_1 CA_2 CA_3   TX_1 TX_2 TX_3        WI_1 WI_2 WI_3       6
 #
 #       4      CA_1_HOBBIES_1 ....   TX_1_HOBBIES_1...    ... WI_3_FOODS_3     70
 #       ...
 #
 #
 #
 # The steps are the following:
 #
 # 1. Transform dataset into a column oriented one
 # 2. Create the hierarchy representation as a dictionary
 #
 # For a complete description of how that is done under the hood, and for a sense of what the API accepts, see [scikit-hts' docs](https://scikit-hts.readthedocs.io/en/latest/hierarchy.html)


 def get_data():
    os.system(" mkdir -p data")
    os.system(
        " cd data && kaggle competitions download -c m5-forecasting-accuracy && unzip -o m5-forecasting-accuracy.zip"
    )


 calendar = pd.read_csv(os.path.join(data, "calendar.csv"), index_col="d")


 def preprocess(path: str) -> tuple[pd.DataFrame, np.ndarray, list[str]]:
    df = pd.read_csv(path).drop(columns=["item_id", "id"])
    id_cols = [col for col in df.columns if not col.startswith("d_")]
    df.columns = [
        pd.to_datetime(calendar.loc[col].date) if col not in id_cols else col
        for col in df.columns
    ]
    df = df.melt(id_vars=id_cols, var_name="date", value_name="sales")

    # aggregate item_id to reduce number of models
    hierarchy = ["state_id", "store_id", "cat_id", "dept_id"]
    df = df.groupby(hierarchy + ["date"], as_index=False).sum()

    # get rid of underscores to avoid colliding with hts's internal underscore concatenation
    for col in hierarchy:
        df[col] = df[col].str.replace("_", "")

    return hts.functions.get_hierarchichal_df(
        df,
        level_names=hierarchy,
        hierarchy=[hierarchy[0 : i + 1] for i in range(len(hierarchy) - 1)],
        date_colname="date",
        val_colname="sales",
    )


 def read_data(path: str) -> tuple[pd.DataFrame, np.ndarray, list[str]]:
    if not os.path.exists(path):
        get_data()
    return preprocess(path)


 df, sum_mat, sum_mat_labels = read_data(
    os.path.join(data, "sales_train_evaluation.csv")
 )
 # reorder columns hierarchically
 df = df[sum_mat_labels]
 train_df, eval_df = df[:-28], df[-28:]
 train_df.head()

 forecasts = pd.DataFrame(index=eval_df.index, columns=eval_df.columns)
 for col in sum_mat_labels:
    m = Prophet()
    m.fit(train_df[col].reset_index().rename(columns={"date": "ds", col: "y"}))
    forecasts[col] = m.predict(
        forecasts[col].reset_index().rename(columns={"date": "ds"})
    ).yhat.values

 # reconciliation
 errors = pd.DataFrame()
 for method in ["OLS", "WLSS"]:
    revised_forecasts = pd.DataFrame(
        data=hts.functions.optimal_combination(
            {
                col: pd.DataFrame(data=forecasts[col].values, columns=["yhat"])
                for col in sum_mat_labels
            },
            sum_mat,
            method=method,
            mse={},
        ),
        index=forecasts.index,
        columns=sum_mat_labels,
    )

    agg_diff = revised_forecasts.sum() / eval_df.sum()
    r2 = 1 - ((revised_forecasts - eval_df) ** 2).sum() / eval_df.var().sum()
    errors = pd.concat(
        (
            errors,
            pd.concat(
                (agg_diff, r2), axis=1, keys=[(method, "agg_diff"), (method, "r2")]
            ),
        ),
        axis=1,
    )


 # compared to our middle-out approach
 revised_forecasts = forecasts.copy()
 # bottom-up part
 states = revised_forecasts.filter(regex="^\w\w$").columns
 revised_forecasts["total"] = revised_forecasts[states].sum(axis=1)
 for state in states:
    revised_forecasts[state] = revised_forecasts.filter(
        regex=f"^{state}_{state}\d$"
    ).sum(axis=1)

 # top-down part
 stores = revised_forecasts.filter(regex="^\w\w_\w\w\d$").columns
 for store in stores:
    cats = revised_forecasts.filter(regex=f"^{store}_[^_]*$").columns
    pred_sums = revised_forecasts[cats].sum()
    pred_shares = pred_sums / pred_sums.sum()
    for cat in cats:
        revised_forecasts[cat] = revised_forecasts[store] * pred_shares[cat]

 cats = revised_forecasts.filter(regex="^\w\w_\w\w\d_[^_]*$").columns
 for cat in cats:
    depts = revised_forecasts.filter(regex=f"^{cat}_").columns
    pred_sums = revised_forecasts[depts].sum()
    pred_shares = pred_sums / pred_sums.sum()
    for dept in depts:
        revised_forecasts[dept] = revised_forecasts[cat] * pred_shares[dept]

 method = "our-middle-out"
 agg_diff = revised_forecasts.sum() / eval_df.sum()
 r2 = 1 - ((revised_forecasts - eval_df) ** 2).sum() / eval_df.var().sum()
 errors = pd.concat(
    (
        errors,
        pd.concat((agg_diff, r2), axis=1, keys=[(method, "agg_diff"), (method, "r2")]),
    ),
    axis=1,
 )

 method = "no-reconciliation"
 agg_diff = forecasts.sum() / eval_df.sum()
 r2 = 1 - ((forecasts - eval_df) ** 2).sum() / eval_df.var().sum()
 errors = pd.concat(
    (
        errors,
        pd.concat((agg_diff, r2), axis=1, keys=[(method, "agg_diff"), (method, "r2")]),
    ),
    axis=1,
 )

 print(
    pd.concat(
        [
            errors.loc["total"].rename("total"),
            errors.loc[states].mean().rename("states"),
            errors.loc[stores].mean().rename("stores"),
            errors.loc[cats].mean().rename("cats"),
        ],
        axis=1,
    )
 )
		total	states	stores	cats
OLS	agg_diff	0.926737	0.922922	0.925042	0.958276
	r2	-6.790550	-0.134777	0.861383	0.961364
WLSS	agg_diff	0.926805	0.922828	0.925050	0.957586
	r2	-6.780770	-0.137587	0.860504	0.961340
our-middle-out	agg_diff	0.927709	0.923106	0.925246	0.958024
	r2	-6.680218	-0.137359	0.861531	0.961585
no-reconciliation	agg_diff	0.926443	0.923957	0.925246	0.956866
	r2	-6.824754	-0.120379	0.861531	0.960979
	#!/usr/bin/env python
	# coding: utf-8

	# # scikit-hts showcase: [M5 kaggle competition](https://www.kaggle.com/c/m5-forecasting-accuracy)
	#
	# In this notebook we will use the data from the Kaggle competiton to perform some hierarchical forecasting. The problem is particularly well suited for the library, as there is a clear hierarchical relationship between each of the series: we have states, stores, categories, departments, and items; sum of sales of items resolve to departments, which summed resolved to categories and so forth.
	#
	#
	# We will however limit the scope of the forecasting task to producing forecasts at the
	# department level, rather than going to the full extent and forecasting for single items.
	#
	# The reasons for this is that this notebook is designed for exemplification purposes, rather than providing a workable solution for the challenge.

	# See [Local runtimes](https://research.google.com/colaboratory/local-runtimes.html)

	import os

	# os.system('pip install matplotlib pandas scikit-hts kaggle')

	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	import hts

	from sklearn.pipeline import Pipeline
	from prophet import Prophet

	# Download the raw files into this directory
	data = "data"


	# ## Transform data in the format required by scikit-hts

	# We will build a hierarchy tree that will look like the following:
	#
	#
	# Level Node Key # of nodes
	#
	# 1 total 1
	#
	# 2 CA TX WI 3
	#
	# 3 CA_1 CA_2 CA_3 TX_1 TX_2 TX_3 WI_1 WI_2 WI_3 6
	#
	# 4 CA_1_HOBBIES_1 .... TX_1_HOBBIES_1... ... WI_3_FOODS_3 70
	# ...
	#
	#
	#
	# The steps are the following:
	#
	# 1. Transform dataset into a column oriented one
	# 2. Create the hierarchy representation as a dictionary
	#
	# For a complete description of how that is done under the hood, and for a sense of what the API accepts, see [scikit-hts' docs](https://scikit-hts.readthedocs.io/en/latest/hierarchy.html)


	def get_data():
	os.system(" mkdir -p data")
	os.system(
	" cd data && kaggle competitions download -c m5-forecasting-accuracy && unzip -o m5-forecasting-accuracy.zip"
	)


	calendar = pd.read_csv(os.path.join(data, "calendar.csv"), index_col="d")


	def preprocess(path: str) -> tuple[pd.DataFrame, np.ndarray, list[str]]:
	df = pd.read_csv(path).drop(columns=["item_id", "id"])
	id_cols = [col for col in df.columns if not col.startswith("d_")]
	df.columns = [
	pd.to_datetime(calendar.loc[col].date) if col not in id_cols else col
	for col in df.columns
	]
	df = df.melt(id_vars=id_cols, var_name="date", value_name="sales")

	# aggregate item_id to reduce number of models
	hierarchy = ["state_id", "store_id", "cat_id", "dept_id"]
	df = df.groupby(hierarchy + ["date"], as_index=False).sum()

	# get rid of underscores to avoid colliding with hts's internal underscore concatenation
	for col in hierarchy:
	df[col] = df[col].str.replace("_", "")

	return hts.functions.get_hierarchichal_df(
	df,
	level_names=hierarchy,
	hierarchy=[hierarchy[0 : i + 1] for i in range(len(hierarchy) - 1)],
	date_colname="date",
	val_colname="sales",
	)


	def read_data(path: str) -> tuple[pd.DataFrame, np.ndarray, list[str]]:
	if not os.path.exists(path):
	get_data()
	return preprocess(path)


	df, sum_mat, sum_mat_labels = read_data(
	os.path.join(data, "sales_train_evaluation.csv")
	)
	# reorder columns hierarchically
	df = df[sum_mat_labels]
	train_df, eval_df = df[:-28], df[-28:]
	train_df.head()

	forecasts = pd.DataFrame(index=eval_df.index, columns=eval_df.columns)
	for col in sum_mat_labels:
	m = Prophet()
	m.fit(train_df[col].reset_index().rename(columns={"date": "ds", col: "y"}))
	forecasts[col] = m.predict(
	forecasts[col].reset_index().rename(columns={"date": "ds"})
	).yhat.values

	# reconciliation
	errors = pd.DataFrame()
	for method in ["OLS", "WLSS"]:
	revised_forecasts = pd.DataFrame(
	data=hts.functions.optimal_combination(
	{
	col: pd.DataFrame(data=forecasts[col].values, columns=["yhat"])
	for col in sum_mat_labels
	},
	sum_mat,
	method=method,
	mse={},
	),
	index=forecasts.index,
	columns=sum_mat_labels,
	)

	agg_diff = revised_forecasts.sum() / eval_df.sum()
	r2 = 1 - ((revised_forecasts - eval_df) ** 2).sum() / eval_df.var().sum()
	errors = pd.concat(
	(
	errors,
	pd.concat(
	(agg_diff, r2), axis=1, keys=[(method, "agg_diff"), (method, "r2")]
	),
	),
	axis=1,
	)


	# compared to our middle-out approach
	revised_forecasts = forecasts.copy()
	# bottom-up part
	states = revised_forecasts.filter(regex="^\w\w$").columns
	revised_forecasts["total"] = revised_forecasts[states].sum(axis=1)
	for state in states:
	revised_forecasts[state] = revised_forecasts.filter(
	regex=f"^{state}_{state}\d$"
	).sum(axis=1)

	# top-down part
	stores = revised_forecasts.filter(regex="^\w\w_\w\w\d$").columns
	for store in stores:
	cats = revised_forecasts.filter(regex=f"^{store}_[^_]*$").columns
	pred_sums = revised_forecasts[cats].sum()
	pred_shares = pred_sums / pred_sums.sum()
	for cat in cats:
	revised_forecasts[cat] = revised_forecasts[store] * pred_shares[cat]

	cats = revised_forecasts.filter(regex="^\w\w_\w\w\d_[^_]*$").columns
	for cat in cats:
	depts = revised_forecasts.filter(regex=f"^{cat}_").columns
	pred_sums = revised_forecasts[depts].sum()
	pred_shares = pred_sums / pred_sums.sum()
	for dept in depts:
	revised_forecasts[dept] = revised_forecasts[cat] * pred_shares[dept]

	method = "our-middle-out"
	agg_diff = revised_forecasts.sum() / eval_df.sum()
	r2 = 1 - ((revised_forecasts - eval_df) ** 2).sum() / eval_df.var().sum()
	errors = pd.concat(
	(
	errors,
	pd.concat((agg_diff, r2), axis=1, keys=[(method, "agg_diff"), (method, "r2")]),
	),
	axis=1,
	)

	method = "no-reconciliation"
	agg_diff = forecasts.sum() / eval_df.sum()
	r2 = 1 - ((forecasts - eval_df) ** 2).sum() / eval_df.var().sum()
	errors = pd.concat(
	(
	errors,
	pd.concat((agg_diff, r2), axis=1, keys=[(method, "agg_diff"), (method, "r2")]),
	),
	axis=1,
	)

	print(
	pd.concat(
	[
	errors.loc["total"].rename("total"),
	errors.loc[states].mean().rename("states"),
	errors.loc[stores].mean().rename("stores"),
	errors.loc[cats].mean().rename("cats"),
	],
	axis=1,
	)
	)