Last active
June 3, 2018 22:24
-
-
Save yabyzq/a36b294739830838adeb3d03831fa7f4 to your computer and use it in GitHub Desktop.
forecast
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import matplotlib.pyplot as plt | |
import numpy as np | |
import os | |
import time | |
from numpy import newaxis | |
import math | |
from sklearn.metrics import mean_squared_error | |
import statsmodels.api as sm | |
from statsmodels.tsa.api import ExponentialSmoothing, SimpleExpSmoothing, Holt | |
def mean_absolute_percentage_error(y_true, y_pred): | |
y_true, y_pred = np.array(y_true), np.array(y_pred) | |
return np.mean(np.abs((y_true - y_pred) / y_true)) * 100 | |
#loading data | |
df = pd.read_csv('C:/Users/Administrator/Desktop/LSTM/simple.csv') | |
df.index = pd.DatetimeIndex(pd.to_datetime(df.date,format='%Y-%m-%d'),freq='D') | |
#split train and test | |
train_size = round(0.75*len(df)) | |
train = df[1:train_size] | |
test = df[train_size:] | |
#http://www.seanabu.com/2016/03/22/time-series-seasonal-ARIMA-model-in-python/ | |
from statsmodels.tsa.stattools import adfuller | |
def test_stationarity(timeseries): | |
#Determing rolling statistics | |
rolmean = timeseries.rolling(window = 12).mean() | |
rolstd = timeseries.rolling(window = 12).std() | |
#Plot rolling statistics: | |
#fig = plt.figure(figsize=(12, 8)) | |
orig = plt.plot(timeseries, color='blue',label='Original') | |
mean = plt.plot(rolmean, color='red', label='Rolling Mean') | |
std = plt.plot(rolstd, color='black', label = 'Rolling Std') | |
plt.legend(loc='best') | |
plt.title('Rolling Mean & Standard Deviation') | |
plt.show() | |
#Perform Dickey-Fuller test: | |
print('Results of Dickey-Fuller Test:') | |
dftest = adfuller(timeseries, autolag='AIC') | |
dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used']) | |
for key,value in dftest[4].items(): | |
dfoutput['Critical Value (%s)'%key] = value | |
print(dfoutput) | |
test_stationarity(df.balance) | |
# Method 1 - Baseline using last day value | |
prediction = test.copy().drop(['balance'], axis=1) | |
prediction['last_day'] = train['balance'][len(train)-1] | |
x = 30 | |
prediction['last_x_day'] = train['balance'].rolling(x).mean().iloc[-1] | |
plt.plot(train.index, train['balance'], label='Train') | |
plt.plot(test.index,test['balance'], label='Test') | |
plt.plot(prediction.index,prediction['last_day'], label='Last Day') | |
plt.plot(prediction.index,prediction['last_x_day'], label='Last %d Days'%x) | |
plt.legend(loc='best') | |
plt.show() | |
print("#Last Day\nCoefficent:" ,round(np.corrcoef(test.balance,prediction.last_day)[0,1],4),\ | |
" RMSE:", round(math.sqrt(mean_squared_error(test.balance, prediction.last_day)),4),\ | |
" MAPE:", round(mean_absolute_percentage_error(test.balance, prediction.last_day),4)) | |
print("#Last %d Days\nCoefficent:"%x ,round(np.corrcoef(test.balance,prediction.last_x_day)[0,1],4),\ | |
" RMSE:", round(math.sqrt(mean_squared_error(test.balance, prediction.last_x_day)),4),\ | |
" MAPE:", round(mean_absolute_percentage_error(test.balance, prediction.last_x_day),4)) | |
# Method 2 - capture Seasonality using Holt, ARIMA | |
# analysing training data | |
sm.tsa.seasonal_decompose(train.balance).plot() | |
plt.show() | |
#fit and predict | |
fit1 = Holt(np.asarray(train['balance'])).fit(smoothing_level = 0.05,smoothing_slope = 0.05) | |
prediction['holt_linear'] = fit1.forecast(len(test)) | |
fit2 = ExponentialSmoothing(np.asarray(train['balance']) ,seasonal_periods=7 ,trend='add', seasonal='add',).fit() | |
prediction['holt_winter'] = fit2.forecast(len(test)) | |
fit3 = sm.tsa.statespace.SARIMAX(train.balance, order=(2, 1, 4),seasonal_order=(0,1,1,7)).fit(maxiter=500) | |
prediction['arima'] = fit3.predict(start=test.index[0], end=test.index[-1], dynamic=True) | |
plt.plot(train['balance'], label='Train') | |
plt.plot(test['balance'], label='Test') | |
plt.plot(prediction['holt_linear'], label='Holt Linear') | |
plt.plot(prediction['holt_winter'], label='Holt Winter') | |
plt.plot(prediction['arima'], label='ARIMA') | |
plt.legend(loc='best') | |
plt.show() | |
print("#Holt Linear\nCoefficent:",round(np.corrcoef(test.balance,prediction.holt_linear)[0,1],4),\ | |
" RMSE:", round(math.sqrt(mean_squared_error(test.balance, prediction.holt_linear)),4),\ | |
" MAPE:", round(mean_absolute_percentage_error(test.balance, prediction.holt_linear),4)) | |
print("#Holt Winter\nCoefficent:",round(np.corrcoef(test.balance,prediction.holt_winter)[0,1],4),\ | |
" RMSE:", round(math.sqrt(mean_squared_error(test.balance, prediction.holt_winter)),4),\ | |
" MAPE:", round(mean_absolute_percentage_error(test.balance, prediction.holt_winter),4)) | |
print("#ARIMA\nCoefficent:",round(np.corrcoef(test.balance,prediction.arima)[0,1],4),\ | |
" RMSE:", round(math.sqrt(mean_squared_error(test.balance, prediction.arima)),4),\ | |
" MAPE:", round(mean_absolute_percentage_error(test.balance, prediction.arima),4)) | |
import Prophet from fbprophet | |
m = Prophet() | |
m.fit(pd.DataFrame({'ds':train.date, 'y':train.balance})) | |
forecast = m.predict(m.make_future_dataframe(periods=len(test))) | |
prediction['prophet'] = forecast['yhat'] | |
#http://pythondata.com/forecasting-time-series-data-with-prophet-part-1/ | |
# Method 3 - Facebook Prophet | |
forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']] | |
type(forecast.yhat.index) | |
type(train.balance.index) | |
forecast = forecast.set_index(pd.DatetimeIndex(forecast['ds'],freq = 'D')) | |
plt.figure(figsize=(16,8)) | |
plt.plot( train['balance'], label='Train') | |
plt.plot(test['balance'], label='Test') | |
plt.plot(forecast.yhat[-35:-1], label='PROPHET') | |
plt.legend(loc='best') | |
plt.show() | |
m = Prophet() | |
m.fit(pd.DataFrame({'ds':train.date, 'y':train.balance})) | |
forecast = m.predict(m.make_future_dataframe(periods=len(test))) | |
forecast = forecast.set_index(pd.DatetimeIndex(forecast['ds'],freq = 'D')) | |
prediction['prophet'] = forecast['yhat'] | |
plt.figure(figsize=(16,8)) | |
plt.plot(train['balance'], label='Train') | |
plt.plot(test['balance'], label='Test') | |
plt.plot(prediction['prophet'], label='Prophet') | |
plt.legend(loc='best') | |
plt.show() | |
print("#Prophet\nCoefficent:",round(np.corrcoef(test.balance,prediction.prophet)[0,1],4),\ | |
" RMSE:", round(math.sqrt(mean_squared_error(test.balance, prediction.prophet)),4),\ | |
" MAPE:", round(mean_absolute_percentage_error(test.balance, prediction.prophet),4)) | |
from datetime import date, timedelta | |
import pandas as pd | |
import numpy as np | |
from sklearn.metrics import mean_squared_error | |
from keras.models import Sequential | |
from keras.layers.core import Dense, Dropout, Activation | |
from keras.layers import LSTM | |
from keras import callbacks | |
from keras.callbacks import ModelCheckpoint | |
from keras import optimizers | |
from sklearn.preprocessing import MinMaxScaler | |
import matplotlib.pyplot as plt | |
%matplotlib inline | |
#loading the data | |
data = pd.read_csv("C:/Users/eye1/Desktop/sett_data.csv",parse_dates=["date"]) | |
scaler = MinMaxScaler(feature_range=(0, 1)) | |
data['balance'] = scaler.fit_transform(data['balance'].values.reshape(-1,1)) | |
data = data.set_index(pd.DatetimeIndex(data['date'],freq = 'D')).drop(['date'], axis=1) | |
def get_timespan(input_data, date, minus, periods, freq='D'): | |
return input_data.loc[pd.date_range(date - timedelta(days=minus), periods=periods, freq=freq)] | |
train_start_date = pd.datetime(2017, 6, 10) | |
def convert_timeseries(data, start_date, is_train=True): | |
X = pd.DataFrame({"last_1": get_timespan(data, start_date, 1, 1).values.ravel(), | |
"last_3_mean": get_timespan(data, start_date, 3, 3).mean().values,#last 3 | |
"last_5_mean": get_timespan(data, start_date, 5, 5).mean().values,#last 5 | |
"last_7_mean": get_timespan(data, start_date, 7, 7).mean().values,#last 5 | |
"last_14_mean": get_timespan(data, start_date, 14, 14).mean().values}) | |
for i in range(7): | |
X['last_4_dow{}_mean'.format(i)] = get_timespan(data, start_date, 28-i, 4, freq='7D').mean().values #get every week last 7 | |
if is_train: | |
y = data.loc[pd.date_range(start_date, periods=1)].values | |
return X, y | |
return X | |
#choose cut off for train and test | |
cut_off = pd.datetime(2018, 2, 1) | |
#set up training data | |
X_train, y_train = [], [] | |
for i in range(pd.Timedelta(cut_off - train_start_date).days): | |
delta = timedelta(days=i) | |
X_tmp, y_tmp = convert_timeseries(data, train_start_date + delta) | |
X_train.append(X_tmp) | |
y_train.append(y_tmp) | |
X_train = pd.concat(X_train, axis=0) | |
y_train = np.concatenate(y_train, axis=0) | |
X_train = X_train.as_matrix() | |
X_train = X_train.reshape((X_train.shape[0], 1, X_train.shape[1])) | |
X_test, y_test = [], [] | |
for i in range(pd.Timedelta(data.index[-2] - cut_off).days): | |
delta = timedelta(days=i) | |
X_tmp, y_tmp = convert_timeseries(data, cut_off + delta) | |
X_test.append(X_tmp) | |
y_test.append(y_tmp) | |
X_test = pd.concat(X_test, axis=0) | |
y_test = np.concatenate(y_test, axis=0) | |
X_test = X_test.as_matrix() | |
X_test = X_test.reshape((X_test.shape[0], 1, X_test.shape[1])) | |
#model | |
model = Sequential() | |
model.add(LSTM(32, input_shape=(X_train.shape[1],X_train.shape[2]))) | |
model.add(Dropout(.1)) | |
model.add(Dense(8)) | |
model.add(Dropout(.1)) | |
model.add(Dense(1)) | |
#model.compile(loss = 'mse', optimizer='adam', metrics=['mse']) | |
model.compile(loss='mse',optimizer=optimizers.SGD(lr=0.05, decay=1e-6, momentum=0.9, nesterov=True)) | |
from numpy import newaxis | |
def predict_sequence_full(model, data, window_size): | |
#Shift the window by 1 new prediction each time, re-run predictions on new window | |
curr_frame = data | |
predicted = [] | |
for i in range(len(data)): | |
predicted.append(model.predict(curr_frame[i:i+1])) | |
curr_frame = curr_frame[1:] | |
curr_frame = np.insert(curr_frame, [window_size-1], predicted[-1], axis=0) | |
return predicted | |
model.fit(X_train, y_train, batch_size = 512, epochs = 500, verbose=2) | |
#plt.plot(model.predict(X_test)) | |
plt.plot(np.concatenate(predict_sequence_full(model, X_test, window_size = 1)).ravel()) | |
plt.plot(y_test) | |
plt.show() | |
def get_timespan(input_data, date, minus, periods, freq='D'): | |
return input_data.loc[pd.date_range(date - timedelta(days=minus), periods=periods, freq=freq)] | |
def convert_timeseries(input_data, start_date, is_train=True): | |
X = pd.DataFrame({"last_1": get_timespan(input_data, start_date, 1, 1).mean().values, | |
"last_2_mean": get_timespan(input_data, start_date, 2, 2).mean().values,#last 2 | |
"last_3_mean": get_timespan(input_data, start_date, 3, 3).mean().values,#last 3 | |
"last_4_mean": get_timespan(input_data, start_date, 4, 4).mean().values,#last 4 | |
"last_5_mean": get_timespan(input_data, start_date, 5, 5).mean().values,#last 5 | |
"last_6_mean": get_timespan(input_data, start_date, 6, 6).mean().values,#last 6 | |
"last_7_mean": get_timespan(input_data, start_date, 7, 7).mean().values,#last 7 | |
"last_14_mean": get_timespan(input_data, start_date, 14, 14).mean().values}) | |
for i in range(7): | |
X['last_4_dow{}_mean'.format(i)] = get_timespan(input_data, start_date, 28-i, 4, freq='7D').mean().values #get every week last 7 | |
if is_train: | |
y = input_data.loc[pd.date_range(start_date, periods=1)].mean().values | |
return X, y | |
return X | |
def predict_sequence_full(model, data, window_size): | |
#Shift the window by 1 new prediction each time, re-run predictions on new window | |
curr_frame = data | |
predicted = [] | |
for i in range(len(data)): | |
predicted.append(model.predict(curr_frame[i:i+1])) | |
curr_frame = curr_frame[1:] | |
curr_frame = np.insert(curr_frame, [window_size-1], predicted[-1], axis=0) | |
return predicted | |
#set up training data | |
X_train, y_train = [], [] | |
for i in range(pd.Timedelta(cut_off - train_start_date).days): | |
delta = timedelta(days=i) | |
X_tmp, y_tmp = convert_timeseries(df, train_start_date + delta) | |
X_train.append(X_tmp) | |
y_train.append(y_tmp) | |
X_train = pd.concat(X_train, axis=0) | |
y_train = np.concatenate(y_train, axis=0) | |
X_train = X_train.as_matrix() | |
X_train = X_train.reshape((X_train.shape[0], 1, X_train.shape[1])) | |
#set up testing data | |
X_test, y_test = [], [] | |
for i in range(pd.Timedelta(df.index[-1] - cut_off).days): | |
delta = timedelta(days=i) | |
X_tmp, y_tmp = convert_timeseries(df, cut_off + delta) | |
X_test.append(X_tmp) | |
y_test.append(y_tmp) | |
X_test = pd.concat(X_test, axis=0) | |
y_test = np.concatenate(y_test, axis=0) | |
X_test = X_test.as_matrix() | |
X_test = X_test.reshape((X_test.shape[0], 1, X_test.shape[1])) | |
#model | |
model = Sequential() | |
model.add(LSTM(32, input_shape=(X_train.shape[1],X_train.shape[2]))) | |
model.add(Dropout(.1)) | |
model.add(Dense(8)) | |
model.add(Dropout(.1)) | |
model.add(Dense(1)) | |
model.compile(loss='mse',optimizer=optimizers.SGD(lr=0.05, decay=1e-6, momentum=0.9, nesterov=True))#optimizer='adam', metrics=['mse'] | |
model.fit(X_train, y_train, batch_size = 512, epochs = 1000, verbose=2) | |
prediction['lstm'] = np.concatenate(predict_sequence_full(model, X_test, window_size = 1)).ravel() | |
plt.figure(figsize=(12,4)) | |
plt.plot(train['balance'], label='Train') | |
plt.plot(test['balance'], label='Test') | |
plt.plot(prediction['lstm'], label='LSTM') | |
plt.legend(loc='best') | |
plt.show() | |
print("#LSTM\nCoefficent:",round(np.corrcoef(test.balance,prediction.lstm)[0,1],4),\ | |
" RMSE:", round(math.sqrt(mean_squared_error(test.balance, prediction.lstm)),4),\ | |
" MAPE:", round(mean_absolute_percentage_error(test.balance, prediction.lstm),4)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment