Last active
October 12, 2022 15:47
-
-
Save vdutor/aee150243e2cad6a8f452959030c644c to your computer and use it in GitHub Desktop.
Python script to read the airline dataset
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Copyright 2021 Vincent Dutordoir | |
# | |
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software | |
# and associated documentation files (the "Software"), to deal in the Software without restriction, | |
# including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, | |
# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, | |
# subject to the following conditions: | |
# | |
# The above copyright notice and this permission notice shall be included in all copies or substantial | |
# portions of the Software. | |
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT | |
# LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. | |
# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, | |
# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE | |
# OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | |
import numpy as np | |
import pandas as pd | |
import time | |
def airline(n=None): | |
""" | |
Script adapted from James Hensman | |
https://github.com/jameshensman/VFF/blob/master/experiments/airline/airline_vff_additive.py | |
Data pickle file can be downloaded from: | |
https://drive.google.com/file/d/1CnA6FYb8jNUckJt4VLz_ONA1KgV-bXwK/view?usp=sharing | |
Returns the Airline delay dataset, containing a total of 5929413 rows. | |
Each datapoint has 8 features. | |
All features are rescaled to [-1, 1] and the target is normalized to be N(0, 1) distributed. | |
:param n: int | |
total dataset size (train + test size) | |
n_train = 2/3 * n = train size | |
n_test = 1/3 * n = test size | |
Defaults to None, which corresponds to returning all rows (5929413 in total). | |
:return: | |
X: [n_train, 8], Y: [n_train, 1] | |
XT: [n_test, 8], YT: [n_test, 1] | |
""" | |
# Import the data | |
data = pd.read_pickle('airline.pickle') | |
# Convert time of day from hhmm to minutes since midnight | |
data.ArrTime = 60*np.floor(data.ArrTime/100)+np.mod(data.ArrTime, 100) | |
data.DepTime = 60*np.floor(data.DepTime/100)+np.mod(data.DepTime, 100) | |
# Pick out the data | |
Y = data['ArrDelay'].values | |
names = [ | |
'Month', 'DayofMonth', | |
'DayOfWeek', 'plane_age', | |
'AirTime', 'Distance', | |
'ArrTime', 'DepTime' | |
] | |
X = data[names].values | |
if n is None: | |
n = len(X) | |
assert n == len(Y) | |
# Shuffle the data and only consider a subset of it | |
perm = np.random.permutation(len(X)) | |
X = X[perm] | |
Y = Y[perm] | |
XT = X[int(2*n/3):n] | |
YT = Y[int(2*n/3):n] | |
X = X[:int(2*n/3)] | |
Y = Y[:int(2*n/3)] | |
# Normalize Y scale and offset | |
Ymean = Y.mean() | |
Ystd = Y.std() | |
Y = (Y - Ymean) / Ystd | |
Y = Y.reshape(-1, 1) | |
YT = (YT - Ymean) / Ystd | |
YT = YT.reshape(-1, 1) | |
# Normalize X on [-1, 1] | |
Xmin, Xmax = X.min(0), X.max(0) | |
X = (X - Xmin) / (Xmax - Xmin) | |
X = 2 * (X - 0.5) | |
XT = (XT - Xmin) / (Xmax - Xmin) | |
XT = 2 * (XT - 0.5) | |
return X, Y, XT, YT |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
The pickle doesn't work with current versions with Pandas but can be loaded using version 0.17.1 which can be installed from conda-forge :)