Created April 8, 2015
"cells": [
import numpy as np
import pandas as pd
import matplotlib as plt
%matplotlib inline
# My notebook is running from the DAT_20_NYC folder so it's a simple relative path
local_path_to_file = 'Data/bikeshare.csv'
bikeshare = pd.read_csv(local_path_to_file)
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
<thead>
<tr style="text-align: right;">
<th></th>
<th>instant</th>
<th>dteday</th>
<th>season</th>
<th>yr</th>
<th>mnth</th>
<th>hr</th>
<th>holiday</th>
<th>weekday</th>
<th>workingday</th>
<th>weathersit</th>
<th>temp</th>
<th>atemp</th>
<th>hum</th>
<th>windspeed</th>
<th>casual</th>
<th>registered</th>
<th>cnt</th>
</tr>
</thead>
" </tr>\n",
" </thead>\n",
<tbody>
<tr>
<th>0</th>
<td> 1</td>
<td> 2011-01-01</td>
<td> 1</td>
<td> 0</td>
<td> 1</td>
<td> 0</td>
<td> 0</td>
<td> 6</td>
<td> 0</td>
<td> 1</td>
<td> 0.24</td>
<td> 0.2879</td>
<td> 0.81</td>
<td> 0</td>
<td> 3</td>
<td> 13</td>
<td> 16</td>
</tr>
" </tr>\n",
<tr>
<th>1</th>
<td> 2</td>
<td> 2011-01-01</td>
<td> 1</td>
<td> 0</td>
<td> 1</td>
<td> 1</td>
<td> 0</td>
<td> 6</td>
<td> 0</td>
<td> 1</td>
<td> 0.22</td>
<td> 0.2727</td>
<td> 0.80</td>
<td> 0</td>
<td> 8</td>
<td> 32</td>
<td> 40</td>
</tr>
" </tr>\n",
<tr>
<th>2</th>
<td> 3</td>
<td> 2011-01-01</td>
<td> 1</td>
<td> 0</td>
<td> 1</td>
<td> 2</td>
<td> 0</td>
<td> 6</td>
<td> 0</td>
<td> 1</td>
<td> 0.22</td>
<td> 0.2727</td>
<td> 0.80</td>
<td> 0</td>
<td> 5</td>
<td> 27</td>
<td> 32</td>
</tr>
" </tr>\n",
<tr>
<th>3</th>
<td> 4</td>
<td> 2011-01-01</td>
<td> 1</td>
<td> 0</td>
<td> 1</td>
<td> 3</td>
<td> 0</td>
<td> 6</td>
<td> 0</td>
<td> 1</td>
<td> 0.24</td>
<td> 0.2879</td>
<td> 0.75</td>
<td> 0</td>
<td> 3</td>
<td> 10</td>
<td> 13</td>
</tr>
" </tr>\n",
<tr>
<th>4</th>
<td> 5</td>
<td> 2011-01-01</td>
<td> 1</td>
<td> 0</td>
<td> 1</td>
<td> 4</td>
<td> 0</td>
<td> 6</td>
<td> 0</td>
<td> 1</td>
<td> 0.24</td>
<td> 0.2879</td>
<td> 0.75</td>
<td> 0</td>
<td> 0</td>
<td> 1</td>
" <td> 1</td>\n",
" </tr>\n",
" </tbody>\n",
"text": [
" instant dteday season yr mnth hr holiday weekday workingday \\\n",
"0 1 2011-01-01 1 0 1 0 0 6 0 \n",
"1 2 2011-01-01 1 0 1 1 0 6 0 \n",
"2 3 2011-01-01 1 0 1 2 0 6 0 \n",
"3 4 2011-01-01 1 0 1 3 0 6 0 \n",
"4 5 2011-01-01 1 0 1 4 0 6 0 \n",
" weathersit temp atemp hum windspeed casual registered cnt \n",
"0 1 0.24 0.2879 0.81 0 3 13 16 \n",
"1 1 0.22 0.2727 0.80 0 8 32 40 \n",
"2 1 0.22 0.2727 0.80 0 5 27 32 \n",
"3 1 0.24 0.2879 0.75 0 3 10 13 \n",
"4 1 0.24 0.2879 0.75 0 0 1 1 "
# Cut the crap!
bikeshare.drop(['instant', 'dteday'], axis=1, inplace=True)
# Be careful using these "inplace" arguments, can't run the cell more than once...
# One solution... store a copy so we can run more than once
"cell_type": "code",
"collapsed": false,
"input": [
# what does bikeshare.weathersit look like?
print bikeshare.weathersit.value_counts()
"1 11413\n",
"2 4544\n",
"3 1419\n",
"4 3\n",
"dtype: int64\n"
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse=False)
encoded_weathersit = ohe.fit_transform(bikeshare[['weathersit']])
"I ran into a snag in class where I ran `ohe.fit_transform(bikeshare.weathersit)` instead of `ohe.fit_transform(bikeshare[['weathersit']])`\n",
"*What's the difference between these two?* **Hint: Always pay attention to types!**"
"array([[ 1., 0., 0., 0.],\n",
" [ 1., 0., 0., 0.],\n",
" [ 1., 0., 0., 0.],\n",
" ..., \n",
" [ 1., 0., 0., 0.],\n",
" [ 1., 0., 0., 0.],\n",
" [ 1., 0., 0., 0.]])"
# Just a sanity check, these values should be the same as `bikeshare.weathersit.value_counts()`
array([11413, 4544, 1419, 3])
### How to get these encoded values back into the bikeshare dataframe?
# Simple conversion, but let's be mindful to keep our labels consistent.
weathersit_factors = bikeshare.weathersit.unique()
## I strongly recommend you rename your column labels to something that's comprehensible!
encoded_dataframe = pd.DataFrame(encoded_weathersit, columns=['Weather Label 1', 'Weather Label 2', 'Weather Label 3', 'Weather Label 4'])
<div style="max-height:1000px;max-width:1500px;overflow:auto;">
<table border="1" class="dataframe">
"<table border=\"1\" class=\"dataframe\">\n",
<thead>
<tr style="text-align: right;">
<th></th>
<th>Weather Label 1</th>
<th>Weather Label 2</th>
<th>Weather Label 3</th>
<th>Weather Label 4</th>
</tr>
</thead>
" </tr>\n",
" </thead>\n",
<tbody>
<tr>
<th>0</th>
<td> 1</td>
<td> 0</td>
<td> 0</td>
<td> 0</td>
</tr>
" </tr>\n",
<tr>
<th>1</th>
<td> 1</td>
<td> 0</td>
<td> 0</td>
<td> 0</td>
</tr>
" </tr>\n",
<tr>
<th>2</th>
<td> 1</td>
<td> 0</td>
<td> 0</td>
" <td> 0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td> 1</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td> 1</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" </tr>\n",
" </tbody>\n",
"text": [
" Weather Label 1 Weather Label 2 Weather Label 3 Weather Label 4\n",
"0 1 0 0 0\n",
"1 1 0 0 0\n",
"2 1 0 0 0\n",
"3 1 0 0 0\n",
"4 1 0 0 0"
"### We'll want to merge the encoded dataframe back into the original"
"print encoded_dataframe.shape\n",
"print bikeshare.shape"
"(17379, 4)\n",
"(17379, 15)\n"
"encoded_bikeshare = pd.concat([bikeshare, encoded_dataframe], axis=1)\n",
"encoded_bikeshare.drop(['weathersit'], axis=1, inplace=True)"
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>season</th>\n",
" <th>yr</th>\n",
" <th>mnth</th>\n",
" <th>hr</th>\n",
" <th>holiday</th>\n",
" <th>weekday</th>\n",
" <th>workingday</th>\n",
" <th>temp</th>\n",
" <th>atemp</th>\n",
" <th>hum</th>\n",
" <th>windspeed</th>\n",
" <th>casual</th>\n",
" <th>registered</th>\n",
" <th>cnt</th>\n",
" <th>Weather Label 1</th>\n",
" <th>Weather Label 2</th>\n",
" <th>Weather Label 3</th>\n",
" <th>Weather Label 4</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td> 1</td>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 6</td>\n",
" <td> 0</td>\n",
" <td> 0.24</td>\n",
" <td> 0.2879</td>\n",
" <td> 0.81</td>\n",
" <td> 0</td>\n",
" <td> 3</td>\n",
" <td> 13</td>\n",
" <td> 16</td>\n",
" <td> 1</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td> 1</td>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" <td> 1</td>\n",
" <td> 0</td>\n",
" <td> 6</td>\n",
" <td> 0</td>\n",
" <td> 0.22</td>\n",
" <td> 0.2727</td>\n",
" <td> 0.80</td>\n",
" <td> 0</td>\n",
" <td> 8</td>\n",
" <td> 32</td>\n",
" <td> 40</td>\n",
" <td> 1</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td> 1</td>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" <td> 2</td>\n",
" <td> 0</td>\n",
" <td> 6</td>\n",
" <td> 0</td>\n",
" <td> 0.22</td>\n",
" <td> 0.2727</td>\n",
" <td> 0.80</td>\n",
" <td> 0</td>\n",
" <td> 5</td>\n",
" <td> 27</td>\n",
" <td> 32</td>\n",
" <td> 1</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td> 1</td>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" <td> 3</td>\n",
" <td> 0</td>\n",
" <td> 6</td>\n",
" <td> 0</td>\n",
" <td> 0.24</td>\n",
" <td> 0.2879</td>\n",
" <td> 0.75</td>\n",
" <td> 0</td>\n",
" <td> 3</td>\n",
" <td> 10</td>\n",
" <td> 13</td>\n",
" <td> 1</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td> 1</td>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" <td> 4</td>\n",
" <td> 0</td>\n",
" <td> 6</td>\n",
" <td> 0</td>\n",
" <td> 0.24</td>\n",
" <td> 0.2879</td>\n",
" <td> 0.75</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" <td> 1</td>\n",
" <td> 1</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" </tr>\n",
" </tbody>\n",
"text": [
" season yr mnth hr holiday weekday workingday temp atemp hum \\\n",
"0 1 0 1 0 0 6 0 0.24 0.2879 0.81 \n",
"1 1 0 1 1 0 6 0 0.22 0.2727 0.80 \n",
"2 1 0 1 2 0 6 0 0.22 0.2727 0.80 \n",
"3 1 0 1 3 0 6 0 0.24 0.2879 0.75 \n",
"4 1 0 1 4 0 6 0 0.24 0.2879 0.75 \n",
" windspeed casual registered cnt Weather Label 1 Weather Label 2 \\\n",
"0 0 3 13 16 1 0 \n",
"1 0 8 32 40 1 0 \n",
"2 0 5 27 32 1 0 \n",
"3 0 3 10 13 1 0 \n",
"4 0 0 1 1 1 0 \n",
" Weather Label 3 Weather Label 4 \n",
"0 0 0 \n",
"1 0 0 \n",
"2 0 0 \n",
"3 0 0 \n",
"4 0 0 "
