Skip to content

Instantly share code, notes, and snippets.

@Aerlinger
Created April 8, 2015 22:53
Show Gist options
  • Save Aerlinger/d6610fdaef7ebdbb0003 to your computer and use it in GitHub Desktop.
Save Aerlinger/d6610fdaef7ebdbb0003 to your computer and use it in GitHub Desktop.
OneHotEncoding
{
"metadata": {
"name": "",
"signature": "sha256:973f629a9825a5e2ab8a547b36f84aa20289633b121d810ca166ed9acdeebc6f"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": [
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"import matplotlib as plt\n",
"\n",
"%matplotlib inline"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# My notebook is running from the DAT_20_NYC folder so it's a simple relative path\n",
"local_path_to_file = 'Data/bikeshare.csv'\n",
"\n",
"bikeshare = pd.read_csv(local_path_to_file)\n",
"\n",
"bikeshare.head()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>instant</th>\n",
" <th>dteday</th>\n",
" <th>season</th>\n",
" <th>yr</th>\n",
" <th>mnth</th>\n",
" <th>hr</th>\n",
" <th>holiday</th>\n",
" <th>weekday</th>\n",
" <th>workingday</th>\n",
" <th>weathersit</th>\n",
" <th>temp</th>\n",
" <th>atemp</th>\n",
" <th>hum</th>\n",
" <th>windspeed</th>\n",
" <th>casual</th>\n",
" <th>registered</th>\n",
" <th>cnt</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td> 1</td>\n",
" <td> 2011-01-01</td>\n",
" <td> 1</td>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 6</td>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" <td> 0.24</td>\n",
" <td> 0.2879</td>\n",
" <td> 0.81</td>\n",
" <td> 0</td>\n",
" <td> 3</td>\n",
" <td> 13</td>\n",
" <td> 16</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td> 2</td>\n",
" <td> 2011-01-01</td>\n",
" <td> 1</td>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" <td> 1</td>\n",
" <td> 0</td>\n",
" <td> 6</td>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" <td> 0.22</td>\n",
" <td> 0.2727</td>\n",
" <td> 0.80</td>\n",
" <td> 0</td>\n",
" <td> 8</td>\n",
" <td> 32</td>\n",
" <td> 40</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td> 3</td>\n",
" <td> 2011-01-01</td>\n",
" <td> 1</td>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" <td> 2</td>\n",
" <td> 0</td>\n",
" <td> 6</td>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" <td> 0.22</td>\n",
" <td> 0.2727</td>\n",
" <td> 0.80</td>\n",
" <td> 0</td>\n",
" <td> 5</td>\n",
" <td> 27</td>\n",
" <td> 32</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td> 4</td>\n",
" <td> 2011-01-01</td>\n",
" <td> 1</td>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" <td> 3</td>\n",
" <td> 0</td>\n",
" <td> 6</td>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" <td> 0.24</td>\n",
" <td> 0.2879</td>\n",
" <td> 0.75</td>\n",
" <td> 0</td>\n",
" <td> 3</td>\n",
" <td> 10</td>\n",
" <td> 13</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td> 5</td>\n",
" <td> 2011-01-01</td>\n",
" <td> 1</td>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" <td> 4</td>\n",
" <td> 0</td>\n",
" <td> 6</td>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" <td> 0.24</td>\n",
" <td> 0.2879</td>\n",
" <td> 0.75</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" <td> 1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 2,
"text": [
" instant dteday season yr mnth hr holiday weekday workingday \\\n",
"0 1 2011-01-01 1 0 1 0 0 6 0 \n",
"1 2 2011-01-01 1 0 1 1 0 6 0 \n",
"2 3 2011-01-01 1 0 1 2 0 6 0 \n",
"3 4 2011-01-01 1 0 1 3 0 6 0 \n",
"4 5 2011-01-01 1 0 1 4 0 6 0 \n",
"\n",
" weathersit temp atemp hum windspeed casual registered cnt \n",
"0 1 0.24 0.2879 0.81 0 3 13 16 \n",
"1 1 0.22 0.2727 0.80 0 8 32 40 \n",
"2 1 0.22 0.2727 0.80 0 5 27 32 \n",
"3 1 0.24 0.2879 0.75 0 3 10 13 \n",
"4 1 0.24 0.2879 0.75 0 0 1 1 "
]
}
],
"prompt_number": 2
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Cut the crap!\n",
"bikeshare.drop(['instant', 'dteday'], axis=1, inplace=True)\n",
"\n",
"# Be careful using these \"inplace\" arguments, can't run the cell more than once... \n",
"# One solution... store a copy so we can run more than once\n",
"bikeshare.dropna(inplace=True)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 3
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# what does bikeshare.weathersit look like?\n",
"print bikeshare.weathersit.value_counts()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"1 11413\n",
"2 4544\n",
"3 1419\n",
"4 3\n",
"dtype: int64\n"
]
}
],
"prompt_number": 4
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from sklearn.preprocessing import OneHotEncoder\n",
"\n",
"ohe = OneHotEncoder(sparse=False)\n",
"\n",
"\n",
"encoded_weathersit = ohe.fit_transform(bikeshare[['weathersit']])"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 5
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"I ran into a snag in class where I ran `ohe.fit_transform(bikeshare.weathersit)` instead of `ohe.fit_transform(bikeshare[['weathersit']])`\n",
"\n",
"\n",
"*What's the difference between these two?* **Hint: Always pay attention to types!**"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"encoded_weathersit"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 6,
"text": [
"array([[ 1., 0., 0., 0.],\n",
" [ 1., 0., 0., 0.],\n",
" [ 1., 0., 0., 0.],\n",
" ..., \n",
" [ 1., 0., 0., 0.],\n",
" [ 1., 0., 0., 0.],\n",
" [ 1., 0., 0., 0.]])"
]
}
],
"prompt_number": 6
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Just a sanity check, these values should be the same as `bikeshare.weathersit.value_counts()`\n",
"encoded_weathersit.sum(axis=0).astype(np.int)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 7,
"text": [
"array([11413, 4544, 1419, 3])"
]
}
],
"prompt_number": 7
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"### How to get these encoded values back into the bikeshare dataframe?\n",
"\n",
"# Simple conversion, but let's be mindful to keep our labels consistent. \n",
"weathersit_factors = bikeshare.weathersit.unique()\n",
"\n",
"## I strongly recommend you rename your column labels to something that's comprehensible! \n",
"encoded_dataframe = pd.DataFrame(encoded_weathersit, columns=['Weather Label 1', 'Weather Label 2', 'Weather Label 3', 'Weather Label 4'])\n",
"\n",
"encoded_dataframe.head()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Weather Label 1</th>\n",
" <th>Weather Label 2</th>\n",
" <th>Weather Label 3</th>\n",
" <th>Weather Label 4</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td> 1</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td> 1</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td> 1</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td> 1</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td> 1</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 8,
"text": [
" Weather Label 1 Weather Label 2 Weather Label 3 Weather Label 4\n",
"0 1 0 0 0\n",
"1 1 0 0 0\n",
"2 1 0 0 0\n",
"3 1 0 0 0\n",
"4 1 0 0 0"
]
}
],
"prompt_number": 8
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### We'll want to merge the encoded dataframe back into the original"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"print encoded_dataframe.shape\n",
"\n",
"print bikeshare.shape"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"(17379, 4)\n",
"(17379, 15)\n"
]
}
],
"prompt_number": 9
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"encoded_bikeshare = pd.concat([bikeshare, encoded_dataframe], axis=1)\n",
"\n",
"encoded_bikeshare.drop(['weathersit'], axis=1, inplace=True)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 10
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"encoded_bikeshare.head()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>season</th>\n",
" <th>yr</th>\n",
" <th>mnth</th>\n",
" <th>hr</th>\n",
" <th>holiday</th>\n",
" <th>weekday</th>\n",
" <th>workingday</th>\n",
" <th>temp</th>\n",
" <th>atemp</th>\n",
" <th>hum</th>\n",
" <th>windspeed</th>\n",
" <th>casual</th>\n",
" <th>registered</th>\n",
" <th>cnt</th>\n",
" <th>Weather Label 1</th>\n",
" <th>Weather Label 2</th>\n",
" <th>Weather Label 3</th>\n",
" <th>Weather Label 4</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td> 1</td>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 6</td>\n",
" <td> 0</td>\n",
" <td> 0.24</td>\n",
" <td> 0.2879</td>\n",
" <td> 0.81</td>\n",
" <td> 0</td>\n",
" <td> 3</td>\n",
" <td> 13</td>\n",
" <td> 16</td>\n",
" <td> 1</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td> 1</td>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" <td> 1</td>\n",
" <td> 0</td>\n",
" <td> 6</td>\n",
" <td> 0</td>\n",
" <td> 0.22</td>\n",
" <td> 0.2727</td>\n",
" <td> 0.80</td>\n",
" <td> 0</td>\n",
" <td> 8</td>\n",
" <td> 32</td>\n",
" <td> 40</td>\n",
" <td> 1</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td> 1</td>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" <td> 2</td>\n",
" <td> 0</td>\n",
" <td> 6</td>\n",
" <td> 0</td>\n",
" <td> 0.22</td>\n",
" <td> 0.2727</td>\n",
" <td> 0.80</td>\n",
" <td> 0</td>\n",
" <td> 5</td>\n",
" <td> 27</td>\n",
" <td> 32</td>\n",
" <td> 1</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td> 1</td>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" <td> 3</td>\n",
" <td> 0</td>\n",
" <td> 6</td>\n",
" <td> 0</td>\n",
" <td> 0.24</td>\n",
" <td> 0.2879</td>\n",
" <td> 0.75</td>\n",
" <td> 0</td>\n",
" <td> 3</td>\n",
" <td> 10</td>\n",
" <td> 13</td>\n",
" <td> 1</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td> 1</td>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" <td> 4</td>\n",
" <td> 0</td>\n",
" <td> 6</td>\n",
" <td> 0</td>\n",
" <td> 0.24</td>\n",
" <td> 0.2879</td>\n",
" <td> 0.75</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 1</td>\n",
" <td> 1</td>\n",
" <td> 1</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" <td> 0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 11,
"text": [
" season yr mnth hr holiday weekday workingday temp atemp hum \\\n",
"0 1 0 1 0 0 6 0 0.24 0.2879 0.81 \n",
"1 1 0 1 1 0 6 0 0.22 0.2727 0.80 \n",
"2 1 0 1 2 0 6 0 0.22 0.2727 0.80 \n",
"3 1 0 1 3 0 6 0 0.24 0.2879 0.75 \n",
"4 1 0 1 4 0 6 0 0.24 0.2879 0.75 \n",
"\n",
" windspeed casual registered cnt Weather Label 1 Weather Label 2 \\\n",
"0 0 3 13 16 1 0 \n",
"1 0 8 32 40 1 0 \n",
"2 0 5 27 32 1 0 \n",
"3 0 3 10 13 1 0 \n",
"4 0 0 1 1 1 0 \n",
"\n",
" Weather Label 3 Weather Label 4 \n",
"0 0 0 \n",
"1 0 0 \n",
"2 0 0 \n",
"3 0 0 \n",
"4 0 0 "
]
}
],
"prompt_number": 11
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment