Created
May 6, 2016 18:38
-
-
Save jaircastruita/f43b68c88b5c8ebf50a1060432abde8d to your computer and use it in GitHub Desktop.
This file has been truncated, but you can view the full file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "", | |
"signature": "sha256:a8398ce40087121501e924c32e900fecbc5db8dd5d2713c651e50d79a75f3012" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"%matplotlib inline\n", | |
"import matplotlib.pyplot as plt\n", | |
"import pandas as pd\n", | |
"import seaborn as sns; sns.set()\n", | |
"import folium\n", | |
"\n", | |
"import vincent\n", | |
"vincent.core.initialize_notebook()\n", | |
"import json\n", | |
"from vincent import AxisProperties, PropertySet, ValueRef" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": [ | |
"\n", | |
" <script>\n", | |
" \n", | |
" function vct_load_lib(url, callback){\n", | |
" if(typeof d3 !== 'undefined' &&\n", | |
" url === 'http://d3js.org/d3.v3.min.js'){\n", | |
" callback()\n", | |
" }\n", | |
" var s = document.createElement('script');\n", | |
" s.src = url;\n", | |
" s.async = true;\n", | |
" s.onreadystatechange = s.onload = callback;\n", | |
" s.onerror = function(){\n", | |
" console.warn(\"failed to load library \" + url);\n", | |
" };\n", | |
" document.getElementsByTagName(\"head\")[0].appendChild(s);\n", | |
" };\n", | |
" var vincent_event = new CustomEvent(\n", | |
" \"vincent_libs_loaded\",\n", | |
" {bubbles: true, cancelable: true}\n", | |
" );\n", | |
" \n", | |
" function load_all_libs(){\n", | |
" console.log('Loading Vincent libs...')\n", | |
" vct_load_lib('http://d3js.org/d3.v3.min.js', function(){\n", | |
" vct_load_lib('http://d3js.org/d3.geo.projection.v0.min.js', function(){\n", | |
" vct_load_lib('http://wrobstory.github.io/d3-cloud/d3.layout.cloud.js', function(){\n", | |
" vct_load_lib('http://wrobstory.github.io/vega/vega.v1.3.3.js', function(){\n", | |
" window.dispatchEvent(vincent_event);\n", | |
" });\n", | |
" });\n", | |
" });\n", | |
" });\n", | |
" };\n", | |
" if(typeof define === \"function\" && define.amd){\n", | |
" if (window['d3'] === undefined ||\n", | |
" window['topojson'] === undefined){\n", | |
" require.config(\n", | |
" {paths: {\n", | |
" d3: 'http://d3js.org/d3.v3.min',\n", | |
" topojson: 'http://d3js.org/topojson.v1.min'\n", | |
" }\n", | |
" }\n", | |
" );\n", | |
" require([\"d3\"], function(d3){\n", | |
" console.log('Loading Vincent from require.js...')\n", | |
" window.d3 = d3;\n", | |
" require([\"topojson\"], function(topojson){\n", | |
" window.topojson = topojson;\n", | |
" load_all_libs();\n", | |
" });\n", | |
" });\n", | |
" } else {\n", | |
" load_all_libs();\n", | |
" };\n", | |
" }else{\n", | |
" console.log('Require.js not found, loading manually...')\n", | |
" load_all_libs();\n", | |
" };\n", | |
"\n", | |
" </script>" | |
], | |
"metadata": {}, | |
"output_type": "display_data", | |
"text": [ | |
"<IPython.core.display.HTML at 0xa554e48>" | |
] | |
} | |
], | |
"prompt_number": 1 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"path = r'D:\\ownCloud\\Documentos posgrado\\Posgrado 2do semestre\\Seminario investigacion\\Experimental Data\\Cd Mex\\Movilidad\\Hist Ecobici\\ecobici.csv'\n", | |
"trips = pd.read_csv(path,\n", | |
" parse_dates=['date_removed', 'date_arrived'],\n", | |
" infer_datetime_format=True)\n", | |
"\n", | |
"trips = trips.loc[trips['action'] != 'C ']\n", | |
"\n", | |
"# Find the start date\n", | |
"ind = pd.DatetimeIndex(trips.date_arrived)\n", | |
"trips['date'] = ind.date.astype('datetime64')\n", | |
"trips['hour'] = ind.hour\n", | |
"\n", | |
"# Count trips by date\n", | |
"by_date = trips.pivot_table('bike',aggfunc='count',\n", | |
" index='date',\n", | |
" columns='station_arrived', )\n", | |
"\n", | |
"by_date.drop([col for col in list(by_date.columns.values) if col > 275], \n", | |
" axis=1, inplace=True)\n", | |
"\n", | |
"# add a flag indicating weekend\n", | |
"by_date['weekend'] = (by_date.index.dayofweek >= 5)\n", | |
"\n", | |
"by_date_weekday = by_date[by_date['weekend'] == False]\n", | |
"\n", | |
"#mavg_wd = pd.rolling_mean(by_date_weekday.sum(axis=1), 20)\n", | |
"#mstd_wd = pd.rolling_std(by_date_weekday.sum(axis=1), 20)\n", | |
"mavg_wd = by_date_weekday.sum(axis=1).rolling(window=20,center=False).mean()\n", | |
"mstd_wd = by_date_weekday.sum(axis=1).rolling(window=20,center=False).std()\n", | |
"\n", | |
"black_list = by_date_weekday.sum(axis=1)[by_date_weekday.sum(axis=1) <= mavg_wd-2*mstd_wd].index\n", | |
"\n", | |
"by_date_weekend = by_date[by_date['weekend'] == True]\n", | |
"\n", | |
"#mavg_we = pd.rolling_mean(by_date_weekend.sum(axis=1), 20)\n", | |
"#mstd_we = pd.rolling_std(by_date_weekend.sum(axis=1), 20)\n", | |
"mavg_we = by_date_weekend.sum(axis=1).rolling(window=20,center=False).mean()\n", | |
"mstd_we = by_date_weekend.sum(axis=1).rolling(window=20,center=False).std()\n", | |
"\n", | |
"black_list = black_list.union(by_date_weekend.sum(axis=1)[by_date_weekend.sum(axis=1) <= mavg_we-2*mstd_we].index)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stderr", | |
"text": [ | |
"C:\\Users\\ghost\\AppData\\Local\\Enthought\\Canopy\\App\\appdata\\canopy-1.5.2.2785.win-x86_64\\lib\\site-packages\\IPython\\core\\interactiveshell.py:2741: DtypeWarning: Columns (0,1) have mixed types. Specify dtype option on import or set low_memory=False.\n", | |
" interactivity=interactivity, compiler=compiler)\n" | |
] | |
} | |
], | |
"prompt_number": 2 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"trips_in = pd.read_csv(path,\n", | |
" parse_dates=['date_removed', 'date_arrived'],\n", | |
" infer_datetime_format=True)\n", | |
"\n", | |
"trips_in = trips_in.loc[trips_in['action'] != 'C ']\n", | |
"\n", | |
"# Find the start date\n", | |
"ind = pd.DatetimeIndex(trips.date_arrived)\n", | |
"trips_in['date'] = ind.date.astype('datetime64')\n", | |
"trips_in['hour'] = ind.hour\n", | |
"\n", | |
"# remove those values on the blacklist\n", | |
"trips_in = trips_in.loc[~trips_in.date.isin(black_list)]" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 3 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"trips_out = pd.read_csv(path,\n", | |
" parse_dates=['date_removed', 'date_arrived'],\n", | |
" infer_datetime_format=True)\n", | |
"\n", | |
"trips_out = trips_out.loc[trips_out['action'] != 'C ']\n", | |
"\n", | |
"# Find the start date\n", | |
"ind = pd.DatetimeIndex(trips.date_removed)\n", | |
"trips_out['date'] = ind.date.astype('datetime64')\n", | |
"trips_out['hour'] = ind.hour\n", | |
"\n", | |
"# remove those values on the blacklist\n", | |
"trips_out = trips_out.loc[~trips_out.date.isin(black_list)]" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 4 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"#trips_out\n", | |
"#trips_out = trips_out.loc[~trips_out.date.isin(black_list)]\n", | |
"#trips_out.loc[~trips_out.date.isin(black_list)]\n", | |
"#black_list" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 5 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# count trips by date and by hour\n", | |
"by_hour_out = trips_out.pivot_table('bike', aggfunc='count',\n", | |
" index=['date', 'hour'],\n", | |
" columns='station_removed').fillna(0).reset_index('hour')\n", | |
"\n", | |
"# average these counts by weekend\n", | |
"by_hour_out['weekend'] = (by_hour_out.index.dayofweek >= 5)\n", | |
"by_hour_out = by_hour_out.groupby(['weekend', 'hour']).mean()\n", | |
"by_hour_out.index.set_levels([['weekday', 'weekend'],\n", | |
" [\"{0}:00\".format(i) for i in range(24)]],\n", | |
" inplace=True);\n", | |
"\n", | |
"by_hour_out.drop([col for col in list(by_hour_out.columns.values) if col > 275], axis=1, inplace=True)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 6 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# count trips by date and by hour\n", | |
"by_hour_in = trips_in.pivot_table('bike', aggfunc='count',\n", | |
" index=['date', 'hour'],\n", | |
" columns='station_arrived').fillna(0).reset_index('hour')\n", | |
"\n", | |
"# average these counts by weekend\n", | |
"by_hour_in['weekend'] = (by_hour_in.index.dayofweek >= 5)\n", | |
"by_hour_in = by_hour_in.groupby(['weekend', 'hour']).mean()\n", | |
"by_hour_in.index.set_levels([['weekday', 'weekend'],\n", | |
" [\"{0}:00\".format(i) for i in range(24)]],\n", | |
" inplace=True);\n", | |
"\n", | |
"by_hour_in.drop([col for col in list(by_hour_in.columns.values) if col > 275], axis=1, inplace=True)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 7 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"#by_hour = pd.concat([by_hour_in.loc['weekday'],by_hour_out.loc['weekday']],axis=0)\n", | |
"by_hour = pd.concat([by_hour_in,by_hour_out],axis=0)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 8 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"from sklearn import mixture \n", | |
"\n", | |
"X = by_hour.loc['weekday'].values.T\n", | |
"num_clusters = 6\n", | |
"cv_types = ['spherical', 'tied', 'diag', 'full']\n", | |
"\n", | |
"gmm = mixture.GMM(n_components=num_clusters, covariance_type=cv_types[3])\n", | |
"gmm.fit(X)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 9, | |
"text": [ | |
"GMM(covariance_type='full', init_params='wmc', min_covar=0.001,\n", | |
" n_components=6, n_init=1, n_iter=100, params='wmc', random_state=None,\n", | |
" thresh=None, tol=0.001, verbose=0)" | |
] | |
} | |
], | |
"prompt_number": 9 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"CDMX_COORDINATES = (19.4186, -99.1686) \n", | |
"\n", | |
"#create empty map zoomed in on Mexico City\n", | |
"station_attrs = pd.read_csv('C:\\Users\\ghost\\Google Drive\\JairCastruitaGastelum\\Progress Reports\\Clustering_ECOBICI\\DATA\\estacion_coordenadas.csv')" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 10 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"color_iter = ['blue', 'red', 'green', '#e017e5', '#d8f604', '#15d4e2', '#e21569']\n", | |
"by_hour_in.columns = by_hour_in.columns.astype(str) + ' in'\n", | |
"by_hour_out.columns = by_hour_out.columns.astype(str) + ' out'\n", | |
"by_hour_concat = pd.concat([by_hour_in.loc['weekday'],by_hour_out.loc['weekday']],axis=1)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 11 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"map = folium.Map(location=CDMX_COORDINATES, tiles='Stamen Toner', zoom_start=14)\n", | |
"Y_ = gmm.predict(X)\n", | |
"plt.figure(figsize=(16,8))\n", | |
"\n", | |
"for i, (mean, covar, color) in enumerate(zip(\n", | |
" gmm.means_, gmm._get_covars(), color_iter)):\n", | |
" \n", | |
" if not np.any(Y_ == i):\n", | |
" continue \n", | |
" \n", | |
" plt.plot(mean,color=color)\n", | |
" \n", | |
" for attr in range(len(Y_)):\n", | |
" if Y_[attr] == i:\n", | |
" alt = station_attrs[station_attrs['id'] == by_hour.columns[attr]]['longitud'].values[0]\n", | |
" lat = station_attrs[station_attrs['id'] == by_hour.columns[attr]]['latitud'].values[0]\n", | |
" \n", | |
" counts = vincent.Bar(by_hour_concat[[str(by_hour.columns[attr])+' in',str(by_hour.columns[attr])+' out']], width=650, height=300)\n", | |
" counts.axis_titles(x='Hora', y='Estacion '+str(int(by_hour.columns[attr]))+': Valor Promedio en un dia')\n", | |
" counts.to_json(str(by_hour.columns[attr])+'.json')\n", | |
" counts.legend(title=str(station_attrs.iloc[attr,0]))\n", | |
" \n", | |
" folium.RegularPolygonMarker(location=[lat,alt], \n", | |
" popup=folium.Popup(max_width=700).add_child(\n", | |
" folium.Vega(json.load(open(str(by_hour.columns[attr])+'.json')), \n", | |
" width=700, \n", | |
" height=350)),\n", | |
" fill_color=color,\n", | |
" number_of_sides=8,\n", | |
" radius=10).add_to(map)\n", | |
" \n", | |
"map" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": [ | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment