Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save jaircastruita/f43b68c88b5c8ebf50a1060432abde8d to your computer and use it in GitHub Desktop.
Save jaircastruita/f43b68c88b5c8ebf50a1060432abde8d to your computer and use it in GitHub Desktop.
This file has been truncated, but you can view the full file.
{
"metadata": {
"name": "",
"signature": "sha256:a8398ce40087121501e924c32e900fecbc5db8dd5d2713c651e50d79a75f3012"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": [
"%matplotlib inline\n",
"import matplotlib.pyplot as plt\n",
"import pandas as pd\n",
"import seaborn as sns; sns.set()\n",
"import folium\n",
"\n",
"import vincent\n",
"vincent.core.initialize_notebook()\n",
"import json\n",
"from vincent import AxisProperties, PropertySet, ValueRef"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"\n",
" <script>\n",
" \n",
" function vct_load_lib(url, callback){\n",
" if(typeof d3 !== 'undefined' &&\n",
" url === 'http://d3js.org/d3.v3.min.js'){\n",
" callback()\n",
" }\n",
" var s = document.createElement('script');\n",
" s.src = url;\n",
" s.async = true;\n",
" s.onreadystatechange = s.onload = callback;\n",
" s.onerror = function(){\n",
" console.warn(\"failed to load library \" + url);\n",
" };\n",
" document.getElementsByTagName(\"head\")[0].appendChild(s);\n",
" };\n",
" var vincent_event = new CustomEvent(\n",
" \"vincent_libs_loaded\",\n",
" {bubbles: true, cancelable: true}\n",
" );\n",
" \n",
" function load_all_libs(){\n",
" console.log('Loading Vincent libs...')\n",
" vct_load_lib('http://d3js.org/d3.v3.min.js', function(){\n",
" vct_load_lib('http://d3js.org/d3.geo.projection.v0.min.js', function(){\n",
" vct_load_lib('http://wrobstory.github.io/d3-cloud/d3.layout.cloud.js', function(){\n",
" vct_load_lib('http://wrobstory.github.io/vega/vega.v1.3.3.js', function(){\n",
" window.dispatchEvent(vincent_event);\n",
" });\n",
" });\n",
" });\n",
" });\n",
" };\n",
" if(typeof define === \"function\" && define.amd){\n",
" if (window['d3'] === undefined ||\n",
" window['topojson'] === undefined){\n",
" require.config(\n",
" {paths: {\n",
" d3: 'http://d3js.org/d3.v3.min',\n",
" topojson: 'http://d3js.org/topojson.v1.min'\n",
" }\n",
" }\n",
" );\n",
" require([\"d3\"], function(d3){\n",
" console.log('Loading Vincent from require.js...')\n",
" window.d3 = d3;\n",
" require([\"topojson\"], function(topojson){\n",
" window.topojson = topojson;\n",
" load_all_libs();\n",
" });\n",
" });\n",
" } else {\n",
" load_all_libs();\n",
" };\n",
" }else{\n",
" console.log('Require.js not found, loading manually...')\n",
" load_all_libs();\n",
" };\n",
"\n",
" </script>"
],
"metadata": {},
"output_type": "display_data",
"text": [
"<IPython.core.display.HTML at 0xa554e48>"
]
}
],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"path = r'D:\\ownCloud\\Documentos posgrado\\Posgrado 2do semestre\\Seminario investigacion\\Experimental Data\\Cd Mex\\Movilidad\\Hist Ecobici\\ecobici.csv'\n",
"trips = pd.read_csv(path,\n",
" parse_dates=['date_removed', 'date_arrived'],\n",
" infer_datetime_format=True)\n",
"\n",
"trips = trips.loc[trips['action'] != 'C ']\n",
"\n",
"# Find the start date\n",
"ind = pd.DatetimeIndex(trips.date_arrived)\n",
"trips['date'] = ind.date.astype('datetime64')\n",
"trips['hour'] = ind.hour\n",
"\n",
"# Count trips by date\n",
"by_date = trips.pivot_table('bike',aggfunc='count',\n",
" index='date',\n",
" columns='station_arrived', )\n",
"\n",
"by_date.drop([col for col in list(by_date.columns.values) if col > 275], \n",
" axis=1, inplace=True)\n",
"\n",
"# add a flag indicating weekend\n",
"by_date['weekend'] = (by_date.index.dayofweek >= 5)\n",
"\n",
"by_date_weekday = by_date[by_date['weekend'] == False]\n",
"\n",
"#mavg_wd = pd.rolling_mean(by_date_weekday.sum(axis=1), 20)\n",
"#mstd_wd = pd.rolling_std(by_date_weekday.sum(axis=1), 20)\n",
"mavg_wd = by_date_weekday.sum(axis=1).rolling(window=20,center=False).mean()\n",
"mstd_wd = by_date_weekday.sum(axis=1).rolling(window=20,center=False).std()\n",
"\n",
"black_list = by_date_weekday.sum(axis=1)[by_date_weekday.sum(axis=1) <= mavg_wd-2*mstd_wd].index\n",
"\n",
"by_date_weekend = by_date[by_date['weekend'] == True]\n",
"\n",
"#mavg_we = pd.rolling_mean(by_date_weekend.sum(axis=1), 20)\n",
"#mstd_we = pd.rolling_std(by_date_weekend.sum(axis=1), 20)\n",
"mavg_we = by_date_weekend.sum(axis=1).rolling(window=20,center=False).mean()\n",
"mstd_we = by_date_weekend.sum(axis=1).rolling(window=20,center=False).std()\n",
"\n",
"black_list = black_list.union(by_date_weekend.sum(axis=1)[by_date_weekend.sum(axis=1) <= mavg_we-2*mstd_we].index)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stderr",
"text": [
"C:\\Users\\ghost\\AppData\\Local\\Enthought\\Canopy\\App\\appdata\\canopy-1.5.2.2785.win-x86_64\\lib\\site-packages\\IPython\\core\\interactiveshell.py:2741: DtypeWarning: Columns (0,1) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" interactivity=interactivity, compiler=compiler)\n"
]
}
],
"prompt_number": 2
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"trips_in = pd.read_csv(path,\n",
" parse_dates=['date_removed', 'date_arrived'],\n",
" infer_datetime_format=True)\n",
"\n",
"trips_in = trips_in.loc[trips_in['action'] != 'C ']\n",
"\n",
"# Find the start date\n",
"ind = pd.DatetimeIndex(trips.date_arrived)\n",
"trips_in['date'] = ind.date.astype('datetime64')\n",
"trips_in['hour'] = ind.hour\n",
"\n",
"# remove those values on the blacklist\n",
"trips_in = trips_in.loc[~trips_in.date.isin(black_list)]"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 3
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"trips_out = pd.read_csv(path,\n",
" parse_dates=['date_removed', 'date_arrived'],\n",
" infer_datetime_format=True)\n",
"\n",
"trips_out = trips_out.loc[trips_out['action'] != 'C ']\n",
"\n",
"# Find the start date\n",
"ind = pd.DatetimeIndex(trips.date_removed)\n",
"trips_out['date'] = ind.date.astype('datetime64')\n",
"trips_out['hour'] = ind.hour\n",
"\n",
"# remove those values on the blacklist\n",
"trips_out = trips_out.loc[~trips_out.date.isin(black_list)]"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 4
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#trips_out\n",
"#trips_out = trips_out.loc[~trips_out.date.isin(black_list)]\n",
"#trips_out.loc[~trips_out.date.isin(black_list)]\n",
"#black_list"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 5
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# count trips by date and by hour\n",
"by_hour_out = trips_out.pivot_table('bike', aggfunc='count',\n",
" index=['date', 'hour'],\n",
" columns='station_removed').fillna(0).reset_index('hour')\n",
"\n",
"# average these counts by weekend\n",
"by_hour_out['weekend'] = (by_hour_out.index.dayofweek >= 5)\n",
"by_hour_out = by_hour_out.groupby(['weekend', 'hour']).mean()\n",
"by_hour_out.index.set_levels([['weekday', 'weekend'],\n",
" [\"{0}:00\".format(i) for i in range(24)]],\n",
" inplace=True);\n",
"\n",
"by_hour_out.drop([col for col in list(by_hour_out.columns.values) if col > 275], axis=1, inplace=True)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 6
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# count trips by date and by hour\n",
"by_hour_in = trips_in.pivot_table('bike', aggfunc='count',\n",
" index=['date', 'hour'],\n",
" columns='station_arrived').fillna(0).reset_index('hour')\n",
"\n",
"# average these counts by weekend\n",
"by_hour_in['weekend'] = (by_hour_in.index.dayofweek >= 5)\n",
"by_hour_in = by_hour_in.groupby(['weekend', 'hour']).mean()\n",
"by_hour_in.index.set_levels([['weekday', 'weekend'],\n",
" [\"{0}:00\".format(i) for i in range(24)]],\n",
" inplace=True);\n",
"\n",
"by_hour_in.drop([col for col in list(by_hour_in.columns.values) if col > 275], axis=1, inplace=True)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 7
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#by_hour = pd.concat([by_hour_in.loc['weekday'],by_hour_out.loc['weekday']],axis=0)\n",
"by_hour = pd.concat([by_hour_in,by_hour_out],axis=0)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 8
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from sklearn import mixture \n",
"\n",
"X = by_hour.loc['weekday'].values.T\n",
"num_clusters = 6\n",
"cv_types = ['spherical', 'tied', 'diag', 'full']\n",
"\n",
"gmm = mixture.GMM(n_components=num_clusters, covariance_type=cv_types[3])\n",
"gmm.fit(X)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 9,
"text": [
"GMM(covariance_type='full', init_params='wmc', min_covar=0.001,\n",
" n_components=6, n_init=1, n_iter=100, params='wmc', random_state=None,\n",
" thresh=None, tol=0.001, verbose=0)"
]
}
],
"prompt_number": 9
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"CDMX_COORDINATES = (19.4186, -99.1686) \n",
"\n",
"#create empty map zoomed in on Mexico City\n",
"station_attrs = pd.read_csv('C:\\Users\\ghost\\Google Drive\\JairCastruitaGastelum\\Progress Reports\\Clustering_ECOBICI\\DATA\\estacion_coordenadas.csv')"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 10
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"color_iter = ['blue', 'red', 'green', '#e017e5', '#d8f604', '#15d4e2', '#e21569']\n",
"by_hour_in.columns = by_hour_in.columns.astype(str) + ' in'\n",
"by_hour_out.columns = by_hour_out.columns.astype(str) + ' out'\n",
"by_hour_concat = pd.concat([by_hour_in.loc['weekday'],by_hour_out.loc['weekday']],axis=1)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 11
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"map = folium.Map(location=CDMX_COORDINATES, tiles='Stamen Toner', zoom_start=14)\n",
"Y_ = gmm.predict(X)\n",
"plt.figure(figsize=(16,8))\n",
"\n",
"for i, (mean, covar, color) in enumerate(zip(\n",
" gmm.means_, gmm._get_covars(), color_iter)):\n",
" \n",
" if not np.any(Y_ == i):\n",
" continue \n",
" \n",
" plt.plot(mean,color=color)\n",
" \n",
" for attr in range(len(Y_)):\n",
" if Y_[attr] == i:\n",
" alt = station_attrs[station_attrs['id'] == by_hour.columns[attr]]['longitud'].values[0]\n",
" lat = station_attrs[station_attrs['id'] == by_hour.columns[attr]]['latitud'].values[0]\n",
" \n",
" counts = vincent.Bar(by_hour_concat[[str(by_hour.columns[attr])+' in',str(by_hour.columns[attr])+' out']], width=650, height=300)\n",
" counts.axis_titles(x='Hora', y='Estacion '+str(int(by_hour.columns[attr]))+': Valor Promedio en un dia')\n",
" counts.to_json(str(by_hour.columns[attr])+'.json')\n",
" counts.legend(title=str(station_attrs.iloc[attr,0]))\n",
" \n",
" folium.RegularPolygonMarker(location=[lat,alt], \n",
" popup=folium.Popup(max_width=700).add_child(\n",
" folium.Vega(json.load(open(str(by_hour.columns[attr])+'.json')), \n",
" width=700, \n",
" height=350)),\n",
" fill_color=color,\n",
" number_of_sides=8,\n",
" radius=10).add_to(map)\n",
" \n",
"map"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment