3vivekb · January 31, 2019 18:19
diff --git a/cleaning_commands.py b/cleaning_commands.py
 import pandas as pd
 import glob

 #Combine multiple csv's into one df
 allFiles = glob.glob('.' + "/gps_*.csv")
 frame = pd.DataFrame()
 frames = []
 for file_ in allFiles:
    df = pd.read_csv(file_, header=0)
    frames.append(df)
 df = pd.concat(frames, ignore_index=True)

 #Transitime Data Formatting
 # http://strftime.org/ for reference
 df['time'] = pd.to_datetime(df['time'], format='%Y-%m-%d %H:%M:%S.%f')

 #Ridership Data Time Formatting
 df['Date_Time']=pd.to_datetime(df['Date'], format='%m/%d/%Y %I:%M:%S %p')
 df['Year'] = pd.DatetimeIndex(df['Date_Time']).year
 df['Month'] = pd.DatetimeIndex(df['Date_Time']).month
 df['Year-Month'] = pd.DatetimeIndex(df['Date_Time']).year + pd.DatetimeIndex(df['Date_Time']).month

 #There is some duplicate data we need to clean out namely March 2016
 df = df.drop_duplicates()

 #Get latency of time data, convert from timedelta to seconds
 df['latency'] = (df['time_processed'] - df['time'])
 df['latency_seconds'] = df['latency'].astype('timedelta64[s]')

 #Find number of records by source, vehicle, and hour
 df.groupby([df.source,df.vehicle_id,df.time.dt.hour])['source'].count()

 #Splits nasty strings, pulls first element of the list.
 df['long_name_with_underscores'].str.split('-').str[0]

 #How to quickly add a feature to a DataFrame.  It's super slow to edit them so it's almost always better to just
 #make a new dataframe with the current rows plus whichever columns you wish.
 od = []
 for i,r, in OD_unique.iterrows():
 #     row = {}
    row = dict(json.loads(r.to_json()))
    row['from_point'] = Point(r.from_longitude,r.from_latitude)
    row['to_point'] = Point(r.to_longitude,r.to_latitude)
    od.append(row)

 od = pd.DataFrame(od)

 #Or if you are specifically working with geodata this method is faster:
 #https://gis.stackexchange.com/questions/174159/convert-a-pandas-dataframe-to-a-geodataframe
 from geopandas import GeoDataFrame
 from shapely.geometry import Point

 geometry = [Point(xy) for xy in zip(df.Lon, df.Lat)]
 df = df.drop(['Lon', 'Lat'], axis=1)
 crs = {'init': 'epsg:4326'}
 geo_df = GeoDataFrame(df, crs=crs, geometry=geometry)

 # Remove lines where the geometry is null!
 geo_df = geo_df.loc[~geo_df['geometry'].isnull(),]

 #Remove spaces in the name, replace with _
 geo_df.rename(columns=lambda x: x.replace(' ','_'), inplace=True)

 #Converting a CRS
 geo_df = geo_df.to_crs({'init': 'epsg:4326'})

 # If the output file exists, remove it!
 try:
    os.remove('geospatial_output.geojson')
 except OSError:
    pass

 try:
    shutil.rmtree('geospatial_output/')
 except FileNotFoundError:
    pass

 # Write the file to GeoJSON and ESRI Shapefile
 geo_df.to_file('geospatial_output.geojson',driver='GeoJSON')
 geo_df.to_file('geospatial_output/',driver='ESRI Shapefile')

 # https://codeburst.io/dealing-with-datetimes-like-a-pro-in-pandas-b80d3d808a7f
 #Take a timestamp column and give it a PST time zone.
 assignments.set_index('t', inplace=True)
 assignments.index = assignments.index.tz_localize('America/Los_Angeles').tz_convert('UTC')
 assignments.reset_index(inplace=True)

 #Create a json from a dataframe and push it to an api
 #http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_json.html
 data= assignments.to_json(orient='records',date_format='epoch', date_unit='ms')
 post_object = '{"avl":' + data + '}'
 d = json.loads(post_object)

 r = requests.post(url = 'http://url', json = d)

 #Display more than 20 columns of data
 pd.get_option("display.max_columns")
 pd.set_option("display.max_columns", 40 )
	import pandas as pd
	import glob

	#Combine multiple csv's into one df
	allFiles = glob.glob('.' + "/gps_*.csv")
	frame = pd.DataFrame()
	frames = []
	for file_ in allFiles:
	df = pd.read_csv(file_, header=0)
	frames.append(df)
	df = pd.concat(frames, ignore_index=True)

	#Transitime Data Formatting
	# http://strftime.org/ for reference
	df['time'] = pd.to_datetime(df['time'], format='%Y-%m-%d %H:%M:%S.%f')

	#Ridership Data Time Formatting
	df['Date_Time']=pd.to_datetime(df['Date'], format='%m/%d/%Y %I:%M:%S %p')
	df['Year'] = pd.DatetimeIndex(df['Date_Time']).year
	df['Month'] = pd.DatetimeIndex(df['Date_Time']).month
	df['Year-Month'] = pd.DatetimeIndex(df['Date_Time']).year + pd.DatetimeIndex(df['Date_Time']).month

	#There is some duplicate data we need to clean out namely March 2016
	df = df.drop_duplicates()

	#Get latency of time data, convert from timedelta to seconds
	df['latency'] = (df['time_processed'] - df['time'])
	df['latency_seconds'] = df['latency'].astype('timedelta64[s]')

	#Find number of records by source, vehicle, and hour
	df.groupby([df.source,df.vehicle_id,df.time.dt.hour])['source'].count()

	#Splits nasty strings, pulls first element of the list.
	df['long_name_with_underscores'].str.split('-').str[0]

	#How to quickly add a feature to a DataFrame. It's super slow to edit them so it's almost always better to just
	#make a new dataframe with the current rows plus whichever columns you wish.
	od = []
	for i,r, in OD_unique.iterrows():
	# row = {}
	row = dict(json.loads(r.to_json()))
	row['from_point'] = Point(r.from_longitude,r.from_latitude)
	row['to_point'] = Point(r.to_longitude,r.to_latitude)
	od.append(row)

	od = pd.DataFrame(od)

	#Or if you are specifically working with geodata this method is faster:
	#https://gis.stackexchange.com/questions/174159/convert-a-pandas-dataframe-to-a-geodataframe
	from geopandas import GeoDataFrame
	from shapely.geometry import Point

	geometry = [Point(xy) for xy in zip(df.Lon, df.Lat)]
	df = df.drop(['Lon', 'Lat'], axis=1)
	crs = {'init': 'epsg:4326'}
	geo_df = GeoDataFrame(df, crs=crs, geometry=geometry)

	# Remove lines where the geometry is null!
	geo_df = geo_df.loc[~geo_df['geometry'].isnull(),]

	#Remove spaces in the name, replace with _
	geo_df.rename(columns=lambda x: x.replace(' ','_'), inplace=True)

	#Converting a CRS
	geo_df = geo_df.to_crs({'init': 'epsg:4326'})

	# If the output file exists, remove it!
	try:
	os.remove('geospatial_output.geojson')
	except OSError:
	pass

	try:
	shutil.rmtree('geospatial_output/')
	except FileNotFoundError:
	pass

	# Write the file to GeoJSON and ESRI Shapefile
	geo_df.to_file('geospatial_output.geojson',driver='GeoJSON')
	geo_df.to_file('geospatial_output/',driver='ESRI Shapefile')

	# https://codeburst.io/dealing-with-datetimes-like-a-pro-in-pandas-b80d3d808a7f
	#Take a timestamp column and give it a PST time zone.
	assignments.set_index('t', inplace=True)
	assignments.index = assignments.index.tz_localize('America/Los_Angeles').tz_convert('UTC')
	assignments.reset_index(inplace=True)

	#Create a json from a dataframe and push it to an api
	#http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_json.html
	data= assignments.to_json(orient='records',date_format='epoch', date_unit='ms')
	post_object = '{"avl":' + data + '}'
	d = json.loads(post_object)

	r = requests.post(url = 'http://url', json = d)

	#Display more than 20 columns of data
	pd.get_option("display.max_columns")
	pd.set_option("display.max_columns", 40 )