Last active
January 31, 2019 18:19
-
-
Save 3vivekb/6d88d1e65838999c2373ba29cb46c8f1 to your computer and use it in GitHub Desktop.
Frequently used Python Pandas Data Cleaning and grouping commands
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import glob | |
#Combine multiple csv's into one df | |
allFiles = glob.glob('.' + "/gps_*.csv") | |
frame = pd.DataFrame() | |
frames = [] | |
for file_ in allFiles: | |
df = pd.read_csv(file_, header=0) | |
frames.append(df) | |
df = pd.concat(frames, ignore_index=True) | |
#Transitime Data Formatting | |
# http://strftime.org/ for reference | |
df['time'] = pd.to_datetime(df['time'], format='%Y-%m-%d %H:%M:%S.%f') | |
#Ridership Data Time Formatting | |
df['Date_Time']=pd.to_datetime(df['Date'], format='%m/%d/%Y %I:%M:%S %p') | |
df['Year'] = pd.DatetimeIndex(df['Date_Time']).year | |
df['Month'] = pd.DatetimeIndex(df['Date_Time']).month | |
df['Year-Month'] = pd.DatetimeIndex(df['Date_Time']).year + pd.DatetimeIndex(df['Date_Time']).month | |
#There is some duplicate data we need to clean out namely March 2016 | |
df = df.drop_duplicates() | |
#Get latency of time data, convert from timedelta to seconds | |
df['latency'] = (df['time_processed'] - df['time']) | |
df['latency_seconds'] = df['latency'].astype('timedelta64[s]') | |
#Find number of records by source, vehicle, and hour | |
df.groupby([df.source,df.vehicle_id,df.time.dt.hour])['source'].count() | |
#Splits nasty strings, pulls first element of the list. | |
df['long_name_with_underscores'].str.split('-').str[0] | |
#How to quickly add a feature to a DataFrame. It's super slow to edit them so it's almost always better to just | |
#make a new dataframe with the current rows plus whichever columns you wish. | |
od = [] | |
for i,r, in OD_unique.iterrows(): | |
# row = {} | |
row = dict(json.loads(r.to_json())) | |
row['from_point'] = Point(r.from_longitude,r.from_latitude) | |
row['to_point'] = Point(r.to_longitude,r.to_latitude) | |
od.append(row) | |
od = pd.DataFrame(od) | |
#Or if you are specifically working with geodata this method is faster: | |
#https://gis.stackexchange.com/questions/174159/convert-a-pandas-dataframe-to-a-geodataframe | |
from geopandas import GeoDataFrame | |
from shapely.geometry import Point | |
geometry = [Point(xy) for xy in zip(df.Lon, df.Lat)] | |
df = df.drop(['Lon', 'Lat'], axis=1) | |
crs = {'init': 'epsg:4326'} | |
geo_df = GeoDataFrame(df, crs=crs, geometry=geometry) | |
# Remove lines where the geometry is null! | |
geo_df = geo_df.loc[~geo_df['geometry'].isnull(),] | |
#Remove spaces in the name, replace with _ | |
geo_df.rename(columns=lambda x: x.replace(' ','_'), inplace=True) | |
#Converting a CRS | |
geo_df = geo_df.to_crs({'init': 'epsg:4326'}) | |
# If the output file exists, remove it! | |
try: | |
os.remove('geospatial_output.geojson') | |
except OSError: | |
pass | |
try: | |
shutil.rmtree('geospatial_output/') | |
except FileNotFoundError: | |
pass | |
# Write the file to GeoJSON and ESRI Shapefile | |
geo_df.to_file('geospatial_output.geojson',driver='GeoJSON') | |
geo_df.to_file('geospatial_output/',driver='ESRI Shapefile') | |
# https://codeburst.io/dealing-with-datetimes-like-a-pro-in-pandas-b80d3d808a7f | |
#Take a timestamp column and give it a PST time zone. | |
assignments.set_index('t', inplace=True) | |
assignments.index = assignments.index.tz_localize('America/Los_Angeles').tz_convert('UTC') | |
assignments.reset_index(inplace=True) | |
#Create a json from a dataframe and push it to an api | |
#http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_json.html | |
data= assignments.to_json(orient='records',date_format='epoch', date_unit='ms') | |
post_object = '{"avl":' + data + '}' | |
d = json.loads(post_object) | |
r = requests.post(url = 'http://url', json = d) | |
#Display more than 20 columns of data | |
pd.get_option("display.max_columns") | |
pd.set_option("display.max_columns", 40 ) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment