stuntgoat · March 8, 2016 01:45
diff --git a/pandas_notes.py b/pandas_notes.py
 # read csv
 df = pd.read_csv("<filename>")

 # show duplicates; returns series of bools
 dup_indexes = r.duplicated(cols=("colname1", "colname2"))

 # remove duplicates; in place
 df.drop_duplicates("colname1", "colname2", inplace=True)

 # drop a column
 del df['my_col_name']

 # rename row items in a column; `f` is a function that takes a
 # value and renames it based on some condition
 df.NAMES = df.NAMES.map(f)

 # count unique items in a row
 df.NAMES.value_counts()

 # subset with map
 df_subset = df[df.NAMES.map(bool_func)]

 # select rows by condition
 df_subset = df[df.VALUES > 100]
 # multiple conditions
 df_subset = df[(df.VALUES > 100) & (df.OTHER == True)]

 # remove rows with function that checks for row values;
 # `has_good_value` takes a row and an index of the row to check(`row_idx`) and returns a bool.
 df_new = df.apply(has_good_value, axis=1, row_idx=0)

 # convert datetime64 index to iso column
 df['isodatetime'] = df.index.format(formatter=lambda x:x.isoformat())

 # sort a TimeSeriesIndex 
 df.reindex(df.index.values.sort())

 # Drop a row by index; inplace
 df.drop(df.index[3109], inplace=True)

 # Convert a column data type
 df['my_colname'] = df['my_colname'].astype(float)

 # Create a histogram from a column; using a bin width of 5
 df.my_colname.hist(bins=range(0, 100, 5))
 plt.show()
 # Then clear figure
 plt.clf()

 # Append row to dataframe
 df = pd.DataFrame(columns=range(10))
 df.loc[0] = np.random.randn(10)

 # Convert ISO string formatted times to datetimes
 df.times = df.times.map(pd.to_datetime)
 #  Create a DateTimeIndex from datetimes
 df.index = pd.TimeSeries(df.times)
 # If the times are not sorted you'll wanna call sort_index- returns a copy
 df = df.sort_index()

 # Extract weekday from Timeseries index 
 df['weekday'] = df.index.map(lambda x: x.strftime('%A'))

 # Resample time series index data by hour and sum of rows between intervals
 df = df.resample('H', how='sum')

 # Convert UTC timeindex without timezone to US/Eastern
 df.index = df.index.tz_localize('UTC')  # localize index to UTC
 df.index = df.index.tz_convert('US/Eastern')  # convert

 # Edit multiple values in place with boolean vector indexing.
 # in this case, NaN values are set to 0.
 df.ix[df.SOME_NUMBERS.isnull()] = 0

 # Set output options in terminal
 pd.set_option('display.max_rows', 1000)
 # pd.set_option('max_columns', 100)
 pd.set_option('display.max_columns', 400) # default is 80
 pd.set_option('display.width', 1000) # default is 80

 # Info
 pd.describe_option('display')
 # Turn off wrap
 pd.set_option('expand_frame_repr', False)

 # numpy settings; sort of related
 np.set_printoptions(threshold=5000)
	# read csv
	df = pd.read_csv("<filename>")

	# show duplicates; returns series of bools
	dup_indexes = r.duplicated(cols=("colname1", "colname2"))

	# remove duplicates; in place
	df.drop_duplicates("colname1", "colname2", inplace=True)

	# drop a column
	del df['my_col_name']

	# rename row items in a column; `f` is a function that takes a
	# value and renames it based on some condition
	df.NAMES = df.NAMES.map(f)

	# count unique items in a row
	df.NAMES.value_counts()

	# subset with map
	df_subset = df[df.NAMES.map(bool_func)]

	# select rows by condition
	df_subset = df[df.VALUES > 100]
	# multiple conditions
	df_subset = df[(df.VALUES > 100) & (df.OTHER == True)]

	# remove rows with function that checks for row values;
	# `has_good_value` takes a row and an index of the row to check(`row_idx`) and returns a bool.
	df_new = df.apply(has_good_value, axis=1, row_idx=0)

	# convert datetime64 index to iso column
	df['isodatetime'] = df.index.format(formatter=lambda x:x.isoformat())

	# sort a TimeSeriesIndex
	df.reindex(df.index.values.sort())

	# Drop a row by index; inplace
	df.drop(df.index[3109], inplace=True)

	# Convert a column data type
	df['my_colname'] = df['my_colname'].astype(float)

	# Create a histogram from a column; using a bin width of 5
	df.my_colname.hist(bins=range(0, 100, 5))
	plt.show()
	# Then clear figure
	plt.clf()

	# Append row to dataframe
	df = pd.DataFrame(columns=range(10))
	df.loc[0] = np.random.randn(10)

	# Convert ISO string formatted times to datetimes
	df.times = df.times.map(pd.to_datetime)
	# Create a DateTimeIndex from datetimes
	df.index = pd.TimeSeries(df.times)
	# If the times are not sorted you'll wanna call sort_index- returns a copy
	df = df.sort_index()

	# Extract weekday from Timeseries index
	df['weekday'] = df.index.map(lambda x: x.strftime('%A'))

	# Resample time series index data by hour and sum of rows between intervals
	df = df.resample('H', how='sum')

	# Convert UTC timeindex without timezone to US/Eastern
	df.index = df.index.tz_localize('UTC') # localize index to UTC
	df.index = df.index.tz_convert('US/Eastern') # convert

	# Edit multiple values in place with boolean vector indexing.
	# in this case, NaN values are set to 0.
	df.ix[df.SOME_NUMBERS.isnull()] = 0

	# Set output options in terminal
	pd.set_option('display.max_rows', 1000)
	# pd.set_option('max_columns', 100)
	pd.set_option('display.max_columns', 400) # default is 80
	pd.set_option('display.width', 1000) # default is 80

	# Info
	pd.describe_option('display')
	# Turn off wrap
	pd.set_option('expand_frame_repr', False)

	# numpy settings; sort of related
	np.set_printoptions(threshold=5000)