Last active
March 8, 2016 01:45
-
-
Save stuntgoat/7150564 to your computer and use it in GitHub Desktop.
notes on using Pandas
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# read csv | |
df = pd.read_csv("<filename>") | |
# show duplicates; returns series of bools | |
dup_indexes = r.duplicated(cols=("colname1", "colname2")) | |
# remove duplicates; in place | |
df.drop_duplicates("colname1", "colname2", inplace=True) | |
# drop a column | |
del df['my_col_name'] | |
# rename row items in a column; `f` is a function that takes a | |
# value and renames it based on some condition | |
df.NAMES = df.NAMES.map(f) | |
# count unique items in a row | |
df.NAMES.value_counts() | |
# subset with map | |
df_subset = df[df.NAMES.map(bool_func)] | |
# select rows by condition | |
df_subset = df[df.VALUES > 100] | |
# multiple conditions | |
df_subset = df[(df.VALUES > 100) & (df.OTHER == True)] | |
# remove rows with function that checks for row values; | |
# `has_good_value` takes a row and an index of the row to check(`row_idx`) and returns a bool. | |
df_new = df.apply(has_good_value, axis=1, row_idx=0) | |
# convert datetime64 index to iso column | |
df['isodatetime'] = df.index.format(formatter=lambda x:x.isoformat()) | |
# sort a TimeSeriesIndex | |
df.reindex(df.index.values.sort()) | |
# Drop a row by index; inplace | |
df.drop(df.index[3109], inplace=True) | |
# Convert a column data type | |
df['my_colname'] = df['my_colname'].astype(float) | |
# Create a histogram from a column; using a bin width of 5 | |
df.my_colname.hist(bins=range(0, 100, 5)) | |
plt.show() | |
# Then clear figure | |
plt.clf() | |
# Append row to dataframe | |
df = pd.DataFrame(columns=range(10)) | |
df.loc[0] = np.random.randn(10) | |
# Convert ISO string formatted times to datetimes | |
df.times = df.times.map(pd.to_datetime) | |
# Create a DateTimeIndex from datetimes | |
df.index = pd.TimeSeries(df.times) | |
# If the times are not sorted you'll wanna call sort_index- returns a copy | |
df = df.sort_index() | |
# Extract weekday from Timeseries index | |
df['weekday'] = df.index.map(lambda x: x.strftime('%A')) | |
# Resample time series index data by hour and sum of rows between intervals | |
df = df.resample('H', how='sum') | |
# Convert UTC timeindex without timezone to US/Eastern | |
df.index = df.index.tz_localize('UTC') # localize index to UTC | |
df.index = df.index.tz_convert('US/Eastern') # convert | |
# Edit multiple values in place with boolean vector indexing. | |
# in this case, NaN values are set to 0. | |
df.ix[df.SOME_NUMBERS.isnull()] = 0 | |
# Set output options in terminal | |
pd.set_option('display.max_rows', 1000) | |
# pd.set_option('max_columns', 100) | |
pd.set_option('display.max_columns', 400) # default is 80 | |
pd.set_option('display.width', 1000) # default is 80 | |
# Info | |
pd.describe_option('display') | |
# Turn off wrap | |
pd.set_option('expand_frame_repr', False) | |
# numpy settings; sort of related | |
np.set_printoptions(threshold=5000) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment