-
a pandas table is called a dataframe
-
columns of a dataframe are usually called variables because they correlate to variables of an experiment -- so think 'variable' as in experiments, not variable as in programming
-
rows of a dataframe are often called records or cases or observations
-
one-dimensional data in pandas is called a series, bsicaly a list
- but it can have a name as well as a value for each item, which looks like 2 columns, but still counts as one dimensional?
-
a null in pandas is any cell that pandas counts as "missing data"; there are various forms of this
-
to impute values means to fill in missing values with a guess or estimate based on the existing data, such as filling in missing ages with the mean of the provided ages
terminal:
sudo apt install python3-pip
pip3 -V
pip3 install pandas
pip3 list
(this will also install numpy automatically)
import numpy as np
import pandas as pd
create dataframe from reading csv:
df = pd.read_csv('bestsellers.csv')
(same directory as python file)
# Get the first 5 rows of the spreadsheet
print(df.head())
# Also works on individual columns, and can provide number of rows
print(df["Rating"].head(10))
# Get the shape of the spreadsheet, aka number of rows and cols
print(df.shape)
# Get the column names of the spreadsheet
print(df.columns)
# Get summary statistics for each column - mean, min and max values, etc. idk what it does with non numeric columns
print(df.describe())
# List columns and their data types and count of non-null values
df.info()
filter: select columns whose name matches regular expression
df.filter(regex='channel')
sample: show a handful of random rows, i think??
df.sample(3)
query: select rows where a certain column has a certain value
#to view all records for channeltitle='Foxstarhindi'
df_india.query('channelTitle=="FoxStarHindi"')
iloc: create a slice by index
- takes one slice argument to select rows, or add a second slice argument to select columns
# grab rows 1 through 4. second index is exclusive.
df.iloc[1:5]
# grab only columns 3-5 of rows 1 through 4
df.iloc[1:5, 3:6]
# grab the last 3 columns for all rows
df.iloc[:, -3:]
loc: create a slice by name
# to view columns from channel id to view count
df.loc[:,'channelId':'view_count']
check for duplicte rows:
df.duplicated().sum()
drop duplicates:
# Remove duplicate rows
# inplace=True means that it mutates the current dataframe instead of creating a new one
df.drop_duplicates(inplace=True)
# The Pandas .drop_duplicates() method
df.drop_duplicates(
subset=None, # Which columns to consider
keep='first', # Which duplicate record to keep
inplace=False, # Whether to drop in place
ignore_index=False # Whether to relabel the index
)
count missing values:
# check for missing values -- count number of nulls for each variable
df.isnull().sum()
drop missing values:
# .dropna() to remove data with nulls
# Exploring the Pandas .dropna() method
df.dropna(
axis=0, # Whether to drop rows or columns (rows by default)
how='any', # Whether to drop records if 'all' or 'any' records are missing
thresh=None, # How many columns/rows must be missing to drop
subset=None, # Which rows/columns to consider
inplace=False # Whether to drop in place (i.e., without needing to re-assign)
)
df = df.dropna()
fill in missing values:
# fill in nulls with zeroes (or whatever else)
df = df.fillna(0)
df = df.fillna({'Name': 'Someone', 'Age': 25, 'Location': 'USA'})
# "impute" (estimate) values by filling with mean
df['Age'] = df['Age'].fillna(df['Age'].mean())
rename columns:
# Rename the columns
df.rename(columns={"Name": "Title", "Year": "Publication Year", "User Rating": "Rating"}, inplace=True)
change data type of a column:
# Price was type int by default, so change it to a float because that's how money works
df["Price"] = df["Price"].astype(float)
convert a string to a datetime object:
# Convert the Date field to a datetime type object so that we can use date-specific functions on it
df["Date"] = pd.to_datetime(df["Date"])
clean whitespace on strings:
# Trimming Whitespace from a Pandas Column
df['Favorite Color'] = df['Favorite Color'].str.strip()
change case:
df['Location'] = df['Location'].str.title()
df['Location'] = df['Location'].str.lower)
replace a sub-string, including removing a redundant word e.g.:
df['Region'] = df['Region'].str.replace('Region ', '')
split one column into two by sub-string:
# split a "lastname, firstname" column into two separate columns for first and last name
# assumes that column Name exists but columns Last Name and First Name dont exist yet
# "expand=True" is what tells it to make new columns for those new column names
# "Make note here of the use of the double square brackets. This is required since we are passing in a list of columns we want to create!"
df[['Last Name', 'First Name']] = df['Name'].str.split(',', expand=True)
#Adding column for year to the dataframe (Date is a datetime object)
df['Year']=df['Date'].dt.year
df['Month']=df.Date.dt.month
# Count how many times each author appears in the list, AKA how many rows feature that author
author_counts = df['Author'].value_counts()
print(author_counts)
# Split into groups by genre, look at Rating for each group, take the mean of the group's ratings and assign it to that group name
avg_rating_by_genre = df.groupby("Genre")["Rating"].mean()
print(avg_rating_by_genre)
# For comparison, running .mean() on Rating without grouping gives just one number as a result
df["Rating"].mean()
# Whose books cost the most?
# Group by author, and take the mean of the Price column for each author
price_by_author = df.groupby("Author")["Price"].mean()
print(price_by_author.head(10))
# This variable is a series
# Sort the result in descending order
price_by_author.sort_values(ascending=False)
# Export top selling authors to a CSV file - can export just the head
author_counts.head(10).to_csv("top_authors.csv")
# Export average rating by genre to a CSV file
avg_rating_by_genre.to_csv("avg_rating_by_genre.csv")