jxu · June 10, 2024 15:09
diff --git a/pandas_cheatsheet.py b/pandas_cheatsheet.py
 import pandas as pd

 ### Data import ###

 # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_csv.html 
 df = pd.read_csv("file.csv")  


 ### Data types ###

 df.dtypes 

 ### DataFrame operations ###

 # Subset, rename, retype columns
 df[["a", "b"]]
 df.rename(columns={"y":"year"})
 df.astype({"a": int})

 # Unique and duplicated values
 df["col"].unique  # numpy array of unique values
 df["col"].is_unique
 df.drop_duplicates(subset=["col"])
 df.duplicated()  # Boolean Series

 # Summary statistics
 df["col"].value_counts(dropna=False)
 df["col"].isna().sum()

 # Table join 
 # Warning: NULLs are joined together, unlike SQL
 # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.merge.html

 # Use on="key" if joining columns are named the same
 df.merge(df2, how="left", left_on="lk", right_on="rk")

 # SQL-style GROUP BY named aggregation functions 
 # Common functions: size, count (non-NAs), nunique (count distinct), first, max, min, mean
 df.groupby("group", as_index=False).agg(
   a_sum=('a', "sum"),
   d_range=('d', lambda x: x.max() - x.min())
 )

 ### Data export
 df.to_csv("file.csv", index=False)
	import pandas as pd

	### Data import ###

	# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_csv.html
	df = pd.read_csv("file.csv")


	### Data types ###

	df.dtypes

	### DataFrame operations ###

	# Subset, rename, retype columns
	df[["a", "b"]]
	df.rename(columns={"y":"year"})
	df.astype({"a": int})

	# Unique and duplicated values
	df["col"].unique # numpy array of unique values
	df["col"].is_unique
	df.drop_duplicates(subset=["col"])
	df.duplicated() # Boolean Series

	# Summary statistics
	df["col"].value_counts(dropna=False)
	df["col"].isna().sum()

	# Table join
	# Warning: NULLs are joined together, unlike SQL
	# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.merge.html

	# Use on="key" if joining columns are named the same
	df.merge(df2, how="left", left_on="lk", right_on="rk")

	# SQL-style GROUP BY named aggregation functions
	# Common functions: size, count (non-NAs), nunique (count distinct), first, max, min, mean
	df.groupby("group", as_index=False).agg(
	a_sum=('a', "sum"),
	d_range=('d', lambda x: x.max() - x.min())
	)

	### Data export
	df.to_csv("file.csv", index=False)