import pandas as pd
df = pd.read_csv('filename.csv')
df.shape
df.size
df.count
df.info()
df.describe()
df['column_name'] = pd.to_datetime(df['column_name'])
df['column_name'].unique()
pd.concat(group for _, group in df.groupby("column_name") if len(group) > 1)
df[df.duplicated()].shape[0]
df.duplicated().sum()
df.shape[0] - df.dropna().shape[0]
df['column_name'].dropna().unique().size
df.drop('column_name', axis=1, inplace=True)
df_18.groupby(['col1', 'col2']).size().groupby('col2').max()
df_08.rename(columns=lambda x: x + "_2008", inplace=True)
means = []
for _ in range(10000):
bootstrap = FULL_DATA.sample(SAMPLE_SIZE, replace = True)
bootstrap_mean = bootstrap[bootstrap['QUERY_PARAM'] == 'QUERY_VALUE']['COLUMN'].mean()
means.append(bootstrap_mean)
np.percentile(sample, 2.5), np.percentile(sample, 97.5)
import statsmodels.api as sm;
df['intercept'] = 1
linear_model = sm.OLS(df['DEPENDENT_COLUMN'], df[['intercept', 'COLUMN_A', 'COLUMN_B', ...]])
regression_results = linear_model.fit()
regression_results.summary()
dummies = pd.get_dummies(df['CATEGORICAL_COLUMN'])
df_with_dummies = df.join(dummies)
model = sm.OLS(df_with_dummies['DEPENDENT_COLUMN'], df_with_dummies[['intercept', 'COLUMN_A', 'COLUMN_B', ...]])
regression = model.fit()
regression.summary()
import seaborn as sns
sns.pairplot(df[['COLUMN_A', 'COLUMN_B', 'COLUMN_C']]);
from patsy import dmatrices
y, X = dmatrices('price ~ area + bedrooms + bathrooms', df, return_type = 'dataframe')
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif["features"] = X.columns
vif