Skip to content

Instantly share code, notes, and snippets.

@NathanDai5287
Created September 26, 2022 06:35
Show Gist options
  • Save NathanDai5287/d5f1865bbf3eaf5ee5cbe47b4df6a3a1 to your computer and use it in GitHub Desktop.
Save NathanDai5287/d5f1865bbf3eaf5ee5cbe47b4df6a3a1 to your computer and use it in GitHub Desktop.
Titanic Survival Classification
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
# combine train and test data because usually the data will not be split
# df = pd.read_csv('data/train.csv').append(pd.read_csv('data/test.csv')).to_csv('data/titanic.csv', index=False)
# read data
df = pd.read_csv('data/titanic.csv')
# get summary of data
print(df.info())
# show first 5 rows
print(df.head())
le = LabelEncoder() # convert categorical string variables to integers
df['Sex'] = le.fit_transform(df['Sex']) # "female" and "male"
df['Ticket'] = le.fit_transform(df['Ticket']) # ticket type
df['Cabin'] = le.fit_transform(df['Cabin']) # cabin types
df['Embarked'] = le.fit_transform(df['Embarked']) # port of embarkation
# you can create new variables from existing ones
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1 # number of family members
# fill missing values with median
df['Age'].fillna(df['Age'].median(), inplace=True)
df['Fare'].fillna(df['Fare'].median(), inplace=True)
# drop datapoint if "survived" is missing
df.dropna(subset=['Survived'], inplace=True)
# drop passenger name because it does not affect survival
df.drop('Name', axis=1, inplace=True)
# set independent and depend variables
independents = df[[variable for variable in df.columns if variable != 'Survived']]
dependent = df['Survived'].values
# split training and testing data with train_test_split (80% training, 20% testing)
x_train, x_test, y_train, y_test = train_test_split(independents, dependent, test_size=0.2, random_state=42)
# create logistic regression model
model = LogisticRegression()
# fit model and predict
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
# get accuracy of model
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(model.score(x_test, y_test)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment