Data Dictionary
Variable Definition Key
survival Survival 0 = No, 1 = Yes
pclass Ticket class 1 = 1st, 2 = 2nd, 3 = 3rd
sex Sex
Age Age in years
sibsp # of siblings / spouses aboard the Titanic
parch # of parents / children aboard the Titanic
ticket Ticket number
fare Passenger fare
cabin Cabin number
embarked Port of Embarkation C = Cherbourg, Q = Queenstown, S = Southampton
Variable Notes
pclass: A proxy for socio-economic status (SES)
1st = Upper
2nd = Middle
3rd = Lower
age: Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5
sibsp: The dataset defines family relations in this way... Sibling = brother, sister, stepbrother, stepsister Spouse = husband, wife (mistresses and fiancés were ignored)
parch: The dataset defines family relations in this way... Parent = mother, father Child = daughter, son, stepdaughter, stepson Some children travelled only with a nanny, therefore parch=0 for them.
survival: Survival
PassengerId: Unique Id of a passenger.
pclass: Ticket class
sex: Sex
Age: Age in years
sibsp: # of siblings / spouses aboard the Titanic
parch: # of parents / children aboard the Titanic
ticket: Ticket number
fare: Passenger fare
cabin: Cabin number
embarked: Port of Embarkation
titanic_df.describe()
# linear algebra
import numpy as np
# data processing
import pandas as pd
# data visualization
import seaborn as sns
%matplotlib inline
from matplotlib import pyplot as plt
from matplotlib import style
# Algorithms
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import ElasticNet
from sklearn import tree
import pandas as pd
import os
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
titanic_df = pd.read_csv("./Data/titanic.csv")
titanic_df['Sex'] = titanic_df['Sex'].map({'female': 1, 'male': 0})
titanic_df.head(4)
data = titanic_df.drop(columns=['PassengerId', 'Name', "Ticket", "Embarked", "Cabin"])
data.sample(10)
fig=plt.figure(figsize=(17,10))
data.hist(column="Age", bins=30)
plt.xlabel("Age",fontsize=15)
plt.ylabel("Frequency",fontsize=15)
plt.xlim([0.0,100.0])
plt.axvline(data["Age"].mean(), color="red")
print('Mean age of passengers = {}'.format(data["Age"].mean()))
data_nona = data["Age"].fillna(data["Age"].mean())
data["Age"]=data_nona
data.head(10)
# data.info()
target = data["Survived"]
target_names = ["yes", "no"]
variable = data.drop("Survived", axis=1)
feature_names = variable.columns
variable.head()
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(variable, target, random_state=42)
# y_test is what the outcome is, and y_predict is what the decision tree predicts it.
#y_test is the survive column entries, and X_test the rest of the columns or the
# variables we think affect the survivie.
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
clf.score(X_test, y_test)
clf.score(X_train, y_train)
y_predict = clf.predict(X_test)
from sklearn.externals.six import StringIO
from IPython.display import Image
from sklearn.tree import export_graphviz
import pydotplus
dot_data = StringIO()
export_graphviz(clf, out_file=dot_data,
filled=True, rounded=True,
special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
Image(graph.create_png())
graph.write_png('Titanic_Default_Tree.png')
X_train.columns
clf = tree.DecisionTreeClassifier(max_depth=5)
clf = clf.fit(X_train, y_train)
clf.score(X_test, y_test)
y_predict = clf.predict(X_test)
clf.score(X_train, y_train)
dot_data = StringIO()
export_graphviz(clf, out_file=dot_data,
filled=True, rounded=True,
special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_png('Titanic_small_tree_sklearn.png')
Image(graph.create_png())
from dtreeviz.trees import *
viz = dtreeviz(clf,
X_train,
y_train,
target_name='survive',
feature_names=['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare'],
class_names=['Die', 'Survive'] # need class_names for classifier
)
graph = pydotplus.graph_from_dot_data(viz.dot)
Image(graph.create_png())
graph.write_svg('Titanic_small.svg')
import itertools
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm, datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
def plot_confusion_matrix(cm, classes,
normalize=False,
title='Confusion matrix',
cmap=plt.cm.Blues):
"""
This function prints and plots the confusion matrix.
Normalization can be applied by setting `normalize=True`.
"""
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print("Normalized confusion matrix")
else:
print('Confusion matrix, without normalization')
print(cm)
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
fmt = '.2f' if normalize else 'd'
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, format(cm[i, j], fmt),
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.tight_layout()
cnf_matrix = confusion_matrix(y_test, y_predict)
np.set_printoptions(precision=2)
plt.rcParams.update({'font.size': 24})
plt.style.use("dark_background")
plt.figure(figsize=(9, 9))
plot_confusion_matrix(cnf_matrix, classes=['Die', 'Survive'],
normalize=False, title='Confusion Matrix')
# plt.show()
plt.savefig('Titanic_confusion_matrix.png')
# Create a random forest classifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)
rf.score(X_train, y_train)
# Have not done the cross validation
rf.score(X_test, y_test)
# Random Forests in sklearn will automatically calculate feature importance
importances = rf.feature_importances_
importances
# We can sort the features by their importance
sorted(zip(rf.feature_importances_, feature_names), reverse=True)
With Radial-basis-function kernel
from sklearn.svm import SVC
# using X_train, X_test, y_train, y_test
model = SVC(kernel='rbf')
model.fit(X_train, y_train)
y_predict = model.predict(X_test)
# compare y_predict to y_test
# if good prediction, what is y_predict - y_test = 0
accuracy = model.score(X_test, y_test)
print(accuracy)
# How do I assess accuracy
model.score(X_train, y_train)
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier
classifier.fit(X_train, y_train)
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")
predictions = classifier.predict(X_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test}).head()
# KNN
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train, y_train)
Y_pred = knn.predict(X_test)
acc_knn =knn.score(X_train, y_train)
print(acc_knn)
knn.score(X_test, y_test)
# (64,16,8)
neural_network = MLPClassifier(hidden_layer_sizes=(64,16,8), solver="adam", random_state=1)
neural_network.fit(X_train, y_train)
print(f"Training Data Score: {neural_network.score(X_train, y_train)}")
print(f"Testing Data Score: {neural_network.score(X_test, y_test)}")
model = ElasticNet(alpha=1, l1_ratio=0.5, normalize=False)
model.fit(X_train, y_train)
# Create a dataframe with the features and coefficients
fc_df = pd.DataFrame(list(zip(variable.columns, model.coef_)), columns=['features', 'coefficients'])
fc_df.head()
y_pred = model.predict(X_test)
print(f"Training Data Score: {model.score(X_train, y_train)}")
print(f"Testing Data Score: {model.score(X_test, y_test)}")
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=2)
kmeans.fit(X_train)
predicted_cluters = kmeans.predict(X_train)
print(f"Training Data Score: {kmeans.score(X_train, y_train)}")
print(f"Testing Data Score: {kmeans.score(X_test, y_test)}")
from sklearn.naive_bayes import GaussianNB
naive_model = GaussianNB()
naive_model.fit(X_train, y_train)
print(f"Training Data Score: {naive_model.score(X_train, y_train)}")
print(f"Testing Data Score: {naive_model.score(X_test, y_test)}")