{
"metadata": {
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4-final"
},
"orig_nbformat": 2,
"kernelspec": {
"name": "Python 3.7.4 64-bit ('venv')",
"display_name": "Python 3.7.4 64-bit ('venv')",
"metadata": {
"interpreter": {
"hash": "e284c72d79b42194b3fe2a0767ff9cca6d233ae03063bab113c99e4bc6bd25a8"
}
}
}
},
"nbformat": 4,
"nbformat_minor": 2,
"cells": [
{
"source": [
"# Titanic Model with 90% accuracy\n",
"https://www.kaggle.com/vinothan/titanic-model-with-90-accuracy"
],
"cell_type": "markdown",
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np \n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"import warnings\n",
"warnings.filterwarnings('ignore')\n",
"%matplotlib inline"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"train_df=pd.read_csv(\"./datasets/titanic/train.csv\")\n",
"test_df=pd.read_csv(\"./datasets/titanic/test.csv\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" PassengerId Survived Pclass \\\n",
"0 1 0 3 \n",
"1 2 1 1 \n",
"2 3 1 3 \n",
"3 4 1 1 \n",
"4 5 0 3 \n",
"\n",
" Name Sex Age SibSp \\\n",
"0 Braund, Mr. Owen Harris male 22.0 1 \n",
"1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n",
"2 Heikkinen, Miss. Laina female 26.0 0 \n",
"3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n",
"4 Allen, Mr. William Henry male 35.0 0 \n",
"\n",
" Parch Ticket Fare Cabin Embarked \n",
"0 0 A/5 21171 7.2500 NaN S \n",
"1 0 PC 17599 71.2833 C85 C \n",
"2 0 STON/O2. 3101282 7.9250 NaN S \n",
"3 0 113803 53.1000 C123 S \n",
"4 0 373450 8.0500 NaN S "
],
"text/html": "
\n\n
\n \n \n | \n PassengerId | \n Survived | \n Pclass | \n Name | \n Sex | \n Age | \n SibSp | \n Parch | \n Ticket | \n Fare | \n Cabin | \n Embarked | \n
\n \n \n \n 0 | \n 1 | \n 0 | \n 3 | \n Braund, Mr. Owen Harris | \n male | \n 22.0 | \n 1 | \n 0 | \n A/5 21171 | \n 7.2500 | \n NaN | \n S | \n
\n \n 1 | \n 2 | \n 1 | \n 1 | \n Cumings, Mrs. John Bradley (Florence Briggs Th... | \n female | \n 38.0 | \n 1 | \n 0 | \n PC 17599 | \n 71.2833 | \n C85 | \n C | \n
\n \n 2 | \n 3 | \n 1 | \n 3 | \n Heikkinen, Miss. Laina | \n female | \n 26.0 | \n 0 | \n 0 | \n STON/O2. 3101282 | \n 7.9250 | \n NaN | \n S | \n
\n \n 3 | \n 4 | \n 1 | \n 1 | \n Futrelle, Mrs. Jacques Heath (Lily May Peel) | \n female | \n 35.0 | \n 1 | \n 0 | \n 113803 | \n 53.1000 | \n C123 | \n S | \n
\n \n 4 | \n 5 | \n 0 | \n 3 | \n Allen, Mr. William Henry | \n male | \n 35.0 | \n 0 | \n 0 | \n 373450 | \n 8.0500 | \n NaN | \n S | \n
\n \n
\n
"
},
"metadata": {},
"execution_count": 3
}
],
"source": [
"train_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"__Test_DataSet_\n"
]
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
" PassengerId Pclass Name Sex \\\n",
"0 892 3 Kelly, Mr. James male \n",
"1 893 3 Wilkes, Mrs. James (Ellen Needs) female \n",
"2 894 2 Myles, Mr. Thomas Francis male \n",
"3 895 3 Wirz, Mr. Albert male \n",
"4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female \n",
"\n",
" Age SibSp Parch Ticket Fare Cabin Embarked \n",
"0 34.5 0 0 330911 7.8292 NaN Q \n",
"1 47.0 1 0 363272 7.0000 NaN S \n",
"2 62.0 0 0 240276 9.6875 NaN Q \n",
"3 27.0 0 0 315154 8.6625 NaN S \n",
"4 22.0 1 1 3101298 12.2875 NaN S "
],
"text/html": "\n\n
\n \n \n | \n PassengerId | \n Pclass | \n Name | \n Sex | \n Age | \n SibSp | \n Parch | \n Ticket | \n Fare | \n Cabin | \n Embarked | \n
\n \n \n \n 0 | \n 892 | \n 3 | \n Kelly, Mr. James | \n male | \n 34.5 | \n 0 | \n 0 | \n 330911 | \n 7.8292 | \n NaN | \n Q | \n
\n \n 1 | \n 893 | \n 3 | \n Wilkes, Mrs. James (Ellen Needs) | \n female | \n 47.0 | \n 1 | \n 0 | \n 363272 | \n 7.0000 | \n NaN | \n S | \n
\n \n 2 | \n 894 | \n 2 | \n Myles, Mr. Thomas Francis | \n male | \n 62.0 | \n 0 | \n 0 | \n 240276 | \n 9.6875 | \n NaN | \n Q | \n
\n \n 3 | \n 895 | \n 3 | \n Wirz, Mr. Albert | \n male | \n 27.0 | \n 0 | \n 0 | \n 315154 | \n 8.6625 | \n NaN | \n S | \n
\n \n 4 | \n 896 | \n 3 | \n Hirvonen, Mrs. Alexander (Helga E Lindqvist) | \n female | \n 22.0 | \n 1 | \n 1 | \n 3101298 | \n 12.2875 | \n NaN | \n S | \n
\n \n
\n
"
},
"metadata": {},
"execution_count": 4
}
],
"source": [
"print('__Test_DataSet_')\n",
"test_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"def missingdata(data):\n",
" total = data.isnull().sum().sort_values(ascending = False)\n",
" percent = (data.isnull().sum()/data.isnull().count()*100).sort_values(ascending = False)\n",
" ms=pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])\n",
" ms= ms[ms[\"Percent\"] > 0]\n",
" f,ax =plt.subplots(figsize=(8,6))\n",
" plt.xticks(rotation='90')\n",
" fig=sns.barplot(ms.index, ms[\"Percent\"],color=\"green\",alpha=0.8)\n",
" plt.xlabel('Features', fontsize=15)\n",
" plt.ylabel('Percent of missing values', fontsize=15)\n",
" plt.title('Percent missing data by feature', fontsize=15)\n",
" return ms"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" Total Percent\n",
"Cabin 687 77.104377\n",
"Age 177 19.865320\n",
"Embarked 2 0.224467"
],
"text/html": "\n\n
\n \n \n | \n Total | \n Percent | \n
\n \n \n \n Cabin | \n 687 | \n 77.104377 | \n
\n \n Age | \n 177 | \n 19.865320 | \n
\n \n Embarked | \n 2 | \n 0.224467 | \n
\n \n
\n
"
},
"metadata": {},
"execution_count": 6
},
{
"output_type": "display_data",
"data": {
"text/plain": "",
"image/svg+xml": "\n\n\n\n",
"image/png": "\n"
},
"metadata": {
"needs_background": "light"
}
}
],
"source": [
"missingdata(train_df)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" Total Percent\n",
"Cabin 327 78.229665\n",
"Age 86 20.574163\n",
"Fare 1 0.239234"
],
"text/html": "\n\n
\n \n \n | \n Total | \n Percent | \n
\n \n \n \n Cabin | \n 327 | \n 78.229665 | \n
\n \n Age | \n 86 | \n 20.574163 | \n
\n \n Fare | \n 1 | \n 0.239234 | \n
\n \n
\n
"
},
"metadata": {},
"execution_count": 7
},
{
"output_type": "display_data",
"data": {
"text/plain": "",
"image/svg+xml": "\n\n\n\n",
"image/png": "\n"
},
"metadata": {
"needs_background": "light"
}
}
],
"source": [
"missingdata(test_df)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"30.272590361445783"
]
},
"metadata": {},
"execution_count": 8
}
],
"source": [
"test_df['Age'].mean()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"train_df['Embarked'].fillna(train_df['Embarked'].mode()[0], inplace = True)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"test_df['Fare'].fillna(test_df['Fare'].median(), inplace = True)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"drop_column = ['Cabin']\n",
"train_df.drop(drop_column, axis=1, inplace = True)\n",
"test_df.drop(drop_column,axis=1,inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"test_df['Age'].fillna(test_df['Age'].median(), inplace = True)\n",
"train_df['Age'].fillna(train_df['Age'].median(), inplace = True)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"check the nan value in train data\nPassengerId 0\nSurvived 0\nPclass 0\nName 0\nSex 0\nAge 0\nSibSp 0\nParch 0\nTicket 0\nFare 0\nEmbarked 0\ndtype: int64\n__________________________________________________________________________________________\ncheck the nan value in test data\nPassengerId 0\nPclass 0\nName 0\nSex 0\nAge 0\nSibSp 0\nParch 0\nTicket 0\nFare 0\nEmbarked 0\ndtype: int64\n"
]
}
],
"source": [
"print('check the nan value in train data')\n",
"print(train_df.isnull().sum())\n",
"print('___'*30)\n",
"print('check the nan value in test data')\n",
"print(test_df.isnull().sum())"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"all_data=[train_df,test_df]"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"for dataset in all_data:\n",
" dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"# Define function to extract titles from passenger names\n",
"def get_title(name):\n",
" title_search = re.search(' ([A-Za-z]+)\\.', name)\n",
" # If the title exists, extract and return it.\n",
" if title_search:\n",
" return title_search.group(1)\n",
" return \"\"\n",
"# Create a new feature Title, containing the titles of passenger names\n",
"for dataset in all_data:\n",
" dataset['Title'] = dataset['Name'].apply(get_title)\n",
"# Group all non-common titles into one single grouping \"Rare\"\n",
"for dataset in all_data:\n",
" dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', \n",
" 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')\n",
"\n",
" dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')\n",
" dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')\n",
" dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"## create bin for age features\n",
"for dataset in all_data:\n",
" dataset['Age_bin'] = pd.cut(dataset['Age'], bins=[0,12,20,40,120], labels=['Children','Teenage','Adult','Elder'])"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"## create bin for fare features\n",
"for dataset in all_data:\n",
" dataset['Fare_bin'] = pd.cut(dataset['Fare'], bins=[0,7.91,14.45,31,120], labels=['Low_fare','median_fare',\n",
" 'Average_fare','high_fare'])"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"### for our reference making a copy of both DataSet start working for copy of dataset\n",
"traindf=train_df\n",
"testdf=test_df"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"all_dat=[traindf,testdf]"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"for dataset in all_dat:\n",
" drop_column = ['Age','Fare','Name','Ticket']\n",
" dataset.drop(drop_column, axis=1, inplace = True)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"drop_column = ['PassengerId']\n",
"traindf.drop(drop_column, axis=1, inplace = True)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" PassengerId Pclass Sex SibSp Parch Embarked FamilySize Title \\\n",
"0 892 3 male 0 0 Q 1 Mr \n",
"1 893 3 female 1 0 S 2 Mrs \n",
"\n",
" Age_bin Fare_bin \n",
"0 Adult Low_fare \n",
"1 Elder Low_fare "
],
"text/html": "\n\n
\n \n \n | \n PassengerId | \n Pclass | \n Sex | \n SibSp | \n Parch | \n Embarked | \n FamilySize | \n Title | \n Age_bin | \n Fare_bin | \n
\n \n \n \n 0 | \n 892 | \n 3 | \n male | \n 0 | \n 0 | \n Q | \n 1 | \n Mr | \n Adult | \n Low_fare | \n
\n \n 1 | \n 893 | \n 3 | \n female | \n 1 | \n 0 | \n S | \n 2 | \n Mrs | \n Elder | \n Low_fare | \n
\n \n
\n
"
},
"metadata": {},
"execution_count": 23
}
],
"source": [
"testdf.head(2)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"traindf = pd.get_dummies(traindf, columns = [\"Sex\",\"Title\",\"Age_bin\",\"Embarked\",\"Fare_bin\"],\n",
" prefix=[\"Sex\",\"Title\",\"Age_type\",\"Em_type\",\"Fare_type\"])"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"testdf = pd.get_dummies(testdf, columns = [\"Sex\",\"Title\",\"Age_bin\",\"Embarked\",\"Fare_bin\"],\n",
" prefix=[\"Sex\",\"Title\",\"Age_type\",\"Em_type\",\"Fare_type\"])"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" PassengerId Pclass SibSp Parch FamilySize Sex_female Sex_male \\\n",
"0 892 3 0 0 1 0 1 \n",
"1 893 3 1 0 2 1 0 \n",
"2 894 2 0 0 1 0 1 \n",
"3 895 3 0 0 1 0 1 \n",
"4 896 3 1 1 3 1 0 \n",
"\n",
" Title_Master Title_Miss Title_Mr ... Age_type_Teenage Age_type_Adult \\\n",
"0 0 0 1 ... 0 1 \n",
"1 0 0 0 ... 0 0 \n",
"2 0 0 1 ... 0 0 \n",
"3 0 0 1 ... 0 1 \n",
"4 0 0 0 ... 0 1 \n",
"\n",
" Age_type_Elder Em_type_C Em_type_Q Em_type_S Fare_type_Low_fare \\\n",
"0 0 0 1 0 1 \n",
"1 1 0 0 1 1 \n",
"2 1 0 1 0 0 \n",
"3 0 0 0 1 0 \n",
"4 0 0 0 1 0 \n",
"\n",
" Fare_type_median_fare Fare_type_Average_fare Fare_type_high_fare \n",
"0 0 0 0 \n",
"1 0 0 0 \n",
"2 1 0 0 \n",
"3 1 0 0 \n",
"4 1 0 0 \n",
"\n",
"[5 rows x 23 columns]"
],
"text/html": "\n\n
\n \n \n | \n PassengerId | \n Pclass | \n SibSp | \n Parch | \n FamilySize | \n Sex_female | \n Sex_male | \n Title_Master | \n Title_Miss | \n Title_Mr | \n ... | \n Age_type_Teenage | \n Age_type_Adult | \n Age_type_Elder | \n Em_type_C | \n Em_type_Q | \n Em_type_S | \n Fare_type_Low_fare | \n Fare_type_median_fare | \n Fare_type_Average_fare | \n Fare_type_high_fare | \n
\n \n \n \n 0 | \n 892 | \n 3 | \n 0 | \n 0 | \n 1 | \n 0 | \n 1 | \n 0 | \n 0 | \n 1 | \n ... | \n 0 | \n 1 | \n 0 | \n 0 | \n 1 | \n 0 | \n 1 | \n 0 | \n 0 | \n 0 | \n
\n \n 1 | \n 893 | \n 3 | \n 1 | \n 0 | \n 2 | \n 1 | \n 0 | \n 0 | \n 0 | \n 0 | \n ... | \n 0 | \n 0 | \n 1 | \n 0 | \n 0 | \n 1 | \n 1 | \n 0 | \n 0 | \n 0 | \n
\n \n 2 | \n 894 | \n 2 | \n 0 | \n 0 | \n 1 | \n 0 | \n 1 | \n 0 | \n 0 | \n 1 | \n ... | \n 0 | \n 0 | \n 1 | \n 0 | \n 1 | \n 0 | \n 0 | \n 1 | \n 0 | \n 0 | \n
\n \n 3 | \n 895 | \n 3 | \n 0 | \n 0 | \n 1 | \n 0 | \n 1 | \n 0 | \n 0 | \n 1 | \n ... | \n 0 | \n 1 | \n 0 | \n 0 | \n 0 | \n 1 | \n 0 | \n 1 | \n 0 | \n 0 | \n
\n \n 4 | \n 896 | \n 3 | \n 1 | \n 1 | \n 3 | \n 1 | \n 0 | \n 0 | \n 0 | \n 0 | \n ... | \n 0 | \n 1 | \n 0 | \n 0 | \n 0 | \n 1 | \n 0 | \n 1 | \n 0 | \n 0 | \n
\n \n
\n
5 rows × 23 columns
\n
"
},
"metadata": {},
"execution_count": 26
}
],
"source": [
"testdf.head()"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"output_type": "display_data",
"data": {
"text/plain": "",
"image/svg+xml": "\n\n\n