{
"metadata": {
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4-final"
},
"orig_nbformat": 2,
"kernelspec": {
"name": "Python 3.7.4 64-bit ('venv')",
"display_name": "Python 3.7.4 64-bit ('venv')",
"metadata": {
"interpreter": {
"hash": "e284c72d79b42194b3fe2a0767ff9cca6d233ae03063bab113c99e4bc6bd25a8"
}
}
}
},
"nbformat": 4,
"nbformat_minor": 2,
"cells": [
{
"source": [
"# 练习 3-3\n",
"处理Kaggle上的泰坦尼克数据集"
],
"cell_type": "markdown",
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"'./datasets/titanic'"
]
},
"metadata": {},
"execution_count": 1
}
],
"source": [
"import os\n",
"TITANTIC_PATH = os.path.join('./datasets', 'titanic')\n",
"TITANTIC_PATH"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"def load_titantic_data(filename, titantic_path=TITANTIC_PATH):\n",
" filepath = os.path.join(titantic_path, filename)\n",
" return pd.read_csv(filepath)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"tags": []
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"titanic.zip: Skipping, found more recently modified local copy (use --force to force download)\n"
]
}
],
"source": [
"# pip install kaggle\n",
"# 使用kaggle提供的api下载数据\n",
"!kaggle competitions download -c titanic -p datasets"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"train_data = load_titantic_data('train.csv')\n",
"test_data = load_titantic_data('test.csv')"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" PassengerId Survived Pclass \\\n",
"0 1 0 3 \n",
"1 2 1 1 \n",
"2 3 1 3 \n",
"3 4 1 1 \n",
"4 5 0 3 \n",
"\n",
" Name Sex Age SibSp \\\n",
"0 Braund, Mr. Owen Harris male 22.0 1 \n",
"1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n",
"2 Heikkinen, Miss. Laina female 26.0 0 \n",
"3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n",
"4 Allen, Mr. William Henry male 35.0 0 \n",
"\n",
" Parch Ticket Fare Cabin Embarked \n",
"0 0 A/5 21171 7.2500 NaN S \n",
"1 0 PC 17599 71.2833 C85 C \n",
"2 0 STON/O2. 3101282 7.9250 NaN S \n",
"3 0 113803 53.1000 C123 S \n",
"4 0 373450 8.0500 NaN S "
],
"text/html": "
\n\n
\n \n \n | \n PassengerId | \n Survived | \n Pclass | \n Name | \n Sex | \n Age | \n SibSp | \n Parch | \n Ticket | \n Fare | \n Cabin | \n Embarked | \n
\n \n \n \n 0 | \n 1 | \n 0 | \n 3 | \n Braund, Mr. Owen Harris | \n male | \n 22.0 | \n 1 | \n 0 | \n A/5 21171 | \n 7.2500 | \n NaN | \n S | \n
\n \n 1 | \n 2 | \n 1 | \n 1 | \n Cumings, Mrs. John Bradley (Florence Briggs Th... | \n female | \n 38.0 | \n 1 | \n 0 | \n PC 17599 | \n 71.2833 | \n C85 | \n C | \n
\n \n 2 | \n 3 | \n 1 | \n 3 | \n Heikkinen, Miss. Laina | \n female | \n 26.0 | \n 0 | \n 0 | \n STON/O2. 3101282 | \n 7.9250 | \n NaN | \n S | \n
\n \n 3 | \n 4 | \n 1 | \n 1 | \n Futrelle, Mrs. Jacques Heath (Lily May Peel) | \n female | \n 35.0 | \n 1 | \n 0 | \n 113803 | \n 53.1000 | \n C123 | \n S | \n
\n \n 4 | \n 5 | \n 0 | \n 3 | \n Allen, Mr. William Henry | \n male | \n 35.0 | \n 0 | \n 0 | \n 373450 | \n 8.0500 | \n NaN | \n S | \n
\n \n
\n
"
},
"metadata": {},
"execution_count": 5
}
],
"source": [
"train_data.head()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" PassengerId Pclass Name Sex \\\n",
"0 892 3 Kelly, Mr. James male \n",
"1 893 3 Wilkes, Mrs. James (Ellen Needs) female \n",
"2 894 2 Myles, Mr. Thomas Francis male \n",
"3 895 3 Wirz, Mr. Albert male \n",
"4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female \n",
"\n",
" Age SibSp Parch Ticket Fare Cabin Embarked \n",
"0 34.5 0 0 330911 7.8292 NaN Q \n",
"1 47.0 1 0 363272 7.0000 NaN S \n",
"2 62.0 0 0 240276 9.6875 NaN Q \n",
"3 27.0 0 0 315154 8.6625 NaN S \n",
"4 22.0 1 1 3101298 12.2875 NaN S "
],
"text/html": "\n\n
\n \n \n | \n PassengerId | \n Pclass | \n Name | \n Sex | \n Age | \n SibSp | \n Parch | \n Ticket | \n Fare | \n Cabin | \n Embarked | \n
\n \n \n \n 0 | \n 892 | \n 3 | \n Kelly, Mr. James | \n male | \n 34.5 | \n 0 | \n 0 | \n 330911 | \n 7.8292 | \n NaN | \n Q | \n
\n \n 1 | \n 893 | \n 3 | \n Wilkes, Mrs. James (Ellen Needs) | \n female | \n 47.0 | \n 1 | \n 0 | \n 363272 | \n 7.0000 | \n NaN | \n S | \n
\n \n 2 | \n 894 | \n 2 | \n Myles, Mr. Thomas Francis | \n male | \n 62.0 | \n 0 | \n 0 | \n 240276 | \n 9.6875 | \n NaN | \n Q | \n
\n \n 3 | \n 895 | \n 3 | \n Wirz, Mr. Albert | \n male | \n 27.0 | \n 0 | \n 0 | \n 315154 | \n 8.6625 | \n NaN | \n S | \n
\n \n 4 | \n 896 | \n 3 | \n Hirvonen, Mrs. Alexander (Helga E Lindqvist) | \n female | \n 22.0 | \n 1 | \n 1 | \n 3101298 | \n 12.2875 | \n NaN | \n S | \n
\n \n
\n
"
},
"metadata": {},
"execution_count": 6
}
],
"source": [
"test_data.head()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"tags": []
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"\nRangeIndex: 891 entries, 0 to 890\nData columns (total 12 columns):\n # Column Non-Null Count Dtype \n--- ------ -------------- ----- \n 0 PassengerId 891 non-null int64 \n 1 Survived 891 non-null int64 \n 2 Pclass 891 non-null int64 \n 3 Name 891 non-null object \n 4 Sex 891 non-null object \n 5 Age 714 non-null float64\n 6 SibSp 891 non-null int64 \n 7 Parch 891 non-null int64 \n 8 Ticket 891 non-null object \n 9 Fare 891 non-null float64\n 10 Cabin 204 non-null object \n 11 Embarked 889 non-null object \ndtypes: float64(2), int64(5), object(5)\nmemory usage: 83.7+ KB\n"
]
}
],
"source": [
"train_data.info()"
]
},
{
"source": [
"可以看出**Age, Cabin, Embarked**数据是不完全的,特别是的Cabin缺失了74%的数据,没办法只能忽略掉Cabin记录了。Age的缺失的数据可以使用median来代替。\n",
"\n",
"而**Name、Ticket**可能有些数字,但是转化为模型可以使用的数字有些棘手,所以也暂时忽略掉相关的记录"
],
"cell_type": "markdown",
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" PassengerId Survived Pclass Age SibSp \\\n",
"count 891.000000 891.000000 891.000000 714.000000 891.000000 \n",
"mean 446.000000 0.383838 2.308642 29.699118 0.523008 \n",
"std 257.353842 0.486592 0.836071 14.526497 1.102743 \n",
"min 1.000000 0.000000 1.000000 0.420000 0.000000 \n",
"25% 223.500000 0.000000 2.000000 20.125000 0.000000 \n",
"50% 446.000000 0.000000 3.000000 28.000000 0.000000 \n",
"75% 668.500000 1.000000 3.000000 38.000000 1.000000 \n",
"max 891.000000 1.000000 3.000000 80.000000 8.000000 \n",
"\n",
" Parch Fare \n",
"count 891.000000 891.000000 \n",
"mean 0.381594 32.204208 \n",
"std 0.806057 49.693429 \n",
"min 0.000000 0.000000 \n",
"25% 0.000000 7.910400 \n",
"50% 0.000000 14.454200 \n",
"75% 0.000000 31.000000 \n",
"max 6.000000 512.329200 "
],
"text/html": "\n\n
\n \n \n | \n PassengerId | \n Survived | \n Pclass | \n Age | \n SibSp | \n Parch | \n Fare | \n
\n \n \n \n count | \n 891.000000 | \n 891.000000 | \n 891.000000 | \n 714.000000 | \n 891.000000 | \n 891.000000 | \n 891.000000 | \n
\n \n mean | \n 446.000000 | \n 0.383838 | \n 2.308642 | \n 29.699118 | \n 0.523008 | \n 0.381594 | \n 32.204208 | \n
\n \n std | \n 257.353842 | \n 0.486592 | \n 0.836071 | \n 14.526497 | \n 1.102743 | \n 0.806057 | \n 49.693429 | \n
\n \n min | \n 1.000000 | \n 0.000000 | \n 1.000000 | \n 0.420000 | \n 0.000000 | \n 0.000000 | \n 0.000000 | \n
\n \n 25% | \n 223.500000 | \n 0.000000 | \n 2.000000 | \n 20.125000 | \n 0.000000 | \n 0.000000 | \n 7.910400 | \n
\n \n 50% | \n 446.000000 | \n 0.000000 | \n 3.000000 | \n 28.000000 | \n 0.000000 | \n 0.000000 | \n 14.454200 | \n
\n \n 75% | \n 668.500000 | \n 1.000000 | \n 3.000000 | \n 38.000000 | \n 1.000000 | \n 0.000000 | \n 31.000000 | \n
\n \n max | \n 891.000000 | \n 1.000000 | \n 3.000000 | \n 80.000000 | \n 8.000000 | \n 6.000000 | \n 512.329200 | \n
\n \n
\n
"
},
"metadata": {},
"execution_count": 8
}
],
"source": [
"train_data.describe()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"0 549\n",
"1 342\n",
"Name: Survived, dtype: int64"
]
},
"metadata": {},
"execution_count": 9
}
],
"source": [
"train_data['Survived'].value_counts()"
]
},
{
"source": [
"- 只有38.34%的人活了下来"
],
"cell_type": "markdown",
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"3 491\n",
"1 216\n",
"2 184\n",
"Name: Pclass, dtype: int64"
]
},
"metadata": {},
"execution_count": 10
}
],
"source": [
"train_data['Pclass'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"male 577\n",
"female 314\n",
"Name: Sex, dtype: int64"
]
},
"metadata": {},
"execution_count": 11
}
],
"source": [
"train_data['Sex'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"S 644\n",
"C 168\n",
"Q 77\n",
"Name: Embarked, dtype: int64"
]
},
"metadata": {},
"execution_count": 12
}
],
"source": [
"train_data['Embarked'].value_counts()"
]
},
{
"source": [
"创建一个预处理Pipeline"
],
"cell_type": "markdown",
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.base import BaseEstimator, TransformerMixin\n",
"\n",
"class DataFrameSelector(BaseEstimator, TransformerMixin):\n",
" def __init__(self, attribute_names):\n",
" self.attribute_names = attribute_names\n",
"\n",
" def fit(self, X, y=None):\n",
" return self\n",
"\n",
" def transform(self, X):\n",
" return X[self.attribute_names]"
]
},
{
"source": [
"创建一个pipeline选出数值属性"
],
"cell_type": "markdown",
"metadata": {}
},
{
"source": [
"from sklearn.pipeline import Pipeline\n",
"try:\n",
" from sklearn.impute import SimpleImputer # scikit-learn 0.20+\n",
"except ImportError:\n",
" from sklearn.preprocessing import Imputer as SimpleImputer\n",
"num_pipeline = Pipeline([\n",
" (\"select_numeric\", DataFrameSelector([\"Age\", \"SibSp\", \"Parch\", \"Fare\"])),\n",
" (\"imputer\", SimpleImputer(strategy='median'))\n",
"])"
],
"cell_type": "code",
"metadata": {},
"execution_count": 14,
"outputs": []
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"array([[22. , 1. , 0. , 7.25 ],\n",
" [38. , 1. , 0. , 71.2833],\n",
" [26. , 0. , 0. , 7.925 ],\n",
" ...,\n",
" [28. , 1. , 2. , 23.45 ],\n",
" [26. , 0. , 0. , 30. ],\n",
" [32. , 0. , 0. , 7.75 ]])"
]
},
"metadata": {},
"execution_count": 15
}
],
"source": [
"num_pipeline.fit_transform(train_data)"
]
},
{
"source": [
"在0.20以前版本的Scikit-Learn需要使用`LabelBinarizer`或`CategoricalEncoder`才能将分类的值转化为one-hot-vector; 0.20+以上的版本可以直接使用`OneHotEncoder`类"
],
"cell_type": "markdown",
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"PassengerId 891\n",
"Survived 0\n",
"Pclass 3\n",
"Name Allen, Miss. Elisabeth Walton\n",
"Sex male\n",
"Age 24\n",
"SibSp 0\n",
"Parch 0\n",
"Ticket CA. 2343\n",
"Fare 8.05\n",
"Cabin G6\n",
"Embarked S\n",
"dtype: object"
]
},
"metadata": {},
"execution_count": 16
}
],
"source": [
"most_ = pd.Series([train_data[c].value_counts().index[0] for c in train_data], index=train_data.columns)\n",
"most_"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"PassengerId\nSurvived\nPclass\nName\nSex\nAge\nSibSp\nParch\nTicket\nFare\nCabin\nEmbarked\n"
]
}
],
"source": [
"for c in train_data:\n",
" print(c)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"'Allen, Miss. Elisabeth Walton'"
]
},
"metadata": {},
"execution_count": 18
}
],
"source": [
"train_data['Name'].value_counts().index[0]"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"class MostFrequentImputer(BaseEstimator, TransformerMixin):\n",
" \"\"\"\n",
" 计算分类中的值出现的次数最多的的类表是多少\n",
" 示例:\n",
" 如tain_data['Set'].value_counts():\n",
" male 577\n",
" female 314\n",
" Name: Sex, dtype: int64\n",
"\n",
" 则ain_data['Set'].value_counts().index[0]:male\n",
" \"\"\"\n",
" def fit(self, X, y=None):\n",
" self.most_requent_ = pd.Series([X[c].value_counts().index[0] for c in X], index=X.columns)\n",
" return self\n",
" \n",
" def transform(self, X, y=None):\n",
" return X.fillna(self.most_requent_)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.preprocessing import OneHotEncoder\n"
]
},
{
"source": [
"现在创建一个pipeline来处理分类属性"
],
"cell_type": "markdown",
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"cat_pipeline = Pipeline([\n",
" ('select_cat', DataFrameSelector(['Pclass', 'Sex', 'Embarked'])),\n",
" ('imputer', MostFrequentImputer()),\n",
" ('cat_encoder', OneHotEncoder(sparse=False))\n",
"])"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"array([[0., 0., 1., ..., 0., 0., 1.],\n",
" [1., 0., 0., ..., 1., 0., 0.],\n",
" [0., 0., 1., ..., 0., 0., 1.],\n",
" ...,\n",
" [0., 0., 1., ..., 0., 0., 1.],\n",
" [1., 0., 0., ..., 1., 0., 0.],\n",
" [0., 0., 1., ..., 0., 1., 0.]])"
]
},
"metadata": {},
"execution_count": 22
}
],
"source": [
"cat_pipeline.fit_transform(train_data)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"train_data.head()"
]
},
{
"source": [
"**将数字特征和分类特征结合起来**"
],
"cell_type": "markdown",
"metadata": {}
},
{
"source": [
"现在可是使用这个preprocess_pipeline将raw data转化为机器学习模型使用的数据了"
],
"cell_type": "markdown",
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.pipeline import FeatureUnion\n",
"preprocess_pipeline = FeatureUnion(transformer_list=[\n",
" ('num_pipeline', num_pipeline),\n",
" ('cat_pipeline', cat_pipeline),\n",
"])"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"array([[22., 1., 0., ..., 0., 0., 1.],\n",
" [38., 1., 0., ..., 1., 0., 0.],\n",
" [26., 0., 0., ..., 0., 0., 1.],\n",
" ...,\n",
" [28., 1., 2., ..., 0., 0., 1.],\n",
" [26., 0., 0., ..., 1., 0., 0.],\n",
" [32., 0., 0., ..., 0., 1., 0.]])"
]
},
"metadata": {},
"execution_count": 25
}
],
"source": [
"X_train = preprocess_pipeline.fit_transform(train_data)\n",
"X_train"
]
},
{
"source": [
"**不要忘了训练的标签数据**"
],
"cell_type": "markdown",
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"0 0\n",
"1 1\n",
"2 1\n",
"3 1\n",
"4 0\n",
" ..\n",
"886 0\n",
"887 1\n",
"888 0\n",
"889 1\n",
"890 0\n",
"Name: Survived, Length: 891, dtype: int64"
]
},
"metadata": {},
"execution_count": 26
}
],
"source": [
"y_train = train_data['Survived']\n",
"y_train"
]
},
{
"source": [
"### 首先使用SVC模型测试一下"
],
"cell_type": "markdown",
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"SVC(gamma='auto')"
]
},
"metadata": {},
"execution_count": 27
}
],
"source": [
"from sklearn.svm import SVC\n",
"svm_clf = SVC(gamma='auto')\n",
"svm_clf.fit(X_train, y_train)"
]
},
{
"source": [
"使用训练好的SVC模型进行预测"
],
"cell_type": "markdown",
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"array([0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1,\n",
" 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1,\n",
" 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,\n",
" 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1,\n",
" 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,\n",
" 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1,\n",
" 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1,\n",
" 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,\n",
" 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,\n",
" 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,\n",
" 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,\n",
" 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,\n",
" 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0,\n",
" 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1,\n",
" 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1,\n",
" 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0])"
]
},
"metadata": {},
"execution_count": 28
}
],
"source": [
"X_test = preprocess_pipeline.fit_transform(test_data)\n",
"y_pred = svm_clf.predict(X_test)\n",
"y_pred"
]
},
{
"source": [
"此时我们可是使用SVC预测的结果按照Kaggle要求的格式构建号CSV文件,上传Kaggle看我们的得分,不过在此之前我们可是使用交叉验证的方法来看看我们的模型表现如何"
],
"cell_type": "markdown",
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"array([0.66666667, 0.66292135, 0.71910112, 0.74157303, 0.76404494,\n",
" 0.71910112, 0.7752809 , 0.73033708, 0.74157303, 0.80898876])"
]
},
"metadata": {},
"execution_count": 29
}
],
"source": [
"from sklearn.model_selection import cross_val_score\n",
"svm_scores = cross_val_score(svm_clf, X_train, y_train, cv=10)\n",
"svm_scores"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"0.7329588014981274"
]
},
"metadata": {},
"execution_count": 30
}
],
"source": [
"svm_scores.mean()"
]
},
{
"source": [
"也就是说模型的accuracy只有73.30%,虽然明显要比随你乱猜要好,但是仍然不是一个好的得分。从kaggle的[leaderboard](https://www.kaggle.com/c/titanic/leaderboard)可以看到前面排名几乎都达到了100%, 不过由于可以下载的到[测试集](https://www.encyclopedia-titanica.org/titanic-victims/),100%的成绩中有比较大的水分,我们无需理会这些"
],
"cell_type": "markdown",
"metadata": {}
},
{
"source": [
"### 试试RandomForestClassifier"
],
"cell_type": "markdown",
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"array([0.74444444, 0.79775281, 0.75280899, 0.80898876, 0.88764045,\n",
" 0.83146067, 0.83146067, 0.7752809 , 0.85393258, 0.84269663])"
]
},
"metadata": {},
"execution_count": 31
}
],
"source": [
"from sklearn.ensemble import RandomForestClassifier\n",
"forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)\n",
"forest_scores = cross_val_score(forest_clf, X_train, y_train, cv=10)\n",
"forest_scores\n"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"0.8126466916354558"
]
},
"metadata": {},
"execution_count": 32
}
],
"source": [
"forest_scores.mean()"
]
},
{
"source": [
"可以看到RandomForestClassifier明显好了一下"
],
"cell_type": "markdown",
"metadata": {}
},
{
"source": [
"### 试试AdaBoostClassifier"
],
"cell_type": "markdown",
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"0.812621722846442"
]
},
"metadata": {},
"execution_count": 33
}
],
"source": [
"from sklearn.ensemble import AdaBoostClassifier\n",
"boost_clf = AdaBoostClassifier(n_estimators=100, random_state=42)\n",
"boost_scores = cross_val_score(boost_clf, X_train, y_train, cv=10)\n",
"boost_scores.mean()"
]
},
{
"source": [
"### 试试KNeighborsClassifier"
],
"cell_type": "markdown",
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"0.7172659176029963"
]
},
"metadata": {},
"execution_count": 34
}
],
"source": [
"from sklearn.neighbors import KNeighborsClassifier\n",
"knn_clf = KNeighborsClassifier()\n",
"knn_scores = cross_val_score(knn_clf, X_train, y_train, cv=10)\n",
"knn_scores.mean()"
]
},
{
"source": [
"可以看到`RandomForestClassifier`表现的还是不错的,我们来找一下随机森林的最佳参数"
],
"cell_type": "markdown",
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.model_selection import GridSearchCV\n",
"params_grid = [{\n",
" 'n_estimators':[10,20,30, 40, 50],\n",
" 'criterion':['gini','entropy'],\n",
" 'max_features':['sqrt','log2']}]\n",
"grid_search = GridSearchCV(forest_clf, params_grid, cv=5, verbose=3, n_jobs=-1)\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Fitting 5 folds for each of 20 candidates, totalling 100 fits\n",
"[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.\n",
"[Parallel(n_jobs=-1)]: Done 16 tasks | elapsed: 3.6s\n",
"[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 5.8s finished\n"
]
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42), n_jobs=-1,\n",
" param_grid=[{'criterion': ['gini', 'entropy'],\n",
" 'max_features': ['sqrt', 'log2'],\n",
" 'n_estimators': [10, 20, 30, 40, 50]}],\n",
" verbose=3)"
]
},
"metadata": {},
"execution_count": 36
}
],
"source": [
"grid_search.fit(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"{'criterion': 'entropy', 'max_features': 'sqrt', 'n_estimators': 50}"
]
},
"metadata": {},
"execution_count": 37
}
],
"source": [
"grid_search.best_params_"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"0.8182397003745316"
]
},
"metadata": {},
"execution_count": 39
}
],
"source": [
"forest_clf = RandomForestClassifier(**grid_search.best_params_)\n",
"forest_scores = cross_val_score(forest_clf, X_train, y_train, cv=10)\n",
"forest_scores.mean()"
]
},
{
"source": [
"### 实时BaggingClassifier"
],
"cell_type": "markdown",
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"(0.8272034956304619, 0.7498901958778095)"
]
},
"metadata": {},
"execution_count": 46
}
],
"source": [
"from sklearn.ensemble import BaggingClassifier\n",
"bagging_clf = BaggingClassifier(forest_clf)\n",
"bagging_clf_acc_sorces = cross_val_score(bagging_clf, X_train, y_train, cv=10, scoring='accuracy')\n",
"bagging_clf_f1_sorces = cross_val_score(bagging_clf, X_train, y_train, cv=10, scoring='f1')\n",
"bagging_clf_acc_sorces.mean(), bagging_clf_f1_sorces.mean()"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [],
"source": [
"bagging_grid_params=[{\n",
" 'base_estimator':[svm_clf, forest_clf],\n",
" 'n_estimators':[10,20,30, 40, 50],\n",
" 'max_samples':[0.1, 0.3, 0.5, 0.8, 1.0],\n",
" 'max_features':[0.1, 0.3, 0.5, 0.8, 1.0]\n",
"}]\n",
"bagging_grid_search = GridSearchCV(bagging_clf, bagging_grid_params, cv=5, verbose=3, n_jobs=-1)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"bagging_grid_search.fit(X_train, y_train)"
]
},
{
"source": [
"这里我们不关注每个模型的10折叠的平均分,而是看一下每个模型的每次折叠的箱线图"
],
"cell_type": "markdown",
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [],
"source": [
"%matplotlib inline\n",
"import matplotlib\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [
{
"output_type": "display_data",
"data": {
"text/plain": "",
"image/svg+xml": "\n\n\n\n",
"image/png": "\n"
},
"metadata": {
"needs_background": "light"
}
}
],
"source": [
"plt.figure(figsize=(8, 4))\n",
"plt.plot([1]*10, svm_scores, '.')\n",
"plt.plot([2]*10, forest_scores, '.')\n",
"plt.boxplot([svm_scores, forest_scores], labels=('SVM', 'Random Forest'))\n",
"plt.ylabel('Accuracy', fontsize=14)\n",
"plt.show()"
]
},
{
"source": [
"为了进一步改善结果,可以进行如下操作:\n",
"- 对更多模型,使用cross validation和grid search调整超参数\n",
"- 使用更多的特征工程,例如:\n",
" - 是**SibSp**和**Parch**的和代替他们\n",
" - 尝试找出与**Survived**属性很好相关的部分\n",
"- 尝试把年龄属性更改为年龄段属性\n"
],
"cell_type": "markdown",
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" Survived\n",
"AgeBucket \n",
"0.0 0.576923\n",
"15.0 0.362745\n",
"30.0 0.423256\n",
"45.0 0.404494\n",
"60.0 0.240000\n",
"75.0 1.000000"
],
"text/html": "\n\n
\n \n \n | \n Survived | \n
\n \n AgeBucket | \n | \n
\n \n \n \n 0.0 | \n 0.576923 | \n
\n \n 15.0 | \n 0.362745 | \n
\n \n 30.0 | \n 0.423256 | \n
\n \n 45.0 | \n 0.404494 | \n
\n \n 60.0 | \n 0.240000 | \n
\n \n 75.0 | \n 1.000000 | \n
\n \n
\n
"
},
"metadata": {},
"execution_count": 44
}
],
"source": [
"train_data[\"AgeBucket\"] = train_data[\"Age\"] // 15 * 15\n",
"train_data[[\"AgeBucket\", \"Survived\"]].groupby(['AgeBucket']).mean()"
]
},
{
"cell_type": "code",
"execution_count": 77,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" Survived\n",
"RelativesOnboard \n",
"0 0.303538\n",
"1 0.552795\n",
"2 0.578431\n",
"3 0.724138\n",
"4 0.200000\n",
"5 0.136364\n",
"6 0.333333\n",
"7 0.000000\n",
"10 0.000000"
],
"text/html": "\n\n
\n \n \n | \n Survived | \n
\n \n RelativesOnboard | \n | \n
\n \n \n \n 0 | \n 0.303538 | \n
\n \n 1 | \n 0.552795 | \n
\n \n 2 | \n 0.578431 | \n
\n \n 3 | \n 0.724138 | \n
\n \n 4 | \n 0.200000 | \n
\n \n 5 | \n 0.136364 | \n
\n \n 6 | \n 0.333333 | \n
\n \n 7 | \n 0.000000 | \n
\n \n 10 | \n 0.000000 | \n
\n \n
\n
"
},
"metadata": {},
"execution_count": 77
}
],
"source": [
"train_data[\"RelativesOnboard\"] = train_data[\"SibSp\"] + train_data[\"Parch\"]\n",
"train_data[[\"RelativesOnboard\", \"Survived\"]].groupby(['RelativesOnboard']).mean()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
]
}