from sklearn
.model_selection
import train_test_split
, GridSearchCV
from sklearn
.neighbors
import KNeighborsClassifier
from sklearn
.preprocessing
import StandardScaler
from sklearn
.feature_extraction
.text
import TfidfVectorizer
from sklearn
.naive_bayes
import MultinomialNB
from sklearn
.metrics
import classification_report
from sklearn
.feature_extraction
import DictVectorizer
from sklearn
.tree
import DecisionTreeClassifier
, export_graphviz
from sklearn
.ensemble
import RandomForestClassifier
import pandas
as pd
import numpy
as np
import matplotlib
.pyplot
as plt
import re
from sklearn
import preprocessing
from sklearn
.feature_extraction
import DictVectorizer
from sklearn
.linear_model
import LogisticRegression
plt
.rcParams
['font.sans-serif']=['SimHei']
plt
.rcParams
['axes.unicode_minus']=False
导入数据:
data_train
= pd
.read_csv
("F:/kaggle数据/titanic/train.csv")
data_test
= pd
.read_csv
("F:/kaggle数据/titanic/test.csv")
查看缺失信息:
"""
训练集中"Age"、"Cabin"、"Embarked"缺失
测试集中"Age"、"Fare"、"Cabin"缺失
测试集中无"Survived"标签值
"""
缺失值处理方法:
"""
1.直接删除此特征(缺损数据太多的情况,防止引入噪声)
2.直接删除缺损数据的样本(~土豪操作~只用于训练数据集,且样本量较大,缺损数据样本较少的情况)
3.直接将有无数值作为新的特征(数据缺失较多,且数据有无本身是对预测是一个有用的特征)
4.中值或均值回补(缺失数据较多,不想损失此较多训练数据,特征又比较重要的情况,是比较常用的方法)
5.参考其他特征,利用与此特征的相关性编写算法回补数据(~大神级操作~回补的准确性可能会比较高一些,但实现过程复杂)
"""
data_train
.isnull
().any()
结果:
PassengerId
False
Survived
False
Pclass
False
Name
False
Sex
False
Age
True
SibSp
False
Parch
False
Ticket
False
Fare
False
Cabin
True
Embarked
True
dtype
: bool
data_test
.isnull
().any()
结果:
PassengerId
False
Pclass
False
Name
False
Sex
False
Age
True
SibSp
False
Parch
False
Ticket
False
Fare
True
Cabin
True
Embarked
False
dtype
: bool
data_train
.info
(null_counts
=True)
结果:
<class 'pandas.core.frame.DataFrame'>
RangeIndex
: 891 entries
, 0 to
890
Data columns
(total
12 columns
):
--- ------ -------------- -----
0 PassengerId
891 non
-null int64
1 Survived
891 non
-null int64
2 Pclass
891 non
-null int64
3 Name
891 non
-null
object
4 Sex
891 non
-null
object
5 Age
714 non
-null float64
6 SibSp
891 non
-null int64
7 Parch
891 non
-null int64
8 Ticket
891 non
-null
object
9 Fare
891 non
-null float64
10 Cabin
204 non
-null
object
11 Embarked
889 non
-null
object
dtypes
: float64
(2), int64
data_test
.info
(null_counts
=True)
结果:
<class 'pandas.core.frame.DataFrame'>
RangeIndex
: 418 entries
, 0 to
417
Data columns
(total
11 columns
):
--- ------ -------------- -----
0 PassengerId
418 non
-null int64
1 Pclass
418 non
-null int64
2 Name
418 non
-null
object
3 Sex
418 non
-null
object
4 Age
332 non
-null float64
5 SibSp
418 non
-null int64
6 Parch
418 non
-null int64
7 Ticket
418 non
-null
object
8 Fare
417 non
-null float64
9 Cabin
91 non
-null
object
10 Embarked
418 non
-null
object
dtypes
: float64
(2), int64
(4), object(5)
memory usage
: 36.0+ KB
一、查看数据信息
1、船舱等级与生还者的关系
1) 使用不同分类下的value_counts()
bottom
= np
.zeros
(3)
x
= np
.sort
(data_train
["Pclass"].unique
())
for i
in range(2):
y
= data_train
[data_train
["Survived"]==i
].groupby
(["Pclass"])["Survived"].value_counts
()
plt
.bar
(x
,y
,bottom
=bottom
,width
=0.5,label
="{}".format(i
))
for xi
,yi
,zi
in zip(x
,bottom
,y
):
plt
.text
(xi
,((yi
+zi
)+yi
)/2,zi
,ha
="center",va
="center")
bottom
+= y
.values
plt
.legend
()
plt
.title
("船舱等级与生还的关系")
plt
.show
()
2)直接挑选不同Pclass类别下的Survived分类数,并统计频数合并
h1
= data_train
[data_train
["Pclass"]==1]["Survived"].value_counts
().reindex
(index
=(0,1))
h2
= data_train
[data_train
["Pclass"]==2]["Survived"].value_counts
()
h3
= data_train
[data_train
["Pclass"]==3]["Survived"].value_counts
()
pd
.DataFrame
([list(h1
),list(h2
),list(h3
)],index
=(1,2,3),columns
=(0,1))
pd
.DataFrame
([np
.array
(h1
),np
.array
(h2
),np
.array
(h3
)],index
=(1,2,3),columns
=(1,2))
merge
= pd
.DataFrame
({0:data_train
[data_train
['Survived'] == 0]["Pclass"].value_counts
(),1:data_train
[data_train
['Survived'] == 1]["Pclass"].value_counts
()})
merge
3)使用 pd.crosstab直接进行频数统计
Pclass_Survived
= pd
.crosstab
(index
=data_train
["Pclass"], columns
=data_train
["Survived"])
Pclass_Survived
.plot
(kind
="bar",stacked
=True)
X
= np
.arange
(3)
bottom
= np
.zeros
(3)
for i
in range(2):
Z
= Pclass_Survived
[i
]
for x
,y
,z
in zip(X
,bottom
,Z
):
plt
.text
(x
,((y
+z
)+y
)/2,z
,ha
="center",va
="center")
bottom
+= Z
plt
.title
("船舱等级与生还的关系")
plt
.show
()
2、头衔与生还的关系
data_train
['Appellation'] = data_train
["Name"].apply(lambda x
: re
.search
("\w+\.",x
).group
()).str.replace
(".","")
data_train
['Appellation'].replace
(['Capt','Col','Countess','Don','Dr','Jonkheer','Lady','Major','Rev','Sir'], 'Rare',inplace
=True)
data_train
['Appellation'].replace
(['Mlle','Ms'], 'Miss',inplace
=True)
data_train
['Appellation'].replace
('Mme', 'Mrs',inplace
=True)
data_train
.Appellation
.unique
()
Appellation_Survived
= pd
.crosstab
(index
= data_train
["Appellation"], columns
= data_train
["Survived"])
Appellation_Survived
.plot
(kind
="bar")
plt
.xticks
(ticks
=np
.arange
(5),labels
=Appellation_Survived
.index
,rotation
=360)
plt
.title
("头衔与生还的关系")
plt
.show
()
3、性别和生还的关系
Sex_Survived
= pd
.crosstab
(data_train
["Sex"],data_train
["Survived"])
Sex_Survived
.plot
(kind
="bar")
plt
.xticks
(ticks
=np
.arange
(2),labels
=(Sex_Survived
.index
),rotation
=360)
plt
.title
("性别与生还者的关系")
plt
.show
()
4、兄弟姐妹配偶数量和生还的关系
Sibsp_Survived
= pd
.crosstab
(data_train
["SibSp"],data_train
["Survived"])
Sibsp_Survived
.plot
(kind
="bar")
plt
.xticks
(ticks
=np
.arange
(7),labels
=Sibsp_Survived
.index
,rotation
=360)
plt
.title
("兄弟姐妹配偶数量和生还的关系")
plt
.show
()
Sibsp_Survived
.iloc
[1:,:].plot
(kind
="bar")
plt
.xticks
(ticks
=np
.arange
(5),labels
=Sibsp_Survived
.iloc
[1:,:].index
,rotation
=360)
plt
.title
("兄弟姐妹配偶数量和生还的关系")
plt
.show
()
5、同船家人数和生还的关系
Parch_Survived
= pd
.crosstab
(index
=data_train
["Parch"],columns
=data_train
["Survived"])
Parch_Survived
.plot
(kind
="bar")
plt
.xticks
(np
.arange
(7),Parch_Survived
.index
,rotation
=360)
plt
.title
("同船家人数和生还的关系")
plt
.show
()
Parch_Survived
.iloc
[3:,:].plot
(kind
="bar")
plt
.xticks
(np
.arange
(4),Parch_Survived
.iloc
[3:,:].index
,rotation
=360)
plt
.title
("同船家人数和生还的关系")
plt
.show
()
6、船票是否共有与生还的关系
data_train
["Ticket"].unique
().size
681,小于船客总数,说明有共用票,新建特征,将共用票的置为1,反正,为0
不将分组类别作为索引(as_index = False) 统计不同Ticket使用者数量
Ticket_Count
= data_train
.groupby
("Ticket",as_index
= False)["PassengerId"].count
()
结果:
Ticket PassengerId
0 110152 3
1 110413 3
2 110465 2
3 110564 1
4 110813 1
... ... ...
676 W
./C
. 6608 4
677 W
./C
. 6609 1
678 W
.E
.P
. 5734 1
679 W
/C
14208 1
680 WE
/P
5735 2
681 rows ×
2 columns
提取使用者为1的Ticket id,将data_train中满足Ticket使用者为1的置成0,其余的置成1
Ticket_Count_0
= Ticket_Count
[Ticket_Count
.PassengerId
== 1]['Ticket']
data_train
['GroupTicket'] = np
.where
(data_train
.Ticket
.isin
(Ticket_Count_0
), 0, 1)
GroupTicket_Survived
= pd
.crosstab
(data_train
["GroupTicket"],data_train
["Survived"])
GroupTicket_Survived
.plot
(kind
="bar")
plt
.xticks
(np
.arange
(2),GroupTicket_Survived
.index
,rotation
=360)
plt
.show
()
7、船票价格和生还的关系
bins
= np
.arange
(-50,600,50)
data_train
["Fare_range"] = pd
.cut
(data_train
["Fare"],bins
=bins
)
Fare_range_Survived
= pd
.crosstab
(data_train
["Fare_range"],data_train
["Survived"])
Fare_range_Survived
.plot
(kind
="bar")
plt
.xticks
(np
.arange
(8),Fare_range_Survived
.index
)
plt
.show
()
Fare_range_Survived
.iloc
[2:,:].plot
(kind
="bar")
plt
.xticks
(np
.arange
(6),Fare_range_Survived
.iloc
[2:,:].index
,rotation
=360)
plt
.show
()
二、 处理缺失值之前保证数据完整性,copy一份
train
= data_train
.copy
()
1、"Cabin"使用No填充,表示没座位
train
["Cabin"].fillna
("No",inplace
=True)
Cabin_Survived
= pd
.crosstab
(train
["Cabin"],train
["Survived"])
train
["Group_Cabin"] = np
.where
(train
["Cabin"]=="No",0,1)
Group_Cabin_Survived
= pd
.crosstab
(train
["Group_Cabin"],train
["Survived"])
Group_Cabin_Survived
.plot
(kind
="bar")
plt
.xticks
(np
.arange
(2),Group_Cabin_Survived
.index
,rotation
=360)
plt
.show
()
2、"Embarked"缺失两个使用众数填充
train
["Embarked"].fillna
(train
["Embarked"].mode
()[0],inplace
=True)
train
["Embarked_New"] = train
["Embarked"].map({"S":0,"C":1,"Q":2})
Embarked_New_Survived
= pd
.crosstab
(train
["Embarked_New"],train
["Survived"])
Embarked_New_Survived
.plot
(kind
="bar")
plt
.xticks
(np
.arange
(3),Embarked_New_Survived
.index
)
plt
.show
()
3、"Age"采用头衔对应的中位数填充
Age_Appellation_median
= train
.groupby
('Appellation')['Age'].median
()
train
.set_index
(keys
=train
["Appellation"],inplace
=True)
train
["Age"].fillna
(Age_Appellation_median
,inplace
=True)
Age缺失值填充完成后,使用reset_index,其中,drop表示不保留以前的索引,也就是"Appellation",该特征在训练集后边仍有,可不保存,否则因为冲突报错
train
= train
.reset_index
(drop
=True)
bins
= np
.arange
(0,90,10)
train
["Age_Group"] = pd
.cut
(train
["Age"], bins
=bins
)
Age_Group_Survived
= pd
.crosstab
(train
["Age_Group"],train
["Survived"])
Age_Group_Survived
.plot
(kind
="bar")
plt
.xticks
(np
.arange
(8),Age_Group_Survived
.index
,rotation
=360)
plt
.show
()
三、新特征创建
train
["Sex"] = train
["Sex"].map({"male":0,"female":1})
dict = DictVectorizer
(sparse
=False)
Appellation_new
= dict.fit_transform
(pd
.DataFrame
(train
["Appellation"]).to_dict
(orient
="records"))
Appellation_new
= pd
.DataFrame
(Appellation_new
)
train
= pd
.merge
(train
,Appellation_new
,left_index
=True,right_index
=True)
train
["Age_reserved"] = train
["Age"]
train
.loc
[(train
["Age"]>0)&(train
["Age"]<=10),"Age"] =0
train
.loc
[(train
["Age"]>10)&(train
["Age"]<=20),"Age"] =1
train
.loc
[(train
["Age"]>20)&(train
["Age"]<=30),"Age"] =2
train
.loc
[(train
["Age"]>30)&(train
["Age"]<=40),"Age"] =3
train
.loc
[(train
["Age"]>40)&(train
["Age"]<=50),"Age"] =4
train
.loc
[(train
["Age"]>50)&(train
["Age"]<=60),"Age"] =5
train
.loc
[(train
["Age"]>60)&(train
["Age"]<=70),"Age"] =6
train
.loc
[(train
["Age"]>70)&(train
["Age"]<=80),"Age"] =7
train
["Fare_reserved"] = train
["Fare"]
train
.loc
[(train
["Fare"]>-50)&(train
["Fare"]<=0),"Fare"] = 0
train
.loc
[(train
["Fare"]>0)&(train
["Fare"]<=50),"Fare"] = 1
train
.loc
[(train
["Fare"]>50)&(train
["Fare"]<=100),"Fare"] = 2
train
.loc
[(train
["Fare"]>100)&(train
["Fare"]<=150),"Fare"] = 3
train
.loc
[(train
["Fare"]>150)&(train
["Fare"]<=200),"Fare"] = 4
train
.loc
[(train
["Fare"]>200)&(train
["Fare"]<=250),"Fare"] = 5
train
.loc
[(train
["Fare"]>250)&(train
["Fare"]<=300),"Fare"] = 6
train
.loc
[(train
["Fare"]>300)&(train
["Fare"]<=350),"Fare"] = 7
train
.loc
[(train
["Fare"]>350)&(train
["Fare"]<=400),"Fare"] = 8
train
.loc
[(train
["Fare"]>400)&(train
["Fare"]<=450),"Fare"] = 9
train
.loc
[(train
["Fare"]>450)&(train
["Fare"]<=500),"Fare"] = 10
train
.loc
[(train
["Fare"]>500)&(train
["Fare"]<=550),"Fare"] = 11
train
["Family"] = train
["SibSp"] + train
["Parch"] + 1
train
.drop
(["PassengerId","Name","Ticket","Cabin","Embarked","Appellation","Fare_range","Age_Group"],axis
=1,inplace
=True)
四、模型搭建
x
= train
.drop
("Survived",axis
=1).values
y
= train
["Survived"].values
x_train
,x_test
,y_train
,y_test
= train_test_split
(x
,y
,test_size
=0.25,random_state
=0)
print(x_train
.shape
)
print(y_train
.shape
)
print(x_test
.shape
)
print(y_test
.shape
)
结果:
(668, 17)
(668,)
(223, 17)
(223,)
knn
= KNeighborsClassifier
()
dec
= DecisionTreeClassifier
()
rf
= RandomForestClassifier
()
param
= {"n_neighbors":[2,3,4,5,6,7,8,9,10]}
grid
= GridSearchCV
(estimator
=knn
,param_grid
=param
,cv
=5)
grid
.fit
(x_train
,y_train
)
print(grid
.best_params_
)
print(grid
.score
(x_test
,y_test
))
结果:
{'n_neighbors': 5}
0.7399103139013453
param
= {"max_depth":[2,3,4,5,6]}
grid
= GridSearchCV
(estimator
=dec
,param_grid
=param
,cv
=5)
grid
.fit
(x_train
,y_train
)
print(grid
.best_params_
)
print(grid
.score
(x_test
,y_test
))
结果:
{'max_depth': 3}
0.7982062780269058
param
= {"max_depth":[2,3,4,5,6], "n_estimators":[10,30,50,70,100,120,150,170,200]}
grid
= GridSearchCV
(estimator
=rf
,param_grid
=param
,cv
=5)
grid
.fit
(x_train
,y_train
)
print(grid
.best_params_
)
print(grid
.score
(x_test
,y_test
))
结果:
{'max_depth': 6, 'n_estimators': 120}
0.820627802690583
logis
= LogisticRegression
()
logis
.get_params
param
= {"C":[0.01,0.03,0.1,0.3,1,2],
"penalty":["l1","l2"]}
grid
= GridSearchCV
(estimator
=logis
,param_grid
=param
,cv
=5)
grid
.fit
(x_train
,y_train
)
print(grid
.best_params_
)
print(grid
.score
(x_test
,y_test
))
结果:
{'C': 1, 'penalty': 'l1'}
0.8295964125560538
data_test
["Appellation"] = data_test
["Name"].apply(lambda x
: re
.search
("\w+\.",x
).group
()).str.replace
(".","")
data_test
['Appellation'].replace
(['Capt','Col','Countess','Don','Dr','Jonkheer','Lady','Major','Rev','Sir'], 'Rare',inplace
=True)
data_test
['Appellation'].replace
(['Mlle','Ms'], 'Miss',inplace
=True)
data_test
['Appellation'].replace
('Mme', 'Mrs',inplace
=True)
五、对测试集进行同样特征处理操作
data_test
["Appellation"] = data_test
["Name"].apply(lambda x
: re
.search
("\w+\.",x
).group
()).str.replace
(".","")
data_test
['Appellation'].replace
(['Capt','Col','Countess','Don','Dr','Jonkheer','Lady','Major','Rev','Sir'], 'Rare',inplace
=True)
data_test
['Appellation'].replace
(['Mlle','Ms'], 'Miss',inplace
=True)
data_test
['Appellation'].replace
('Mme', 'Mrs',inplace
=True)
test
= data_test
.copy
()
Appellation_new_test
= pd
.DataFrame
(dict.transform
(pd
.DataFrame
(test
["Appellation"]).to_dict
(orient
="records")))
test
= pd
.merge
(test
,Appellation_new_test
,left_index
=True,right_index
=True)
test
["Cabin"].fillna
("No",inplace
=True)
test
["Group_Cabin"] = np
.where
(test
["Cabin"]=="No",0,1)
test
["Embarked_New"] = test
["Embarked"].map({"S":0,"C":1,"Q":2})
test
["Sex"] = test
["Sex"].map({"male":0,"female":1})
test_median
= test
.groupby
("Appellation")["Age"].median
()
test
.set_index
("Appellation",inplace
=True)
test
["Age"].fillna
(test_median
,inplace
=True)
test
.reset_index
(drop
=False,inplace
=True)
bins
= np
.arange
(0,90,10)
test
["Age_range"] = pd
.cut
(test
["Age"],bins
=bins
)
test
["Age_range"].value_counts
(dropna
=False)
test
["Age_reserved"] = test
["Age"]
test
.loc
[(test
["Age"]>0)&(test
["Age"]<=10),"Age"] =0
test
.loc
[(test
["Age"]>10)&(test
["Age"]<=20),"Age"] =1
test
.loc
[(test
["Age"]>20)&(test
["Age"]<=30),"Age"] =2
test
.loc
[(test
["Age"]>30)&(test
["Age"]<=40),"Age"] =3
test
.loc
[(test
["Age"]>40)&(test
["Age"]<=50),"Age"] =4
test
.loc
[(test
["Age"]>50)&(test
["Age"]<=60),"Age"] =5
test
.loc
[(test
["Age"]>60)&(test
["Age"]<=70),"Age"] =6
test
.loc
[(test
["Age"]>70)&(test
["Age"]<=80),"Age"] =7
test
["Fare"].fillna
(test
["Fare"].mean
(),inplace
=True)
test
["Fare_reserved"] = test
["Fare"]
test
.loc
[(test
["Fare"]>-50)&(test
["Fare"]<=0),"Fare"] = 0
test
.loc
[(test
["Fare"]>0)&(test
["Fare"]<=50),"Fare"] = 1
test
.loc
[(test
["Fare"]>50)&(test
["Fare"]<=100),"Fare"] = 2
test
.loc
[(test
["Fare"]>100)&(test
["Fare"]<=150),"Fare"] = 3
test
.loc
[(test
["Fare"]>150)&(test
["Fare"]<=200),"Fare"] = 4
test
.loc
[(test
["Fare"]>200)&(test
["Fare"]<=250),"Fare"] = 5
test
.loc
[(test
["Fare"]>250)&(test
["Fare"]<=300),"Fare"] = 6
test
.loc
[(test
["Fare"]>300)&(test
["Fare"]<=350),"Fare"] = 7
test
.loc
[(test
["Fare"]>350)&(test
["Fare"]<=400),"Fare"] = 8
test
.loc
[(test
["Fare"]>400)&(test
["Fare"]<=450),"Fare"] = 9
test
.loc
[(test
["Fare"]>450)&(test
["Fare"]<=500),"Fare"] = 10
test
.loc
[(test
["Fare"]>500)&(test
["Fare"]<=550),"Fare"] = 11
test
["Family"] = test
["SibSp"] + test
["Parch"] + 1
test
["Ticket"].value_counts
()
test_count0
= test
.groupby
("Ticket",as_index
=False)["PassengerId"].count
()
test_count0
= test_count0
[test_count0
["PassengerId"]==1]["Ticket"]
test
['GroupTicket'] = np
.where
(test
["Ticket"].isin
(test_count0
), 0, 1)
test
.drop
(["PassengerId","Name","Ticket","Cabin","Embarked","Appellation","Age_range"],axis
=1,inplace
=True)
order_train
= train
.drop
("Survived",axis
=1)
order
= list(order_train
.columns
)
test
= test
[order
]
x_pre
= test
.values
param
= {"max_depth":[2,3,4,5,6], "n_estimators":[10,30,50,70,100,120,150,170,200]}
grid
= GridSearchCV
(estimator
=rf
,param_grid
=param
,cv
=5)
grid
.fit
(x_train
,y_train
)
print(grid
.score
(x_test
,y_test
))
y_pre
= grid
.predict
(x_pre
)
print(grid
.best_params_
)
pd
.DataFrame
(y_pre
).to_csv
("pre123.csv",index
=False,header
=False)