ML-决策树-泰坦尼克号
流程代码项目总结
流程
1.数据加载 2.数据特征 3.数据预处理:均值填充,最大值填充 4.特征选择 5.构造ID3树 6.决策树训练 7.决策树预测 8.准确率输出
代码
import pandas
as pd
from sklearn
.feature_extraction
import DictVectorizer
from sklearn
.tree
import DecisionTreeClassifier
train_data
= pd
.read_csv
('C:\\Users\\YXJ\\Documents\\Tencent Files\\1064584707\\FileRecv\\Titanic_Data-master\\train.csv')
test_data
= pd
.read_csv
('C:\\Users\\YXJ\\Documents\\Tencent Files\\1064584707\\FileRecv\\Titanic_Data-master\\train.csv')
print(train_data
.info
())
print('-'*30)
print(train_data
.describe
())
print('-'*30)
print(train_data
.describe
(include
=['O']))
print('-'*30)
print(train_data
.head
())
print('-'*30)
print(train_data
.tail
())
print('-'*30)
train_data
['Age'].fillna
(train_data
['Age'].mean
(), inplace
=True)
test_data
['Age'].fillna
(test_data
['Age'].mean
(),inplace
=True)
train_data
['Fare'].fillna
(train_data
['Fare'].mean
(), inplace
=True)
test_data
['Fare'].fillna
(test_data
['Fare'].mean
(),inplace
=True)
print(train_data
['Embarked'].value_counts
())
train_data
['Embarked'].fillna
('S', inplace
=True)
test_data
['Embarked'].fillna
('S',inplace
=True)
features
= ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
train_features
= train_data
[features
]
train_labels
= train_data
['Survived']
test_features
= test_data
[features
]
dvec
=DictVectorizer
(sparse
=False)
train_features
=dvec
.fit_transform
(train_features
.to_dict
(orient
='record'))
print(dvec
.feature_names_
)
clf
= DecisionTreeClassifier
(criterion
='entropy')
clf
.fit
(train_features
, train_labels
)
test_features
=dvec
.transform
(test_features
.to_dict
(orient
='record'))
pred_labels
= clf
.predict
(test_features
)
acc_decision_tree
= round(clf
.score
(train_features
, train_labels
), 6)
print(u
'score 准确率为 %.4lf' % acc_decision_tree
)
```python
import numpy
as np
from sklearn
.model_selection
import cross_val_score
score
=cross_val_score
(clf
,train_features
,train_labels
,cv
=10)
print(score
)
print(u
'score 准确率为 %.4lf' % np
.mean
(score
))
项目总结
采用决策树方法,对泰坦尼克号乘客生存问题进行预测,准确率达0.9820,使用K折交叉验证后准确率为0.7801