决策树主要分为两部分:创建决策树、使用决策树模型进行预测,如下图所示。
年龄 性别: (3)收入
(4)婚姻
1)建立树模型 2)剪枝 3)输出结果
以分类树为基准: 各个参数的意思
重要参数和剪枝
##6. SHAP
:决策树一般用于变量选择,决策树是后续学习随机森林的基础。
R^2
clf = DecisionTreeClassifier(criterion='gini',max_depth = 2, min_samples_leaf=5, min_samples_split=2, random_state = 0) clf.fit(train_x,train_y) score = clf.score(test_x,test_y) score plt.rcParams['font.sans-serif']=['SimHei'] #显示中文标签 plt.rcParams['axes.unicode_minus']=False #用来正常显示负号树的可视化
import pydotplus from IPython.display import display, Image dot_data = tree.export_graphviz(clf, out_file=None, feature_names=train_x.columns, class_names = ['留存','非留存'], filled = True, rounded =True ) graph = pydotplus.graph_from_dot_data(dot_data) display(Image(graph.create_png()))网格搜索调参
#网格搜索 from sklearn.model_selection import cross_val_score,GridSearchCV paramaters = {'criterion': ('gini','entropy'), 'min_samples_split':[2,3,4,5], 'max_depth':range(1,5), 'class_weight':('balanced',None), 'presort':(False,True) } tr = tree.DecisionTreeClassifier() gsearch = GridSearchCV(tr,paramaters) gsearch.fit(train_x,train_y) model = gsearch.best_estimator_ model score = model.score(train_x,train_y) score调参过后由 72%提升到 76%
model.feature_importances_ [*zip(train_x.columns,clf.feature_importances_)]学习曲线确定数的深度:
#用学习曲线确定最优max_depth取值 test = [] for i in range(100): clf = tree.DecisionTreeClassifier(max_depth=i+1 ,criterion="entropy" ,random_state=30 ,splitter="random" ) clf = clf.fit(train_x,train_y) score = clf.score(test_x,test_y) test.append(score) plt.plot(range(1,101),test,color="red",label="max_depth") plt.legend() plt.show()EIL5
import eli5 from eli5.sklearn import PermutationImportance # 由⼤到⼩,进⾏降序 perm = PermutationImportance(model, random_state=1).fit(test_x, test_y) eli5.show_weights(perm, feature_names = test_x.columns.tolist())#把perm加载进来绿色表示特征越重要
shap
import shap explainer = shap.TreeExplainer(model) # # 传入特征矩阵X,计算SHAP值 shap_values = explainer.shap_values(train_x) print('*'*80) ##计算,⼀共是2⾏的数据, print(shap_values) shap.initjs()#显示格式转换这里用shap_values[1] 红色越多对模型(结果)的促进作用越强 +
#随机森林 # 随机森林 from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import roc_auc_score score = [] for i in range(1,500,10): model = RandomForestClassifier(n_estimators=i,criterion='gini',bootstrap=True) model.fit(train_x,train_y) rf_predictions = model.predict(test_x) rf_probs = model.predict_proba(test_x)[:,1] roc_value = roc_auc_score(test_y,rf_probs) score.append(roc_value) # 确定范围 import matplotlib.pyplot as plt print(max(score)) #找到使得最大时候的决策树数的数量 print(score.index(max(score))*10+1) plt.figure(figsize=[20,5]) plt.plot(range(1,500,10),score)roc_auc_score 概率作为输入 ROC曲线
促进作用还是抑制作用
