决策树的思想来源非常朴素,程序设计中的条件分支机构就是if-then结构,最早的决策树就是利用这类结构分割数据的一种分类学习方法。 信息和消除不确定性是相联系的 信息增益:当得知一个特征后,减少的信息熵的大小 决策树的分类依据之一:信息增益 泰坦里克号数据来源:http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt
import pandas as pd from sklearn.feature_extraction import DictVectorizer from sklearn.model_selection import train_test_split from sklearn.tree import DecisionTreeClassifier,export_graphviz def decision(): # 决策树对泰坦里克号进行预测生死 # 获取数据 titan = pd.read_csv("http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt") # 处理数据,找出特征值目标值 x = titan[["pclass","age","sex"]] y = titan["survived"] print(x) # 处理缺失值 x["age"].fillna(x["age"].mean(),inplace=True) # 进行数据的分割 训练集和测试集 x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25) # 进行特征工程处理-->类别 one_hot编码 dict = DictVectorizer(sparse=False) x_train = dict.fit_transform(x_train.to_dict(orient="records")) print(dict.get_feature_names()) x_test = dict.transform(x_test.to_dict(orient="records")) print(x_train) # 使用决策树进行预测 dec = DecisionTreeClassifier() dec.fit(x_train,y_train) # 预测准确率 print("预测准确率:",dec.score(x_test,y_test)) # 导出决策树的结构 export_graphviz(dec,out_file="./tree.dot",feature_names=['年龄', 'pclass=1st', 'pclass=2nd', 'pclass=3rd', '女性', '男性']) return None if __name__=='__main__': decision()
例如:如果训练了5棵树,其中4棵树结果都是True 1个树结果是FALSE 哪个最终结果就是True
随机森林建立多颗决策树的过程: N个样本 M个特征 1,单颗树建立过程:
随即在n个样本中选择一个样本,重复n次(样本可能重复)随机在m个特征当中选出m个特征 2,建立多颗决策树:样本,特征大多不一样 随机有返回的抽样(bootstrap) import pandas as pd from sklearn.feature_extraction import DictVectorizer from sklearn.model_selection import train_test_split,GridSearchCV from sklearn.tree import DecisionTreeClassifier,export_graphviz from sklearn.ensemble import RandomForestClassifier def decision(): # 决策树对泰坦里克号进行预测生死 # 获取数据 titan = pd.read_csv("http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt") # 处理数据,找出特征值目标值 x = titan[["pclass","age","sex"]] y = titan["survived"] print(x) # 处理缺失值 x["age"].fillna(x["age"].mean(),inplace=True) # 进行数据的分割 训练集和测试集 x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25) # 进行特征工程处理-->类别 one_hot编码 dict = DictVectorizer(sparse=False) x_train = dict.fit_transform(x_train.to_dict(orient="records")) print(dict.get_feature_names()) x_test = dict.transform(x_test.to_dict(orient="records")) print(x_train) # 随机森林进行预测(超参数调优) rf = RandomForestClassifier() param = {"n_estimators":[120,200,300,500,800,1200],"max_depth":[5,8,15,25,30]} # 网格搜索交叉验证 gc = GridSearchCV(rf,param_grid=param,cv=2) gc.fit(x_train,y_train) print("准确率:",gc.score(x_test,y_test)) print("查看选择的模型:",gc.best_params_) return None if __name__=='__main__': decision()