scikit-learn入门

    科技2022-08-26  108

    scikit-learn学习网站 https://scikit-learn.org/stable/ 基本流程:

    1、基本模型流程

    from sklearn import datasets from sklearn import svm from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score import pickle import numpy as np iris = datasets.load_iris() digits = datasets.load_digits() # print(iris.data) # print(iris.data.shape)#(150,4) 150个样本,每个样本四个特征数据 # print(iris.target_names)#['setosa' 'versicolor' 'virginica'] 三个标签名字 # print(iris.target)#150个样本的分类 上面的名字对应成0 1 2 print(digits.data.shape) # (1797,64) # 手动划分训练集、测试集 n_test = 100 # 测试样本个数 train_X = digits.data[:-n_test, :] # 第0行到倒数100行样本,所有列 train_y = digits.target[:-n_test] # 样本标签 test_X = digits.data[-n_test:, :] y_true = digits.target[-n_test:] # 选择SVM模型 svm_model = svm.SVC(gamma=0.001, C=100.) # 自己选的超参数 # svm_model = svm.SVC(gamma=100., C=1.) # 训练模型 svm_model.fit(train_X, train_y) # 选择LR逻辑回归模型 lr_model = LogisticRegression() # 训练模型 lr_model.fit(train_X, train_y) # 在测试集上测试模型 y_pred_svm = svm_model.predict(test_X) y_pred_lr = lr_model.predict(test_X) # 查看结果 # print '预测标签:', y_pred # print '真实标签:', y_true print('SVM结果:', accuracy_score(y_true, y_pred_svm)) # 0.98 print('LR结果:', accuracy_score(y_true, y_pred_lr)) # 0.97 #保存模型 with open('svm_model.pkl', 'wb') as f:#wb二进制的方法写 pickle.dump(svm_model, f) # 重新加载模型进行预测 with open('svm_model.pkl', 'rb') as f: model = pickle.load(f) random_samples_index = np.random.randint(0, 1796, 5) random_samples = digits.data[random_samples_index, :]#数据集里随机选5个数 random_targets = digits.target[random_samples_index] random_predict = model.predict(random_samples) print(random_predict)#【6 2 5 6 4】 print(random_targets)#【6 2 5 6 4】

    2、特征归一化

    import numpy as np from sklearn.model_selection import train_test_split #准备数据集 X = np.random.randint(0, 100, (10, 4))#十行四列矩阵,即十个样本,每个样本四个特征维度 y = np.random.randint(0, 3, 10)#0到3中随机取十个数,作为标签 y.sort() print('样本:') print(X) print('标签:', y) # 分割训练集、测试集 # random_state确保每次随机分割得到相同的结果 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1 / 3., random_state=7) print('训练集:') print(X_train) print(y_train) print('测试集:') print(X_test) print(y_test) # 特征归一化 from sklearn import preprocessing x1 = np.random.randint(0, 1000, 5).reshape(5, 1)#第一个特征 五个样本 x2 = np.random.randint(0, 10, 5).reshape(5, 1) x3 = np.random.randint(0, 100000, 5).reshape(5, 1) X = np.concatenate([x1, x2, x3], axis=1) print(X) #数字跨度太大,归一化到相同数量级 print(preprocessing.scale(X))#默认中心归一化

    生成分类数据进行验证scale的必要性

    from sklearn.datasets import make_classification import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split X, y = make_classification(n_samples=300, n_features=2, n_redundant=0, n_informative=2, random_state=25, n_clusters_per_class=1, scale=100) plt.scatter(X[:, 0], X[:, 1], c=y) plt.show()

    from sklearn import svm, preprocessing # 注释掉以下这句表示不进行特征归一化 #X = preprocessing.scale(X) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1 / 3., random_state=7) svm_classifier = svm.SVC() svm_classifier.fit(X_train, y_train) print(svm_classifier.score(X_test, y_test))

    不归一化是0.97,归一化是0.98

    3、交叉验证

    from sklearn import datasets from sklearn.model_selection import train_test_split, cross_val_score from sklearn.neighbors import KNeighborsClassifier#只要选取一个参数k import matplotlib.pyplot as plt iris = datasets.load_iris() X = iris.data y = iris.target X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1 / 3., random_state=10) k_range = range(1, 31) cv_scores = [] for n in k_range: knn = KNeighborsClassifier(n) #cv是选10轮不同数据,然后取平均 scores = cross_val_score(knn, X_train, y_train, cv=10, scoring='accuracy') # 分类问题使用 # scores = cross_val_score(knn, X_train, y_train, cv=10, scoring='neg_mean_squared_error') # 回归问题使用 cv_scores.append(scores.mean()) plt.plot(k_range, cv_scores) plt.xlabel('K') plt.ylabel('Accuracy') plt.show()

    #选择最佳k best_knn = KNeighborsClassifier(n_neighbors=5) best_knn.fit(X_train, y_train) print(best_knn.score(X_test, y_test)) print(best_knn.predict(X_test)) #5的时候score是0.96,27的时候是0.94

    4、过拟合与欠拟合---超参数的选择

    sklearn.svn.SVC 一般需要选择C,kernel,gamma三个比较重要的超参数

    ①学习曲线learning_curve

    from sklearn.model_selection import learning_curve from sklearn.datasets import load_digits from sklearn.svm import SVC import matplotlib.pyplot as plt import numpy as np # 加载数据 digits = load_digits() X = digits.data y = digits.target # gamma=0.001 train_sizes, train_scores, val_scores = learning_curve( SVC(gamma=0.001), X, y, cv=10, scoring='accuracy', train_sizes=[0.1, 0.25, 0.5, 0.75, 1] )#cv=10,把数据分成10份,一份做验证集,10折交叉验证 #循环5次,trainsize代表取多少作为训练集,分别跑一次十折 # 在10折的交叉验证数据上进行平均 train_scores_mean = np.mean(train_scores, axis=1) val_scores_mean = np.mean(val_scores, axis=1) # 绘制学习曲线 plt.plot(train_sizes, train_scores_mean, 'o-', color='r', label='training') plt.plot(train_sizes, val_scores_mean, '*-', color='g', label='cross validation') plt.xlabel('training sample size') plt.ylabel('accuracy') plt.legend(loc='best') plt.show()

    gamma=0.001时,在测试集和训练集都越来越好,是刚刚好的情况 gamma=0.1时,即使训练样本少在训练集也一直是百分之百,但在测试集很差,过拟合了 ②验证集validation_curve

    from sklearn.model_selection import validation_curve from sklearn.datasets import load_digits from sklearn.svm import SVC import matplotlib.pyplot as plt import numpy as np # 加载数据 digits = load_digits() X = digits.data y = digits.target print(X.shape) print(y) #param_range = np.arange(1, 6) / 500. param_range = np.logspace(-6.5, -2, 10)#选对数画出来好看一些 print(param_range) train_scores, val_scores = validation_curve( SVC(), X, y, param_name='gamma', param_range=param_range, cv=5, scoring='accuracy')#param_name代表设置SVC里gamma这个参数 #gamma取-6.5,-2,10 的时候分别跑五折 # 在5折的交叉验证数据上进行平均 train_scores_mean = np.mean(train_scores, axis=1) val_scores_mean = np.mean(val_scores, axis=1) # 绘制学习曲线 plt.plot(param_range, train_scores_mean, 'o-', color='r', label='training') plt.plot(param_range, val_scores_mean, '*-', color='g', label='cross validation') plt.xlabel('gamma') plt.ylabel('accuracy') plt.legend(loc='best') plt.show()

    gamma取0.001之后基本就是过拟合了

    ③选择C,kernel,gamma三个超参数调整 使用GridSearch网格搜索

    import numpy as np from sklearn.model_selection import train_test_split from sklearn import datasets, svm from sklearn.model_selection import GridSearchCV from sklearn.metrics import accuracy_score iris = datasets.load_iris() X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=0) # 设置参数调整的范围及配置,定义的两个模型,共12种组合 param_grid = [ {'C': [1, 10, 100, 1000], 'kernel': ['linear']},#线性回归没有gamma这个参数 {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},#高斯回归 ] svm_model = svm.SVC() # 将超参数配置及模型放入GridSearch中进行自动搜索 clf = GridSearchCV(svm_model, param_grid, cv=5) clf.fit(X_train, y_train) # 获取选择的最优模型 best_model = clf.best_estimator_ # 查看选择的最优超参数配置 print(clf.best_params_)#{'C': 10, 'kernel': 'linear'} # 预测 y_pred = best_model.predict(X_test) print('accuracy', accuracy_score(y_test, y_pred))#accuracy 1.0

    5、特征选择

    (特征降维–PCA算协方差矩阵,然后得到特征值特征向量,特征值从大到小排序进行选择,对应的特征向量就是选择的新的向量。并不是原有特征中的任何一个,而是进行了特征映射)

    (特征选择–选出来的特征就是原有特征中的一个或几个) ①去除方差小的特征

    # 1. 去除方差小的特征 from sklearn.feature_selection import VarianceThreshold # 6个样本,3维的特征向量 X = [[0, 0, 1], [0, 1, 0], [1, 0, 0], [0, 1, 1], [0, 1, 0], [0, 1, 1]] # 根据方差保留80%的向量 # 计算公式:var_thresh = p(1-p) sel = VarianceThreshold(threshold=(.8 * (1 - .8))) print(sel.fit_transform(X)) 三维向量选取了两维 输出: [[0 1] [1 0] [0 0] [1 1] [1 0] [1 1]]

    ②基于单变量统计特征选择

    # 2. 基于单变量统计特征选择 from sklearn.datasets import load_iris from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import chi2 iris = load_iris() X, y = iris.data, iris.target print('原始特征:') print(X.shape) print(X[:5, :]) # 使用卡方分布选择2个维度的变量 X_new = SelectKBest(chi2, k=2).fit_transform(X, y)#四维特征选取了两维 print('选取的特征:') print(X_new.shape) print(X_new[:5, :]) 输出: 原始特征: (150, 4) [[5.1 3.5 1.4 0.2] [4.9 3. 1.4 0.2] [4.7 3.2 1.3 0.2] [4.6 3.1 1.5 0.2] [5. 3.6 1.4 0.2]] 选取的特征: (150, 2) [[1.4 0.2] [1.4 0.2] [1.3 0.2] [1.5 0.2] [1.4 0.2]]

    ③ 基于模型的特征选择

    # 3. 基于模型的特征选择 from sklearn.ensemble import RandomForestClassifier from sklearn.datasets import load_iris from sklearn.feature_selection import SelectFromModel iris = load_iris() X, y = iris.data, iris.target print('原始特征:') print(X.shape) print(X[:5, :]) clf = RandomForestClassifier() clf = clf.fit(X, y) print('特征得分:') print(clf.feature_importances_ ) # 基于随机森林选择特征 model = SelectFromModel(clf, prefit=True)#true代表之前训练好了,默认参数threshold为2 ,即选择两个参数 X_new = model.transform(X) print('选取的特征:') print(X_new.shape) print(X_new[:5, :]) 输出: 原始特征: (150, 4) [[5.1 3.5 1.4 0.2] [4.9 3. 1.4 0.2] [4.7 3.2 1.3 0.2] [4.6 3.1 1.5 0.2] [5. 3.6 1.4 0.2]] 特征得分: [0.0985073 0.03100764 0.43976362 0.43072144] 选取的特征: (150, 2) [[1.4 0.2] [1.4 0.2] [1.3 0.2] [1.5 0.2] [1.4 0.2]]

    6、评价指标补充

    多种指标简单介绍 详细介绍 ①曲线下面积(AUC) AUC就是特征曲线ROC下的面积,用于二分类,当输出预测值不是简单的标签,而是一个概率值 ②对数损失logloss

    Processed: 0.012, SQL: 9