机器学习-回归算法

    科技2022-08-17  98

    回归模型指标

    指标误差 值越小越好

    评估回归模型的性能: * 误差归属于越小越好,代表实际值与预测值相差比小 * - 均方误差(MSE: Mean Squared Error) * 预测值和实际值之间的差异 * 所有样本预测值和实际值差的平方之和,除以样本总数 * - 均方根误差(RMSE: Root Mean Squared Error) * 就是均方误差平方根,结果数据更方便描述 * - 平均绝对误差(MAE:Mean Absolute Error) * 预测值和实际值的差的绝对值的平均值,计算预测值实际值之间的平均差距 * - 均方根对数误差

    package com.mllib import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.{LabeledPoint, LinearRegressionModel, LinearRegressionWithSGD} import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Row, SparkSession} import scala.collection.Map object HousesRegression { def main(args: Array[String]): Unit = { //读取文件 val spark = SparkSession.builder() .appName("house") .master("local") .getOrCreate() //不进行日志打印 spark.sparkContext.setLogLevel("ERROR") //读取数据 val houseData = spark.read .option("header","true")//首行不读 .csv("data/house1.csv")//指定文件名称位置 .rdd//datafrom类型转换成rdd格式 //houseData.show() //转换特征 1 of k 把分类型数据转换成向量 val mapping = {2 to 12}.map( x => getMapping(houseData,x) ) val cateLength = mapping.map(_.size).sum //继续转换 转换为向量 val labelPortRDD = houseData.map( line => { val features = Array.ofDim[Double](cateLength) //取出数据空值 val record = line.toSeq.map(x => if(x == null){"null"}else{x}) //设置o步长 var step = 0 //获取对应位置 for((field,index) <- record.slice(2,12).zipWithIndex){//slice():切片 //field = 朝向 //利用下标取值 val fieldMapping = mapping(index) //获得field位置 val idx = fieldMapping(field.toString) features(step + idx.toInt) = 1.0 step += fieldMapping.size } //mllib rdd //mi DF DS val feature = Vectors.dense(features) //获得标签 val label = line.getString(14).split("元")(0).toDouble LabeledPoint(label,feature) } ) //划分测试集和训练集 val Array(trainRDD,testRDD) = labelPortRDD.randomSplit(Array(0.8,0.2)) // val model = LinearRegressionWithSGD.train(trainRDD, // 100, // 0.03) val model = LinearRegressionModel.load(spark.sparkContext,"model") val result = testRDD.map( line => { (line.label,model.predict(line.features)) } ) result.take(10).foreach(println(_)) //获得结果后将模型进行存储 将当前模型存储在当前项目下 model.save(spark.sparkContext,"model") } def testmodel(trainRDD:RDD[LabeledPoint],testRDD:RDD[LabeledPoint]): Unit ={ //对值进行测试 val steps = Array(0.01,0.02,0.03) for (step <- steps){ //设置参数训练模型(对象) val model = LinearRegressionWithSGD.train(trainRDD,100,step) //使用模型 val result = testRDD.map( line => { (line.label,model.predict(line.features)) } ) //result.take(10).foreach(println(_)) //进行结果的验证 //均方误差(MES:MEAN Squared Error) //所有样本预测值和实际值差的平方之和,除以样本总数 val resultCount = result.count().toDouble//获得所有测试机数量 val mse = result.map{ case(a,b) => Math.pow(a-b,2)//a:原来值 b:实际值 pow:开平方 }.sum() / resultCount //sum()求和 /数量 就是平均值 //除了均方误差还有均方根误差 val rmse = Math.sqrt(mse) //println("mse="+mse)//科学计数法 数值太大 val mae = result.map{//绝对值 case(a,b) => Math.abs(a-b) }.sum/resultCount //循环验证结果 println(s"step= $step,mse = $mse,rmse= $rmse,mae = $mae") } } def getMapping(data:RDD[Row],index:Int):Map[String,Long]={ data.map( line => { val str = line.getString(index) if(str == "" || str==null){ "null" }else{ str } } ) .distinct()//去除重复数据 .sortBy(x => x)//排序 .zipWithIndex()//元素加上下标 .collectAsMap()//转换map } }
    Processed: 0.017, SQL: 9