This page documents sections of the MLlib guide for the RDD-based API (the spark.mllib package). Please see the MLlib Main Guide for the DataFrame-based API (the spark.ml package), which is now the primary API for MLlib.
import org.apache.log4j.PropertyConfigurator
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.{DoubleType, StructField, StructType}
import org.apache.spark.sql.{DataFrame, Row, SQLContext, types}
import org.apache.spark.mllib.linalg.{VectorUDT, Vectors}
import org.apache.spark.mllib.regression.LabeledPoint
import scala.io.Source
object Regression extends App{
val conf=new SparkConf().setAppName("regression")
val sc=new SparkContext(conf)
val sqc=new SQLContext(sc)
val date=Source.fromFile("data/data1.txt").getLines().map{line=>
val parts=line.split(",")
val a=Vectors.dense(parts(1).split(" ").map(_.toDouble))
val b=parts(0).toDouble
LabeledPoint(b,a)
//LabeledPoint(parts(0).toDouble,Vectors.dense(parts(1).split(" ").map(_.toDouble)))
}
//val df=sqc.createDataFrame(d,schema)
val df=sqc.createDataFrame(sc.parallelize(date.toSeq))
//val training=Source.fromFile("data/data.txt").getLines()
val lr=new LinearRegression()
.setMaxIter(10)//set maximum number of iterations
.setRegParam(0.3)//Set the regularization parameter.
.setElasticNetParam(0.8)//Set the ElasticNet mixing parameter.
// Fit the model
val lrModel = lr.fit(df)
// Print the coefficients and intercept for linear regression
println(s"Coefficients: ${lrModel.coefficients} Intercept: ${lrModel.intercept}")
// Summarize the model over the training setand print out some metrics
val trainingSummary = lrModel.summary
println(s"numIterations: ${trainingSummary.totalIterations}")
println(s"objectiveHistory: ${trainingSummary.objectiveHistory.toList}")
trainingSummary.residuals.show()
println(s"RMSE: ${trainingSummary.rootMeanSquaredError}")
println(s"r2: ${trainingSummary.r2}")
}
其实代码很少,整个过程都很简单,管方文档已经写的很清楚了,唯一的难点就是
val lrModel = lr.fit(df)
这是的df是DataFrame格式,官方文档是利用
val df=spark.read.format("libsvm")
.load("data/mllib/sample_linear_regression_data.txt")
val date=Source.fromFile("data/data1.txt").getLines().map{line=>
val parts=line.split(",")
val a=Vectors.dense(parts(1).split(" ").map(_.toDouble))
val b=parts(0).toDouble
LabeledPoint(b,a)
}