-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathLinRegExercise.scala
77 lines (59 loc) · 2.96 KB
/
LinRegExercise.scala
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
////////////////////////////////////////////
//// LINEAR REGRESSION EXERCISE ///////////
/// Complete the commented tasks below ///
/////////////////////////////////////////
// Import LinearRegression
import org.apache.spark.ml.regression.LinearRegression
// Optional: Use the following code below to set the Error reporting
import org.apache.log4j._
Logger.getLogger("org").setLevel(Level.ERROR)
// Start a simple Spark Session
import org.apache.spark.sql.SparkSession
val spark = SparkSession.builder().getOrCreate()
// Use Spark to read in the Ecommerce Customers csv file.
val data = spark.read.option("header","true").option("inferSchema","true").format("csv").load("Ecommerce Customers.csv")
// Print the Schema of the DataFrame
// data.printSchema()
// Print out an example Row
// Various ways to do this, just
// choose whichever way you prefer
// data.head(1)
////////////////////////////////////////////////////
//// Setting Up DataFrame for Machine Learning ////
//////////////////////////////////////////////////
// A few things we need to do before Spark can accept the data!
// It needs to be in the form of two columns
// ("label","features")
// Import VectorAssembler and Vectors
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.linalg.Vectors
// Rename the Yearly Amount Spent Column as "label"
// Also grab only the numerical columns from the data
// Set all of this as a new dataframe called df
val df = data.select($"Yearly Amount Spent".as("label"),$"Avg Session Length",$"Time on App",$"Time on Website",$"Length of Membership")
// An assembler converts the input values to a vector
// A vector is what the ML algorithm reads to train a model
// Use VectorAssembler to convert the input columns of df
// to a single output column of an array called "features"
// Set the input columns from which we are supposed to read the values.
// Call this new object assembler
val assembler = new VectorAssembler().setInputCols(Array("Avg Session Length","Time on App","Time on Website","Length of Membership")).setOutputCol("features")
// Use the assembler to transform our DataFrame to the two columns: label and features
val output = assembler.transform(df).select($"label",$"features")
// Create a Linear Regression Model object
val lr = new LinearRegression()
// Fit the model to the data and call this model lrModel
val lrModel = lr.fit(output)
// Print the coefficients and intercept for linear regression
println($"Coefficients: ${lrModel.coefficients} Intercept: ${lrModel.intercept}")
// Summarize the model over the training set and print out some metrics!
// Use the .summary method off your model to create an object
// called trainingSummary
val trainingSummary = lrModel.summary
// Show the residuals, the RMSE, the MSE, and the R^2 Values.
println("Residuals:")
trainingSummary.residuals.show()
println(s"RMSE: ${trainingSummary.rootMeanSquaredError}")
println(s"MSE: ${trainingSummary.meanSquaredError}")
println(s"R2: ${trainingSummary.r2}")
// Great Job!