-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathLogRegProject.scala
109 lines (81 loc) · 4.23 KB
/
LogRegProject.scala
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
//////////////////////////////////////////////
// LOGISTIC REGRESSION PROJECT //////////////
////////////////////////////////////////////
// In this project we will be working with a fake advertising data set, indicating whether or not a particular internet user clicked on an Advertisement. We will try to create a model that will predict whether or not they will click on an ad based off the features of that user.
// This data set contains the following features:
// 'Daily Time Spent on Site': consumer time on site in minutes
// 'Age': cutomer age in years
// 'Area Income': Avg. Income of geographical area of consumer
// 'Daily Internet Usage': Avg. minutes a day consumer is on the internet
// 'Ad Topic Line': Headline of the advertisement
// 'City': City of consumer
// 'Male': Whether or not consumer was male
// 'Country': Country of consumer
// 'Timestamp': Time at which consumer clicked on Ad or closed window
// 'Clicked on Ad': 0 or 1 indicated clicking on Ad
///////////////////////////////////////////
// COMPLETE THE COMMENTED TASKS BELOW ////
/////////////////////////////////////////
////////////////////////
/// GET THE DATA //////
//////////////////////
// Import SparkSession and Logistic Regression
import org.apache.spark.sql.SparkSession
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.feature.{VectorAssembler,OneHotEncoder,StringIndexer,VectorIndexer}
import org.apache.spark.mllib.evaluation.MulticlassMetrics
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.Pipeline
// Optional: Use the following code below to set the Error reporting
import org.apache.log4j._
Logger.getLogger("org").setLevel(Level.ERROR)
// Create a Spark Session
val spark = SparkSession.builder().getOrCreate()
// Use Spark to read in the Advertising csv file.
val data = spark.read.option("header","true").option("inferSchema","true").format("csv").load("advertising.csv")
import spark.implicits._
// Print the Schema of the DataFrame
data.printSchema()
///////////////////////
/// Display Data /////
/////////////////////
// Print out a sample row of the data (multiple ways to do this)
data.head(1)
////////////////////////////////////////////////////
//// Setting Up DataFrame for Machine Learning ////
//////////////////////////////////////////////////
// Do the Following:
// - Rename the Clicked on Ad column to "label"
// - Grab the following columns "Daily Time Spent on Site","Age","Area Income","Daily Internet Usage","Timestamp","Male"
// - Create a new column called Hour from the Timestamp containing the Hour of the click
val df = (data.select($"Clicked on Ad".as("label"),$"Daily Time Spent on Site",$"Age",$"Area Income",$"Daily Internet Usage",$"Timestamp",$"Male"))
val modified_df = df.withColumn("Hour",hour($"Timestamp"))
// Import VectorAssembler and Vectors
// Create a new VectorAssembler object called assembler for the feature
// columns as the input Set the output column to be called features
val assembler = new VectorAssembler().setInputCols(Array("Daily Time Spent on Site","Age","Area Income","Daily Internet Usage","Male","Hour")).setOutputCol("features")
// Use randomSplit to create a train test split of 70/30
val Array(train, test) = modified_df.randomSplit(Array(0.7,0.3),seed=46)
///////////////////////////////
// Set Up the Pipeline ///////
/////////////////////////////
// Import Pipeline
// Create a new LogisticRegression object called lr
val lr = new LogisticRegression()
// Create a new pipeline with the stages: assembler, lr
val pipeline = new Pipeline().setStages(Array(assembler,lr))
// Fit the pipeline to training set.
val model = pipeline.fit(train)
// Get Results on Test Set with transform
val results = model.transform(test)
////////////////////////////////////
//// MODEL EVALUATION /////////////
//////////////////////////////////
// For Metrics and Evaluation import MulticlassMetrics
// Convert the test results to an RDD using .as and .rdd
val predictionAndLabels = results.select($"prediction",$"label").as[(Double,Double)].rdd
// Instantiate a new MulticlassMetrics object
val metrics = new MulticlassMetrics(predictionAndLabels)
// Print out the Confusion matrix
println("Confusion Matrix:")
println(metrics.confusionMatrix)