-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscript.R
92 lines (69 loc) · 1.99 KB
/
script.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# load library ---
library(dplyr)
library(factoextra)
library(FactoMineR)
# load data ---
property <- read.csv("data_input/nyc.csv", stringsAsFactors = F)
# inspect data structure ---
str(property)
# data cleansing ---
ppt <- property %>%
select(-c(X, BOROUGH, BLOCK, LOT, ZIP.CODE)) %>%
mutate(LAND.SQUARE.FEET = as.integer(LAND.SQUARE.FEET),
GROSS.SQUARE.FEET = as.integer(GROSS.SQUARE.FEET),
SALE.PRICE = as.integer(SALE.PRICE)) %>%
select_if(is.integer) %>%
filter(complete.cases(.))
head(ppt)
# data pre-process ---
ppt_z <- scale(ppt)
# find eigen ---
eigen_ppt <- eigen(cov(ppt_z))
# eigen value
eigen_ppt$values
# eigen vector
eigen_ppt$vectors
# principal component analysis ---
ppt.pca <- prcomp(ppt, scale = TRUE)
summary(ppt.pca)
# visualize pca ---
ppt.small <- ppt[1:300,]
biplot(prcomp(ppt.small, scale = T), cex = 0.5)
# subset ppt.small
ppt.small <- ppt[1:300,]
ppt.small_pca <- prcomp(ppt.small, scale = T)
ppt.small_pca
# K-Means Clustering ---
# load data
whiskies <- read.csv("data_input/whiskies.txt") %>%
select(-c(RowID, Postcode, Latitude, Longitude))
head(whiskies)
# rownames(whiskies) <- whiskies[,1]
# whiskies <- whiskies %>%
# select(-Distillery)
str(whiskies)
# scale data
whiskies.z <- whiskies %>%
mutate_if(is.integer, scale)
# elbow plot
set.seed(100)
wss <- function(data, maxCluster = 9) {
# Initialize within sum of squares
SSw <- (nrow(data) - 1) * sum(apply(data, 2, var))
SSw <- vector()
for (i in 2:maxCluster) {
SSw[i] <- sum(kmeans(data, centers = i)$withinss)
}
plot(1:maxCluster, SSw, type = "o", xlab = "Number of Clusters", ylab = "Within groups sum of squares", pch=19)
}
wss(whiskies.z)
# kmeans
set.seed(100)
whis.km <- kmeans(whiskies.z, k = 5)
# k-means and pca
whi.pca <- PCA(whiskies.z, graph = F)
# get cluster
whiskies$cluster <- as.factor(whis.km$cluster)
# visualize
plot(whi.pca, choix=c("ind"), label="none", col.ind=whiskies$cluster)
legend("topright", levels(whiskies$cluster), pch=19, col=1:5)