-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.jl
139 lines (119 loc) · 5.56 KB
/
main.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
using DelimitedFiles, DataFrames, CSV
using Random, Printf
using ThreadTools
include("src/parzenwindow.jl")
include("src/gmm.jl")
include("src/eval_metrics.jl")
@doc """
Main function to load dataset, train Gaussian mixture model and Parzen window estimation, set hyperparametrs on valid data and eval models
on test data.
# Examples
```jldoctest
g::Float64, p::Float64 = main(true, "yeast")
```
where the 'g' is GMM roc-AUC on test data and 'p' is PWE roc-AUC on test data.
The main(show_report::Bool, dataset_name::String) has got two arguments. If 'show_report' is set to true, the whole evaluation report is
printed, 'dataset_name' is a string name of dataset to be loaded.
""" ->
function main(show_report::Bool, dataset_name::String)
# Load all data.
data_anomal_raw = readdlm(string("data_anomalyproject/", dataset_name, "/anomalous.txt"))'
data_normal_raw = readdlm(string("data_anomalyproject/", dataset_name, "/normal.txt"))'
data_anomal::Matrix{Float64} = data_anomal_raw[:, shuffle(1:end)]
data_normal::Matrix{Float64} = data_normal_raw[:, shuffle(1:end)]
# Prepare all data.
N_normal::Int64 = size(data_normal)[2]
trn_data::Matrix{Float64} = data_normal[:, begin:Int(round(N_normal/2))]
valid_data::Matrix{Float64} = data_normal[:, Int(round(N_normal/2)):Int(round(3*N_normal/4))]
test_data_n::Matrix{Float64} = data_normal[:, Int(round(3*N_normal/4)):end]
N::Int64 = size(data_normal)[1]
# Define function to get likelihood proxy param.
lt(x) = x < -10e5 ? -10e5 : x
likelihood(model, params, data) = mean(tmap(x->lt(log(model(params, Vector{Float64}(x)))), eachcol(data)))
likelihood(model, data) = mean(tmap(x->lt(log(model(Vector{Float64}(x)))), eachcol(data)))
@doc """
Function to choose optimal count of components (hyperparametr) of Gaussian mixture model. The choose is
performed based on maximization of likelihood (proxy parametr) on given (validation) data.
Minimum of components is 2, maximum is 10.
# Examples
```jldoctest
model, Θ = choose_gmm_model(data)
```
where the 'Θ' is dictionary of params of the GMM,
'data' is set of (validation) data (size(dim, N) ~ (dimension of data, data count),
'model' is a Gaussian mixture model.
""" ->
function choose_gmm_model(data::Matrix{Float64})
models_dict = Dict{Float64, Vector{Any}}() # likelihood => [model, params]
lh::Float64 = 0
for K::Int64=2:10
ps::Dict{Symbol, Vector}, gmm_model, gm_model = create_gmm(K, N); # prepare model
EM!(ps, trn_data, K, gmm_model, gm_model, 60); # learn model params
lh = likelihood(gmm_model, ps, data)
models_dict[lh] = [gmm_model, ps]
end
return models_dict[maximum(keys(models_dict))]
end
@doc """
Function to choose optimal window-size (hyperparametr) of Parzen window estimation. The choose is
performed based on maximization of likelihood (proxy parametr) on given (validation) data.
Minimum of window-size is 0.01, maximum is 10, step is 0.01. The kernel function is Gaussian kernel.
# Examples
```jldoctest
h = choose_parzenwindow_model(data)
```
where the 'h' is window-size and
'data' is set of (validation) data (size(dim, N) ~ (dimension of data, data count).
""" ->
function choose_parzenwindow_model(data::Matrix{Float64})
models_dict = Dict{Float64, Float64}() # likelihood => window-size
lh::Float64 = 0
kernel(x) = k(x)
for step::Float64=0.01:0.01:10
model(x) = create_parzen_window(step, trn_data, kernel, x) # prepare model
lh = likelihood(model, data)
models_dict[lh] = step
end
return models_dict[maximum(keys(models_dict))]
end
# Learn GMM on train data and choose the best count of components by validation data.
gmm_model, params::Dict{Symbol, Vector} = choose_gmm_model(valid_data)
@printf("Best K: %d\n", size(params[:μ])[1])
# Learn Parzen window estimation on train data and choose the best window-size by validation data.
h::Float64 = choose_parzenwindow_model(valid_data)
kernel(x) = k(x)
parzenwindow(x) = create_parzen_window(h, trn_data, kernel, x)
@printf("Best window-size: %.3f\n", h)
# Prepare testing data.
testing_data::Matrix{Float64} = hcat(test_data_n, data_anomal);
testing_labels::Vector{Bool} = Vector{Bool}(vcat(ones(size(test_data_n)[2], 1),
zeros(size(data_anomal)[2], 1))[:, 1]);
# Print evaluation report.
gmm_auc::Float64 = eval_report(gmm_model, params, testing_data, testing_labels, show_report);
parzen_auc::Float64 = eval_report(parzenwindow, testing_data, testing_labels, show_report);
return gmm_auc, parzen_auc
end
@doc """
Function to learn models on dataset (on each dataset 10-times.) and store roc-AUC into CSV.
# Examples
```jldoctest
compare_models()
```
""" ->
function compare_models()
datasets_names::Vector{String} = filter(x->x[1]!='.', readdir("data_anomalyproject/"))
gmm_auc_arr = Vector{Float64}([])
parzen_auc_arr = Vector{Float64}([])
for name in datasets_names
for iter=1:10
println(name, " / ", iter)
g::Float64, p::Float64 = main(false, name)
@printf("gmm auc: %.3f, parzen auc: %.3f\n", g, p)
println()
append!(gmm_auc_arr, g)
append!(parzen_auc_arr, p)
end
end
df = DataFrame(gmm = gmm_auc_arr, parzen = parzen_auc_arr)
CSV.write("auc_stat.csv", df)
end