-
Notifications
You must be signed in to change notification settings - Fork 101
/
Copy pathiris.jl
120 lines (106 loc) · 4.02 KB
/
iris.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# Classification Test - Iris Data Set
# https://archive.ics.uci.edu/ml/datasets/iris
@testset "iris.jl" begin
features, labels = load_data("iris")
labels = String.(labels)
classes = sort(unique(labels))
n = length(labels)
# train a decision stump (depth=1)
model = build_stump(labels, features)
preds = apply_tree(model, features)
cm = confusion_matrix(labels, preds)
@test cm.accuracy > 0.6
@test depth(model) == 1
probs = apply_tree_proba(model, features, classes)
@test reshape(sum(probs, dims=2), n) ≈ ones(n)
probs_m = apply_tree_proba(model, features, classes, use_multithreading=true)
@test reshape(sum(probs_m, dims=2), n) ≈ ones(n)
# train full-tree classifier (over-fit)
model = build_tree(labels, features)
preds = apply_tree(model, features)
cm = confusion_matrix(labels, preds)
@test cm.accuracy == 1.0
@test length(model) == 9
@test depth(model) == 5
@test typeof(preds) == Vector{String}
print_tree(model)
probs = apply_tree_proba(model, features, classes)
@test reshape(sum(probs, dims=2), n) ≈ ones(n)
probs_m = apply_tree_proba(model, features, classes, use_multithreading = true)
@test reshape(sum(probs_m, dims=2), n) ≈ ones(n)
i1 = impurity_importance(model)
s1 = split_importance(model)
# prune tree to 8 leaves
pruning_purity = 0.9
pt = prune_tree(model, pruning_purity)
@test length(pt) == 8
preds = apply_tree(pt, features)
cm = confusion_matrix(labels, preds)
@test 0.99 < cm.accuracy < 1.0
i2 = impurity_importance(pt)
s2 = split_importance(pt)
@test isapprox(i2, i1.+ [0, 0, 0, (47*log(47/48) + log(1/48))/150])
@test s1 == s2 .+ [0, 0, 0, 1]
# prune tree to 3 leaves
pruning_purity = 0.6
pt = prune_tree(model, pruning_purity)
@test length(pt) == 3
preds = apply_tree(pt, features)
cm = confusion_matrix(labels, preds)
@test 0.95 < cm.accuracy < 1.0
probs = apply_tree_proba(model, features, classes)
@test reshape(sum(probs, dims=2), n) ≈ ones(n)
probs_m = apply_tree_proba(model, features, classes, use_multithreading = true)
@test reshape(sum(probs_m, dims=2), n) ≈ ones(n)
# prune tree to a stump, 2 leaves
pruning_purity = 0.5
pt = prune_tree(model, pruning_purity)
@test length(pt) == 2
preds = apply_tree(pt, features)
cm = confusion_matrix(labels, preds)
@test 0.66 < cm.accuracy < 1.0
# run n-fold cross validation for pruned tree
println("\n##### nfoldCV Classification Tree #####")
nfolds = 3
accuracy = nfoldCV_tree(labels, features, nfolds; rng=StableRNG(1))
@test mean(accuracy) > 0.8
# train random forest classifier
n_trees = 10
n_subfeatures = 2
partial_sampling = 0.5
model = build_forest(labels, features, n_subfeatures, n_trees, partial_sampling; rng=StableRNG(2))
preds = apply_forest(model, features)
cm = confusion_matrix(labels, preds)
@test cm.accuracy > 0.95
@test typeof(preds) == Vector{String}
probs = apply_forest_proba(model, features, classes)
@test reshape(sum(probs, dims=2), n) ≈ ones(n)
preds_MT = apply_forest(model, features, use_multithreading = true)
cm_MT = confusion_matrix(labels, preds_MT)
@test cm_MT.accuracy > 0.95
@test typeof(preds_MT) == Vector{String}
@test sum(preds .!= preds_MT) == 0
# run n-fold cross validation for forests
println("\n##### nfoldCV Classification Forest #####")
n_subfeatures = 2
n_trees = 10
n_folds = 3
partial_sampling = 0.5
accuracy = nfoldCV_forest(labels, features, nfolds, n_subfeatures, n_trees, partial_sampling; rng=StableRNG(1))
@test mean(accuracy) > 0.9
# train adaptive-boosted decision stumps
n_iterations = 15
model, coeffs = build_adaboost_stumps(labels, features, n_iterations; rng=StableRNG(1))
preds = apply_adaboost_stumps(model, coeffs, features)
cm = confusion_matrix(labels, preds)
@test cm.accuracy > 0.9
@test typeof(preds) == Vector{String}
probs = apply_adaboost_stumps_proba(model, coeffs, features, classes)
@test reshape(sum(probs, dims=2), n) ≈ ones(n)
# run n-fold cross validation for boosted stumps, using 7 iterations and 3 folds
println("\n##### nfoldCV Classification Adaboosted Stumps #####")
n_iterations = 15
nfolds = 3
accuracy = nfoldCV_stumps(labels, features, nfolds, n_iterations; rng=StableRNG(1))
@test mean(accuracy) > 0.85
end # @testset