Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import numpy as np
import sys
from rf.randomForrest import RandomForest
from rf.decisionTree import DecisionTree
from rf.impurityMeasure import Gini, Entropy, MAE, MSE
from rf.leafFunction import Mode, Mean
from rf.featureSelection import UsersChoice, Variance, Random, MutualInformation, ANOVA, KendallTau
from rf.splitAlgorithm import CART, ID3, C45
from metric.confusionMatrix import ConfusionMatrix
from utility.timer import Time
from settings.forrestSettings import ForrestSettings
from rf.voting import Majority, Confidence, Average, Median
from data.data import Data
from rf.boosting import AdaBoosting, GradientBoosting
def getImurity(impurity: str):
if impurity == 'gini':
return Gini() # Use Gini index as the impurity measure
elif impurity == 'entropy':
return Entropy() # Use Entropy index as the impurity measure
elif impurity == 'mae':
return MAE() # Use MAE index as the impurity measure
elif impurity == 'mse':
return MSE() # Use MSE index as the impurity measure
def getLeaf(leaf: str):
if leaf == 'mode':
return Mode() # Use mode as the leaf function
elif leaf == 'mean':
return Mean() # Use mean as the leaf function
def getSplit(split: str, percentile: int = None):
if split == 'id3':
return ID3(percentile) # Use ID3 algorithm for splitting
elif split == 'c45':
return C45(percentile) # Use C4.5 algorithm for splitting
elif split == 'cart':
return CART(percentile) # Use CART algorithm for splitting
def getVoting(voting: str, weights: list):
if voting == 'majority':
return Majority(weights)
elif voting == 'confidence':
return Confidence(weights)
elif voting == 'average':
return Average(weights)
elif voting == 'median':
return Median(weights)
def getFeatureSelection(selection: str, *args):
if selection == 'choice':
return UsersChoice(*args)
elif selection == 'variance':
return Variance(*args)
elif selection == 'random':
return Random(*args)
elif selection == 'mutual':
return MutualInformation(*args)
elif selection == 'anova':
return ANOVA(*args)
elif selection == 'kendall':
return KendallTau(*args)
def getBooster(booster: str):
if booster == 'adaptive':
return AdaBoosting()
elif booster == 'gradient':
return GradientBoosting()
if __name__ == "__main__":
settings = ForrestSettings()
try:
configFile = sys.argv[1]
settings.getConfig(configFile)
settings.setConfig()
except IndexError:
pass
print(settings)
# Create a timer object to measure execution time
timer = Time()
print("Importing data...\n")
timer.start()
data = Data(trainAmount=settings['trainAmount'], evalAmount=settings['validAmount'], dataPath=settings['dataPath'], normalize=settings['normalize'])
data.inputFeatures(*settings['features'])
data.importData(*settings['dataFiles'])
print(data)
timer.record("Importing Data")
# Set up random forest
timer.start()
print("setting up forrest")
forrest = RandomForest(bootstrapping=settings['bootstraping'], retrainFirst=settings['retrainFirst'])
if settings['booster'] is not None:
forrest.setComponent(getBooster(settings['booster']))
if settings['voting'] is not None:
forrest.setComponent(getVoting(settings['voting'], settings['votingWeights']))
for i in range(settings['numTrees']):
tree = DecisionTree(settings['depth'][i], settings['minSamples'][i])
tree.setComponent(getImurity(settings['impurity'][i]))
tree.setComponent(getLeaf(settings['leaf'][i]))
tree.setComponent(getSplit(settings['split'][i], settings['percentile'][i]))
if settings['featSelection'][i] is not None:
tree.setComponent(getFeatureSelection(settings['featSelection'][i], settings['featParameter'][i]))
forrest.append(tree)
timer.record("Forrest setup")
# Train the random forest
timer.start()
print("begin training")
forrest.train(data.trainSet.data,data.trainSet.labels.argmax(1))
timer.record("Training")
# Evaluate the random forest
timer.start()
print("making predictions\n")
#prediction = forrest.eval(validData)
prediction = tree.eval(data.evalSet.data)
timer.record("Prediction")
print(forrest)
print()
# Calculate and print confusion matrix
confusion = ConfusionMatrix(2)
confusion.update(prediction, data.evalSet.labels.argmax(1))
confusion.percentages()
confusion.calcScores()
print(confusion)
print()
# Print total execution time
print(timer)