Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import numpy as np
import sys, random
from matplotlib import pyplot as plt
from rf.decisionTree import DecisionTree
from rf.impurityMeasure import Gini, Entropy, MAE, MSE
from rf.leafFunction import Mode, Mean
from rf.splitAlgorithm import CART, ID3, C45
from rf.featureSelection import UsersChoice, Variance, Random, MutualInformation, ANOVA, KendallTau
from metric.confusionMatrix import ConfusionMatrix
from utility.timer import Time
from settings.treeSettings import TreeSettings
from data.data import Data
def dataShift(dims):
offSet = [0.25, 0.5, 0.25]
diffLen = abs(len(offSet) - dims)
offSet.extend([0] * diffLen)
random.shuffle(offSet)
return offSet[:dims]
def getImurity(impurity: str):
if impurity == 'gini':
return Gini() # Use Gini index as the impurity measure
elif impurity == 'entropy':
return Entropy() # Use Entropy index as the impurity measure
elif impurity == 'mae':
return MAE() # Use MAE index as the impurity measure
elif impurity == 'mse':
return MSE() # Use MSE index as the impurity measure
def getLeaf(leaf: str):
if leaf == 'mode':
return Mode() # Use mode as the leaf function
elif leaf == 'mean':
return Mean() # Use mean as the leaf function
def getSplit(split: str, percentile: int = None):
if split == 'id3':
return ID3(percentile) # Use ID3 algorithm for splitting
elif split == 'c45':
return C45(percentile) # Use C4.5 algorithm for splitting
elif split == 'cart':
return CART(percentile) # Use CART algorithm for splitting
def getFeatureSelection(selection: str, *args):
if selection == 'choice':
return UsersChoice(*args)
elif selection == 'variance':
return Variance(*args)
elif selection == 'random':
return Random(*args)
elif selection == 'mutual':
return MutualInformation(*args)
elif selection == 'anova':
return ANOVA(*args)
elif selection == 'kendall':
return KendallTau(*args)
if __name__ == "__main__":
settings = TreeSettings()
try:
configFile = sys.argv[1]
settings.getConfig(configFile)
settings.setConfig()
except IndexError:
pass
print(settings)
# Create a timer object to measure execution time
timer = Time()
print("Importing data...\n")
timer.start()
data = Data(trainAmount=settings['trainAmount'], evalAmount=settings['validAmount'], dataPath=settings['dataPath'], normalize=settings['normalize'])
data.inputFeatures(*settings['features'])
data.importData(*settings['dataFiles'])
print(data)
timer.record("Importing Data")
# Create and train a decision tree
timer.start()
print('Setting up tree')
tree = DecisionTree(settings['depth'], settings['minSamples'])
tree.setComponent(getImurity(settings['impurity']))
tree.setComponent(getLeaf(settings['leaf']))
tree.setComponent(getSplit(settings['split'],settings['percentile']))
if settings['featSelection'] is not None:
tree.setFeatureSelection(settings['featSelection'], settings['featParameter']) # Use random feature selection
timer.record("Tree setup")
# Train the tree using the training data
timer.start()
print('begin training...')
#tree.train(trainData,trainLabels)
tree.train(data.trainSet.data,data.trainSet.labels.argmax(1))
timer.record("Training")
# Evaluate the tree on the validation data
timer.start()
print('making predictions...')
#prediction = tree.eval(validData)
prediction = tree.eval(data.evalSet.data)
timer.record("Prediction")
# Print the trained decision tree
print(tree)
print()
# Compute confusion matrix to evaluate the performance of the decision tree
confusion = ConfusionMatrix(2)
#confusion.update(prediction, validLabels)
confusion.update(prediction, data.evalSet.labels.argmax(1))
confusion.percentages()
confusion.calcScores()
# Compute confusion matrix to evaluate the performance of the decision tree
print(confusion)
print()
print(timer)