tree-test.py

import numpy as np
import sys, random
from matplotlib import pyplot as plt
from machineLearning.rf import (
    DecisionTree,
    Gini, Entropy, MAE, MSE,
    Mode, Mean,
    CART, ID3, C45,
    UsersChoice, Variance, Random, MutualInformation, ANOVA, KendallTau
)
from machineLearning.metric import ConfusionMatrix
from machineLearning.utility import Time
from machineLearning.settings.treeSettings import TreeSettings
from machineLearning.data import Data


def dataShift(dims):
    offSet = [0.25, 0.5, 0.25]
    diffLen = abs(len(offSet) - dims)
    offSet.extend([0] * diffLen)
    random.shuffle(offSet)
    return offSet[:dims]


def getImurity(impurity: str):
    if impurity == 'gini':
        return Gini() # Use Gini index as the impurity measure
    elif impurity == 'entropy':
        return Entropy() # Use Entropy index as the impurity measure
    elif impurity == 'mae':
        return MAE() # Use MAE index as the impurity measure
    elif impurity == 'mse':
        return MSE() # Use MSE index as the impurity measure


def getLeaf(leaf: str):
    if leaf == 'mode':
        return Mode() # Use mode as the leaf function
    elif leaf == 'mean':
        return Mean() # Use mean as the leaf function


def getSplit(split: str, percentile: int = None):
    if split == 'id3':
        return ID3(percentile) # Use ID3 algorithm for splitting
    elif split == 'c45':
        return C45(percentile) # Use C4.5 algorithm for splitting
    elif split == 'cart':
        return CART(percentile) # Use CART algorithm for splitting


def getFeatureSelection(selection: str, *args):
    if selection == 'choice':
        return UsersChoice(*args)
    elif selection == 'variance':
        return Variance(*args)
    elif selection == 'random':
        return Random(*args)
    elif selection == 'mutual':
        return MutualInformation(*args)
    elif selection == 'anova':
        return ANOVA(*args)
    elif selection == 'kendall':
        return KendallTau(*args)


if __name__ == "__main__":
    settings = TreeSettings()
    try:
        configFile = sys.argv[1]
        settings.getConfig(configFile)
        settings.setConfig()
    except IndexError:
        pass
    print(settings)

    # Create a timer object to measure execution time
    timer = Time()

    print("Importing data...\n")
    timer.start()
    data = Data(trainAmount=settings['trainAmount'], evalAmount=settings['validAmount'], dataPath=settings['dataPath'], normalize=settings['normalize'])
    data.inputFeatures(*settings['features'])
    data.importData(*settings['dataFiles'])
    print(data)
    timer.record("Importing Data")

    # Create and train a decision tree
    timer.start()
    print('Setting up tree')
    tree = DecisionTree(settings['depth'], settings['minSamples'])
    tree.setComponent(getImurity(settings['impurity']))
    tree.setComponent(getLeaf(settings['leaf']))
    tree.setComponent(getSplit(settings['split'],settings['percentile']))
    if settings['featSelection'] is not None:
        tree.setFeatureSelection(settings['featSelection'], settings['featParameter']) # Use random feature selection
    timer.record("Tree setup")

    # Train the tree using the training data
    timer.start()
    print('begin training...')
    #tree.train(trainData,trainLabels)
    tree.train(data.trainSet.data,data.trainSet.labels.argmax(1))
    timer.record("Training")

    # Evaluate the tree on the validation data
    timer.start()
    print('making predictions...')
    #prediction = tree.eval(validData)
    prediction = tree.eval(data.evalSet.data)
    timer.record("Prediction")

    # Print the trained decision tree
    print(tree)
    print()

    # Compute confusion matrix to evaluate the performance of the decision tree
    confusion = ConfusionMatrix(2)
    #confusion.update(prediction, validLabels)
    confusion.update(prediction, data.evalSet.labels.argmax(1))
    confusion.percentages()
    confusion.calcScores()

    # Compute confusion matrix to evaluate the performance of the decision tree
    print(confusion)
    print()
    print(timer)