# Testing the Tree

## Importing the Basics

In [None]:
import numpy as np
import random
from matplotlib import pyplot as plt
from rf.decisionTree import DecisionTree
from rf.impurityMeasure import Gini, Entropy, MSE, MAE
from rf.leafFunction import Mode, Mean, Confidence
from rf.splitAlgorithm import CART, ID3, C45
from metric.confusionMatrix import ConfusionMatrix
from metric.regressionScores import RegressionScores
from utility.modelIO import ModelIO
from rf.pruning import ReducedError, CostComplexity, PessimisticError

## Generating Test Data

Here I generate random test data. It's two blocks shifted very slightly in some dimensions. For classifier tasks each block gets a label, for regressor tasks each block gets the average coordinates plus some random value as a traget. It's a very simple dummy data set meant for testing the code.

Here one can change the dimensionallity and amount of the data.

In [None]:
def dataShift(dims):
    offSet = [5, 1.5, 2.5]
    diffLen = abs(len(offSet) - dims)
    offSet.extend([0] * diffLen)
    random.shuffle(offSet)
    return offSet[:dims]

# Initialize some parameters
totalAmount = 64000
dims = 5
evalAmount = totalAmount // 4
trainAmount = totalAmount - evalAmount
offSet = dataShift(dims)

# Create covariance matrix
cov = np.eye(dims)  # This creates a covariance matrix with variances 1 and covariances 0

# Generate random multivariate data
oneData = np.random.multivariate_normal(np.zeros(dims), cov, totalAmount)
twoData = np.random.multivariate_normal(offSet, cov, totalAmount)

# Split the data into training and evaluation sets
trainData = np.vstack((oneData[:trainAmount], twoData[:trainAmount]))
validData = np.vstack((oneData[trainAmount:], twoData[trainAmount:]))

# Labels for classification tasks
trainLabels = np.hstack((np.zeros(trainAmount), np.ones(trainAmount)))
validLabels = np.hstack((np.zeros(evalAmount), np.ones(evalAmount)))

# Targets for regression tasks
trainTargets = np.sum(trainData, axis=1) + np.random.normal(0, 0.1, 2*trainAmount)
validTargets = np.sum(validData, axis=1) + np.random.normal(0, 0.1, 2*evalAmount)

# Shuffle the training data
trainIndex = np.random.permutation(len(trainData))
trainData = trainData[trainIndex]
trainLabels = trainLabels[trainIndex]

## Creating the Tree

Here the tree is created. One can set the maximum depth of the tree. Depending on the task, we add a different impurity function and a different leaf function. Finally we add the split algorithm and set the feature percentile. Higher numbers look at more possible splits, but decreases speed. Lower numbers look at less possible splits, speeding up the algorithm. Depending on the data set this can have a strong impact on the performance.

In [None]:
task = 'classifier' # 'classifier'/'regressor'
tree = DecisionTree(maxDepth=5, minSamplesSplit=2)
if task == 'regressor':
    tree.setComponent(MSE())
    tree.setComponent(Mean())
elif task == 'classifier':
    tree.setComponent(Entropy())
    tree.setComponent(Mode())
    #tree.setComponent(Confidence())
tree.setComponent(CART(featurePercentile=90))

## Trainining the tree

Again, depending on the task we train the tree with targets or labels. Then we make a prediction and plot the tree.

In [None]:
if task == 'regressor':
    tree.train(trainData, trainTargets)
elif task == 'classifier':
    tree.train(trainData, trainLabels)
prediction = tree.eval(validData)
print(tree)

In [None]:
# Create bar plot
plt.bar(np.arange(dims), tree.featureImportance, color='steelblue')

# Add labels and title
plt.xlabel('Feature Index')
plt.ylabel('Importance')
plt.title('Feature Importance')

# Add grid
plt.grid(True, linestyle='--', alpha=0.6)

# Show plot
plt.show()

## Evaluating predictions

Depending on the task at hand we create a confusion matrix (classification) or simple metrics (regression). Since the number of classes is fixed to two, we don't need to change anything here.

In [None]:
if task == 'regressor':
    metrics = RegressionScores(numClasses=2)
    metrics.calcScores(prediction, validTargets, validLabels)
    print(metrics)
elif task == 'classifier':
    confusion = ConfusionMatrix(numClasses=2)
    confusion.update(prediction, validLabels)
    confusion.percentages()
    confusion.calcScores()
    print(confusion)

## Saving and Loading a Tree

Trees can be converted to dictionaries and then saved as a json file. This allows us to load them and re-use them. Also json is a raw text format, which is neat.

In [None]:
saver = ModelIO()
saver.save(tree, 'test')
newTree = saver.load('test')
print(newTree)

In [None]:
prediction = newTree.eval(validData)

if task == 'regressor':
    newMetrics = RegressionScores(numClasses=2)
    newMetrics.calcScores(prediction, validTargets, validLabels)
    print(newMetrics)
elif task == 'classifier':
    newConfusion = ConfusionMatrix(numClasses=2)
    newConfusion.update(prediction, validLabels)
    newConfusion.percentages()
    newConfusion.calcScores()
    print(newConfusion)

## Comment

The tree works pretty well with both regression and classification tasks. Labels shouldn't be one-hot encoded, it works but it's still rather iffy. Targets should 1D, I haven't tested with 2D, it might work. Training can be really fast with a percentile set in the split algorithm, otherwise it can be rather slow. Making predictions work fast and well enough.