import numpy as np
from .layer import Layer
from abc import abstractmethod
from numpy.typing import ArrayLike


class Activation(Layer):
    """
    the main activation function class containing all the methods used for activation function
    it's an abstract class, meaning it should never be used directly, but instead used a base
    """
    __slots__ = ['input', 'activation']

    def __init__(self) -> None:
        super().__init__()

    def forward(self, input: ArrayLike) -> np.ndarray:
        """
        creates the activation and introduces non-linearity to the network
        """
        self.input = input
        self.activation = self._function(self.input)
        return self.activation

    def backward(self, gradient: ArrayLike) -> np.ndarray:
        """
        creates the upstream gradient from input gradient
        """
        return self._derivative() * gradient

    @abstractmethod
    def _function(self, input: ArrayLike) -> np.ndarray:
        """
        it's abstract method, thus must be implemented individually
        """
        pass

    @abstractmethod
    def _derivative(self) -> np.ndarray:
        """
        it's abstract method, thus must be implemented individually
        """
        pass


class Relu(Activation):
    """
    Rectified Linear Unit (ReLU) activation function.

    ReLU is a commonly used activation function in neural networks, defined as f(x) = max(0, x).
    It is known to perform well in deep learning models due to its ability to produce sparse representations
    and avoid the vanishing gradient problem.
    """
    __slots__ = []

    def __init__(self) -> None:
        super().__init__()

    def _function(self, input: ArrayLike) -> np.ndarray:
        return np.maximum(0.0, input)

    def _derivative(self) -> np.ndarray:
        return np.where(self.input > 0, 1, 0)


class Elu(Activation):
    """
    Exponential Linear Unit (ELU) activation function.
    it accepts a scaling parameter
    """
    __slots__ = ['alpha']

    def __init__(self, alpha: float = 1.0) -> None:
        super().__init__()
        self.alpha = alpha

    def _function(self, input: ArrayLike) -> np.ndarray:
        return np.where(input <= 0., self.alpha * np.exp(input) - 1, input)

    def _derivative(self) -> np.ndarray:
        return np.where(self.input > 0, 1, self.alpha * np.exp(self.input))


class LeakyRelu(Activation):
    """
    Leaky ReLU activation function.
    one can set the slope on the negative side
    """
    __slots__ = ['epsilon']

    def __init__(self, epislon: float = 1e-1) -> None:
        super().__init__()
        self.epislon = epislon

    def _function(self, input: ArrayLike) -> np.ndarray:
        input[input <= 0.] *= self.epislon
        return input

    def _derivative(self) -> np.ndarray:
        return np.where(self.input > 1, 1, self.epislon)


class Tanh(Activation):
    """
    The hyperbolic tangent (tanh) activation function.

    This activation function maps input values to the range (-1, 1). It is commonly used in neural networks due to its
    ability to introduce non-linearity while still being differentiable.
    """
    __slots__ = []

    def __init__(self) -> None:
        super().__init__()

    def _function(self, input: ArrayLike) -> np.ndarray:
        return np.tanh(input)

    def _derivative(self) -> np.ndarray:
        return 1 - np.square(self.activation)


class Sigmoid(Activation):
    """
    Sigmoid activation function class.
    """
    __slots__ = []

    def __init__(self) -> None:
        super().__init__()

    def _function(self, input: ArrayLike) -> np.ndarray:
        return 1 / (1 + np.exp(-input))

    def _derivative(self) -> np.ndarray:
        return (1 - self.activation) * self.activation


class SoftMax(Activation):
    """
    Softmax activation function.

    Softmax function normalizes the output of a neural network to a probability
    distribution over the classes in the output layer. It is commonly used in
    multi-class classification tasks.
    """
    __slots__ = []

    def __init__(self) -> None:
        super().__init__()

    def _function(self, input: ArrayLike) -> np.ndarray:
        input = input - np.max(input)
        output = np.exp(input)
        return output/np.sum(output, axis=1, keepdims=True)

    def _derivative(self) -> np.ndarray:
        return self.activation * (1 - self.activation)


class SoftPlus(Activation):
    """
    The SoftPlus activation function is defined as log(1 + e^x).
    This function is used to introduce non-linearity to a neural network's output.
    """
    __slots__ = []

    def __init__(self) -> None:
        super().__init__()

    def _function(self, input: ArrayLike) -> np.ndarray:
        return np.log(1. + np.exp(input))

    def _derivative(self) -> np.ndarray:
        output = np.exp(self.input)
        return output / (1. + output)


class SoftSign(Activation):
    """
    SoftSign activation function.

    The SoftSign activation function maps the input to the range [-1, 1],
    making it useful in neural networks where it is important to limit the range
    of activations to avoid saturation.
    """
    __slots__ = []

    def __init__(self) -> None:
        super().__init__()

    def _function(self, input: ArrayLike) -> np.ndarray:
        return input / (np.abs(input) + 1.)

    def _derivative(self) -> np.ndarray:
        output = np.abs(self.input) + 1.
        return 1. / (output ** 2)


class Identity(Activation):
    """
    The identity activation function.
    
    The identity function simply returns its input without any transformation.
    It is often used as the activation function for the output layer of a neural network
    when the task involves regression, i.e., predicting a continuous output value.
    """
    __slots__ = []

    def __init__(self) -> None:
        super().__init__()

    def _function(self, input: ArrayLike) -> np.ndarray:
        return input

    def _derivative(self) -> np.ndarray:
        return 1