Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import numpy as np
from .weights import Weights
from .rnn import RNN, checkDims
class LSTM(RNN):
"""
An implementation of the LSTM layer.
"""
__slots__ = ['inputSize', 'hiddenSize', 'input', 'hidden', 'cell']
def __init__(self, inputSize: int, hiddenSize: int, weights: np.ndarray = None, bias: np.ndarray = None) -> None:
super().__init__()
self.inputSize = inputSize
self.hiddenSize = hiddenSize
# Initialize weights and bias
self.weights = Weights((inputSize + hiddenSize, 4 * hiddenSize), values=weights)
self.bias = Weights((4 * hiddenSize,), values=bias)
# Initialize hidden and cell states
self.hidden = np.zeros((hiddenSize,))
self.cell = np.zeros((hiddenSize,))
def forward(self, input: np.ndarray, hiddenState: np.ndarray = None, cellState: np.ndarray = None) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
"""
forward pass of the LSTM layer
"""
checkDims(input)
self.input = input
self.batchSize, self.seqLength, _ = input.shape
# Initialize hidden and cell states if not provided
if hiddenState is None:
hiddenState = np.zeros((self.batchSize, self.seqLength, self.hiddenSize))
if cellState is None:
cellState = np.zeros((self.batchSize, self.seqLength, self.hiddenSize))
# Initialize output array
output = np.zeros((self.batchSize, self.seqLength, self.hiddenSize))
for t in range(self.seqLength):
combined = np.hstack((hiddenState[:, t, :], input[:, t, :]))
gates = np.matmul(combined, self.weights.values) + self.bias.values
# Compute the input, forget, and output gates
inputGate, forgetGate, outputGate, hiddenGate = np.split(gates, 4)
# Apply sigmoid activation function for input, forget, and output gates
inputGate = 1 / (1 + np.exp(-inputGate))
forgetGate = 1 / (1 + np.exp(-forgetGate))
outputGate = 1 / (1 + np.exp(-outputGate))
# Apply tanh activation function for the cell gate
hiddenGate = np.tanh(hiddenGate)
# Update the cell and hidden state
cellState[:, t, :] = forgetGate * cellState[:, t, :] + inputGate * hiddenGate
hiddenState[:, t, :] = outputGate * np.tanh(cellState[:, t, :])
return output, hiddenState, cellState
def backward(self, gradient: np.ndarray, hiddenGradient: np.ndarray = None, cellGradient: np.ndarray = None) -> np.ndarray:
"""
backward pass of the LSTM layer
"""
gradInputState = np.zeros_like(self.input)
dhiddenNext = np.zeros((self.batchSize, self.hiddenSize))
dcellNext = np.zeros((self.batchSize, self.hiddenSize))
dW = np.zeros_like(self.weights.values)
db = np.zeros_like(self.bias.values)
if hiddenGradient is not None:
dhiddenNext += hiddenGradient
if cellGradient is not None:
dcellNext += cellGradient
for t in reversed(range(self.seqLength)):
# Compute the input, forget, and output gates
inputGate, forgetGate, outputGate, hiddenGate = np.split(gradient[:, t, :], 4)
# Partial derivative of loss w.r.t. output gate
do = dhiddenNext * np.tanh(self.cell[:, t, :])
do_input = do * outputGate * (1 - outputGate)
# Partial derivative of loss w.r.t. cell state
dc = dcellNext + dhiddenNext * outputGate * (1 - np.tanh(self.cell[:, t, :]) ** 2)
dc_bar = dc * inputGate
dc_bar_input = dc_bar * (1 - hiddenGate ** 2)
# Partial derivative of loss w.r.t. input gate
di = dc * hiddenGate
di_input = di * inputGate * (1 - inputGate)
# Partial derivative of loss w.r.t. forget gate
df = dc * self.cell[:, t - 1, :]
df_input = df * forgetGate * (1 - forgetGate)
# Stacking the gradients
dstacked = np.hstack((di_input, df_input, do_input, dc_bar_input))
# Gradients with respect to weights and biases
dW += np.matmul(np.hstack((self.input[:, t, :], self.hidden[:, t - 1, :])).T, dstacked)
db += np.sum(dstacked, axis=0)
# Gradients with respect to inputs
gradInputState[:, t, :] = np.matmul(dstacked, self.weights.values[:self.inputSize].T)
# Update for next timestep
dhiddenNext = np.matmul(dstacked, self.weights.values[self.inputSize:].T)
dcellNext = forgetGate * dc
# Store the gradients
self.weights.deltas = dW
self.bias.deltas = db
return gradInputState, dhiddenNext, dcellNext
def __str__(self) -> str:
"""
used for print the layer in a human readable manner
"""
printString = self.name
printString += ' input size: ' + str(self.inputSize)
printString += ' hidden size: ' + str(self.hiddenSize)
return printString