Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import numpy as np
from .layer import Layer
from .weights import Weights
class BatchNorm2D(Layer):
"""
normalizes 2d input of shape (batchsize, channels, height, width)
"""
__slots__ = ['input', 'channels', 'runningMean', 'runningVariance', 'momentum', 'epsilon', 'batchSize', 'mean', 'variance', 'tiledMean', 'tiledVariance', 'normalized', 'weights', 'bias']
def __init__(self, inputShape: tuple[int, int, int], momentum: float = 0.1, epsilon: float = 1e-3) -> None:
super().__init__()
self.channels = inputShape[0]
self.weights = Weights(inputShape, init='ones')
self.bias = Weights(inputShape[0], init='zeros')
self.runningMean = np.zeros(inputShape[0])
self.runningVariance = np.zeros(inputShape[0])
self.momentum = momentum
self.epsilon = epsilon
self.inputShape = inputShape
def params(self) -> tuple[Weights, Weights]:
"""
returns weights and bias in a python list, called by optimizers
"""
return [self.weights, self.bias]
def forward(self, input: np.ndarray) -> np.ndarray:
"""
normalizes the input with running averges
this is a learning layer, thus it also has weights and biases
it maintains image size
"""
self.input = input
self.batchSize = input.shape[0]
self.mean = self.input.mean(axis=(0,2,3))
self.variance = np.sqrt((self.input.var(axis=(0,2,3))) + self.epsilon) # epsilon is used to avoid divisions by zero
self._runningVariables()
self.tiledMean = np.tile(self.mean, self.batchSize).reshape(self.batchSize, self.channels, 1, 1)
self.tiledVariance = np.tile(self.variance, self.batchSize).reshape(self.batchSize, self.channels, 1, 1)
self.normalized = (self.input - np.tile(self.runningMean,self.batchSize).reshape(self.batchSize, self.channels, 1, 1)) / np.tile(self.runningVariance,self.batchSize).reshape(self.batchSize, self.channels, 1, 1)
return self.weights.values * self.normalized + self.bias.values[None,:,None,None]
def _runningVariables(self) -> None:
"""
this is called during 'forward' method
it mixes the running avergaes with current averages
"""
if self.mode == 'train':
self.runningMean = self.momentum * self.runningMean + (1 - self.momentum) * self.mean
self.runningVariance = self.momentum * self.runningVariance + (1 - self.momentum) * self.variance
# the next two if statements are there in case someone calls the network in eval mode without prior training
if self.mode == 'eval' and np.sum(self.runningMean) == 0:
self.runningMean = self.mean
if self.mode == 'eval' and np.sum(self.runningVariance) == 0:
self.runningVariance = self.variance
def backward(self, gradient: np.ndarray) -> np.ndarray:
"""
transforms upstream gradient, the effect of variance and mean
calculates deltas for weights and biases
its input and output size are the same
"""
self.weights.deltas = (gradient * self.normalized).sum(axis=0)
self.bias.deltas = gradient.sum(axis=(0,2,3))
gradient = gradient * self.weights.values
deltaMean = (gradient / (-self.tiledVariance)).mean(0)
deltaVariance = ((gradient * (self.input - self.tiledMean)).sum(0) * ((-.5 / self.tiledVariance) ** 3))
gradient = (gradient / self.tiledVariance + deltaVariance * 2 * (self.input - self.tiledMean) / self.mean.size + deltaMean / self.mean.size)
return gradient
class BatchNorm1D(Layer):
"""
normalizes 1d input of shape (batchsize, whatever)
"""
__slots__ = ['input', 'runningMean', 'runningVariance', 'momentum', 'epsilon', 'batchSize', 'mean', 'variance', 'tiledMean', 'tiledVariance', 'normalized', 'weights', 'bias']
def __init__(self, numFeatures: int, momentum: float = 0.1, epsilon: float = 1e-3) -> None:
super().__init__()
self.weights = Weights((1, numFeatures), init='ones')
self.bias = Weights((1, numFeatures), init='zeros')
self.runningMean = 0
self.runningVariance = 0
self.momentum = momentum
self.epsilon = epsilon
self.numFeatures = numFeatures
def params(self) -> tuple[Weights, Weights]:
"""
returns weights and bias in a python list, called by optimizers
"""
return [self.weights, self.bias]
def forward(self, input: np.ndarray) -> np.ndarray:
"""
normalizes the input with running averges
this is a learning layer, thus it also has weights and biases
it maintains input size
"""
self.input = input
self.batchSize = input.shape[0]
self.mean = self.input.mean(axis=0)
self.variance = np.sqrt((self.input.var(axis=(0))) + self.epsilon) # epsilon is used to avoid divisions by zero
self._runningVariables()
self.normalized = (self.input - self.runningMean) / self.runningVariance
return self.weights.values * self.normalized + self.bias.values
def _runningVariables(self) -> None:
"""
this is called during 'forward' method
it mixes the running avergaes with current averages
"""
if self.mode == 'train':
self.runningMean = self.momentum * self.runningMean + (1 - self.momentum) * self.mean
self.runningVariance = self.momentum * self.runningVariance + (1 - self.momentum) * self.variance
# the next two if statements are there in case someone calls the network in eval mode without prior training
if self.mode == 'eval' and np.sum(self.runningMean) == 0:
self.runningMean = self.mean
if self.mode == 'eval' and np.sum(self.runningVariance) == 0:
self.runningVariance = self.variance
def backward(self, gradient: np.ndarray) -> np.ndarray:
"""
transformes upstream gradient, the effect of variance and mean
calculates deltas for weights and biases
its input and output size are the same
"""
self.weights.deltas = np.sum(gradient * self.normalized, axis=0, keepdims=True)
self.bias.deltas = gradient.sum(axis=0, keepdims=True)
gradient = gradient * self.weights.values
deltaVariance = np.sum((gradient * (self.input - self.mean)) * ((-.5 / self.variance) ** 3), axis=0, keepdims=True)
deltaMean = (gradient / (-self.mean + self.epsilon)).mean(axis=0, keepdims=True)
gradient = (gradient / self.variance + deltaVariance * 2 * (self.input - self.mean) / self.mean.size + deltaMean / self.input.size)
return gradient