The implementation is in dnn / optimizers.py of ** this GitHub repository **.
I tried using the following learning coefficient optimization with a simple neural network using MNIST as a benchmark. Let's compare the accuracy for each evaluation data set.
The implementation used Theano. This time, I uploaded the source code to My GitHub repository.
Regarding the learning coefficient optimization, I referred to this area. https://gist.github.com/SnippyHolloW/67effa81dd1cd5a488b4 https://gist.github.com/skaae/ae7225263ca8806868cb http://chainer.readthedocs.org/en/stable/reference/optimizers.html?highlight=optimizers http://qiita.com/skitaoka/items/e6afbe238cd69c899b2a
In the code below, params (or self.params) holds the weight and bias of the entire network. Since it is learned by the stochastic gradient descent method, the error function value `loss``` is differentiated by ``` params```, but gparams``` (or ``` self) is required. .gparams```) corresponds to that. (Differentiated by ``` T.grad (loss, param) ``.) The Optimizer class is used up to the point where the slope is obtained by differentiating, and SGD, Momentum SGD, etc. are implemented by inheriting it. I will.
optimizers.py
class Optimizer(object):
def __init__(self, params=None):
if params is None:
return NotImplementedError()
self.params = params
def updates(self, loss=None):
if loss is None:
return NotImplementedError()
self.updates = OrderedDict()
self.gparams = [T.grad(loss, param) for param in self.params]
By the way, `` `self.updates``` here is used to update weights etc.
SGD
optimizers.py
class SGD(Optimizer):
def __init__(self, learning_rate=0.01, params=None):
super(SGD, self).__init__(params=params)
self.learning_rate = 0.01
def updates(self, loss=None):
super(SGD, self).updates(loss=loss)
for param, gparam in zip(self.params, self.gparams):
self.updates[param] = param - self.learning_rate * gparam
return self.updates
Momentum SGD
optimizers.py
class MomentumSGD(Optimizer):
def __init__(self, learning_rate=0.01, momentum=0.9, params=None):
super(MomentumSGD, self).__init__(params=params)
self.learning_rate = learning_rate
self.momentum = momentum
self.vs = [build_shared_zeros(t.shape.eval(), 'v') for t in self.params]
def updates(self, loss=None):
super(MomentumSGD, self).updates(loss=loss)
for v, param, gparam in zip(self.vs, self.params, self.gparams):
_v = v * self.momentum
_v = _v - self.learning_rate * gparam
self.updates[param] = param + _v
self.updates[v] = _v
return self.updates
AdaGrad
optimizers.py
class AdaGrad(Optimizer):
def __init__(self, learning_rate=0.01, eps=1e-6, params=None):
super(AdaGrad, self).__init__(params=params)
self.learning_rate = learning_rate
self.eps = eps
self.accugrads = [build_shared_zeros(t.shape.eval(),'accugrad') for t in self.params]
def updates(self, loss=None):
super(AdaGrad, self).updates(loss=loss)
for accugrad, param, gparam\
in zip(self.accugrads, self.params, self.gparams):
agrad = accugrad + gparam * gparam
dx = - (self.learning_rate / T.sqrt(agrad + self.eps)) * gparam
self.updates[param] = param + dx
self.updates[accugrad] = agrad
return self.updates
RMSprop
optimizers.py
class RMSprop(Optimizer):
def __init__(self, learning_rate=0.001, alpha=0.99, eps=1e-8, params=None):
super(RMSprop, self).__init__(params=params)
self.learning_rate = learning_rate
self.alpha = alpha
self.eps = eps
self.mss = [build_shared_zeros(t.shape.eval(),'ms') for t in self.params]
def updates(self, loss=None):
super(RMSprop, self).updates(loss=loss)
for ms, param, gparam in zip(self.mss, self.params, self.gparams):
_ms = ms*self.alpha
_ms += (1 - self.alpha) * gparam * gparam
self.updates[ms] = _ms
self.updates[param] = param - self.learning_rate * gparam / T.sqrt(_ms + self.eps)
return self.updates
AdaDelta
optimizers.py
class AdaDelta(Optimizer):
def __init__(self, rho=0.95, eps=1e-6, params=None):
super(AdaDelta, self).__init__(params=params)
self.rho = rho
self.eps = eps
self.accugrads = [build_shared_zeros(t.shape.eval(),'accugrad') for t in self.params]
self.accudeltas = [build_shared_zeros(t.shape.eval(),'accudelta') for t in self.params]
def updates(self, loss=None):
super(AdaDelta, self).updates(loss=loss)
for accugrad, accudelta, param, gparam\
in zip(self.accugrads, self.accudeltas, self.params, self.gparams):
agrad = self.rho * accugrad + (1 - self.rho) * gparam * gparam
dx = - T.sqrt((accudelta + self.eps)/(agrad + self.eps)) * gparam
self.updates[accudelta] = (self.rho*accudelta + (1 - self.rho) * dx * dx)
self.updates[param] = param + dx
self.updates[accugrad] = agrad
return self.updates
Adam
optimizers.py
class Adam(Optimizer):
def __init__(self, alpha=0.001, beta1=0.9, beta2=0.999, eps=1e-8, gamma=1-1e-8, params=None):
super(Adam, self).__init__(params=params)
self.alpha = alpha
self.b1 = beta1
self.b2 = beta2
self.gamma = gamma
self.t = theano.shared(np.float32(1))
self.eps = eps
self.ms = [build_shared_zeros(t.shape.eval(), 'm') for t in self.params]
self.vs = [build_shared_zeros(t.shape.eval(), 'v') for t in self.params]
def updates(self, loss=None):
super(Adam, self).updates(loss=loss)
self.b1_t = self.b1 * self.gamma ** (self.t - 1)
for m, v, param, gparam \
in zip(self.ms, self.vs, self.params, self.gparams):
_m = self.b1_t * m + (1 - self.b1_t) * gparam
_v = self.b2 * v + (1 - self.b2) * gparam ** 2
m_hat = _m / (1 - self.b1 ** self.t)
v_hat = _v / (1 - self.b2 ** self.t)
self.updates[param] = param - self.alpha*m_hat / (T.sqrt(v_hat) + self.eps)
self.updates[m] = _m
self.updates[v] = _v
self.updates[self.t] = self.t + 1.0
return self.updates
Using MNIST, we averaged 30 seeds by 20 epochs. Please refer to My GitHub repository for detailed settings of learning coefficients and neural networks.
Well, I don't know because the upper one is messy, let's expand.
SGD has disappeared.
I should have taken the error function value. .. .. ..
We plan to add convolutional neural networks, Stacked Denoising Autoencoders, etc. to this GitHub repository in the future.
I would appreciate it if you could point out any strange points.
Recommended Posts