This is a learning record when I took the Rabbit Challenge with the aim of passing the Japan Deep Learning Association (JDLA) E qualification, which will be held on January 19th and 20th, 2021.
Rabbit Challenge is a course that utilizes the teaching materials edited from the recorded video of the commuting course of "Deep learning course that can be crushed in the field". There is no support for questions, but it is a cheap course (the lowest price as of June 2020) for taking the E qualification exam.
Please check the details from the link below.
Applied Mathematics Machine learning Deep learning (day1) Deep learning (day2) Deep learning (day3) Deep learning (day4)
As the error backpropagation method progresses to the lower layers, the gradient becomes gentler and gentler. Therefore, the parameters of the lower layer are hardly changed by the update by the gradient descent method, and the training does not converge to the optimum value.
--Selection of activation function
--ReLU function
--Initial weight setting --Xavier: When the number of nodes in the previous layer is n, the value obtained by multiplying the weight element by $ \ sqrt {\ frac {1} {n}} $. The activation functions are ReLu, sigmoid (logistic) function, and hyperbolic rectifier function (tanh). --He: When the number of nodes in the previous layer is n, the value obtained by multiplying the weight element by $ \ sqrt {\ frac {2} {n}} $. The activation function is ReLu.
What happens if I set the initial weight to 0? → Because all values are transmitted with the same value. Parameters are no longer tuned.
--Batch normalization A method to suppress the bias of input value data in mini-batch units. By adding a layer that has been processed to normalize the output of the middle layer, the output is forced to always follow a distribution with mean 0 and variance 1. It has the advantages of faster calculation and less gradient disappearance.
The mean and variance of the mini-batch
Contribution to gradient disappearance when the activation function, initial value of weight, and presence / absence of batch normalization are changed
import sys, os
sys.path.append(os.pardir) #Settings for importing files in the parent directory
import numpy as np
from common import layers
from collections import OrderedDict
from common import functions
from data.mnist import load_mnist
import matplotlib.pyplot as plt
from common import optimizer
class MultiLayerNet:
'''
input_size:Number of nodes in the input layer
hidden_size_list:List of hidden tier nodes
output_size:Number of nodes in the output layer
activation:Activation function
weight_init_std:How to initialize weights
weight_decay_lambda:Strength of L2 regularization
use_dropout:With or without dropout
dropout_ratio:Dropout rate
use_batchnorm:With or without batch normalization
'''
def __init__(self, input_size, hidden_size_list, output_size, activation='relu', weight_init_std='relu', weight_decay_lambda=0,
use_dropout = False, dropout_ratio = 0.5, use_batchnorm=False):
self.input_size = input_size
self.output_size = output_size
self.hidden_size_list = hidden_size_list
self.hidden_layer_num = len(hidden_size_list)
self.use_dropout = use_dropout
self.weight_decay_lambda = weight_decay_lambda
self.use_batchnorm = use_batchnorm
self.params = {}
#Weight initialization
self.__init_weight(weight_init_std)
#Layer generation
activation_layer = {'sigmoid': layers.Sigmoid, 'relu': layers.Relu}
self.layers = OrderedDict()
for idx in range(1, self.hidden_layer_num+1):
self.layers['Affine' + str(idx)] = layers.Affine(self.params['W' + str(idx)], self.params['b' + str(idx)])
if self.use_batchnorm:
self.params['gamma' + str(idx)] = np.ones(hidden_size_list[idx-1])
self.params['beta' + str(idx)] = np.zeros(hidden_size_list[idx-1])
self.layers['BatchNorm' + str(idx)] = layers.BatchNormalization(self.params['gamma' + str(idx)], self.params['beta' + str(idx)])
self.layers['Activation_function' + str(idx)] = activation_layer[activation]()
if self.use_dropout:
self.layers['Dropout' + str(idx)] = layers.Dropout(dropout_ratio)
idx = self.hidden_layer_num + 1
self.layers['Affine' + str(idx)] = layers.Affine(self.params['W' + str(idx)], self.params['b' + str(idx)])
self.last_layer = layers.SoftmaxWithLoss()
def __init_weight(self, weight_init_std):
all_size_list = [self.input_size] + self.hidden_size_list + [self.output_size]
for idx in range(1, len(all_size_list)):
scale = weight_init_std
if str(weight_init_std).lower() in ('relu', 'he'):
scale = np.sqrt(2.0 / all_size_list[idx - 1]) #Recommended initial value when using ReLU
elif str(weight_init_std).lower() in ('sigmoid', 'xavier'):
scale = np.sqrt(1.0 / all_size_list[idx - 1]) #Recommended initial value when using sigmoid
self.params['W' + str(idx)] = scale * np.random.randn(all_size_list[idx-1], all_size_list[idx])
self.params['b' + str(idx)] = np.zeros(all_size_list[idx])
def predict(self, x, train_flg=False):
for key, layer in self.layers.items():
if "Dropout" in key or "BatchNorm" in key:
x = layer.forward(x, train_flg)
else:
x = layer.forward(x)
return x
def loss(self, x, d, train_flg=False):
y = self.predict(x, train_flg)
weight_decay = 0
for idx in range(1, self.hidden_layer_num + 2):
W = self.params['W' + str(idx)]
weight_decay += 0.5 * self.weight_decay_lambda * np.sum(W**2)
return self.last_layer.forward(y, d) + weight_decay
def accuracy(self, X, D):
Y = self.predict(X, train_flg=False)
Y = np.argmax(Y, axis=1)
if D.ndim != 1 : D = np.argmax(D, axis=1)
accuracy = np.sum(Y == D) / float(X.shape[0])
return accuracy
def gradient(self, x, d):
# forward
self.loss(x, d, train_flg=True)
# backward
dout = 1
dout = self.last_layer.backward(dout)
layers = list(self.layers.values())
layers.reverse()
for layer in layers:
dout = layer.backward(dout)
#Setting
grads = {}
for idx in range(1, self.hidden_layer_num+2):
grads['W' + str(idx)] = self.layers['Affine' + str(idx)].dW + self.weight_decay_lambda * self.params['W' + str(idx)]
grads['b' + str(idx)] = self.layers['Affine' + str(idx)].db
if self.use_batchnorm and idx != self.hidden_layer_num+1:
grads['gamma' + str(idx)] = self.layers['BatchNorm' + str(idx)].dgamma
grads['beta' + str(idx)] = self.layers['BatchNorm' + str(idx)].dbeta
return grads
#Batch regularization layer
class BatchNormalization:
'''
gamma:Scale factor
beta:offset
momentum:inertia
running_mean:Average used during testing
running_var:Distribution used during testing
'''
def __init__(self, gamma, beta, momentum=0.9, running_mean=None, running_var=None):
self.gamma = gamma
self.beta = beta
self.momentum = momentum
self.input_shape = None
self.running_mean = running_mean
self.running_var = running_var
#Intermediate data to use when backward
self.batch_size = None
self.xc = None
self.std = None
self.dgamma = None
self.dbeta = None
def forward(self, x, train_flg=True):
if self.running_mean is None:
N, D = x.shape
self.running_mean = np.zeros(D)
self.running_var = np.zeros(D)
if train_flg:
mu = x.mean(axis=0) #average
xc = x - mu #Center x
var = np.mean(xc**2, axis=0) #Distributed
std = np.sqrt(var + 10e-7) #scaling
xn = xc / std
self.batch_size = x.shape[0]
self.xc = xc
self.xn = xn
self.std = std
self.running_mean = self.momentum * self.running_mean + (1-self.momentum) * mu #Weighted average of mean
self.running_var = self.momentum * self.running_var + (1-self.momentum) * var #Weighted average of variance values
else:
xc = x - self.running_mean
xn = xc / ((np.sqrt(self.running_var + 10e-7)))
out = self.gamma * xn + self.beta
return out
def backward(self, dout):
dbeta = dout.sum(axis=0)
dgamma = np.sum(self.xn * dout, axis=0)
dxn = self.gamma * dout
dxc = dxn / self.std
dstd = -np.sum((dxn * self.xc) / (self.std * self.std), axis=0)
dvar = 0.5 * dstd / self.std
dxc += (2.0 / self.batch_size) * self.xc * dvar
dmu = np.sum(dxc, axis=0)
dx = dxc - dmu / self.batch_size
self.dgamma = dgamma
self.dbeta = dbeta
return dx
#Data reading
# (x_train, d_train), (x_test, d_test) = load_mnist(normalize=True, one_hot_label=True)
(x_train, d_train), (x_test, d_test) = load_mnist(normalize=True)
print('Data reading completed')
activations = ['sigmoid', 'relu']
weight_init_stds = [0.01, 'Xavier', 'He']
use_batchnorms = [False, True]
iters_num = 2000
train_size = x_train.shape[0]
batch_size = 100
learning_rate = 0.1
plot_interval = 100
plot_idx = 0
for k in range(len(activations)):
for l in range(len(weight_init_stds)):
for m in range(len(use_batchnorms)):
network = MultiLayerNet(input_size=784, hidden_size_list=[40, 20], output_size=10, activation=activations[k], weight_init_std=weight_init_stds[l], use_batchnorm=use_batchnorms[m])
train_loss_list = []
accuracies_train = []
accuracies_test = []
lists = []
plot_idx = plot_idx + 1
for i in range(iters_num):
batch_mask = np.random.choice(train_size, batch_size)
x_batch = x_train[batch_mask]
d_batch = d_train[batch_mask]
#Slope
grad = network.gradient(x_batch, d_batch)
for key in ('W1', 'W2', 'W3', 'b1', 'b2', 'b3'):
network.params[key] -= learning_rate * grad[key]
loss = network.loss(x_batch, d_batch)
train_loss_list.append(loss)
if (i + 1) % plot_interval == 0:
accr_test = network.accuracy(x_test, d_test)
accuracies_test.append(accr_test)
accr_train = network.accuracy(x_batch, d_batch)
accuracies_train.append(accr_train)
print('Generation: ' + str(i+1) + '.Correct answer rate(training) = ' + str(accr_train))
print(' : ' + str(i+1) + '.Correct answer rate(test) = ' + str(accr_test))
lists = range(0, iters_num, plot_interval)
plt.rcParams['figure.figsize'] = (12.0, 10.0)
plt.subplot(4,3,plot_idx)
plt.plot(lists, accuracies_train, label='training set')
plt.plot(lists, accuracies_test, label='test set')
plt.legend(loc='lower right')
plt.title(activations[k] + ', ' + str(weight_init_stds[l]) + ',Batch normalization' + str(use_batchnorms[m]) + ' (' + str(np.round(accuracies_test[-1],2)) + ')')
plt.xlabel('count')
plt.ylabel('accuracy')
plt.ylim(0, 1.0)
#Graph display
plt.tight_layout()
# plt.suptitle('Prediction accuracy when the activation function and initial weight values are changed', fontsize = 16)
plt.show()
If the value of the learning rate is large, the optimum value will not be reached forever and will diverge. If the value of the learning rate is small, it will not diverge, but if it is too small, it will take time to converge or it will be difficult to converge to the global local optimum value.
--Momentum
[Advantages of momentum] --It does not become a local optimum solution, but a global optimum solution. ――It takes a short time to reach the lowest position (optimum value) after reaching the valley.
Momentum gradient
#Slope
grad = network.gradient(x_batch, d_batch)
if i == 0:
v = {}
for key in ('W1', 'W2', 'W3', 'b1', 'b2', 'b3'):
if i == 0:
v[key] = np.zeros_like(network.params[key])
v[key] = momentum * v[key] - learning_rate * grad[key]
network.params[key] += v[key]
loss = network.loss(x_batch, d_batch)
train_loss_list.append(loss)
[Advantages of AdaGrad] ――For slopes with gentle slopes, approach the optimum value.
Gradient of AdaGrad
#Slope
grad = network.gradient(x_batch, d_batch)
if i == 0:
h = {}
for key in ('W1', 'W2', 'W3', 'b1', 'b2', 'b3'):
if i == 0:
h[key] = np.full_like(network.params[key], 1e-4)
else:
h[key] += np.square(grad[key])
network.params[key] -= learning_rate * grad[key] / (np.sqrt(h[key]))
loss = network.loss(x_batch, d_batch)
train_loss_list.append(loss)
[Advantages of RMS Drop] --It does not become a local optimum solution, but a global optimum solution. --There are few cases where hyperparameters need to be adjusted.
RMS Drop gradient
#Slope
grad = network.gradient(x_batch, d_batch)
if i == 0:
h = {}
for key in ('W1', 'W2', 'W3', 'b1', 'b2', 'b3'):
if i == 0:
h[key] = np.zeros_like(network.params[key])
h[key] *= decay_rate
h[key] += (1 - decay_rate) * np.square(grad[key])
network.params[key] -= learning_rate * grad[key] / (np.sqrt(h[key]) + 1e-7)
loss = network.loss(x_batch, d_batch)
train_loss_list.appen
[Advantages of Adam] --This is an algorithm that has the advantages of the exponential decay average of the past gradient of momentum and the exponential decay average of the square of the past gradient of RMSProp.
Adam's gradient
#Slope
grad = network.gradient(x_batch, d_batch)
if i == 0:
m = {}
v = {}
learning_rate_t = learning_rate * np.sqrt(1.0 - beta2 ** (i + 1)) / (1.0 - beta1 ** (i + 1))
for key in ('W1', 'W2', 'W3', 'b1', 'b2', 'b3'):
if i == 0:
m[key] = np.zeros_like(network.params[key])
v[key] = np.zeros_like(network.params[key])
m[key] += (1 - beta1) * (grad[key] - m[key])
v[key] += (1 - beta2) * (grad[key] ** 2 - v[key])
network.params[key] -= learning_rate_t * m[key] / (np.sqrt(v[key]) + 1e-7)
loss = network.loss(x_batch, d_batch)
train_loss_list.append(loss)
The learning curve deviates between the test error and the training error, and the learning is specialized for a specific training sample. There are the following methods to prevent overfitting.
--Use L2 norm: Ridge estimator (reduced estimator ... estimated to bring parameters closer to 0)
--Using the L1 norm: Lasso (the Least absolute shrinkage and selection operator) estimator (sparse estimation ... some parameters are estimated to be exactly 0)
No regularization (reproduction of overfitting)
(optimizer.SGD(learning_rate=0.01))
L2 regularization
(learning_rate=0.01)
L1 regularization
(learning_rate=0.1)
Dropout
(optimizer.SGD(learning_rate=0.01), weight_decay_lambda = 0.01)
(optimizer.Momentum(learning_rate=0.01, momentum=0.9), weight_decay_lambda = 0.01)
(optimizer.AdaGrad(learning_rate=0.01), weight_decay_lambda = 0.01)
(optimizer.Adam(learning_rate=0.01), weight_decay_lambda = 0.01)
Dropout + L1 regularization
(dropout_ratio = 0.1, weight_decay_lambda=0.005)
--Bias
-(Zero) padding
--Stride
If the input size is W × H, the filter size is Fw × Fh, the padding is p, the stride is s, and the output size of the convolution layer is OW × OH, OW and OH are calculated by the following equations.
Disadvantages of fully connected layer: In the case of an image, it is 3D data of vertical, horizontal, and channel, but it is processed as 1D data. That is, the relationship between each RGB channel is not reflected in learning.
AlexNet
It is named Alex Net after the name of the lead author of the paper, Alex Krizhevsky. It is composed of three fully connected layers, including a five-layer convolution layer and a pooling layer. Compared to LeNet, a CNN first devised in 1998 by Yann LeCun et al., It has a considerably deeper structure. A dropout is used for the output of the fully connected layer of size 4096 to prevent overfitting.
Chainer's AlexNet has the following code.
alex.py
import chainer
import chainer.functions as F
import chainer.links as L
class Alex(chainer.Chain):
"""Single-GPU AlexNet without partition toward the channel axis."""
insize = 227
def __init__(self):
super(Alex, self).__init__()
with self.init_scope():
self.conv1 = L.Convolution2D(None, 96, 11, stride=4)
self.conv2 = L.Convolution2D(None, 256, 5, pad=2)
self.conv3 = L.Convolution2D(None, 384, 3, pad=1)
self.conv4 = L.Convolution2D(None, 384, 3, pad=1)
self.conv5 = L.Convolution2D(None, 256, 3, pad=1)
self.fc6 = L.Linear(None, 4096)
self.fc7 = L.Linear(None, 4096)
self.fc8 = L.Linear(None, 1000)
def __call__(self, x, t):
h = F.max_pooling_2d(F.local_response_normalization(
F.relu(self.conv1(x))), 3, stride=2)
h = F.max_pooling_2d(F.local_response_normalization(
F.relu(self.conv2(h))), 3, stride=2)
h = F.relu(self.conv3(h))
h = F.relu(self.conv4(h))
h = F.max_pooling_2d(F.relu(self.conv5(h)), 3, stride=2)
h = F.dropout(F.relu(self.fc6(h)))
h = F.dropout(F.relu(self.fc7(h)))
h = self.fc8(h)
loss = F.softmax_cross_entropy(h, t)
chainer.report({'loss': loss, 'accuracy': F.accuracy(h, t)}, self)
return loss
Recommended Posts