There is a misunderstanding that Tensorflow is a machine learning library, so I wrote a code to find the average to deepen my understanding.
--Generate 100 uniform random numbers from 0 to 100. The average is 50. --Some changes to the learning coefficient
Gradient decent
python
import matplotlib.pylab as plt
%matplotlib inline
import numpy as np
import tensorflow as tf
x_train = np.random.randint(0,100, size=100)
n_itr = 100
m = tf.Variable([30.0], tf.float32) #Variables to estimate
x = tf.placeholder(tf.float32) #Data to give
loss = tf.reduce_sum(tf.square(x - m)) #Square error sum
for lr in [0.009, 0.001, 0.0001]:
optimizer = tf.train.GradientDescentOptimizer(lr) #Solid gradient method
train = optimizer.minimize(loss)
init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)
est = []
for i in range(n_itr):
_, est_m = sess.run([train, m], {x:x_train})
est.append(est_m)
est = np.array(est)
plt.plot(est.reshape(n_itr), label="lr={}".format(lr))
plt.title("batch gradient decent")
plt.legend()
plt.show();
It's converged to the true average
--If the learning coefficient is large, it will vibrate. --Divergence when greater than 0.01 --If the learning coefficient is small, convergence is slow.
RMS Prop
with optimizer I just changed the range of learning rate.
python
for lr in [5, 1, 0.1, 0.01]:
optimizer = tf.train.RMSPropOptimizer(lr)
train = optimizer.minimize(loss)
init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)
est = []
for i in range(n_itr):
_, est_m = sess.run([train, m], {x:x_train})
est.append(est_m)
est = np.array(est)
plt.plot(est.reshape(n_itr), label="lr={}".format(lr))
plt.title("batch RMS Prop")
plt.legend()
plt.show();
--If the learning rate is too large, it will oscillate after convergence. --The learning rate is considerably higher than that of gradient descent.
Adam
python
for lr in [5, 1, 0.1, 0.01]:
optimizer = tf.train.AdamOptimizer(lr)
train = optimizer.minimize(loss)
init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)
est = []
for i in range(n_itr):
_, est_m = sess.run([train, m], {x:x_train})
est.append(est_m)
est = np.array(est)
plt.plot(est.reshape(n_itr), label="lr={}".format(lr))
plt.title("batch Adam")
plt.legend()
plt.show();
--Vibration is gentle. Recall the vibration graph of transient phenomena. --If the learning rate is high, it will overshoot.
AdaGrad
python
for lr in [20, 10, 5, 1, 0.1, 0.01]:
optimizer = tf.train.AdagradOptimizer(lr)
train = optimizer.minimize(loss)
init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)
est = []
for i in range(n_itr):
_, est_m = sess.run([train, m], {x:x_train})
est.append(est_m)
est = np.array(est)
plt.plot(est.reshape(n_itr), label="lr={}".format(lr))
plt.title("batch AdaGrad")
plt.legend()
plt.show();
AdaDelta
python
for lr in [20000, 10000, 1000, 100, 10]:
optimizer = tf.train.AdadeltaOptimizer(lr)
train = optimizer.minimize(loss)
init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)
est = []
for i in range(n_itr):
_, est_m = sess.run([train, m], {x:x_train})
est.append(est_m)
est = np.array(est)
plt.plot(est.reshape(n_itr), label="lr={}".format(lr))
plt.title("batch AdaDelta")
plt.legend()
plt.show();
--The behavior is almost the same as AdaGrad. --The learning rate is quite high.
Recommended Posts