Recently, I've been away from machine learning tasks (I've been writing Rails for a long time ... I'm about to return to the world of machine learning ...) I haven't read Adam's paper yet, so I read it and implemented it appropriately.
motivation I want to create a model that is easy to implement, has good computational efficiency, is memory-saving, is not easily affected by scale, and is adaptive to large-scale data / parameters.
Adaptive moment estimation
I made a comparison with Adam, SGDNesterov, and lasso logistic regression. The data is https://www.kaggle.com/c/data-science-london-scikit-learn/data Was used. I'm using this data because I wanted to see if the performance would be good even with a small amount of data. Since the test data is not labeled, the training data is used separately for training and testing at 8: 2. It is not iterated in Adam, SGD Nesterov.
Sorry for the dirty code, but I will paste it below. Adam
python
# coding: utf-8
import numpy as np
import math
from itertools import izip
from sklearn.metrics import accuracy_score, recall_score
class Adam:
def __init__(self, feat_dim, loss_type='log', alpha=0.001, beta1=0.9, beta2=0.999, epsilon=10**(-8)):
self.weight = np.zeros(feat_dim) # features weight
self.loss_type = loss_type # type of loss function
self.feat_dim = feat_dim # number of dimension
self.x = np.zeros(feat_dim) # feature
self.m = np.zeros(feat_dim) # 1st moment vector
self.v = np.zeros(feat_dim) # 2nd moment vector
self.alpha = alpha # step size
self.beta1 = beta1 # Exponential decay rates for moment estimates
self.beta2 = beta2 # Exponential decay rates for moment estimates
self.epsilon = epsilon
self.t = 1 # timestep
def fit(self, data_fname, label_fname):
with open(data_fname, 'r') as f_data, open(label_fname, 'r') as f_label:
for data, label in izip(f_data, f_label):
self.features = np.array(data.rstrip().split(','), dtype=np.float64)
y = int(-1) if int(label.rstrip())<=0 else int(1) # posi=1, nega=-Unified to 1
# update weight
self.update(self.predict(self.features), y)
self.t += 1
return self.weight
def predict(self, features): #margin
return np.dot(self.weight, features)
def calc_loss(self,m): # m=py=wxy
if self.loss_type == 'hinge':
return max(0,1-m)
elif self.loss_type == 'log':
# if m<=-700: m=-700
return math.log(1+math.exp(-m))
# gradient of loss function
def calc_dloss(self,m): # m=py=wxy
if self.loss_type == 'hinge':
res = -1.0 if (1-m)>0 else 0.0 #loss if loss does not exceed 0=0.Otherwise-With the derivative of m-Become 1
return res
elif self.loss_type == 'log':
if m < 0.0:
return float(-1.0) / (math.exp(m) + 1.0) # yx-e^(-m)/(1+e^(-m))*yx
else:
ez = float( math.exp(-m) )
return -ez / (ez + 1.0) # -yx+1/(1+e^(-m))*yx
def update(self, pred, y):
grad = y*self.calc_dloss(y*pred)*self.features # gradient
self.m = self.beta1*self.m + (1 - self.beta1)*grad # update biased first moment estimate
self.v = self.beta2*self.v + (1 - self.beta2)*grad**2 # update biased second raw moment estimate
mhat = self.m/(1-self.beta1**self.t) # compute bias-corrected first moment estimate
vhat = self.v/(1-self.beta2**self.t) # compute bias-corrected second raw moment estimate
self.alpha *= np.sqrt(1-self.beta2**self.t)/(1-self.beta1**self.t) # update stepsize
self.weight -= self.alpha * mhat/(np.sqrt(vhat) + self.epsilon) # update weight
if __name__=='__main__':
data_fname = 'train800.csv'
label_fname = 'trainLabels800.csv'
test_data_fname = 'test200.csv'
test_label_fname = 'testLabels200.csv'
adam = Adam(40, loss_type='hinge')
adam.fit(data_fname, label_fname)
y_true = []
y_pred = []
with open(test_data_fname, 'r') as f_data, open(test_label_fname, 'r') as f_label:
for data, label in izip(f_data, f_label):
pred_label = adam.predict(np.array(data.rstrip().split(','), dtype=np.float64))
y_true.append(int(label))
y_pred.append( 1 if pred_label>0 else 0)
print 'accuracy:', accuracy_score(y_true, y_pred)
print 'recall:', recall_score(y_true, y_pred)
SGDNesterov
python
# coding: utf-8
import numpy as np
import math
from itertools import izip
from sklearn.metrics import accuracy_score, recall_score
class SgdNesterov:
def __init__(self, feat_dim, loss_type='log', mu=0.9, learning_rate=0.5):
self.weight = np.zeros(feat_dim) # features weight
self.loss_type = loss_type # type of loss function
self.feat_dim = feat_dim
self.x = np.zeros(feat_dim)
self.mu = mu # momentum
self.t = 1 # update times
self.v = np.zeros(feat_dim)
self.learning_rate = learning_rate
def fit(self, data_fname, label_fname):
with open(data_fname, 'r') as f_data, open(label_fname, 'r') as f_label:
for data, label in izip(f_data, f_label):
self.features = np.array(data.rstrip().split(','), dtype=np.float64)
y = int(-1) if int(label.rstrip())<=0 else int(1) # posi=1, nega=-Unify to 1
# update weight
self.update(y)
self.t += 1
return self.weight
def predict(self, features): #margin
return np.dot(self.weight, features)
def calc_loss(self,m): # m=py=wxy
if self.loss_type == 'hinge':
return max(0,1-m)
elif self.loss_type == 'log':
if m<=-700: m=-700
return math.log(1+math.exp(-m))
# gradient of loss function
def calc_dloss(self,m): # m=py=wxy
if self.loss_type == 'hinge':
res = -1.0 if (1-m)>0 else 0.0 #loss if loss does not exceed 0=0.Otherwise-With the derivative of m-Become 1
return res
elif self.loss_type == 'log':
if m < 0.0:
return float(-1.0) / (math.exp(m) + 1.0) # yx-e^(-m)/(1+e^(-m))*yx
else:
ez = float( math.exp(-m) )
return -ez / (ez + 1.0) # -yx+1/(1+e^(-m))*yx
def update(self, y):
w_ahead = self.weight + self.mu * self.v
pred = np.dot(w_ahead, self.features)
grad = y*self.calc_dloss(y*pred)*self.features # gradient
self.v = self.mu * self.v - self.learning_rate * grad # velocity update stays the same
# update weight
self.weight += self.v
if __name__=='__main__':
data_fname = 'train800.csv'
label_fname = 'trainLabels800.csv'
test_data_fname = 'test200.csv'
test_label_fname = 'testLabels200.csv'
sgd_n = SgdNesterov(40, loss_type='hinge')
sgd_n.fit(data_fname, label_fname)
y_true = []
y_pred = []
with open(test_data_fname, 'r') as f_data, open(test_label_fname, 'r') as f_label:
for data, label in izip(f_data, f_label):
pred_label = sgd_n.predict(np.array(data.rstrip().split(','), dtype=np.float64))
y_true.append(int(label))
y_pred.append( 1 if pred_label>0 else 0)
print 'accuracy:', accuracy_score(y_true, y_pred)
print 'recall:', recall_score(y_true, y_pred)
python
import numpy as np
from itertools import izip
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import recall_score
def get_data(data_fname, label_fname):
result_data = []
result_labels = []
with open(data_fname, 'r') as f_data, open(label_fname, 'r') as f_label:
for data, label in izip(f_data, f_label):
result_data.append(data.rstrip().split(','))
result_labels.append(int(label.rstrip()))
return np.array(result_data, dtype=np.float64), result_labels
if __name__=='__main__':
data_fname = 'train800.csv'
label_fname = 'trainLabels800.csv'
test_data_fname = 'test200.csv'
test_label_fname = 'testLabels200.csv'
data, labels = get_data(data_fname, label_fname)
test_data, test_labels = get_data(test_data_fname, test_label_fname)
lr = LogisticRegression()
model = lr.fit(data, labels)
y_pred = model.predict(test_data)
print 'accuracy:', model.score(test_data, test_labels)
print 'recall:', recall_score(test_labels, y_pred)
I thought that there was little data and it would be an obstacle to optimizing alpha, so I fixed alpha, and the accuracy improved. (I think that the optimization of alpha is for learning well with a deep model. I feel that it is correct to remove this time.) The vhat looks like this:
[ 1.01440993 1.03180357 0.95435572 0.9297218 21.07682674
0.94186528 4.65151802 5.00409033 0.99502491 1.04799237
1.03563918 1.01860187 24.53366684 0.99717628 4.56930882
0.99764606 0.95268578 1.00007278 4.94184457 0.96486898
0.9665374 0.89604119 5.77110996 18.18369869 1.06281087
0.98975868 1.01176115 1.06529464 5.55623853 5.52265492
1.00727474 1.00094686 5.23052382 1.0256952 4.53388121
1.0003947 5.4024963 0.98662918 4.86086664 4.4993808 ]
[ 0.70211545 0.70753131 0.68225521 0.65766954 14.23198314
0.66457665 3.00986265 3.73453379 0.70920046 0.71507415
0.7611441 0.71763729 12.45908405 0.71818535 2.44396968
0.72608443 0.62573733 0.697053 3.06402831 0.64277643
0.68346131 0.59957144 3.99612146 11.69024055 0.75532095
0.68612789 0.69620363 0.75933189 3.41557243 4.05831119
0.7255359 0.72140109 3.55049677 0.73630123 2.77828369
0.69178571 3.82801224 0.68480352 3.70976494 2.96358695]
It may be necessary to change the handling of automatic update of alpha for each data, but I found that Adam seems to be strong (small average feeling) even with small data. Adam itself is already implemented in chainer etc., so I think that you should try compatibility with various data using it.
We apologize for the inconvenience, but we would appreciate it if you could point out any mistakes.
Recommended Posts