As I wrote last time, no data is stored in the company. However, it was assumed that the data would be enormous if the logs were acquired properly in the future. Therefore, considering the amount of (time / space) calculation, we decided that it is best to use the online learning algorithm. (I'm writing a story assuming the previous post. It's a shame in many ways ... orz) I haven't used an online classifier properly before, so I tried several classifiers for performance evaluation (although a long time ago ...).
The linear classifier is roughly
For an overview, refer to Online Learning (University of Tokyo Lecture Materials?). think.
There are various types of online classifiers, such as noise-resistant algorithms that converge quickly, those that can perform automatic normalization, and those that can automatically adjust parameters. This time, I compared Exact Soft Confidence-Weight Learning (SCW), Stochastic Gradient Descent (SGD) with automatic parameter adjustment, SGD with automatic normalization, and Naive Bayes (NB).
The code I created is shown below. Some ingenuity is that (1) simple feature hashing is possible, and (2) SCW saves only the diagonal components of the covariance matrix.
python
# coding: utf-8
import numpy as np
import math
class SgdTrainWithAutoLR(object):
def __init__(self, fname, feat_dim, loss_type):
self.fname = fname # input file name
self.weight = None # features weight
self.feat_dim = 2**feat_dim # max size of feature vector
self.bitmask = 2**feat_dim - 1 # mapping dimension
self.loss_type = loss_type # type of loss function
self.lambd = None # regularization
self.gamma = None # learning rate
self.x = np.zeros(self.feat_dim)
self.t = 1 # update times
self.gradbar = np.zeros(self.feat_dim)
self.grad2bar = np.zeros(self.feat_dim)
self.tau = np.ones(self.feat_dim)*5
self.gbar = np.zeros(self.feat_dim)
self.vbar = np.zeros(self.feat_dim)
self.hbar = np.zeros(self.feat_dim)
self.epsilon = 10**(-9)
self.loss_weight = 10 #Parameters for adjusting the imbalance rate
self.update_t = 1
def train(self,gamma,lambd):
self.gamma = gamma
self.lambd = lambd
self.initialize()
with open(self.fname,'r') as trainf:
for line in trainf:
y = line.strip().split(' ')[0]
self.features = self.get_features(line.strip().split(' ')[1:])
y = int(-1) if int(y)<=0 else int(1) # posi=1, nega=-When to say 1
#Numerical forecast
pred = self.predict(self.weight,self.features)
grad = y*self.calc_dloss(y*pred)*self.features
self.gradbar = grad
self.grad2bar = grad**2
# update weight
self.update(pred,y)
self.t += 1
print self.weight
print self.t
return self.weight
def initialize(self):
self.weight = np.zeros(self.feat_dim)
def get_features(self,data):
features = np.zeros(self.feat_dim)
for kv in data:
k, v = kv.strip().split(':')
features[int(k)&self.bitmask] += float(v)
return features
def predict(self,w,features): #margin
return np.dot(w,features)
def calc_loss(self,m): # m=py=wxy
if self.loss_type == 'hinge':
return max(0,1-m)
elif self.loss_type == 'log':
if m<=-700: m=-700
return math.log(1+math.exp(-m))
# gradient of loss function
def calc_dloss(self,m): # m=py=wxy
if self.loss_type == 'hinge':
res = -1.0 if (1-m)>0 else 0.0 #loss if loss does not exceed 0=0.Otherwise-With the derivative of m-Become 1
return res
elif self.loss_type == 'log':
if m < 0.0:
return float(-1.0) / (math.exp(m) + 1.0) # yx-e^(-m)/(1+e^(-m))*yx
else:
ez = float( math.exp(-m) )
return -ez / (ez + 1.0) # -yx+1/(1+e^(-m))*yx
def update(self, pred, y):
m = y*pred
self.gbar *= (1 - self.tau**(-1))
self.gbar += self.tau**(-1)*self.gradbar
self.vbar *= (1 - self.tau**(-1))
self.vbar += self.tau**(-1)*self.grad2bar + self.epsilon
self.hbar *= (1 - self.tau**(-1))
self.hbar += self.tau**(-1)*2*self.grad2bar + self.epsilon #
tmp = self.gbar**2/self.vbar
# update memory size
self.tau = (1-tmp)*self.tau+1 + self.epsilon
# update learning rate
eta = tmp/self.hbar
# update weight
self.update_weight(y, m, eta)
def update_weight(self,y,m,eta):
loss = self.calc_loss(m)
if loss>0.0: #In the case of pa
delta = self.calc_dloss(m)*self.features
self.weight -= eta*y*delta
self.update_t += 1
def save_model(self,ofname,w):
with open(ofname,'w') as f:
#Writing the type of the loss function
f.write(self.loss_type+'\n')
#Writing weight
weight = [str(x).encode('utf-8') for x in w]
f.write(' '.join(weight)+'\n')
#Feature dimension dimension writing
f.write(str(self.feat_dim).encode('utf-8'))
python
# coding: utf-8
import numpy as np
import math
class SgdTrainWithAutoNormarize(object):
def __init__(self, fname, feat_dim, loss_type):
self.fname = fname # input file name
self.weight = None # features weight
self.feat_dim = 2**feat_dim # max size of feature vector
self.bitmask = 2**feat_dim - 1 # mapping dimension
self.loss_type = loss_type # type of loss function
self.eta = 10.0 # learning rate
self.features = None
self.G = np.zeros(self.feat_dim, dtype=np.float64)
self.t = 1 # update times
self.epsilon = 10**(-9)
self.maxfeature = np.zeros(self.feat_dim, dtype=np.float64)
self.N = 0
def train(self,gamma,lambd):
self.gamma = gamma
self.lambd = lambd
self.initialize()
with open(self.fname,'r') as trainf:
for line in trainf:
y = line.strip().split(' ')[0]
self.features = self.get_features(line.strip().split(' ')[1:])
y = int(-1) if int(y)<=0 else int(1) # posi=1, nega=-When to say 1
#Numerical forecast
pred = self.predict(self.weight,self.features)
#Weight normalization
self.NAG(y,y*pred)
self.t += 1
print self.weight
return self.weight
def initialize(self):
self.weight = np.zeros(self.feat_dim,dtype=np.float64)
def get_features(self,data):
features = np.zeros(self.feat_dim,dtype=np.float64)
for kv in data:
k, v = kv.strip().split(':')
features[int(k)&self.bitmask] += float(v)
return features
def NG(self,y,m):
"""Weight normalization"""
idx = np.where( (np.abs(self.features)-self.maxfeature)>0 )
#Adjust the weight
self.weight[idx] *= self.maxfeature[idx]**2/(np.abs(self.features[idx])+self.epsilon)**2
#update max value
self.maxfeature[idx] = np.abs( self.features[idx] )
#Coefficient update
self.N += sum(self.features**2/(self.maxfeature+self.epsilon)**2)
#weight update
loss = self.calc_loss(m)
if loss>0.0:
grad = y*self.calc_dloss(m)*self.features
self.weight -= self.eta*self.t/self.N*1/(self.maxfeature+self.epsilon)**2*grad
def NAG(self,y,m):
"""Weight normalization"""
idx = np.where( (np.abs(self.features)-self.maxfeature)>0 )
#Adjust the weight
self.weight[idx] *= self.maxfeature[idx]/(np.abs(self.features[idx])+self.epsilon)
#update max value
self.maxfeature[idx] = np.abs( self.features[idx] )
#Coefficient update
self.N += sum(self.features**2/(self.maxfeature+self.epsilon)**2)
#Calculation of weight gradient
grad = y*self.calc_dloss(m)*self.features
self.G += grad**2
self.weight -= self.eta*math.sqrt(self.t/self.N)* \
1/(self.maxfeature+self.epsilon)/np.sqrt(self.G+self.epsilon)*grad
def predict(self,w,features):
ez = np.dot(w,features)
#return 1/(1+math.exp(-ez)) #Do not apply to logistic function
return ez
def calc_loss(self,m): # m=py=wxy
if self.loss_type == 'hinge':
return max(0,1-m)
elif self.loss_type == 'log':
if m<=-700: m=-700
return math.log(1+math.exp(-m))
# gradient of loss function
def calc_dloss(self,m): # m=py=wxy
if self.loss_type == 'hinge':
res = -1.0 if (1-m)>0 else 0.0 #loss if loss does not exceed 0=0.Otherwise-With the derivative of m-Become 1
return res
elif self.loss_type == 'log':
if m < 0.0:
return float(-1.0) / (math.exp(m) + 1.0) # yx-e^(-m)/(1+e^(-m))*yx
else:
ez = float( math.exp(-m) )
return -ez / (ez + 1.0) # -yx+1/(1+e^(-m))*yx
def update_weight(self,y,m):
"""Update weight after normalization
"""
loss = self.calc_loss(m)
if loss>0.0:
grad = y*self.calc_dloss(m)*self.features
self.weight -= self.eta*self.t/self.N*1/(self.maxfeature+self.epsilon)**2*grad
def save_model(self,ofname,w):
with open(ofname,'w') as f:
#Writing the type of the loss function
f.write(self.loss_type+'\n')
#Writing weight
weight = [str(x).encode('utf-8') for x in w]
f.write(' '.join(weight)+'\n')
#Feature dimension dimension writing
f.write(str(self.feat_dim).encode('utf-8'))
SCW
python
# coding: utf-8
import numpy as np
import math
from scipy.stats import norm
class ScwTrain(object):
def __init__(self, fname, feat_dim, loss_type, eta, C):
self.fname = fname # input file name
self.feat_dim = 2**feat_dim # max size of feature vector
self.bitmask = 2**feat_dim - 1 # mapping dimension
self.loss_type = loss_type # type of loss function
self.lambd = None # regularization
self.gamma = None # learning rate
self.t = 1 # update times
self.features = np.zeros(self.feat_dim,dtype=np.float64)
self.mean_weight = np.zeros(self.feat_dim,dtype=np.float64)
self.sigma_weight = np.ones(self.feat_dim,dtype=np.float64)
self.alpha = 0
self.beta = 0
self.sai = None
self.pusai = None
self.u = None
self.v = None
self.eta = eta
self.phai = norm.ppf(eta)
self.C = C
def train(self):
with open(self.fname,'r') as trainf:
ex_num = 0
count_y = [0.0, 0.0]
for line in trainf:
y = line.strip().split(' ')[0]
features = line.strip().split(' ')[1:]
y = int(-1) if int(y)<=0 else int(1)
#Prediction of y
pred = self.predict(self.mean_weight,features)
#calculation of vt
vt = self.calc_v()
ex_num += 1
if self.calc_loss(y)>0:
# update weight
self.update_param(y,y*pred,vt)
if y==-1: count_y[0] += 1
else: count_y[1] += 1
print self.mean_weight
print "data num=", ex_num
print "update time=", self.t
print "count_y=", count_y
return self.mean_weight
def initialize(self):
w_init = np.zeros(self.feat_dim, dtype=np.float64)
return w_init
def update_param(self,y,m,v):
nt = v+float(0.5)/self.C
gmt = self.phai*math.sqrt( (self.phai*m*v)**2+4.0*nt*v*(nt+v*self.phai**2) )
self.alpha = max( 0.0 , (-(2.0*m*nt+self.phai**2*m*v)+gmt)/(2.0*(nt**2+nt*v*self.phai**2)) )
u = 0.25*( -self.alpha*v*self.phai+((self.alpha*v*self.phai)**2 + 4.0*v)**0.5 )**2
self.beta = self.alpha*self.phai/(u**0.5 + v*self.alpha*self.phai)
self.mean_weight += self.alpha*y*self.sigma_weight*self.features
self.sigma_weight -= self.beta*(self.sigma_weight*self.features)**2
self.t += 1
def predict(self,w,features):
val = 0.0
self.features = np.zeros(self.feat_dim,dtype=np.float64)
if w != None:
for feature in features:
k,v = feature.strip().split(':')
val += w[int(k) & self.bitmask] * float(v)
self.features[int(k) & self.bitmask] += float(v)
return val
def calc_v(self):
return np.dot(self.sigma_weight, self.features**2)
def calc_loss(self,y): # m=py=wxy
"""Loss function"""
res = self.phai * math.sqrt( np.dot(self.sigma_weight, self.features**2) ) \
- y * np.dot(self.features, self.mean_weight) #sigma saves only diagonal components
return res
def save_model(self,ofname,w):
with open(ofname,'w') as f:
f.write(self.loss_type+'\n')
weight = [str(x).encode('utf-8') for x in w]
f.write(' '.join(weight)+'\n')
f.write(str(self.feat_dim).encode('utf-8'))
Naive Bayes
Naive Bayes is simple, so I write it in one training and test w
python
# coding: utf-8
import numpy as np
import math
class NB(object):
def __init__(self, fname, feat_dim):
self.fname = fname # input file name
self.feat_dim = 2**feat_dim # max size of feature vector
self.bitmask = 2**feat_dim - 1 # mapping dimension
self.t = 1 # update times
self.t_select = 1 # times of?@select sample
self.epsilon = 10**(-9)
self.N=0.0 #Total number of cases
self.Ny = np.zeros(2, dtype=np.float64) # y=-1, y=1 holder (total count)
self.Nxy = np.zeros((2,2**feat_dim), dtype=np.float64) #Sum of the number of occurrences of the combination of y and x vectors
self.propy = None
self.propxy = None
def train(self):
with open(self.fname,'r') as trainf:
for line in trainf:
y = line.strip().split(' ')[0]
self.features = self.get_features(line.strip().split(' ')[1:])
#y = int(-1) if int(y)<=0 else int(1)
y = int(-1) if int(y)<=1 else int(1) # posi=1, nega=-When to say 1
#Count the number of cases
self.N += 1
#Count the number of y
self.incliment_y(y)
#Counting the number of combinations of x and y
self.incliment_xy(y)
#Calculation of probability values used for prediction
self.propy = np.log(self.pred_y()+self.epsilon)
self.propxy = np.log(self.pred_xy()+self.epsilon)
for i in xrange(len(self.propy)):
print self.propxy[i]
def test(self, ifname):
with open(ifname,'r') as testf:
ans = np.zeros(4,dtype=np.int64)
for line in testf:
y = line.strip().split(' ')[0]
features = self.get_features(line.strip().split(' ')[1:])
y = int(-1) if int(y)<=0 else int(1)
res = self.test_pred(features)
#Calculation of result table
ans = self.get_ans(ans, y, res)
print ans
def get_features(self,data):
features = np.zeros(self.feat_dim)
for kv in data:
k, v = kv.strip().split(':')
features[int(k)&self.bitmask] += float(v)
return features
def incliment_y(self,y):
if y==-1: self.Ny[0] += 1.0
else: self.Ny[1] += 1.0
def incliment_xy(self,y):
if y==-1:
self.Nxy[0] += (self.features!=0)*1.0
else:
self.Nxy[1] += (self.features!=0)*1.0
def test_pred(self,features):
res = np.zeros(2,dtype=np.float64)
for i in xrange(len(self.Ny)):
res[i] = self.propy[i] \
+ sum( self.propxy[i]*((features!=0)*1.0) )
if res[0]>res[1]:
return -1
else:
return 1
def predict(self):
res = np.zeros(2,dtype=np.float64)
predy = np.log(self.pred_y()) #Calculation of probability value of y
predx = np.log(self.predxy()) #Calculation of the probability value of conditional x of y
res = np.zeros(2,dtype=np.float64)
for i in xrange(len(self.Ny)):
res[i] = predy[i]+sum(predx[i])
if res[0]>res[1]:
return -1
else:
return 1
def pred_y(self):
return self.Ny/self.N
def pred_xy(self):
res = np.zeros((2,self.feat_dim),dtype=np.float64)
for i in xrange(len(self.Ny)):
if self.Ny[i]==0.0:
res[i] = 0.0
else:
res[i] = self.Nxy[i]/self.Ny[i]
return res
def get_ans(self,ans,y,res):
if y==1 and res==1: #True positive
ans[0] += 1
elif y==1 and res==-1: #False negative
ans[1] += 1
elif y==-1 and res==1: #false positive
ans[2] += 1
else: #True negative
ans[3] += 1
return ans
if __name__=='__main__':
trfname = 'training data file name'
tefname = 'test data file name'
bf = BF(trfname, 6)
bf.train()
bf.test(tefname)
The data used in the experiment was a9a and covtype.binary in Libsvm dataset. a9a is binary data, and covtype is data with a 6,000-fold difference in scale. When inputting a continuous quantity to NB, a value of 1 or more is simply quantized as 1. Also, since covtype.binary does not have a test set, 400,000 data were sampled and used as training data. Hyper parameters are not adjusted in particular. Also, I haven't iterated. The results are as follows.
It turns out that normalization is important when using data with completely different scales for each variable. Also, you can see that SCW seems to converge with a small number of samples. In addition, since the results of online learning vary depending on the order of data input, we shuffled the data and conducted experiments. I only experimented with SCW and SGD with automatic normalization. As a result, SGD fluctuated by up to 3%, but accuracy was relatively stable. On the other hand, SCW fluctuates up to 15%, and if you lick the data only once, it seems to be relatively sensitive to the order of the data. By the way, this area was also mentioned in CROSS 2014 (Machine Learning CROSS First Half Material).
By the way, SGD normalize, lasso, ridge, etc. are all implemented in Vowpal Wabbit, so it is recommended to use Vowpal Wabbit. The calculation speed is also explosive.
If you have any mistakes, please let us know.
Recommended Posts