The other day, I was taught a continuous space topic model at a seminar at the Institute of Statistical Mathematics. Compared to the generative model in which the conventional Latent Dirichlet Allocation uses a mixed model (sum model) The continuous space topic model is a generative model using RBM (product model). According to the paper, the performance is better than LDA.
See the following paper for details http://chasen.org/~daiti-m/paper/nl213cstm.pdf
It looks interesting, so I would like to implement it and verify it.
Initially it was implemented on the CPU, but it took more than 10 hours for 1 epoch with 1000 sentences and 8000 words. It was re-executed on GPU (cupy), and it became about 1 epoch 1 hour.
cstm_gpu.py
import numpy as np
from chainer import cuda
xp = cuda.cupy
# Γ(α + n) / Γ(α) = (α + (n-1) ) * (α + (n-2)) * ... * (α + 1) * α
#Log because the value will be large
# alpha_arr GPU
# n_arr CPU
# n_arr_gpu GPU
def log_gamma_div_gpu(alpha_arr, n_arr, n_arr_gpu):
n_max = np.max(n_arr)
alpha_arr_tmp = xp.copy(alpha_arr)
alpha_arr_tmp[np.where(n_arr==0)] = 1
log_sum_arr = xp.log(alpha_arr_tmp)
for i in range(1,int(n_max)):
n_tmp = xp.where(n_arr_gpu>i+0.5,i,0)
alpha_tmp = xp.where(n_tmp < 0.5, 1 , alpha_arr_tmp + n_tmp)
log_sum_arr += xp.log(alpha_tmp)
return log_sum_arr
def log_gamma_div_cpu(alpha_arr, n_arr):
if isinstance(alpha_arr, np.float64) != 1:
alpha_arr_cpu = cuda.to_cpu(alpha_arr)
else:
alpha_arr_cpu = alpha_arr
n_max = np.max(n_arr)
alpha_arr_tmp_cpu = np.where(alpha_arr_cpu!=0,alpha_arr_cpu,1)
log_sum_arr_cpu = np.log(alpha_arr_tmp_cpu)
for i in range(1,int(n_max)):
n_tmp = np.where(n_arr>i,i,0)
alpha_tmp_cpu = np.where(n_tmp==0, 1 , alpha_arr_tmp_cpu + n_tmp)
log_sum_arr_cpu += np.log(alpha_tmp_cpu)
return log_sum_arr_cpu
class AlphaClass:
#Substitute learning variables
def __init__(self,G0):
self.G0 = cuda.to_gpu(G0)
#Calculate α
#Return value shape(Number of sentences,Number of words)
def calculate_alpha(self,alpha0,w_emb,d_emb):
tmp = d_emb.dot(w_emb.transpose())
alpha = alpha0 * self.G0 * xp.exp(tmp)
return alpha
class ProbClass:
def __init__(self, n):
self.n = n
self.n_gpu = cuda.to_gpu(n)
#Calculate simultaneous probability for each document for all documents
#Return value shape(Number of sentences,)
def calculate_prob_log(self,alpha):
alpha2 = xp.copy(alpha)
alpha2[np.where(self.n==0)] = 0
tmp1 = log_gamma_div_cpu(xp.sum(alpha2,axis=1),np.sum(self.n,axis=1))
tmp2 = xp.sum(log_gamma_div_gpu(alpha2,self.n,self.n_gpu),axis=1)
prob = xp.asarray(-tmp1) + tmp2
return prob
def calculate_alpha_index(alpha0,G0,w_emb,d):
tmp = d.dot(w_emb.transpose())
alpha = alpha0 * G0 * np.exp(tmp)
return alpha
#Calculate simultaneous probability for each document
#Return value value
def calculate_prob_log_index(alpha,n):
alpha2 = np.copy(alpha)
alpha2[np.where(n==0)] = 0
tmp1 = log_gamma_div_cpu(np.sum(alpha2),np.sum(n))
tmp2 = np.sum(log_gamma_div_cpu(alpha2,n))
prob = - tmp1 + tmp2
return prob
def mh_accept(log_ll_old,log_ll_new):
if log_ll_old < log_ll_new:
return 1
else:
p = np.exp(cuda.to_cpu(log_ll_new-log_ll_old))
return np.random.binomial(1,p)
train_cstm_gpu.py
alpha0 = 1.0
w_emb = np.random.randn(Number of words,Number of units)
d_emb = np.random.randn(Number of sentences,Number of units)
wordcntbydoc =Number of frequent words in each sentence
G0 =Number of frequent words in all sentences/Sum of the number of frequent words
sigma_doc = 0.01
sigma_word = 0.02
sigma_alpha = 0.2
alphaclass = cstm.AlphaClass(G0)
probclass = cstm.ProbClass(wordcntbydoc)
def calculate_prob_log_all(alpha0,w,d):
alpha = alphaclass.calculate_alpha(alpha0,w,d)
prob = probclass.calculate_prob_log(alpha)
return prob
prob = calculate_prob_log_all(alpha0,xp.asarray(w_emb),xp.asarray(d_emb))
print("prob={}".format(xp.sum(prob)))
begin_time = time.time()
for epoch in range(0,args.epoch):
logwrite('epoch=' + str(epoch))
w_perm = np.random.permutation(n_vocab)
d_perm = np.random.permutation(n_wid)
for i in d_perm:
d_i = d_emb[i] + sigma_doc * np.random.randn(args.unit)
alpha_new = cstm.calculate_alpha_index(alpha0,G0,w_emb,d_i)
prob_new = cstm.calculate_prob_log_index(alpha_new, wordcntbydoc[i])
new_flag = cstm.mh_accept(prob_old,prob_new)
if new_flag == 1:
d_emb[i] = d_i
prob[i] = prob_new
logwrite(" doc i={}:ll={}".format(i,xp.sum(prob)))
end_time = time.time()
duration = end_time - begin_time
prob = calculate_prob_log_all(xp.asarray(alpha0),xp.asarray(w_emb),xp.asarray(d_emb))
logwrite("doc result={}".format(xp.sum(prob)))
logwrite('doc: {:.2f} sec'.format(duration))
begin_time = time.time()
for i in w_perm:
w_emb_new = xp.copy(w_emb)
w_emb_new[i] = w_emb[i] + sigma_word * np.random.randn(args.unit)
prob_new = calculate_prob_log_all(xp.asarray(alpha0),xp.asarray(w_emb_new),xp.asarray(d_emb))
new_flag = cstm.mh_accept(xp.sum(prob),xp.sum(prob_new))
if new_flag == 1:
prob = xp.copy(prob_new)
w_emb = np.copy(w_emb_new)
logwrite(" word i={}:ll={}".format(i,xp.sum(prob)))
end_time = time.time()
duration = end_time - begin_time
prob = calculate_prob_log_all(xp.asarray(alpha0),xp.asarray(w_emb),xp.asarray(d_emb))
logwrite("word result={}".format(xp.sum(prob)))
logwrite('word: {:.2f} sec'.format(duration))
begin_time = time.time()
z = sigma_alpha * np.random.randn(1)
alpha0_new = alpha0 * np.exp(z)
prob_new = calculate_prob_log_all(xp.asarray(alpha0_new),xp.asarray(w_emb),xp.asarray(d_emb))
new_flag = cstm.mh_accept(xp.sum(prob),np.sum(prob_new))
if new_flag == 1:
alpha0 = alpha0_new
prob = xp.copy(prob_new)
logwrite(" alpha0 : ll={} alpha0={}".format(xp.sum(prob),alpha0))
end_time = time.time()
duration = end_time - begin_time
prob = calculate_prob_log_all(xp.asarray(alpha0),xp.asarray(w_emb),xp.asarray(d_emb))
logwrite("alpha0 result={}".format(xp.sum(prob)))
logwrite('alpha0: {:.2f} sec'.format(duration))
begin_time = time.time()
logwrite('Save to pkl epoch=' + str(epoch) )
result_all = [alpha0,w_emb,d_emb]
file1 = resultdir + "/result_epoch_" + "{0:0>3}".format(epoch) + ".pkl"
with open(file1, 'wb') as output1:
six.moves.cPickle.dump(result_all, output1, -1)
The calculation of the gamma function has become forcible in order to speed up with GPU. Since it was implemented once, does the formula of the paper and the value of the source match? I would like to verify while doing a confirmation test.
Recommended Posts