[PYTHON] Continuous space topic model implementation

0. Continuous Space Topic Model (CSTM)

The other day, I was taught a continuous space topic model at a seminar at the Institute of Statistical Mathematics. Compared to the generative model in which the conventional Latent Dirichlet Allocation uses a mixed model (sum model) The continuous space topic model is a generative model using RBM (product model). According to the paper, the performance is better than LDA.

See the following paper for details http://chasen.org/~daiti-m/paper/nl213cstm.pdf

It looks interesting, so I would like to implement it and verify it.

1. Implementation

Initially it was implemented on the CPU, but it took more than 10 hours for 1 epoch with 1000 sentences and 8000 words. It was re-executed on GPU (cupy), and it became about 1 epoch 1 hour.

cstm_gpu.py


import numpy as np
from chainer import cuda

xp = cuda.cupy

# Γ(α + n) / Γ(α) = (α + (n-1) ) * (α + (n-2)) * ... * (α + 1) * α
#Log because the value will be large
# alpha_arr GPU
# n_arr CPU
# n_arr_gpu GPU
def log_gamma_div_gpu(alpha_arr, n_arr, n_arr_gpu):
    n_max = np.max(n_arr)
    alpha_arr_tmp = xp.copy(alpha_arr)
    alpha_arr_tmp[np.where(n_arr==0)] = 1
    log_sum_arr = xp.log(alpha_arr_tmp)
    for i in range(1,int(n_max)):
        n_tmp = xp.where(n_arr_gpu>i+0.5,i,0)
        alpha_tmp = xp.where(n_tmp < 0.5, 1 , alpha_arr_tmp + n_tmp)
        log_sum_arr += xp.log(alpha_tmp)
    return log_sum_arr

def log_gamma_div_cpu(alpha_arr, n_arr):
    if isinstance(alpha_arr, np.float64) != 1:
        alpha_arr_cpu = cuda.to_cpu(alpha_arr)
    else:
        alpha_arr_cpu = alpha_arr
    n_max = np.max(n_arr)
    alpha_arr_tmp_cpu = np.where(alpha_arr_cpu!=0,alpha_arr_cpu,1)
    log_sum_arr_cpu = np.log(alpha_arr_tmp_cpu)
    for i in range(1,int(n_max)):
        n_tmp = np.where(n_arr>i,i,0)
        alpha_tmp_cpu = np.where(n_tmp==0, 1 , alpha_arr_tmp_cpu + n_tmp)
        log_sum_arr_cpu += np.log(alpha_tmp_cpu)
    return log_sum_arr_cpu

class AlphaClass:
    #Substitute learning variables
    def __init__(self,G0):
        self.G0 = cuda.to_gpu(G0)

    #Calculate α
    #Return value shape(Number of sentences,Number of words)
    def calculate_alpha(self,alpha0,w_emb,d_emb):
        tmp = d_emb.dot(w_emb.transpose())
        alpha = alpha0 * self.G0 * xp.exp(tmp)
        return alpha

class ProbClass:
    def __init__(self, n):
        self.n = n
        self.n_gpu = cuda.to_gpu(n)

    #Calculate simultaneous probability for each document for all documents
    #Return value shape(Number of sentences,)
    def calculate_prob_log(self,alpha):
        alpha2 = xp.copy(alpha)
        alpha2[np.where(self.n==0)] = 0
        tmp1 = log_gamma_div_cpu(xp.sum(alpha2,axis=1),np.sum(self.n,axis=1))
        tmp2 = xp.sum(log_gamma_div_gpu(alpha2,self.n,self.n_gpu),axis=1)
        prob = xp.asarray(-tmp1) + tmp2
        return prob

def calculate_alpha_index(alpha0,G0,w_emb,d):
    tmp = d.dot(w_emb.transpose())
    alpha = alpha0 * G0 * np.exp(tmp)
    return alpha


#Calculate simultaneous probability for each document
#Return value value
def calculate_prob_log_index(alpha,n):
    alpha2 = np.copy(alpha)
    alpha2[np.where(n==0)] = 0
    tmp1 = log_gamma_div_cpu(np.sum(alpha2),np.sum(n))
    tmp2 = np.sum(log_gamma_div_cpu(alpha2,n))
    prob = - tmp1 + tmp2
    return prob

def mh_accept(log_ll_old,log_ll_new):
    if log_ll_old < log_ll_new:
        return 1
    else:
        p = np.exp(cuda.to_cpu(log_ll_new-log_ll_old))
        return np.random.binomial(1,p)

train_cstm_gpu.py


alpha0 = 1.0
w_emb = np.random.randn(Number of words,Number of units)
d_emb = np.random.randn(Number of sentences,Number of units)
wordcntbydoc =Number of frequent words in each sentence
G0 =Number of frequent words in all sentences/Sum of the number of frequent words

sigma_doc = 0.01
sigma_word = 0.02
sigma_alpha = 0.2

alphaclass = cstm.AlphaClass(G0)
probclass = cstm.ProbClass(wordcntbydoc)

def calculate_prob_log_all(alpha0,w,d):
    alpha = alphaclass.calculate_alpha(alpha0,w,d)
    prob = probclass.calculate_prob_log(alpha)
    return prob

prob = calculate_prob_log_all(alpha0,xp.asarray(w_emb),xp.asarray(d_emb))
print("prob={}".format(xp.sum(prob)))


begin_time = time.time()
for epoch in range(0,args.epoch):
    logwrite('epoch=' + str(epoch))
    w_perm = np.random.permutation(n_vocab)
    d_perm = np.random.permutation(n_wid)

    for i in d_perm:
        d_i = d_emb[i] + sigma_doc * np.random.randn(args.unit)
        alpha_new = cstm.calculate_alpha_index(alpha0,G0,w_emb,d_i)
        prob_new = cstm.calculate_prob_log_index(alpha_new, wordcntbydoc[i])
        new_flag = cstm.mh_accept(prob_old,prob_new)
        if new_flag == 1:
            d_emb[i] = d_i
            prob[i] = prob_new
        logwrite("  doc i={}:ll={}".format(i,xp.sum(prob)))


    end_time = time.time()
    duration = end_time - begin_time

    prob = calculate_prob_log_all(xp.asarray(alpha0),xp.asarray(w_emb),xp.asarray(d_emb))
    logwrite("doc result={}".format(xp.sum(prob)))
    logwrite('doc: {:.2f} sec'.format(duration))
    begin_time = time.time()

    for i in w_perm:
        w_emb_new = xp.copy(w_emb)
        w_emb_new[i] = w_emb[i] + sigma_word * np.random.randn(args.unit)
        prob_new = calculate_prob_log_all(xp.asarray(alpha0),xp.asarray(w_emb_new),xp.asarray(d_emb))

        new_flag = cstm.mh_accept(xp.sum(prob),xp.sum(prob_new))
        if new_flag == 1:
            prob = xp.copy(prob_new)
            w_emb = np.copy(w_emb_new)
        logwrite("  word i={}:ll={}".format(i,xp.sum(prob)))
    
    end_time = time.time()
    duration = end_time - begin_time

    prob = calculate_prob_log_all(xp.asarray(alpha0),xp.asarray(w_emb),xp.asarray(d_emb))
    logwrite("word result={}".format(xp.sum(prob)))
    logwrite('word: {:.2f} sec'.format(duration))
    begin_time = time.time()

    z = sigma_alpha * np.random.randn(1)
    alpha0_new = alpha0 * np.exp(z)
    prob_new = calculate_prob_log_all(xp.asarray(alpha0_new),xp.asarray(w_emb),xp.asarray(d_emb))
    new_flag = cstm.mh_accept(xp.sum(prob),np.sum(prob_new))
    if new_flag == 1:
        alpha0 = alpha0_new
        prob = xp.copy(prob_new)
    logwrite("  alpha0 : ll={} alpha0={}".format(xp.sum(prob),alpha0))

    end_time = time.time()
    duration = end_time - begin_time

    prob = calculate_prob_log_all(xp.asarray(alpha0),xp.asarray(w_emb),xp.asarray(d_emb))
    logwrite("alpha0 result={}".format(xp.sum(prob)))
    logwrite('alpha0: {:.2f} sec'.format(duration))
    begin_time = time.time()

    logwrite('Save to pkl epoch=' + str(epoch) )

    result_all = [alpha0,w_emb,d_emb]
    file1 = resultdir + "/result_epoch_" + "{0:0>3}".format(epoch) + ".pkl"
    with open(file1, 'wb') as output1:
        six.moves.cPickle.dump(result_all, output1, -1)

The calculation of the gamma function has become forcible in order to speed up with GPU. Since it was implemented once, does the formula of the paper and the value of the source match? I would like to verify while doing a confirmation test.

Recommended Posts

Continuous space topic model implementation
Python implementation of continuous hidden Markov model
Maximum likelihood estimation implementation of topic model in python
Variational Bayesian inference implementation of topic model in python
Manipulate topic models ~ Interactive Topic Model ~
Pokemon classification by topic model
Deep learning image recognition 2 model implementation
Custom state space model in Python