I was interested in the automatic generation of sentences using AI, so I tried to generate sentences using Keras. What I did was to get the tweets of the idol group, learn them, and make sentences.
I referred to this article.
Try to generate sentences quickly with Keras LSTM
The code is basically the same, but I will leave a note of parts that were a little difficult to understand from this article alone.
First is the acquisition of learning data.
import json
import config
from requests_oauthlib import OAuth1Session
from time import sleep
import re
import emoji
from mongo_dao import MongoDAO
# Not used this time
# Remove emoji
def remove_emoji(src_str):
return ''.join(c for c in src_str if c not in emoji.UNICODE_EMOJI)
# API key setting (defined in another file config.py)
CK = config.CONSUMER_KEY
CS = config.CONSUMER_SECRET
AT = config.ACCESS_TOKEN
ATS = config.ACCESS_TOKEN_SECRET
# Authentication process
twitter = OAuth1Session(CK, CS, AT, ATS)
# Timeline acquisition endpoint
url = "https://api.twitter.com/1.1/statuses/user_timeline.json"
# Acquisition account
necopla_menber = ['@yukino__NECOPLA', '@yurinaNECOPLA', '@riku_NECOPLA', '@miiNECOPLA', '@kaori_NECOPLA', '@sakuraNECOPLA', '@miriNECOPLA', '@renaNECOPLA']
# Parameter definition
params = {'q': '-filter:retweets',
'max_id': 0, # ID to start getting
'count': 200}
arg1:DB Name
arg2:Collection Name
mongo = MongoDAO("db", "necopla_tweets")
mongo.delete_many({})
regex_twitter_account = '@[0-9a-zA-Z_]+'
for menber in necopla_menber:
print(menber)
del params ['max_id'] # Clear ID to start acquisition
# Get the latest 200 tweets / Get tweets older than the ID set in params ['max_id'] from the second time onwards
for j in range(100):
params['screen_name'] = menber
res = twitter.get(url, params=params)
if res.status_code == 200:
#API remaining count
limit = res.headers['x-rate-limit-remaining']
print("API remain: " + limit)
if limit == 1:
sleep(60*15)
n = 0
tweets = json.loads(res.text)
# Exit the loop if you can't get tweets from the account you're processing
if len(tweets) == 0:
break
# Process by tweet
for tweet in tweets:
# Register the entire tweet data
if not "RT @" in tweet['text'][0:4]:
mongo.insert_one({'tweet':re.sub(regex_twitter_account, '',tweet['text'].split('http')[0]).strip()})
if len(tweets) >= 1:
params['max_id'] = tweets[-1]['id']-1
For the learning data in this article, we have obtained tweets from the group "// Necopla //". The following elements have been deleted from the acquired tweets. ・ Image link ・ Account at the time of reply
I'm putting the data to be learned like this into mongoDB.
{"_id": ObjectId ("5e511a2ffac622266fb5801d"), "tweet": "I went to karaoke to practice solo songs, but I broke my throat normally"}
{"_id": ObjectId ("5e511a2ffac622266fb5801e"), "tweet": "I can still go"}
The source of mongoDB operation is here
It is almost the same as the source that I referred to.
from __future__ import print_function
from keras.callbacks import LambdaCallback
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
import matplotlib.pyplot as plt # add
import numpy as np
import random
import sys
import io
from mongo_dao import MongoDAO
mongo = MongoDAO("db", "necopla_tweets")
results = mongo.find()
text = ''
for result in results:
text += result['tweet']
chars = sorted(list(set(text)))
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))
cut the text in semi-redundant sequences of maxlen characters
maxlen = 3
step = 1
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
sentences.append(text[i: i + maxlen])
next_chars.append(text[i + maxlen])
print('nb sequences:', len(sentences))
print('Vectorization...')
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
for t, char in enumerate(sentence):
x[i, t, char_indices[char]] = 1
y[i, char_indices[next_chars[i]]] = 1
build the model: a single LSTM
print('Build model...')
model = Sequential()
model.add(LSTM(128, input_shape=(maxlen, len(chars))))
model.add(Dense(len(chars)))
model.add(Activation('softmax'))
optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)
def sample(preds, temperature=1.0):
# helper function to sample an index from a probability array
preds = np.asarray(preds).astype('float64')
preds = np.log(preds) / temperature
exp_preds = np.exp(preds)
preds = exp_preds / np.sum(exp_preds)
probas = np.random.multinomial(1, preds, 1)
return np.argmax(probas)
# Execute at the end of epoch
def on_epoch_end(epoch, logs):
# Function invoked at end of each epoch. Prints generated text.
print()
print('----- Generating text after Epoch: %d' % epoch)
# start_index = random.randint(0, len(text) - maxlen - 1)
# start_index = 0 # Sentence generation from "Old man was old" every time
for diversity in [0.2]: # diversity = 0.2 only
print('----- diversity:', diversity)
generated = ''
# sentence = text[start_index: start_index + maxlen]
sentence ='Tomorrow'
generated += sentence
print('----- Generating with seed: "' + sentence + '"')
sys.stdout.write(generated)
for i in range(120):
x_pred = np.zeros((1, maxlen, len(chars)))
for t, char in enumerate(sentence):
x_pred[0, t, char_indices[char]] = 1.
preds = model.predict(x_pred, verbose=0)[0]
next_index = sample(preds, diversity)
next_char = indices_char[next_index]
generated += next_char
sentence = sentence[1:] + next_char
sys.stdout.write(next_char)
sys.stdout.flush()
print()
# Execute at the end of learning
def on_train_end(logs):
print('----- saving model...')
model.save_weights("necopla_model" + 'w.hdf5')
model.save("necopla_model.hdf5")
print_callback = LambdaCallback(on_epoch_end=on_epoch_end,
on_train_end=on_train_end)
history = model.fit(x, y,
batch_size=128,
epochs=5,
callbacks=[print_callback])
Plot Training loss & Validation Loss
loss = history.history["loss"]
epochs = range(1, len(loss) + 1)
plt.plot(epochs, loss, "bo", label = "Training loss" )
plt.title("Training loss")
plt.legend()
plt.savefig("loss.png ")
plt.close()
The changes are as follows. -Added a process to save learning data at the end of learning (on_train_end) -Changed the character length (maxlen) from 8 to 3.
The first point is that I saw a pain that the learning data that was moved over a whole day was not saved, so I added a save process. The second point was learning with 8 letters, and when I predicted a sentence starting with a 3-letter word such as "Tomorrow" from the learning data, it did not work well. Since I made tweets, I learned to start with short words.
The learning process looks like this.
Epoch 1/5
663305/663305 [==============================] - 401s 605us/step - loss: 3.5554
----- Generating text after Epoch: 0
----- diversity: 0.2
----- Generating with seed: "Tomorrow"
tomorrow! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !!
I look forward to working with you! !! !!
"Nekopla // Nekopla
# Nekopla
# Nekopla // Nekopla // Nekopra // Nekopra // Nekopra
# Nekopla
# Nekopla // Nekopla is better
Epoch 2/5
663305/663305 [==============================] - 459s 693us/step - loss: 3.2893
----- Generating text after Epoch: 1
----- diversity: 0.2
----- Generating with seed: "Tomorrow"
Here tomorrow! !!
# It's a live performance of NECOPLA!
# There is a live performance of NECOPLA! !!
# Thank you for Nekopla! !! !!
# Thank you for all the information about NECOPLA! !!
# I would be happy if there was a cat plastic ...
# cat
Epoch 3/5
663305/663305 [==============================] - 492s 742us/step - loss: 3.2109
----- Generating text after Epoch: 2
----- diversity: 0.2
----- Generating with seed: "Tomorrow"
Here tomorrow! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !!
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
Epoch 4/5
663305/663305 [==============================] - 501s 755us/step - loss: 3.1692
----- Generating text after Epoch: 3
----- diversity: 0.2
----- Generating with seed: "Tomorrow"
Here tomorrow! !!
# Nekopla
# Nekopla
# I'm glad you came to the Nekopla people! !!
# Nekopla
# Nekopla
# About Nekopla More More More More More More More More More More More More
Epoch 5/5
663305/663305 [==============================] - 490s 739us/step - loss: 3.1407
----- Generating text after Epoch: 4
----- diversity: 0.2
----- Generating with seed: "Tomorrow"
Come see me tomorrow! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !! !!
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
Further learning using the saved learning data Just load the learning data saved after building the model.
The code is here
print('Build model...')
model = Sequential()
model.add(LSTM(128, input_shape=(maxlen, len(chars))))
model.add(Dense(len(chars)))
model.add(Activation('softmax'))
optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)
model.load_weights("necopla_modelw.hdf5")
When I made this process, I thought about the meaning of each code. (slow··.) The process of "on_epoch_end" is called at the end of learning epoch, but here, at the timing of the end of epoch, sentences are created using the learning data at that time. Therefore, when you make a tweet, you can basically make it by imitating this process.
The code is here
def evaluate_tweet():
for diversity in [0.2]: # diversity = 0.2 only
print('----- diversity:', diversity)
generated = ''
sentence ='Tomorrow'
generated += sentence
print('----- Generating with seed: "' + sentence + '"')
sameCharCount = 0
for i in range(120):
x_pred = np.zeros((1, maxlen, len(chars)))
for t, char in enumerate(sentence):
x_pred[0, t, char_indices[char]] = 1.
preds = model.predict(x_pred, verbose=0)[0]
next_index = sample(preds, diversity)
next_char = indices_char[next_index]
if next_char == generated[-1]:
sameCharCount += 1
if sameCharCount >= 3:
continue
elif sameCharCount != 0:
sameCharCount = 0
generated += next_char
sentence = sentence[1:] + next_char
return generated
for i in range(10):
tweet = evaluate_tweet()
print ('----------------' + str (i + 1) +'th time ----------------')
print(tweet)
When making a tweet sentence, if the same character continues for 3 or more characters, the characters will not be connected. Tweets are created with sentences that start with the words "Tomorrow is".
The output result looks like this.
---------------- First time ----------------
Here tomorrow! !! !!
---------------- Second time ----------------
Here tomorrow! !! !!
---------------- Third time ----------------
I wonder if we can meet tomorrow! !! !!
---------------- 4th ----------------
Here tomorrow! !! !!
---------------- 5th time ----------------
Here tomorrow! !! !!
---------------- 6th time ----------------
Here tomorrow! !! !!
---------------- 7th time ----------------
Here tomorrow! !! !!
---------------- 8th time ----------------
Here tomorrow! !! !!
---------------- 9th time ----------------
Here tomorrow! !! !!
---------------- 10th time ----------------
Here tomorrow! !! !!
Somehow, I was able to create something like that.
I've done the same thing with Chainer before, but it's definitely easier and easier to do with Keras. The output result will also be a short sentence, probably because the number of learnings is small, so I will try to see the result while performing additional learning.
I think that the learning method will be different if it is a method of learning by dividing, so I will try that as well.
Recommended Posts