Während des Aufenthalts zu Hause ["Tiefes Lernen von Grund auf ② Verarbeitung natürlicher Sprache"](https://www.amazon.co.jp/%E3%82%BC%E3%83%AD%E3%81%8B% E3% 82% 89% E4% BD% 9C% E3% 82% 8BTiefenlernen-% E2% 80% 95% E8% 87% AA% E7% 84% B6% E8% A8% 80% E8% AA% 9E % E5% 87% A6% E7% 90% 86% E7% B7% A8-% E6% 96% 8E% E8% 97% A4-% E5% BA% B7% E6% AF% 85 / dp / 4873118360) lesen. Ich habe es geschafft, bis zum Ende zu kommen, aber es gibt nicht viele Anwendungsbeispiele in diesem Text. Verwenden wir also den Textcode, um einen Spamfilter (Dokumentklassifizierungsmodell) zu erstellen. Diese Studie basiert auf Qiitas Artikel Satzklassifizierungsmodell von RNN von Grund auf neu erstellt.
Verwenden Sie das "SMS Spam Collection Dataset" in Kaggle.
--Dokumentklassifizierung nach LSTM
# coding: utf-8
from google.colab import drive
drive.mount('/content/drive')
import sys
sys.path.append('drive/My Drive/Colab Notebooks/spam_filter')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
%matplotlib inline
df = pd.read_csv('drive/My Drive/Colab Notebooks/spam_filter/dataset/spam.csv',encoding='latin-1')
df.head()
df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'],axis=1,inplace=True)
df.info()
sns.countplot(df.v1)
plt.xlabel('Label')
plt.title('Number of ham and spam messages')
X = df.v2
Y = df.v1
le = LabelEncoder()
Y = le.fit_transform(Y)
max_words = 1000
max_len = 150
tok = Tokenizer(num_words=max_words)
tok.fit_on_texts(X)
word_to_id = tok.word_index
X_ids = tok.texts_to_sequences(X)
X_ids_pad = sequence.pad_sequences(X_ids,maxlen=max_len)
message_len = [len(v) for v in X_ids]
df['message_len']=message_len
plt.figure(figsize=(12, 8))
df[df.v1=='ham'].message_len.plot(bins=35, kind='hist', color='blue',
label='Ham messages', alpha=0.6)
df[df.v1=='spam'].message_len.plot(kind='hist', color='red',
label='Spam messages', alpha=0.6)
plt.legend()
plt.xlabel("Message Length")
def sigmoid(x):
return 1 / (1 + np.exp(-x))
def softmax(x):
if x.ndim == 2:
x = x - x.max(axis=1, keepdims=True)
x = np.exp(x)
x /= x.sum(axis=1, keepdims=True)
elif x.ndim == 1:
x = x - np.max(x)
x = np.exp(x) / np.sum(np.exp(x))
return x
def cross_entropy_error(y, t):
if y.ndim == 1:
t = t.reshape(1, t.size)
y = y.reshape(1, y.size)
#Lehrerdaten sind eins-hot-Konvertieren Sie für den Vektor in den Index der richtigen Bezeichnung
if t.size == y.size:
t = t.argmax(axis=1)
batch_size = y.shape[0]
return -np.sum(np.log(y[np.arange(batch_size), t] + 1e-7)) / batch_size
class Affine:
def __init__(self, W, b):
self.params = [W, b]
self.grads = [np.zeros_like(W), np.zeros_like(b)]
self.x = None
def forward(self, x):
W, b = self.params
out = np.dot(x, W) + b
self.x = x
return out
def backward(self, dout):
W, b = self.params
dx = np.dot(dout, W.T)
dW = np.dot(self.x.T, dout)
db = np.sum(dout, axis=0)
self.grads[0][...] = dW
self.grads[1][...] = db
return dx
class Softmax:
def __init__(self):
self.params, self.grads = [], []
self.out = None
def forward(self, x):
self.out = softmax(x)
return self.out
def backward(self, dout):
dx = self.out * dout
sumdx = np.sum(dx, axis=1, keepdims=True)
dx -= self.out * sumdx
return dx
class SoftmaxWithLoss:
def __init__(self):
self.params, self.grads = [], []
self.y = None #Ausgabe von Softmax
self.t = None #Lehreretikett
def forward(self, x, t):
self.t = t
self.y = softmax(x)
#Das Lehrerlabel ist eins-Konvertieren Sie für einen heißen Vektor in den richtigen Index
if self.t.size == self.y.size:
self.t = self.t.argmax(axis=1)
loss = cross_entropy_error(self.y, self.t)
return loss
def backward(self, dout=1):
batch_size = self.t.shape[0]
dx = self.y.copy()
dx[np.arange(batch_size), self.t] -= 1
dx *= dout
dx = dx / batch_size
return dx
class Embedding:
def __init__(self, W):
self.params = [W]
self.grads = [np.zeros_like(W)]
self.idx = None
def forward(self, idx):
W, = self.params
self.idx = idx
out = W[idx]
return out
def backward(self, dout):
dW, = self.grads
dW[...] = 0
np.add.at(dW, self.idx, dout)
return None
class TimeEmbedding:
def __init__(self, W):
self.params = [W]
self.grads = [np.zeros_like(W)]
self.layers = None
self.W = W
def forward(self, xs):
N, T = xs.shape
V, D = self.W.shape
out = np.empty((N, T, D), dtype='f')
self.layers = []
for t in range(T):
layer = Embedding(self.W)
out[:, t, :] = layer.forward(xs[:, t])
self.layers.append(layer)
return out
def backward(self, dout):
N, T, D = dout.shape
grad = 0
for t in range(T):
layer = self.layers[t]
layer.backward(dout[:, t, :])
grad += layer.grads[0]
self.grads[0][...] = grad
return None
class LSTM:
def __init__(self, Wx, Wh, b):
'''
Parameters
----------
Wx:Eingang`x`Gewichtsparameter für (kombiniert 4 Gewichte)
Wh:Versteckter Zustand`h`Gewichtsparameter für (Fassen Sie die Gewichte für 4 zusammen)
b:Bias (fasse 4 Bias zusammen)
'''
self.params = [Wx, Wh, b]
self.grads = [np.zeros_like(Wx), np.zeros_like(Wh), np.zeros_like(b)]
self.cache = None
def forward(self, x, h_prev, c_prev):
Wx, Wh, b = self.params
N, H = h_prev.shape
A = np.dot(x, Wx) + np.dot(h_prev, Wh) + b
f = A[:, :H]
g = A[:, H:2*H]
i = A[:, 2*H:3*H]
o = A[:, 3*H:]
f = sigmoid(f)
g = np.tanh(g)
i = sigmoid(i)
o = sigmoid(o)
c_next = f * c_prev + g * i
h_next = o * np.tanh(c_next)
self.cache = (x, h_prev, c_prev, i, f, g, o, c_next)
return h_next, c_next
def backward(self, dh_next, dc_next):
Wx, Wh, b = self.params
x, h_prev, c_prev, i, f, g, o, c_next = self.cache
tanh_c_next = np.tanh(c_next)
ds = dc_next + (dh_next * o) * (1 - tanh_c_next ** 2)
dc_prev = ds * f
di = ds * g
df = ds * c_prev
do = dh_next * tanh_c_next
dg = ds * i
di *= i * (1 - i)
df *= f * (1 - f)
do *= o * (1 - o)
dg *= (1 - g ** 2)
dA = np.hstack((df, dg, di, do))
dWh = np.dot(h_prev.T, dA)
dWx = np.dot(x.T, dA)
db = dA.sum(axis=0)
self.grads[0][...] = dWx
self.grads[1][...] = dWh
self.grads[2][...] = db
dx = np.dot(dA, Wx.T)
dh_prev = np.dot(dA, Wh.T)
return dx, dh_prev, dc_prev
class TimeLSTM:
def __init__(self, Wx, Wh, b, stateful=False):
self.params = [Wx, Wh, b]
self.grads = [np.zeros_like(Wx), np.zeros_like(Wh), np.zeros_like(b)]
self.layers = None
self.h, self.c = None, None
self.dh = None
self.stateful = stateful
def forward(self, xs):
Wx, Wh, b = self.params
N, T, D = xs.shape
H = Wh.shape[0]
self.layers = []
hs = np.empty((N, T, H), dtype='f')
if not self.stateful or self.h is None:
self.h = np.zeros((N, H), dtype='f')
if not self.stateful or self.c is None:
self.c = np.zeros((N, H), dtype='f')
for t in range(T):
layer = LSTM(*self.params)
self.h, self.c = layer.forward(xs[:, t, :], self.h, self.c)
hs[:, t, :] = self.h
self.layers.append(layer)
return hs
def backward(self, dhs):
Wx, Wh, b = self.params
N, T, H = dhs.shape
D = Wx.shape[0]
dxs = np.empty((N, T, D), dtype='f')
dh, dc = 0, 0
grads = [0, 0, 0]
for t in reversed(range(T)):
layer = self.layers[t]
dx, dh, dc = layer.backward(dhs[:, t, :] + dh, dc)
dxs[:, t, :] = dx
for i, grad in enumerate(layer.grads):
grads[i] += grad
for i, grad in enumerate(grads):
self.grads[i][...] = grad
self.dh = dh
return dxs
def set_state(self, h, c=None):
self.h, self.c = h, c
def reset_state(self):
self.h, self.c = None, None
class Rnnlm():
def __init__(self, vocab_size=10000, wordvec_size=100, hidden_size=100, out_size=2):
V, D, H, O = vocab_size, wordvec_size, hidden_size, out_size
rn = np.random.randn
#Gewichtsinitialisierung
embed_W = (rn(V, D) / 100).astype('f')
lstm_Wx = (rn(D, 4 * H) / np.sqrt(D)).astype('f')
lstm_Wh = (rn(H, 4 * H) / np.sqrt(H)).astype('f')
lstm_b = np.zeros(4 * H).astype('f')
affine_W = (rn(H, O) / np.sqrt(H)).astype('f')
affine_b = np.zeros(O).astype('f')
#Schichterzeugung
self.embed_layer = TimeEmbedding(embed_W)
self.lstm_layer = TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=True)
self.affine_layer = Affine(affine_W, affine_b)
self.loss_layer = SoftmaxWithLoss()
self.softmax_layer = Softmax()
#Listen Sie alle Gewichte und Verläufe auf
self.params = self.embed_layer.params + self.lstm_layer.params + self.affine_layer.params
self.grads = self.embed_layer.grads + self.lstm_layer.grads + self.affine_layer.grads
def predict(self, xs):
self.reset_state()
xs = self.embed_layer.forward(xs)
hs = self.lstm_layer.forward(xs)
xs = self.affine_layer.forward(hs[:,-1,:]) #Affine Konvertierung der letzten verborgenen Ebene
score = self.softmax_layer.forward(xs)
return score
def forward(self, xs, t):
xs = self.embed_layer.forward(xs)
hs = self.lstm_layer.forward(xs)
x = self.affine_layer.forward(hs[:,-1,:]) #Affine Konvertierung der letzten verborgenen Ebene
loss = self.loss_layer.forward(x, t)
self.hs = hs
return loss
def backward(self, dout=1):
dout = self.loss_layer.backward(dout)
dhs = np.zeros_like(self.hs)
dhs[:,-1,:] = self.affine_layer.backward(dout) #Legen Sie die Fehlerrückausbreitung der affinen Transformation in der letzten ausgeblendeten Ebene fest
dout = self.lstm_layer.backward(dhs)
dout = self.embed_layer.backward(dout)
return dout
def reset_state(self):
self.lstm_layer.reset_state()
class SGD:
'''
Stochastischer Gradientenabstieg
'''
def __init__(self, lr=0.01):
self.lr = lr
def update(self, params, grads):
for i in range(len(params)):
params[i] -= self.lr * grads[i]
X_train,X_test,Y_train,Y_test = train_test_split(X_ids_pad,Y,test_size=0.15)
--Einstellungen wie Hyperparameter
#Hyper-Parametereinstellungen
vocab_size = len(word_to_id)+1
batch_size = 20
wordvec_size = 100
hidden_size = 100
out_size = 2 #Binäres Problem von Schinken und Spam
lr = 1.0
max_epoch = 10
data_size = len(X_train)
#Während des Lernens verwendete Variablen
max_iters = data_size // batch_size
#Müssen in Numpy Array konvertieren
x = np.array(X_train)
t = np.array(Y_train)
--Lernen
total_loss = 0
loss_count = 0
loss_list = []
#Modellgenerierung
model = Rnnlm(vocab_size, wordvec_size, hidden_size, out_size)
optimizer = SGD(lr)
for epoch in range(max_epoch):
for iter in range(max_iters):
#Holen Sie sich eine Mini-Charge
batch_x = x[iter*batch_size:(iter+1)*batch_size]
batch_t = t[iter*batch_size:(iter+1)*batch_size]
#Finden Sie den Farbverlauf und aktualisieren Sie die Parameter
loss = model.forward(batch_x, batch_t)
model.backward()
optimizer.update(model.params, model.grads)
total_loss += loss
loss_count += 1
avg_loss = total_loss / loss_count
print("| epoch %d | loss %.5f" % (epoch+1, avg_loss))
loss_list.append(float(avg_loss))
total_loss, loss_count = 0,0
x = np.arange(len(loss_list))
plt.plot(x, loss_list, label='train')
plt.xlabel('epochs')
plt.ylabel('loss')
plt.show()
result = model.predict(X_test)
Y_pred = result.argmax(axis=1)
# calculate accuracy of class predictions
print('acc=',metrics.accuracy_score(Y_test, Y_pred))
--Verwirrung Matrix
# print the confusion matrix
print(metrics.confusion_matrix(Y_test, Y_pred))
Dieses Mal konnte ich mein Verständnis des Textes vertiefen, indem ich Versuch und Irrtum machte, um dieses Tool zu erstellen.
Wenn Sie Deep Learning ② lesen, das ebenfalls von Grund auf neu erstellt wurde, empfehlen wir Ihnen, das Beispielprogramm zu verwenden, um eine Art App zu erstellen.
Beurteilung durch selbst gemachte SMS
Das erste ist, Sie einzuladen, gemeinsam ein Baseballspiel zu sehen.
Der zweite ist mein eigener Spam (Sie müssen ihn nicht übersetzen).
Überraschenderweise? Ich kann es richtig beurteilen.
texts_add = ["I'd like to watch baseball game with you. I'm wating for your answer.",
"Do you want to meet new sex partners every night? Feel free to call 09077xx0721."
]
X_ids_add = tok.texts_to_sequences(texts_add)
X_ids_pad_add = sequence.pad_sequences(X_ids_add,maxlen=max_len)
result = model.predict(X_ids_pad_add)
Y_pred = result.argmax(axis=1)
print(Y_pred)
Recommended Posts