Was ich getan habe

Speichern Sie den Tweet mit tweepy und fügen Sie ihn zum Abspielen in word2vec ein Ich habe es mit Bezug auf verschiedene Orte kopiert und eingefügt, daher gibt es nicht viel Neues. Eines der Ziele war "Shota-Mann + Frau = Lori".

Umgebung

CentOS7 Anaconda2-4.1.0

Annahme

tweepy MeCab mecab-python gensim

Code

Es ist mir egal, hier und da überflüssig zu sein.

`TwStream.py`


# -*- encoding:utf-8 -*-

import sys
import os
import re
import time
import tweepy

HERE = os.path.abspath(os.path.dirname(__file__))

CK = ''
CS = ''
AT = ''
AS = ''

class MyListener(tweepy.StreamListener):
	def __init__(self):
		super(MyListener, self).__init__()

	def on_status(self, status):
		try:
			tw = status.text.strip()
			#Nur japanischer Tweet
			if re.search(u'[Ah-Hmm-Nieder]', tw) is not None:
				with open(HERE + '/stream.txt', 'a') as f:
					#Es gibt keinen Tab in Tweet, verwenden Sie ihn also als Trennzeichen
					f.write(tw.encode('utf-8') + '\n\t\n')
				print tw.encode('utf-8')
		except tweepy.TweepError as e:
			print e.reason
			if  'u\'code\': 88' in e.reason:
				print 'wait 15 min'
				time.sleep(15*60)

	def on_error(self, status_code):
		print 'error ', status_code
		if status_code == 420:
			print 'wait 15 min'
			time.sleep(15*60)
		time.sleep(10)

	def on_limit(self, status):
		print 'limit'
		time.sleep(10)

	def on_timeout(self, status):
		print 'timeout'
		time.sleep(10)

if __name__ == '__main__':
	while True:
		try:
			auth = tweepy.OAuthHandler(CK, CS)
			auth.set_access_token(AT, AS)
			print 'auth set'

			st = tweepy.Stream(auth, MyListener())
			print 'sampling'
			st.sample()

		except tweepy.TweepError as e:
			st.disconnect()
			print e.reason
			if 'u\'code\': 88' in e.reason:
				print 'wait 15 min'
				time.sleep(15*60)
		except KeyboardInterrupt:
			st.disconnect()
			break
		except:
			st.disconnect()
			continue

Benennen Sie nun'stream.txt 'in'raw.txt' um.

`W2V.py`


# -*- encoding:utf-8 -*-

import sys
import os
import MeCab
import gensim
from gensim.models import word2vec

#Pfad dieser Datei
HERE = os.path.abspath(os.path.dirname(__file__))
sys.path.append(HERE)

#Selbstgemachtes Modul
from MeCabRW import *
from ProcStr import *

if __name__ == '__main__':
    MODEL = HERE + '/twitter.model'

    try:
        #Laden, wenn es ein Modell gibt
        print 'loading model'
        model = word2vec.Word2Vec.load(MODEL)
        print 'model loaded'
    except:
        #Erstellen Sie, wenn nicht
        print 'model not loaded'
        print 'creating model'
        # mt = MeCab.Tagger('-Owakati')Ist möglich
        mt = mtWakatiNeo()
        avoid = ['RT']
        mecabParseRW(HERE + '/raw.txt', HERE + '/sep.txt', mt, avoid)

        #Lesen Sie die Teilungsdaten
        corp = word2vec.Text8Corpus(HERE + '/sep.txt')
        #Ermöglicht die Analyse von Phrase für Phrase
        phrcorp = gensim.models.Phrases(corp)
        model = word2vec.Word2Vec(phrcorp[corp], size=2000, min_count=2)
        model.save(MODEL)

        print 'creating done'

    pos = [u'Shota', u'Frau']
    neg = [u'Mann']

    sim = model.most_similar(positive=pos, negative=neg)

    print '+: ', ' '.join([i.encode('utf-8') for i in pos])
    print '-: ', ' '.join([i.encode('utf-8') for i in neg])
    print
    for i, j in sim:
        print i.encode('utf-8'), '\t', j

`MeCabRW.py`


# -*- coding: utf-8 -*-

import re
import MeCab

def mtWakatiNeo():
    opt = '-O wakati -d /usr/lib64/mecab/dic/mecab-ipadic-neologd'
    return MeCab.Tagger(opt)

def mecabParseRW(pathIn, pathOut, mt, avoid=[]):
    with open(pathIn, 'r') as f:
        sIn = f.read()
    #URL und@[id]Entfernung
    sIn = re.sub('https?://[A-Za-z0-9/:%#\$&\?\(\)~\.=\+\-]+', ' ', sIn)
    sIn = re.sub('@[A-Za-z0-9_]+', ' ', sIn)
    sOut = []
    for i in sIn.split('\n\t\n'):
        if all([j not in i for j in avoid]):
            p = mt.parse(i) #Manchmal wird es hier zu None
            if type(p) == str: #Typprüfung
                try:
                    p.decode('utf-8')
                    sOut.append(p)
                except:
                    continue
    sOut = '\n\t\n'.join(sOut)
    with open(pathOut, 'w') as f:
        f.write(sOut)
    return sOut

Ergebnis

Sammle und führe ungefähr 60 MB Tweets aus

loading model
model loaded
+:Shota Frau
-:Mann

Affe 0.833452105522
Macaron 0.832771897316
Lori 0.830695152283
Kompliment 0.828270435333
Sprechen_Auf 0.825944542885
Umehara 0.825801610947
Arisa 0.822319507599
Kleine Milch 0.818123817444
hundert_0.817329347134
Honda Tsubasa 0.816138386726

Ist es nicht ein gutes Gefühl? Die umgekehrte Formel hat das gleiche Gefühl.

loading model
model loaded
+:Lori Mann
-:Frau

Lila 0.847893893719
hundert_0.824845731258
Shota 0.82099032402
Mach 0.81635427475
Tsumugi 0.813044965267
Prinzessin 0.812274694443
Parodie 0.809535622597
Mob 0.804774940014
Weiß 0.802413225174
Schwarzes Haar 0.800325155258

Verschiedene

Ich wollte es wirklich mit Windows + Python3 machen, aber dies geschah aufgrund des Zeichencodes und der vorhandenen Materialien. Ich habe nur Python3 verwendet, daher kann es zu seltsamen Schriften kommen.

Referenz

http://docs.tweepy.org/en/v3.5.0/streaming_how_to.html https://radimrehurek.com/gensim/models/phrases.html#module-gensim.models.phrases http://tjo.hatenablog.com/entry/2014/06/19/233949

Python2 + word2vec