Use the livedoor news corpus. Prepared with the following Dockerfile.
Dockerfile
FROM shwld/mecab-python
WORKDIR /usr/data
RUN wget http://www.rondhuit.com/download/ldcc-20140209.tar.gz \
&& tar xvfz ldcc-20140209.tar.gz
WORKDIR /usr/src
Use gemsim.
from os import listdir, path
import MeCab
from gensim.models.doc2vec import Doc2Vec
from gensim.models.doc2vec import LabeledSentence
mecab = MeCab.Tagger('-Owakati -d /usr/lib/mecab/dic/mecab-ipadic-neologd')
dirs = [{'key':i, 'label':path.join('../data/text', x)} for i, x in enumerate(listdir('../data/text')) if not x.endswith('.txt')]
dir_docs = [[path.join(x['label'], y), x['label']] for x in dirs for i, y in enumerate(listdir(x['label']))]
training_docs = []
for idx, (dir_doc) in enumerate(dir_docs):
text = ''
for line in open(dir_doc[0], 'r'):
if (line is ''):
continue
text += mecab.parse(line)
training_docs.append(LabeledSentence(words=text, tags=[dir_doc[0]]))
model = Doc2Vec(documents=training_docs, min_count=1, dm=0)
print(open('../data/text/dokujo-tsushin/dokujo-tsushin-4842348.txt').read())
print(model.docvecs.most_similar('../data/text/dokujo-tsushin/dokujo-tsushin-4842348.txt'))
You can get a result like this
[('../data/text/dokujo-tsushin/dokujo-tsushin-4887920.txt', 0.8448764085769653), ('../data/text/dokujo-tsushin/dokujo-tsushin-6083306.txt', 0.7008811831474304), ('../data/text/livedoor-homme/livedoor-homme-5297934.txt', 0.6486650109291077), ('../data/text/dokujo-tsushin/dokujo-tsushin-4799908.txt', 0.6451865434646606), ('../data/text/dokujo-tsushin/dokujo-tsushin-5927658.txt', 0.6374314427375793), ('../data/text/dokujo-tsushin/dokujo-tsushin-6443618.txt', 0.627821147441864), ('../data/text/dokujo-tsushin/dokujo-tsushin-5453372.txt', 0.6207228899002075), ('../data/text/dokujo-tsushin/dokujo-tsushin-6661458.txt', 0.6123473644256592), ('../data/text/dokujo-tsushin/dokujo-tsushin-5665838.txt', 0.6056432723999023), ('../data/text/dokujo-tsushin/dokujo-tsushin-6624494.txt', 0.6007457971572876)]
It is output in the order of similarity, so when I saw the most similar, it was exactly the same.
Look at the contents like this.
print(open('../data/text/dokujo-tsushin/dokujo-tsushin-4887920.txt').read())