From "Nico Nico Encyclopedia Data" (by Future Search Brazil Co., Ltd.) in National Institute of Informatics website Generate and apply the dictionary of MeCab. This method may be available for text mining for research purposes.
http://www.nii.ac.jp/cscenter/idr/nico/nicopedia-apply.html
nc2mecab.py
# -*- encoding: utf-8 -*-
import os
import csv
import re
def main():
#Input folder name
pth = 'head'
#Output file name
wtnme = 'ncnc.csv'
#Deleted string pattern for word formatting
rmvptn = re.compile(r'(^\d[1,2]Month\d[1,2]Day$)|((\(|().+(\)|))$)') #MonthDayタグとタグ後ろのジャンル名は削除
with open(wtnme,'wb') as wtfh:
wt = csv.writer(wtfh)
fnmes = os.listdir(pth)
for fnme in fnmes:
with open(os.path.join(pth,fnme),'rb') as rdfh:
rd = csv.reader(rdfh)
for row in rd:
if row[3]=='a':
wrd = rmvptn.sub('',row[1]).lower()
if(0 < len(wrd)):
wt.writerow(
[wrd,'0','0',int(max(-32768.0, (6000 - 200 *(len(wrd)**1.3)))),'noun','General','*','*','*','*',wrd,row[2],row[2],'Nico Nico Pedia']
)
if __name__ == '__main__':
main()
python nc2mecab.py
Using the output CSV, "Add to user dictionary" of MeCab: How to add words was executed. However, the dictionary generation command is as follows.
/usr/local/libexec/mecab/mecab-dict-index -d/usr/local/lib/mecab/dic/ipadic -u ncnc.dic -f utf-8 -t utf-8 ncnc.csv
vocaloid and love live! Is the taste of Nico Kitchen.
vocaloid noun, general, *, *, *, *, vocaloid, vocaloid, vocaloid, Nico Nico Pedia And filler, *, *, *, *, *, and, to, to lovelive! Noun, general, *, *, *, *, love live! , Love Live, Love Live, Nico Nico Pedia Is a particle, a particle, *, *, *, *, is, ha, wa Nico Kitchen Noun, General, *, *, *, *, Nico Kitchen, Nico Chu, Nico Chu, Nico Nico Pedia Particles, adnominal forms, *, *, *, *, of, no, no Taste Noun, General, *, *, *, *, Taste, Tashinami, Tashinami .. Symbols, Kuten, *, *, *, * ,. ,. ,. EOS
Recommended Posts