--Acquired information on Japanese infected persons of the new coronavirus (COVID-19) --Morphological analysis with mecab --Visualize feature words with wordcloud
config
import re
import os
### MeCab
POS_LIST = [10, 11, 31, 32, 34]
POS_LIST.extend(list(range(36,50)))
POS_LIST.extend([59, 60, 62, 67])
STOP_WORDS = ["To do", "Absent", "Become", "Already", "Shiyo", "Can", "Became", "Ku", "Finally", "is there", "May", "think", "today", "It", "this", "that", "which one", "Which", "NULL", "To be", "Nari", "Ah", "Canる", "I"]
RE_ALPHABET = re.compile("^[0-9a-zA-Z0-9 .,*<>]+$") # alphabet, number, space, comma or dot
current_dir = os.getcwd()
OUTPUT_PNG_FILE = os.path.join(current_dir, "wordcloud.png ")
(Omitted)
import MeCab
from os import path
from wordcloud import WordCloud
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import re
def create_mecab_list(text_list):
mecab_list = []
mecab = MeCab.Tagger("-Ochasen -d /usr/local/lib/mecab/dic/mecab-ipadic-neologd") # MacOS
mecab.parse("")
# encoding = text.encode('utf-8')
for text in text_list:
node = mecab.parseToNode(text)
while node:
# [Part of speech,Part of speech細分類1,Part of speech細分類2,Part of speech細分類3,Inflected form,Utilization type,Prototype,reading,pronunciation]
#Busy adjective,Independence,*,*,Adjective, Idan,Continuous connection,busy,Isogasiku,Isogasiku
morpheme = node.feature.split(",")[6]
if RE_ALPHABET.match(morpheme):
node = node.next
continue
if morpheme in STOP_WORDS:
node = node.next
continue
if len(morpheme) > 1:
if node.posid in POS_LIST:
mecab_list.append(morpheme)
node = node.next
return mecab_list
wordcloud
import MeCab
from os import path
from wordcloud import WordCloud
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import re
def create_wordcloud(morphemes):
# fpath = "/usr/share/fonts/truetype/takao-gothic/TakaoPGothic.ttf" # Ubuntu
fpath = "/System/Library/Fonts/Hiragino Maru Go ProN W4.ttc" # Mac OS X
wordcloud = WordCloud(
background_color="whitesmoke",
collocations=False,
stopwords=set(STOP_WORDS),
max_font_size=80,
relative_scaling=.5,
width=800,
height=500,
font_path=fpath
).generate(morphemes)
plt.figure()
plt.imshow(wordcloud)
plt.axis("off")
wordcloud.to_file(OUTPUT_PNG_FILE)
――There are more infected people in "male" than in "female" -→ "Male" has a larger font size than "Female" ――There are many other than "20's" --"Mask" is important ...
--Ministry of Health, Labor and Welfare --Q & A about the new coronavirus: - https://www.mhlw.go.jp/stf/seisakunitsuite/bunya/kenkou_iryou/dengue_fever_qa_00001.html ――Guidelines for consultation and consultation regarding new coronavirus infections - https://www.mhlw.go.jp/content/10900000/000596905.pdf
Recommended Posts