A record of solving the problems in the second half of Chapter 4. The target file is neko.txt as shown on the web page.
Use MeCab to morphologically analyze the text (neko.txt) of Natsume Soseki's novel "I am a cat" and save the result in a file called neko.txt.mecab. Use this file to implement a program that addresses the following questions. For problems 37, 38, and 39, use matplotlib or Gnuplot.
Extract the concatenation of nouns (nouns that appear consecutively) with the longest match.
# -*- coding: utf-8 -
__author__ = 'todoroki'
import problem30
def extract_seqs(sentences):
seqs = []
seq = []
for sentence in sentences:
for morpheme in sentence:
if morpheme['pos'] == "noun":
seq.append(morpheme['surface'])
else:
if len(seq) > 1:
seqs.append(seq)
seq = []
return seqs
if __name__ == "__main__":
inputfile = 'neko.txt.mecab'
outputfile = 'neko.mecab_sequences.txt'
f = open(inputfile, "r")
g = open(outputfile, "w")
sentences = problem30.mecab_reader(f)
sequences = extract_seqs(sentences)
for sequence in sequences:
# print "".join(sequence)
g.write("".join(sequence) + '\n')
f.close()
g.close()
Find the words that appear in the sentence and their frequency of appearance, and arrange them in descending order of frequency of appearance.
# -*- coding: utf-8 -
__author__ = 'todoroki'
import problem30
from collections import Counter
def count_words(sentences):
words = []
for sentence in sentences:
for morpheme in sentence:
words.append(morpheme['surface'])
return Counter(words)
if __name__ == "__main__":
inputfile = "neko.txt.mecab"
outputfile = "neko.mecab_words.txt"
f = open(inputfile, 'r')
g = open(outputfile, 'w')
sentences = problem30.mecab_reader(f)
counter = count_words(sentences)
for word, count in counter.most_common():
# print word, count
g.write("%s %s\n" % (word, count))
f.close()
g.close()
Display the 10 words that appear frequently and their frequency of appearance in a graph (for example, a bar graph).
# -*- coding: utf-8 -
__author__ = 'todoroki'
import problem30
import problem36
import matplotlib.pyplot as plt
def plot_words(words, counts, file):
from matplotlib.font_manager import FontProperties
fp = FontProperties(fname='/usr/local/Cellar/ricty/3.2.4/share/fonts/Ricty-Regular.ttf')
plt.bar(range(10), counts, align='center')
plt.xticks(range(0, 10), words, fontproperties=fp)
plt.savefig(file)
if __name__ == '__main__':
inputfile = 'neko.txt.mecab'
outputfile = 'neko.mecab_words.png'
f = open(inputfile, 'r')
words = []
counts = []
sentences = problem30.mecab_reader(f)
counter = problem36.count_words(sentences)
for word, count in counter.most_common(10):
# print word, count
words.append(word.decode('utf8'))
counts.append(count)
plot_words(words, counts, outputfile)
f.close()
Draw a histogram of the frequency of occurrence of words (the horizontal axis represents the frequency of occurrence and the vertical axis represents the number of types of words that take the frequency of occurrence as a bar graph).
# -*- coding: utf-8 -
__author__ = 'todoroki'
import problem30
import problem36
import pandas as pd
def plot_words_hist(freq, file):
plot = freq.hist()
fig = plot.get_figure()
fig.savefig(file)
if __name__ == '__main__':
inputfile = 'neko.txt.mecab'
outputfile = 'neko.mecab_words_hist.png'
f = open(inputfile, 'r')
words = []
counts = []
sentences = problem30.mecab_reader(f)
counter = problem36.count_words(sentences)
freq = pd.Series(list(counter.values()), index=list(counter.keys()))
plot_words_hist(freq, outputfile)
Plot a log-log graph with the frequency of occurrence of words on the horizontal axis and the frequency of occurrence on the vertical axis.
# -*- coding: utf-8 -
__author__ = 'todoroki'
import problem30
import problem36
import matplotlib.pyplot as plt
def plot_words_hist_log(counter, file):
from matplotlib.font_manager import FontProperties
fp = FontProperties(fname='/usr/local/Cellar/ricty/3.2.4/share/fonts/Ricty-Regular.ttf')
plt.figure()
plt.xscale('log')
plt.yscale('log')
plt.plot(sorted(list(counter.values()), reverse=True), range(1, len(list(counter))+1))
plt.savefig(file)
if __name__ == '__main__':
inputfile = 'neko.txt.mecab'
outputfile = 'neko.mecab_words_hist_log.png'
f = open(inputfile, 'r')
words = []
counts = []
sentences = problem30.mecab_reader(f)
counter = problem36.count_words(sentences)
plot_words_hist_log(counter, outputfile)
f.close()
Recommended Posts