[PYTHON] 100 natural language processing knocks Chapter 4 Morphological analysis (first half)

A record of solving the problems in the first half of Chapter 4. The target file is neko.txt as shown on the web page.

Use MeCab to morphologically analyze the text (neko.txt) of Natsume Soseki's novel "I am a cat" and save the result in a file called neko.txt.mecab. Use this file to implement a program that addresses the following questions. For problems 37, 38, and 39, use matplotlib or Gnuplot.

</ i> 30. Reading morphological analysis results

Implement a program that reads the morphological analysis result (neko.txt.mecab). However, each morpheme is stored in a mapping type with the surface, uninflected word, part of speech (pos), and part of speech subclassification 1 (pos1) as keys, and one sentence is expressed as a list of morphemes (mapping type). Let's do it. For the rest of the problems in Chapter 4, use the program created here.

# -*- coding: utf-8 -
__author__ = 'todoroki'

def mecab_reader(mecabfile):
    sentences = []
    sentence = []
    for line in mecabfile:
        if line == "EOS\n":
            if len(sentence) > 0:
                sentences.append(sentence)
            sentence = []
        else:
            surface, features = line.split("\t")
            features = features.split(",")
            dic = {
                'surface': surface,
                'base': features[6],
                'pos': features[0],
                'pos1': features[1]
            }
            sentence.append(dic)
    return sentences

if __name__ == '__main__':
    inputfile = 'neko.txt.mecab'
    outputfile = 'neko.mecab_dic.txt'
    f = open(inputfile, 'r')
    g = open(outputfile, 'w')
    sentences = mecab_reader(f)

    for s in sentences:
        # print str(s).decode("string-escape")
        g.write(str(s).decode("string-escape") + "\n")

    f.close()
    g.close()

If the dictionary type s including Japanese is simply print s, Japanese cannot be displayed as it is. Therefore, it corresponds to be able to display Japanese as str (s) .decode ("string-escape ").

</ i> 31. Verb

Extract all the surface forms of the verb.

# -*- coding: utf-8 -
__author__ = 'todoroki'

import problem30

def extract_verb(sentences):
    res = []
    for sentence in sentences:
        for morpheme in sentence:
            if morpheme['pos'] == 'verb':
                res.append(morpheme['surface'])
    return res

if __name__ == "__main__":
    inputfile = 'neko.txt.mecab'
    outputfile = 'neko.mecab_verb.txt'
    f = open(inputfile, "r")
    g = open(outputfile, "w")
    sentences = problem30.mecab_reader(f)
    verbs = extract_verb(sentences)
    for verb in verbs:
        # print verb
        g.write(verb + '\n')
    f.close()
    g.close()

</ i> 32. The original form of the verb

Extract all the original forms of the verb.

# -*- coding: utf-8 -
__author__ = 'todoroki'

import problem30

def extract_verb_base(sentences):
    res = []
    for sentence in sentences:
        for morpheme in sentence:
            if morpheme['pos'] == 'verb':
                res.append(morpheme['base'])
    return res

if __name__ == "__main__":
    inputfile = 'neko.txt.mecab'
    outputfile = 'neko.mecab_verb_base.txt'
    f = open(inputfile, "r")
    g = open(outputfile, "w")
    sentences = problem30.mecab_reader(f)
    verb_bases = extract_verb_base(sentences)
    for verb in verb_bases:
        # print verb
        g.write(verb + '\n')
    f.close()
    g.close()

</ i> 33.

Extract all the nouns of the s-irregular connection.

# -*- coding: utf-8 -
__author__ = 'todoroki'

import problem30

def extract_sahen(sentences):
    res = []
    for sentence in sentences:
        for morpheme in sentence:
            if morpheme['pos1'] == 'Change connection':
                res.append(morpheme['surface'])
    return res

if __name__ == "__main__":
    inputfile = 'neko.txt.mecab'
    outputfile = 'neko.mecab_sahen.txt'
    f = open(inputfile, "r")
    g = open(outputfile, "w")
    sentences = problem30.mecab_reader(f)
    sahens = extract_sahen(sentences)
    for sahen in sahens:
        # print sahen
        g.write(sahen + '\n')
    f.close()
    g.close()

</ i> 34. "B of A"

Extract a noun phrase in which two nouns are connected by "no".

# -*- coding: utf-8 -
__author__ = 'todoroki'

import problem30

def extract_AofB(sentences):
    res = []
    for sentence in sentences:
        for k in xrange(len(sentence)-3):
            triple = sentence[k:k+3]
            b1 = triple[0]['pos'] == 'noun'
            b2 = triple[1]['surface'] == 'of'
            b3 = triple[2]['pos'] == 'noun'
            if b1 and b2 and b3:
                res.append(t['surface'] for t in triple)
    return res

if __name__ == "__main__":
    inputfile = 'neko.txt.mecab'
    outputfile = 'neko.mecab_AofB.txt'
    f = open(inputfile, "r")
    g = open(outputfile, "w")
    sentences = problem30.mecab_reader(f)
    res = extract_AofB(sentences)
    for r in res:
        # print "".join(r)
        g.write("".join(r) + '\n')
    f.close()
    g.close()

Recommended Posts