[PYTHON] 100 natural language processing knocks Chapter 5 Dependency analysis (first half)

A record of solving the problems in the first half of Chapter 5. The target file is neko.txt as shown on the web page.

Use CaboCha to parse the text (neko.txt) of Natsume Soseki's novel "I am a cat" and save the result in a file called neko.txt.cabocha. Use this file to implement a program that addresses the following questions.

</ i> 40. Reading the dependency analysis result (morpheme)

Implement the class Morph that represents morphemes. This class has surface form (surface), uninflected word (base), part of speech (pos), and part of speech subclassification 1 (pos1) as member variables. In addition, read the analysis result of CaboCha (neko.txt.cabocha), express each sentence as a list of Morph objects, and display the morpheme string of the third sentence.

# -*- coding: utf-8 -*-
__author__ = 'todoroki'

class Morph():
    def __init__(self, surface, base, pos, pos1):
        self.surface = surface
        self.base = base
        self.pos = pos
        self.pos1 = pos1

    def print_all(self):
        return self.surface + "\t" + self.base + ", " + self.pos + ", " + self.pos1

def read_morpheme(cabochafile):
    sentences = []
    sentence = []
    for line in cabochafile:
        if line == "EOS\n":
            # if len(sentence) > 0:
            #     sentences.append(sentence)
            sentences.append(sentence)
            sentence = []
        elif line[0] == "*":
            continue
        else:
            surface, other = line.split()
            others = other.split(",")
            base, pos, pos1 = others[6], others[0], others[1]
            morph = Morph(surface, base, pos, pos1)
            sentence.append(morph)
    return sentences

if __name__ == "__main__":
    f = open("neko.txt.cabocha", "r")
    sentences = read_morpheme(f)
    for morph in sentences[2]:
        print morph.print_all()
    f.close()

</ i> 41. Reading the dependency analysis result (phrase / dependency)

In addition to> 40, implement the clause class Chunk. This class has a list of morphemes (Morph objects) (morphs), a list of related clause index numbers (dst), and a list of related original clause index numbers (srcs) as member variables. In addition, read the analysis result of CaboCha of the input text, express one sentence as a list of Chunk objects, and display the character string and the contact of the phrase of the eighth sentence. For the rest of the problems in Chapter 5, use the program created here.

# -*- coding: utf-8 -*-
__author__ = 'todoroki'

import problem40


class Chunk():
    def __init__(self):
        self.morphs = []
        self.dst = -1
        self.srcs = []

    def __repr__(self):
        if self.morphs:
            surfs = [morph.surface for morph in self.morphs if morph.pos != 'symbol']
            return "".join(surfs)

    def include_pos(self, pos):
        return pos in [morph.pos for morph in self.morphs]

    def morphs_of_pos(self, pos):
        return [morph for morph in self.morphs if morph.pos == pos]

    def morphs_of_pos1(self, pos1):
        return [morph for morph in self.morphs if morph.pos1 == pos1]


def read_chunk(cabochafile):
    sentences = []
    sentence = []
    for line in cabochafile:
        if line == "EOS\n":
            for idx, c in enumerate(sentence[:-1]):
                if c.dst != -1:
                    sentence[c.dst].srcs.append(idx)
            # if len(sentence) > 1:
                # sentences.append(sentence)
            sentences.append(sentence)
            sentence = []
        elif line[0] == "*":
            chunk = Chunk()
            chunk.dst = int(line.split()[2].strip("D"))
            sentence.append(chunk)
        else:
            surface, other = line.split()
            others = other.split(",")
            base, pos, pos1 = others[6], others[0], others[1]
            morph = problem40.Morph(surface, base, pos, pos1)
            sentence[-1].morphs.append(morph)
    return sentences


if __name__ == "__main__":
    f = open("neko.txt.cabocha", "r")
    sentences = read_chunk(f)
    for idx, chnk in enumerate(sentences[7]):
        surfaces = ""
        for mrph in chnk.morphs:
            surfaces += mrph.surface
        print "%d" % idx, surfaces, "=>", chnk.dst
    f.close()

</ i> 42. Display of the clauses of the person concerned and the person concerned

Extract all the text of the original clause and the relationed clause in tab-delimited format. However, do not output symbols such as punctuation marks.

# -*- coding: utf-8 -*-
__author__ = 'todoroki'

import problem41

def make_chunk_pair(sentence):
    pairs = []
    for chunk in sentence:
        if chunk.dst != -1:
            pairs.append((chunk, sentence[chunk.dst]))
    return pairs

if __name__ == "__main__":
    f = open("neko.txt.cabocha")
    sentences = problem41.read_chunk(f)
    pair_sentences = []
    for sentence in sentences:
        pair = make_chunk_pair(sentence)
        pair_sentences.append(pair)
    for sentence in pair_sentences:
        for pair in sentence:
            print "\t".join([str(chunk) for chunk in pair])
    f.close()

</ i> 43. Extract clauses containing nouns related to clauses containing verbs

When clauses containing nouns relate to clauses containing verbs, extract them in tab-delimited format. However, do not output symbols such as punctuation marks.

# -*- coding: utf-8 -*-
__author__ = 'todoroki'

import problem41
import problem42

def findNtoV(chunk_pair):
    flagN = False
    flagV = False
    if "noun" in [morph.pos for morph in chunk_pair[0].morphs]:
        flagN = True
    if "verb" in [morph.pos for morph in chunk_pair[1].morphs]:
        flagV = True
    return flagN and flagV

if __name__ == "__main__":
    f = open("neko.txt.cabocha", "r")
    sentences = problem41.read_chunk(f)
    pair_sentences = []
    for sentence in sentences:
        pair = problem42.make_chunk_pair(sentence)
        pair_sentences.append(pair)
    pairs_NtoV = []
    for pair_sentence in pair_sentences:
        for chunk_pair in pair_sentence:
            if findNtoV(chunk_pair):
                pairs_NtoV.append(chunk_pair)
    for pair_NtoV in pairs_NtoV:
        noun, verb = pair_NtoV
        print "%s\t%s" % (noun, verb)
    f.close()

</ i> 44. Visualization of dependent trees

Visualize the dependency tree of a given sentence as a directed graph. For visualization, convert the dependency tree to DOT language and use Graphviz. Also, to visualize directed graphs directly from Python, use pydot.

# -*- coding: utf-8 -*-
__author__ = 'todoroki'

import problem41
import problem42


def sentenceToDot(idx, sentence):
    head = "digraph sentence{0} ".format(idx)
    body_head = "{ graph [rankdir = LR]; "
    body = ""
    for chunk_pair in sentence:
        former, latter = chunk_pair
        body += ('"'+str(former)+'"->"'+str(latter)+'"; ')
    dotString = head + body_head + body + '}'
    return dotString


if __name__ == "__main__":
    f = open("neko.txt.cabocha", "r")
    sentences = problem41.read_chunk(f)
    pair_sentences = []
    for sentence in sentences:
        pair = problem42.make_chunk_pair(sentence)
        pair_sentences.append(pair)
    # dotStrings = []
    for idx, sentence in enumerate(pair_sentences):
        dotString = sentenceToDot(idx, sentence)
        print dotString
        # dotStrings.append(dotString)
    f.close()

Recommended Posts