A record of solving the problems in the first half of Chapter 5. The target file is neko.txt as shown on the web page.
Use CaboCha to parse the text (neko.txt) of Natsume Soseki's novel "I am a cat" and save the result in a file called neko.txt.cabocha. Use this file to implement a program that addresses the following questions.
Implement the class Morph that represents morphemes. This class has surface form (surface), uninflected word (base), part of speech (pos), and part of speech subclassification 1 (pos1) as member variables. In addition, read the analysis result of CaboCha (neko.txt.cabocha), express each sentence as a list of Morph objects, and display the morpheme string of the third sentence.
# -*- coding: utf-8 -*-
__author__ = 'todoroki'
class Morph():
def __init__(self, surface, base, pos, pos1):
self.surface = surface
self.base = base
self.pos = pos
self.pos1 = pos1
def print_all(self):
return self.surface + "\t" + self.base + ", " + self.pos + ", " + self.pos1
def read_morpheme(cabochafile):
sentences = []
sentence = []
for line in cabochafile:
if line == "EOS\n":
# if len(sentence) > 0:
# sentences.append(sentence)
sentences.append(sentence)
sentence = []
elif line[0] == "*":
continue
else:
surface, other = line.split()
others = other.split(",")
base, pos, pos1 = others[6], others[0], others[1]
morph = Morph(surface, base, pos, pos1)
sentence.append(morph)
return sentences
if __name__ == "__main__":
f = open("neko.txt.cabocha", "r")
sentences = read_morpheme(f)
for morph in sentences[2]:
print morph.print_all()
f.close()
In addition to> 40, implement the clause class Chunk. This class has a list of morphemes (Morph objects) (morphs), a list of related clause index numbers (dst), and a list of related original clause index numbers (srcs) as member variables. In addition, read the analysis result of CaboCha of the input text, express one sentence as a list of Chunk objects, and display the character string and the contact of the phrase of the eighth sentence. For the rest of the problems in Chapter 5, use the program created here.
# -*- coding: utf-8 -*-
__author__ = 'todoroki'
import problem40
class Chunk():
def __init__(self):
self.morphs = []
self.dst = -1
self.srcs = []
def __repr__(self):
if self.morphs:
surfs = [morph.surface for morph in self.morphs if morph.pos != 'symbol']
return "".join(surfs)
def include_pos(self, pos):
return pos in [morph.pos for morph in self.morphs]
def morphs_of_pos(self, pos):
return [morph for morph in self.morphs if morph.pos == pos]
def morphs_of_pos1(self, pos1):
return [morph for morph in self.morphs if morph.pos1 == pos1]
def read_chunk(cabochafile):
sentences = []
sentence = []
for line in cabochafile:
if line == "EOS\n":
for idx, c in enumerate(sentence[:-1]):
if c.dst != -1:
sentence[c.dst].srcs.append(idx)
# if len(sentence) > 1:
# sentences.append(sentence)
sentences.append(sentence)
sentence = []
elif line[0] == "*":
chunk = Chunk()
chunk.dst = int(line.split()[2].strip("D"))
sentence.append(chunk)
else:
surface, other = line.split()
others = other.split(",")
base, pos, pos1 = others[6], others[0], others[1]
morph = problem40.Morph(surface, base, pos, pos1)
sentence[-1].morphs.append(morph)
return sentences
if __name__ == "__main__":
f = open("neko.txt.cabocha", "r")
sentences = read_chunk(f)
for idx, chnk in enumerate(sentences[7]):
surfaces = ""
for mrph in chnk.morphs:
surfaces += mrph.surface
print "%d" % idx, surfaces, "=>", chnk.dst
f.close()
Extract all the text of the original clause and the relationed clause in tab-delimited format. However, do not output symbols such as punctuation marks.
# -*- coding: utf-8 -*-
__author__ = 'todoroki'
import problem41
def make_chunk_pair(sentence):
pairs = []
for chunk in sentence:
if chunk.dst != -1:
pairs.append((chunk, sentence[chunk.dst]))
return pairs
if __name__ == "__main__":
f = open("neko.txt.cabocha")
sentences = problem41.read_chunk(f)
pair_sentences = []
for sentence in sentences:
pair = make_chunk_pair(sentence)
pair_sentences.append(pair)
for sentence in pair_sentences:
for pair in sentence:
print "\t".join([str(chunk) for chunk in pair])
f.close()
When clauses containing nouns relate to clauses containing verbs, extract them in tab-delimited format. However, do not output symbols such as punctuation marks.
# -*- coding: utf-8 -*-
__author__ = 'todoroki'
import problem41
import problem42
def findNtoV(chunk_pair):
flagN = False
flagV = False
if "noun" in [morph.pos for morph in chunk_pair[0].morphs]:
flagN = True
if "verb" in [morph.pos for morph in chunk_pair[1].morphs]:
flagV = True
return flagN and flagV
if __name__ == "__main__":
f = open("neko.txt.cabocha", "r")
sentences = problem41.read_chunk(f)
pair_sentences = []
for sentence in sentences:
pair = problem42.make_chunk_pair(sentence)
pair_sentences.append(pair)
pairs_NtoV = []
for pair_sentence in pair_sentences:
for chunk_pair in pair_sentence:
if findNtoV(chunk_pair):
pairs_NtoV.append(chunk_pair)
for pair_NtoV in pairs_NtoV:
noun, verb = pair_NtoV
print "%s\t%s" % (noun, verb)
f.close()
Visualize the dependency tree of a given sentence as a directed graph. For visualization, convert the dependency tree to DOT language and use Graphviz. Also, to visualize directed graphs directly from Python, use pydot.
# -*- coding: utf-8 -*-
__author__ = 'todoroki'
import problem41
import problem42
def sentenceToDot(idx, sentence):
head = "digraph sentence{0} ".format(idx)
body_head = "{ graph [rankdir = LR]; "
body = ""
for chunk_pair in sentence:
former, latter = chunk_pair
body += ('"'+str(former)+'"->"'+str(latter)+'"; ')
dotString = head + body_head + body + '}'
return dotString
if __name__ == "__main__":
f = open("neko.txt.cabocha", "r")
sentences = problem41.read_chunk(f)
pair_sentences = []
for sentence in sentences:
pair = problem42.make_chunk_pair(sentence)
pair_sentences.append(pair)
# dotStrings = []
for idx, sentence in enumerate(pair_sentences):
dotString = sentenceToDot(idx, sentence)
print dotString
# dotStrings.append(dotString)
f.close()
Recommended Posts