"""
## 45.Extraction of verb case patterns[Permalink](https://nlp100.github.io/ja/ch05.html#45-Extractionofverbcasepatterns)
I would like to consider the sentence used this time as a corpus and investigate the cases that Japanese predicates can take. Think of the verb as a predicate and the particle of the phrase related to the verb as a case, and output the predicate and case in tab-delimited format. However, make sure that the output meets the following specifications.
-In a clause containing a verb, the uninflected word of the leftmost verb is used as a predicate.
-Case particles related to predicates
-If there are multiple particles (phrases) related to the predicate, arrange all the particles in lexicographic order separated by spaces.
An example sentence (neko) that "I saw a human being for the first time here".txt.Consider the 8th sentence of cabocha). This sentence contains two verbs, "begin" and "see", when the phrase "begin" is analyzed as "here" and the phrase as "see" is analyzed as "I am" and "thing". Should produce the following output.
At the beginning
To see
Save the output of this program to a file and check the following items using UNIX commands.
-A combination of predicates and case patterns that frequently appear in the corpus
-Case patterns of the verbs "do", "see", and "give" (arrange in order of frequency of appearance in the corpus)
"""
from collections import defaultdict
from typing import Dict, List
def read_file(fpath: str) -> List[List[str]]:
"""Get clear format of parsed sentences.
Args:
fpath (str): File path.
Returns:
List[List[str]]: List of sentences, and each sentence contains a word list.
e.g. result[1]:
['* 0 2D 0/0 -0.764522',
'\u3000\t sign,Blank,*,*,*,*,\u3000,\u3000,\u3000',
'* 1 2D 0/1 -0.764522',
'I\t noun,Pronoun,General,*,*,*,I,Wagamama,Wagamama',
'Is\t particle,Particle,*,*,*,*,Is,C,Wow',
'* 2 -1D 0/2 0.000000',
'Cat\t noun,General,*,*,*,*,Cat,cat,cat',
'so\t auxiliary verb,*,*,*,Special,Continuous form,Is,De,De',
'is there\t auxiliary verb,*,*,*,Five steps, La line Al,Uninflected word,is there,Al,Al',
'。\t sign,Kuten,*,*,*,*,。,。,。']
"""
with open(fpath, mode="rt", encoding="utf-8") as f:
sentences = f.read().split("EOS\n")
return [sent.strip().split("\n") for sent in sentences if sent.strip() != ""]
class Morph:
"""Morph information for each token.
Args:
data (dict): A dictionary contains necessary information.
Attributes:
surface (str):Surface
base (str):Uninflected word (base)
pos (str):Part of speech (base)
pos1 (str):Part of speech subclassification 1 (pos1)
"""
def __init__(self, data):
self.surface = data["surface"]
self.base = data["base"]
self.pos = data["pos"]
self.pos1 = data["pos1"]
def __repr__(self):
return f"Morph({self.surface})"
def __str__(self):
return "surface[{}]\tbase[{}]\tpos[{}]\tpos1[{}]".format(
self.surface, self.base, self.pos, self.pos1
)
class Chunk:
"""Containing information for Clause/phrase.
Args:
data (dict): A dictionary contains necessary information.
Attributes:
chunk_id (str): The number of clause chunk (Phrase number).
morphs List[Morph]: Morph (morpheme) list.
dst (str): The index of dependency target (Contact clause index number).
srcs (List[str]): The index list of dependency source. (Original clause index number).
"""
def __init__(self, chunk_id, dst):
self.id = chunk_id
self.morphs = []
self.dst = dst
self.srcs = []
def __repr__(self):
return "Chunk( id: {}, dst: {}, srcs: {}, morphs: {} )".format(
self.id, self.dst, self.srcs, self.morphs
)
def get_surface(self) -> str:
"""Concatenate morph surfaces in a chink.
Args:
chunk (Chunk): e.g. Chunk( id: 0, dst: 5, srcs: [], morphs: [Morph(I), Morph(Is)]
Return:
e.g. 'I am'
"""
morphs = self.morphs
res = ""
for morph in morphs:
if morph.pos != "symbol":
res += morph.surface
return res
def validate_pos(self, pos: str) -> bool:
"""Return Ture if 'noun' or 'verb' in chunk's morphs. Otherwise, return False."""
morphs = self.morphs
return any([morph.pos == pos for morph in morphs])
def convert_sent_to_chunks(sent: List[str]) -> List[Morph]:
"""Extract word and convert to morph.
Args:
sent (List[str]): A sentence contains a word list.
e.g. sent:
['* 0 1D 0/1 0.000000',
'I\t noun,Pronoun,General,*,*,*,I,Wagamama,Wagamama',
'Is\t particle,Particle,*,*,*,*,Is,C,Wow',
'* 1 -1D 0/2 0.000000',
'Cat\t noun,General,*,*,*,*,Cat,cat,cat',
'so\t auxiliary verb,*,*,*,Special,Continuous form,Is,De,De',
'is there\t auxiliary verb,*,*,*,Five steps, La line Al,Uninflected word,is there,Al,Al',
'。\t sign,Kuten,*,*,*,*,。,。,。']
Parsing format:
e.g. "* 0 1D 0/1 0.000000"
|column|meaning|
| :----: | :----------------------------------------------------------- |
| 1 |The first column is`*`.. Indicates that this is a dependency analysis result.|
| 2 |Phrase number (integer starting from 0)|
| 3 |Contact number +`D` |
| 4 |Head/Function word positions and any number of feature sequences|
| 5 |Engagement score. In general, the larger the value, the easier it is to engage.|
Returns:
List[Chunk]: List of chunks.
"""
chunks = []
chunk = None
srcs = defaultdict(list)
for i, word in enumerate(sent):
if word[0] == "*":
# Add chunk to chunks
if chunk is not None:
chunks.append(chunk)
# eNw Chunk beggin
chunk_id = word.split(" ")[1]
dst = word.split(" ")[2].rstrip("D")
chunk = Chunk(chunk_id, dst)
srcs[dst].append(chunk_id) # Add target->source to mapping list
else: # Add Morch to chunk.morphs
features = word.split(",")
dic = {
"surface": features[0].split("\t")[0],
"base": features[6],
"pos": features[0].split("\t")[1],
"pos1": features[1],
}
chunk.morphs.append(Morph(dic))
if i == len(sent) - 1: # Add the last chunk
chunks.append(chunk)
# Add srcs to each chunk
for chunk in chunks:
chunk.srcs = list(srcs[chunk.id])
return chunks
def get_predicate_pattern(chunks: List[Chunk]) -> List[Dict[str, list]]:
"""Get edges from sentence chunks.
Terms:
-predicate(predicate)
-Item(argument)
-Case(case)
Args:
chunks (List[Chunk]): A sentence contains many chunks.
e.g. [Chunk( id: 0, dst: 5, srcs: [], morphs: [Morph(I), Morph(Is)] ),
Chunk( id: 1, dst: 2, srcs: [], morphs: [Morph(here), Morph(so)] ),
Chunk( id: 2, dst: 3, srcs: ['1'], morphs: [Morph(start), Morph(hand)] ),
Chunk( id: 3, dst: 4, srcs: ['2'], morphs: [Morph(Human), Morph(That)] ),
Chunk( id: 4, dst: 5, srcs: ['3'], morphs: [Morph(thing), Morph(To)] ),
Chunk( id: 5, dst: -1, srcs: ['0', '4'], morphs: [Morph(You see), Morph(Ta), Morph(。)] )]
Returns:
List[Dict[str, list]]: Predicate and case.
e.g. [defaultdict(list, {'start': ['so']}), defaultdict(list, {'to see': ['Is', 'To']})]
"""
patterns = []
for chunk in chunks:
# Skip if not valid
if len(chunk.srcs) == 0 or all([morph.pos != "verb" for morph in chunk.morphs]):
continue
# Initialize
pred_case = defaultdict(list)
# Get predicate
for morph in chunk.morphs:
if morph.pos == "verb":
predicate = morph.base
break
# Get case
for src in chunk.srcs:
src_chunk = chunks[int(src)]
for morph in src_chunk.morphs:
if morph.pos == "Particle":
pred_case[predicate].append(morph.base)
# Add to patterns
patterns.append(pred_case)
return patterns
def write_to_file(pattern_sents, path):
"""Write patterns to file.
Args:
pattern_sents ([type]): predicate-case patterns.
e.g. [[defaultdict(list, {'Born': ['so']}), defaultdict(list, {'Tsukuri': ['Or', 'But']})],
[defaultdict(list, {'cry': ['so']}), defaultdict(list, {'To do': ['hand', 'Only', 'Is']})]]
"""
# convert_patterns_to_text
lines = []
for pattern_sent in pattern_sents:
for pattern in pattern_sent: # pattern: {'Tsukuri': ['Or', 'But']}
for predicate, case_list in pattern.items():
case_text = " ".join(sorted(case_list)) #Sort characters in ascending order
lines.append((predicate, case_text))
# write_to_file
with open(path, "w") as f:
for line in lines:
f.write(f"{line[0]}\t{line[1]}\n")
fpath = "neko.txt.cabocha"
sentences = read_file(fpath)
sentences = [convert_sent_to_chunks(sent) for sent in sentences] # ans41
# ans45
pattern_sents = [get_predicate_pattern(sent) for sent in sentences]
pattern_sents = list(filter(lambda x: len(x) != 0, pattern_sents))
write_to_file(pattern_sents, "patterns.txt")
# "patterns.txt":
#Be born
#Tsukuka
#By crying
#Just to do
#At the beginning
#To see
#Listen
#To catch
#Boil
#Eat
ans45.sh
#A combination of predicates and case patterns that frequently appear in the corpus
# see ans19.sh
sort patterns.txt | uniq -c | sort -k1nr > patterns_sorted.txt
# -k 3: sort as the 3rd column
# -n: numeric sort
# -r: reverse order
#Only "do"
grep "^To do\s" patterns.txt | sort | uniq -c | sort -k1nr > "To do.txt"
# #"See" only
grep "^to see\s" patterns.txt | sort | uniq -c | sort -k1nr > "to see.txt"
# #"Give" only
grep "^give\s" patterns.txt | sort | uniq -c | sort -k1nr > "give.txt"
Recommended Posts