"""
42.Display of the phrase of the person in charge and the person in charge
Extract all the text of the original clause and the text of the clause in the tab-delimited format. However, do not output symbols such as punctuation marks.
"""
from collections import defaultdict
from typing import List
def read_file(fpath: str) -> List[List[str]]:
"""Get clear format of parsed sentences.
Args:
fpath (str): File path.
Returns:
List[List[str]]: List of sentences, and each sentence contains a word list.
e.g. result[1]:
['* 0 2D 0/0 -0.764522',
'\u3000\t sign,Blank,*,*,*,*,\u3000,\u3000,\u3000',
'* 1 2D 0/1 -0.764522',
'I\t noun,Pronoun,General,*,*,*,I,Wagamama,Wagamama',
'Is\t particle,Particle,*,*,*,*,Is,C,Wow',
'* 2 -1D 0/2 0.000000',
'Cat\t noun,General,*,*,*,*,Cat,cat,cat',
'so\t auxiliary verb,*,*,*,Special,Continuous form,Is,De,De',
'is there\t auxiliary verb,*,*,*,Five steps, La line Al,Uninflected word,is there,Al,Al',
'。\t sign,Kuten,*,*,*,*,。,。,。']
"""
with open(fpath, mode="rt", encoding="utf-8") as f:
sentences = f.read().split("EOS\n")
return [sent.strip().split("\n") for sent in sentences if sent.strip() != ""]
class Morph:
"""Morph information for each token.
Args:
data (dict): A dictionary contains necessary information.
Attributes:
surface (str):Surface
base (str):Uninflected word (base)
pos (str):Part of speech (base)
pos1 (str):Part of speech subcategory 1 (pos1
"""
def __init__(self, data):
self.surface = data["surface"]
self.base = data["base"]
self.pos = data["pos"]
self.pos1 = data["pos1"]
def __repr__(self):
return f"Morph({self.surface})"
def __str__(self):
return "surface[{}]\tbase[{}]\tpos[{}]\tpos1[{}]".format(
self.surface, self.base, self.pos, self.pos1
)
class Chunk:
"""Containing information for Clause/phrase.
Args:
data (dict): A dictionary contains necessary information.
Attributes:
chunk_id (str): The number of clause chunk (Phrase number).
morphs List[Morph]: Morph (morpheme) list.
dst (str): The index of dependency target (Contact clause index number).
srcs (List[str]): The index list of dependency source. (Original clause index number).
"""
def __init__(self, chunk_id, dst):
self.id = chunk_id
self.morphs = []
self.dst = dst
self.srcs = []
def __repr__(self):
return "Chunk( id: {}, dst: {}, srcs: {}, morphs: {} )".format(
self.id, self.dst, self.srcs, self.morphs
)
def convert_sent_to_chunks(sent: List[str]) -> List[Morph]:
"""Extract word and convert to morph.
Args:
sent (List[str]): A sentence contains a word list.
e.g. sent:
['* 0 1D 0/1 0.000000',
'I\t noun,Pronoun,General,*,*,*,I,Wagamama,Wagamama',
'Is\t particle,Particle,*,*,*,*,Is,C,Wow',
'* 1 -1D 0/2 0.000000',
'Cat\t noun,General,*,*,*,*,Cat,cat,cat',
'so\t auxiliary verb,*,*,*,Special,Continuous form,Is,De,De',
'is there\t auxiliary verb,*,*,*,Five steps, La line Al,Uninflected word,is there,Al,Al',
'。\t sign,Kuten,*,*,*,*,。,。,。']
Parsing format:
e.g. "* 0 1D 0/1 0.000000"
|column|meaning|
| :----: | :----------------------------------------------------------- |
| 1 |The first column is`*`.. Indicates that this is a dependency analysis result.|
| 2 |Phrase number (integer starting from 0)|
| 3 |Contact number +`D` |
| 4 |Head/Function word positions and any number of feature sequences|
| 5 |Engagement score. In general, the larger the value, the easier it is to engage.|
Returns:
List[Chunk]: List of chunks.
"""
chunks = []
chunk = None
srcs = defaultdict(list)
for i, word in enumerate(sent):
if word[0] == "*":
# Add chunk to chunks
if chunk is not None:
chunks.append(chunk)
# eNw Chunk beggin
chunk_id = word.split(" ")[1]
dst = word.split(" ")[2].rstrip("D")
chunk = Chunk(chunk_id, dst)
srcs[dst].append(chunk_id) # Add target->source to mapping list
else: # Add Morch to chunk.morphs
features = word.split(",")
dic = {
"surface": features[0].split("\t")[0],
"base": features[6],
"pos": features[0].split("\t")[1],
"pos1": features[1],
}
chunk.morphs.append(Morph(dic))
if i == len(sent) - 1: # Add the last chunk
chunks.append(chunk)
# Add srcs to each chunk
for chunk in chunks:
chunk.srcs = list(srcs[chunk.id])
return chunks
def concat_morphs_surface(chunk: Chunk) -> str:
"""Concatenate morph surfaces in a chink.
Args:
chunk (Chunk): e.g. Chunk( id: 0, dst: 5, srcs: [], morphs: [Morph(I), Morph(Is)]
Return:
e.g. 'I am'
"""
res = ""
for morph in chunk.morphs:
if morph.pos != "symbol":
res += morph.surface
return res
def concat_chunks_surface(chunks: List[Chunk]):
"""Concatenate surface of dependency source and target between chunks.
Args:
chunks (List[Chunk]): chunks represent a sentences.
e.g. [Chunk( id: 0, dst: 5, srcs: [], morphs: [Morph(I), Morph(Is)] ),
Chunk( id: 1, dst: 2, srcs: [], morphs: [Morph(here), Morph(so)] ),
Chunk( id: 2, dst: 3, srcs: ['1'], morphs: [Morph(start), Morph(hand)] ),
Chunk( id: 3, dst: 4, srcs: ['2'], morphs: [Morph(Human), Morph(That)] ),
Chunk( id: 4, dst: 5, srcs: ['3'], morphs: [Morph(thing), Morph(To)] ),
Chunk( id: 5, dst: -1, srcs: ['0', '4'], morphs: [Morph(You see), Morph(Ta), Morph(。)] )]
"""
chunks_surface = []
for chunk in chunks:
if len(chunk.srcs) == 0:
continue
else:
current_chunk_surface = concat_morphs_surface(chunk)
for src in chunk.srcs:
src_chunk = chunks[int(src)]
src_chunk_surface = concat_morphs_surface(src_chunk)
chunks_surface.append(
"{} {}".format(src_chunk_surface, current_chunk_surface)
)
return chunks_surface
fpath = "neko.txt.cabocha"
sentences = read_file(fpath)
chunks = [convert_sent_to_chunks(sent) for sent in sentences] # ans41
result = [concat_chunks_surface(sent) for sent in chunks] # ans42
result = list(filter(lambda x: len(x) != 0, result)) # filtering the empty list
for sent in result[:3]:
print(sent)
# ['I am a cat']
# ['No name', 'Not yet']
# ['Where were you born', 'Born or not', 'I don't get it', 'I have no idea']
Recommended Posts