[PYTHON] 100 Sprachverarbeitungsklopfen (2020): 40

ans40.py

"""
40.Lesen des Abhängigkeitsanalyseergebnisses (Morphologie)
Implementieren Sie die Klasse Morph, die die Morphologie darstellt. Diese Klasse hat Oberfläche und Findall (pos1) als Mitgliedsvariablen.
Darüber hinaus ist das Analyseergebnis von CaboCha (neko.txt.Lesen Sie cabocha), drücken Sie jeden Satz als Liste von Morph-Objekten aus und zeigen Sie die morphologische Elementzeichenfolge des dritten Satzes an.

ans40_cabocha.py mit neko.txt.Cabocha erzeugen.
ans40.Das von sh generierte Beispiel ist neko.txt.cabocha2. Wenn Sie vergleichen, neko.txt.Cabocha ist schöner.

ans40.py Implementierung ist direkt neko.txt.Die Geschwindigkeit ist schnell, weil Cabocha geladen ist.
ans40_2.Die Implementierung von py benötigt Analysezeit. Nicht empfohlen.
"""
from typing import List


class Morph:
    def __init__(self, data):
        self.surface = data["surface"]
        self.base = data["base"]
        self.pos = data["pos"]
        self.pos1 = data["pos1"]

    def __repr__(self):
        return "surface[{}]\tbase[{}]\tpos[{}]\tpos1[{}]".format(
            self.surface, self.base, self.pos, self.pos1
        )


def read_file(fpath: str) -> List[List[str]]:
    """Get clear format of parsed sentences.

    Args:
        fpath (str): File path.

    Returns:
        List[List[str]]: List of sentences, and each sentence contains a word list.
                         e.g. result[1]:
                           ['* 0 1D 0/1 0.000000',
                            'ich\t Substantiv,Gleichbedeutend,Allgemeines,*,*,*,ich,Wagahai,Wagahai',
                            'Ist\t Assistent,Hilfe,*,*,*,*,Ist,C.,Beeindruckend',
                            '* 1 -1D 0/2 0.000000',
                            'Katze\t Substantiv,Allgemeines,*,*,*,*,Katze,Katze,Katze',
                            'damit\t Hilfsverb,*,*,*,Besondere,Kontinuierlicher Typ,Ist,De,De',
                            'Gibt es\t Hilfsverb,*,*,*,Fünf Schritte, La Linie Al,Grundform,Gibt es,Al,Al',
                            '。\t Symbol,Phrase,*,*,*,*,。,。,。']
    """
    with open(fpath, mode="rt", encoding="utf-8") as f:
        sentences = f.read().split("EOS\n")
    return [sent.strip().split("\n") for sent in sentences if sent.strip() != ""]


# ans40
def convert_sent_to_morph(sent: List[str]) -> List[Morph]:
    """Extract word and convert to morph.

    Args:
        sent (List[str]): A sentence contains a word list.
                            e.g. sent:
                                ['* 0 2D 0/0 -0.764522',
                                '\u3000\t Symbol,Leer,*,*,*,*,\u3000,\u3000,\u3000',
                                '* 1 2D 0/1 -0.764522',
                                'ich\t Substantiv,Gleichbedeutend,Allgemeines,*,*,*,ich,Wagahai,Wagahai',
                                'Ist\t Assistent,Hilfe,*,*,*,*,Ist,C.,Beeindruckend',
                                '* 2 -1D 0/2 0.000000',
                                'Katze\t Substantiv,Allgemeines,*,*,*,*,Katze,Katze,Katze',
                                'damit\t Hilfsverb,*,*,*,Besondere,Kontinuierlicher Typ,Ist,De,De',
                                'Gibt es\t Hilfsverb,*,*,*,Fünf Schritte, La Linie Al,Grundform,Gibt es,Al,Al',
                                '。\t Symbol,Phrase,*,*,*,*,。,。,。']

    Returns:
        List[Morph]: [description]
    """
    res = []
    for word in sent:
        if word[0] == "*":
            continue
        features = word.split(",")
        dic = {
            "surface": features[0].split("\t")[0],
            "base": features[6],
            "pos": features[0].split("\t")[1],
            "pos1": features[1],
        }
        res.append(Morph(dic))

    return res


fpath = "neko.txt.cabocha"
sentences = read_file(fpath)
morph_sents = [convert_sent_to_morph(sent) for sent in sentences]

for m in morph_sents[2]:
    print(m)  # __str__()

# surface[Name]   base[Name]      pos[Substantiv]       pos1[Allgemeines]
# surface[Ist]     base[Ist]        pos[Partikel]       pos1[係Partikel]
# surface[noch]   base[noch]      pos[Adverb]       pos1[Hilfsanschluss]
# surface[Nein]   base[Nein]      pos[Adjektiv]     pos1[Unabhängigkeit]
# surface[。]     base[。]        pos[Symbol]       pos1[Phrase]

ans40_2.py

from typing import List

import CaboCha


def read_file(path: str) -> List[str]:
    data = []
    with open(path) as f:
        for line in f:
            line = line.strip()
            if line != "":
                data.append(line)
    return data


class Morph:
    def __init__(self, surface, base, pos, pos1):
        self.surface = surface  #Oberflächentyp
        self.base = base  #Grundform
        self.pos = pos  #Teil
        self.pos1 = pos1  #Teiltexte Unterklassifizierung 1

    def __str__(self):
        s = "surface[{}]\tbase[{}]\tpos[{}]\tpos1[{}]"
        return s.format(self.surface, self.base, self.pos, self.pos1)


def get_morph(sent: str) -> list:
    c = CaboCha.Parser()
    parsed_sent = c.parse(sent).toString(CaboCha.FORMAT_LATTICE)
    words = parsed_sent.strip().split("\n")
    # e.g. ['* 0 -1D 0/0 0.000000', 'einer\t Substantiv,Nummer,*,*,*,*,einer,Ichi,Ichi', 'EOS']

    morphs = []
    for word in words:
        #Der Anfang ist*Die Linie ist das Ergebnis einer Abhängigkeitsanalyse. Überspringen Sie sie daher
        if word[0] == "*" or word.strip() == "EOS":
            continue

        #Andernfalls wird die Oberflächenebene durch Tabulatoren getrennt','Durch Pause trennen
        features = word.split(",")

        morphs.append(
            Morph(
                features[0].split("\t")[0],  # surface
                features[6],  # base
                features[0].split("\t")[1],  # pos
                features[1],  # pos1
            )
        )

    return morphs


file_path = "neko.txt"
sentence_list = read_file(file_path)
# ['einer', 'Ich bin eine Katze.', 'Es gibt noch keinen Namen.', 'Ich habe keine Ahnung, wo ich geboren wurde.']

morphs = [get_morph(sent) for sent in sentence_list]  #Cabochas Analysezeit ist etwas lang
for m in morphs[3]:
    print(m)

# surface[Wo]   base[Wo]      pos[Substantiv]       pos1[代Substantiv]
# surface[damit]     base[damit]        pos[Partikel]       pos1[格Partikel]
# surface[Geboren]   base[Geborenる]    pos[Verb]       pos1[Unabhängigkeit]
# surface[Ta]     base[Ta]        pos[Hilfsverb]     pos1[*]
# surface[Oder]     base[Oder]        pos[Partikel]       pos1[副Partikel／並立Partikel／終Partikel]
# surface[Tonto] base[Tonto]    pos[Adverb]       pos1[Allgemeines]
# surface[Registrieren]   base[Registrieren]      pos[Substantiv]       pos1[Verbindung ändern]
# surface[Aber]     base[Aber]        pos[Partikel]       pos1[格Partikel]
# surface[Tsuka]   base[Tsukuri]      pos[Verb]       pos1[Unabhängigkeit]
# surface[Nu]     base[Nu]        pos[Hilfsverb]     pos1[*]
# surface[。]     base[。]        pos[Symbol]       pos1[Phrase]

ans40_parse_to_cabocha_format.py

import CaboCha


def parse_txt(file_in: str, file_out: str) -> None:
    """Convert neko.txt to cabocha format in a clear format."""
    with open(file_in) as f_in, open(file_out, "w") as f_out:
        cabocha = CaboCha.Parser()

        for line in f_in:
            line = line.strip()
            if line == "":
                continue
            parsed_sent = cabocha.parse(line).toString(CaboCha.FORMAT_LATTICE)
            f_out.write(parsed_sent)


file_in = "neko.txt"
file_out = "neko.txt.cabocha"

parse_txt(file_in, file_out)

ans40.sh

cat neko.txt | cabocha -f1 > neko.txt.cabocha