[PYTHON] 100 Sprachverarbeitungsklopfen (2020): 41

"""
41.Lesen des Abhängigkeitsanalyseergebnisses (Phrase / Abhängigkeit)
Implementieren Sie zusätzlich zu 40 die Klauselklasse Chunk. Diese Klasse enthält eine Liste von Morph-Elementen (Morph-Objekten) (Morphs), eine Liste verwandter Klauselindexnummern (dst) und eine Liste verwandter ursprünglicher Klauselindexnummern (srcs) als Mitgliedsvariablen. Lesen Sie außerdem das Analyseergebnis von CaboCha des Eingabetextes, drücken Sie einen Satz als Liste von Chunk-Objekten aus und zeigen Sie die Zeichenfolge und den Kontakt der Phrase des achten Satzes an. Verwenden Sie für die restlichen Probleme in Kapitel 5 das hier erstellte Programm.
"""
from collections import defaultdict
from typing import List


def read_file(fpath: str) -> List[List[str]]:
    """Get clear format of parsed sentences.

    Args:
        fpath (str): File path.

    Returns:
        List[List[str]]: List of sentences, and each sentence contains a word list.
                         e.g. result[1]:
                            ['* 0 2D 0/0 -0.764522',
                             '\u3000\t Symbol,Leer,*,*,*,*,\u3000,\u3000,\u3000',
                             '* 1 2D 0/1 -0.764522',
                             'ich\t Substantiv,Gleichbedeutend,Allgemeines,*,*,*,ich,Wagahai,Wagahai',
                             'Ist\t Assistent,Hilfe,*,*,*,*,Ist,C.,Beeindruckend',
                             '* 2 -1D 0/2 0.000000',
                             'Katze\t Substantiv,Allgemeines,*,*,*,*,Katze,Katze,Katze',
                             'damit\t Hilfsverb,*,*,*,Besondere,Kontinuierlicher Typ,Ist,De,De',
                             'Gibt es\t Hilfsverb,*,*,*,Fünf Schritte, La Linie Al,Grundform,Gibt es,Al,Al',
                             '。\t Symbol,Phrase,*,*,*,*,。,。,。']
    """
    with open(fpath, mode="rt", encoding="utf-8") as f:
        sentences = f.read().split("EOS\n")
    return [sent.strip().split("\n") for sent in sentences if sent.strip() != ""]


class Morph:
    """Morph information for each token.

    Args:
        data (dict): A dictionary contains necessary information.

    Attributes:
        surface (str):Oberfläche
        base (str):Base
        pos (str):Teil (Basis)
        pos1 (str):Teil Teil Unterklassifizierung 1 (Pos1
    """

    def __init__(self, data):
        self.surface = data["surface"]
        self.base = data["base"]
        self.pos = data["pos"]
        self.pos1 = data["pos1"]

    def __repr__(self):
        return f"Morph({self.surface})"

    def __str__(self):
        return "surface[{}]\tbase[{}]\tpos[{}]\tpos1[{}]".format(
            self.surface, self.base, self.pos, self.pos1
        )


class Chunk:
    """Containing information for Clause/phrase.

    Args:
        data (dict): A dictionary contains necessary information.

    Attributes:
        chunk_id (str): The number of clause chunk (Phrasennummer).
        morphs List[Morph]: Morph (Morphem) list.
        dst (str): The index of dependency target (Indexnummer der Kontaktklausel).
        srcs (List[str]): The index list of dependency source. (Original-Klauselindexnummer).
    """

    def __init__(self, chunk_id, dst):
        self.id = chunk_id
        self.morphs = []
        self.dst = dst
        self.srcs = []

    def __repr__(self):
        return "Chunk( id: {}, dst: {}, srcs: {}, morphs: {} )".format(
            self.id, self.dst, self.srcs, self.morphs
        )


# ans41
def convert_sent_to_chunks(sent: List[str]) -> List[Morph]:
    """Extract word and convert to morph.

    Args:
        sent (List[str]): A sentence contains a word list.
                            e.g. sent:
                               ['* 0 1D 0/1 0.000000',
                                'ich\t Substantiv,Gleichbedeutend,Allgemeines,*,*,*,ich,Wagahai,Wagahai',
                                'Ist\t Assistent,Hilfe,*,*,*,*,Ist,C.,Beeindruckend',
                                '* 1 -1D 0/2 0.000000',
                                'Katze\t Substantiv,Allgemeines,*,*,*,*,Katze,Katze,Katze',
                                'damit\t Hilfsverb,*,*,*,Besondere,Kontinuierlicher Typ,Ist,De,De',
                                'Gibt es\t Hilfsverb,*,*,*,Fünf Schritte, La Linie Al,Grundform,Gibt es,Al,Al',
                                '。\t Symbol,Phrase,*,*,*,*,。,。,。']

    Parsing format:
        e.g. "* 0 1D 0/1 0.000000"
        |Säule|Bedeutung|
        | :----: | :----------------------------------------------------------- |
        |   1    |Die erste Spalte ist`*`.. Zeigt an, dass es sich um ein Ergebnis der Abhängigkeitsanalyse handelt.|
        |   2    |Phrasennummer (Ganzzahl ab 0)|
        |   3    |Kontaktnummer +`D`                                              |
        |   4    |Hauptadresse/Funktionswortposition und beliebig viele Identitätsspalten|
        |   5    |Verlobungspunktzahl. Im Allgemeinen ist es umso einfacher, sich zu engagieren, je größer der Wert ist.|

    Returns:
        List[Chunk]: List of chunks.
    """
    chunks = []
    chunk = None
    srcs = defaultdict(list)

    for i, word in enumerate(sent):
        if word[0] == "*":
            # Add chunk to chunks
            if chunk is not None:
                chunks.append(chunk)

            # eNw Chunk beggin
            chunk_id = word.split(" ")[1]
            dst = word.split(" ")[2].rstrip("D")
            chunk = Chunk(chunk_id, dst)
            srcs[dst].append(chunk_id)  # Add target->source to mapping list

        else:  # Add Morch to chunk.morphs
            features = word.split(",")
            dic = {
                "surface": features[0].split("\t")[0],
                "base": features[6],
                "pos": features[0].split("\t")[1],
                "pos1": features[1],
            }
            chunk.morphs.append(Morph(dic))

            if i == len(sent) - 1:  # Add the last chunk
                chunks.append(chunk)

    # Add srcs to each chunk
    for chunk in chunks:
        chunk.srcs = list(srcs[chunk.id])

    return chunks


fpath = "neko.txt.cabocha"
sentences = read_file(fpath)
chunks = [convert_sent_to_chunks(sent) for sent in sentences]

for chunk in chunks[5]:
    print(chunk)

# Chunk( id: 0, dst: 5, srcs: [], morphs: [Morph(ich), Morph(Ist)] )
# Chunk( id: 1, dst: 2, srcs: [], morphs: [Morph(Hier), Morph(damit)] )
# Chunk( id: 2, dst: 3, srcs: ['1'], morphs: [Morph(Start), Morph(Hand)] )
# Chunk( id: 3, dst: 4, srcs: ['2'], morphs: [Morph(Mensch), Morph(Das)] )
# Chunk( id: 4, dst: 5, srcs: ['3'], morphs: [Morph(Ding), Morph(Zu)] )
# Chunk( id: 5, dst: -1, srcs: ['0', '4'], morphs: [Morph(Sie sehen), Morph(Ta), Morph(。)] )