[PYTHON] 100 Sprachverarbeitungsklopfen (2020): 48

"""
## 48.Extraktion von Pfaden von der Nomenklatur zu den Wurzeln[Permalink](https://nlp100.github.io/ja/ch05.html#48-ExtraktionvonPfadenvonderNomenklaturzudenWurzeln)

Extrahieren Sie für eine Klausel, die die gesamte Nomenklatur des Satzes enthält, den Pfad von dieser Klausel zum Stamm des Syntaxbaums. Der Pfad im Syntaxbaum muss jedoch die folgenden Spezifikationen erfüllen.

-Jede Klausel wird durch eine (oberflächliche) morphologische Sequenz dargestellt
-Von der Anfangsklausel bis zur Endklausel des Pfades lautet der Ausdruck jeder Klausel "` -> `Verbinden mit

Betrachten Sie den Beispielsatz "John McCarthy hat den Begriff künstliche Intelligenz auf der ersten Konferenz über KI geprägt." Wenn CaboCha für die Abhängigkeitsanalyse verwendet wird, kann die folgende Ausgabe erhalten werden.


John McCarthy->Erstellt
Über KI->Der Erste->Bei einem Treffen->Erstellt
Der Erste->Bei einem Treffen->Erstellt
Bei einem Treffen->Erstellt
Künstliche Intelligenz->Terminologie->Erstellt
Terminologie->Erstellt

Wenn KNP für die Abhängigkeitsanalyse verwendet wird, kann die folgende Ausgabe erhalten werden.


John McCarthy->Erstellt
Zur KI->verbunden->Bei einem Treffen->Erstellt
Bei einem Treffen->Erstellt
Mit künstlicher Intelligenz->Sagen->Terminologie->Erstellt
Terminologie->Erstellt

"""
from collections import defaultdict
from typing import List


def read_file(fpath: str) -> List[List[str]]:
    """Get clear format of parsed sentences.

    Args:
        fpath (str): File path.

    Returns:
        List[List[str]]: List of sentences, and each sentence contains a word list.
                         e.g. result[1]:
                            ['* 0 2D 0/0 -0.764522',
                             '\u3000\t Symbol,Leer,*,*,*,*,\u3000,\u3000,\u3000',
                             '* 1 2D 0/1 -0.764522',
                             'ich\t Substantiv,Gleichbedeutend,Allgemeines,*,*,*,ich,Wagahai,Wagahai',
                             'Ist\t Assistent,Hilfe,*,*,*,*,Ist,C.,Beeindruckend',
                             '* 2 -1D 0/2 0.000000',
                             'Katze\t Substantiv,Allgemeines,*,*,*,*,Katze,Katze,Katze',
                             'damit\t Hilfsverb,*,*,*,Besondere,Kontinuierlicher Typ,Ist,De,De',
                             'Gibt es\t Hilfsverb,*,*,*,Fünf Schritte, La Linie Al,Grundform,Gibt es,Al,Al',
                             '。\t Symbol,Phrase,*,*,*,*,。,。,。']
    """
    with open(fpath, mode="rt", encoding="utf-8") as f:
        sentences = f.read().split("EOS\n")
    return [sent.strip().split("\n") for sent in sentences if sent.strip() != ""]


class Morph:
    """Morph information for each token.

    Args:
        data (dict): A dictionary contains necessary information.

    Attributes:
        surface (str):Oberfläche
        base (str):Base
        pos (str):Teil (Basis)
        pos1 (str):Teil Teil Unterklassifizierung 1 (pos1)
    """

    def __init__(self, data):
        self.surface = data["surface"]
        self.base = data["base"]
        self.pos = data["pos"]
        self.pos1 = data["pos1"]

    def __repr__(self):
        return f"Morph({self.surface})"

    def __str__(self):
        return "surface[{}]\tbase[{}]\tpos[{}]\tpos1[{}]".format(
            self.surface, self.base, self.pos, self.pos1
        )


class Chunk:
    """Containing information for Clause/phrase.

    Args:
        data (dict): A dictionary contains necessary information.

    Attributes:
        chunk_id (str): The number of clause chunk (Phrasennummer).
        morphs List[Morph]: Morph (Morphem) list.
        dst (str): The index of dependency target (Indexnummer der Kontaktklausel).
        srcs (List[str]): The index list of dependency source. (Original-Klauselindexnummer).
    """

    def __init__(self, chunk_id, dst):
        self.id = chunk_id
        self.morphs = []
        self.dst = dst
        self.srcs = []

    def __repr__(self):
        return "Chunk( id: {}, dst: {}, srcs: {}, morphs: {} )".format(
            self.id, self.dst, self.srcs, self.morphs
        )

    def get_surface(self) -> str:
        """Concatenate morph surfaces in a chink.

        Args:
            chunk (Chunk): e.g. Chunk( id: 0, dst: 5, srcs: [], morphs: [Morph(ich), Morph(Ist)]
        Return:
            e.g. 'ich bin'
        """
        morphs = self.morphs
        res = ""
        for morph in morphs:
            if morph.pos != "Symbol":
                res += morph.surface
        return res

    def validate_pos(self, pos: str) -> bool:
        """Return Ture if 'Substantiv' or 'Verb' in chunk's morphs. Otherwise, return False."""
        morphs = self.morphs
        return any([morph.pos == pos for morph in morphs])


def convert_sent_to_chunks(sent: List[str]) -> List[Morph]:
    """Extract word and convert to morph.

    Args:
        sent (List[str]): A sentence contains a word list.
                            e.g. sent:
                               ['* 0 1D 0/1 0.000000',
                                'ich\t Substantiv,Gleichbedeutend,Allgemeines,*,*,*,ich,Wagahai,Wagahai',
                                'Ist\t Assistent,Hilfe,*,*,*,*,Ist,C.,Beeindruckend',
                                '* 1 -1D 0/2 0.000000',
                                'Katze\t Substantiv,Allgemeines,*,*,*,*,Katze,Katze,Katze',
                                'damit\t Hilfsverb,*,*,*,Besondere,Kontinuierlicher Typ,Ist,De,De',
                                'Gibt es\t Hilfsverb,*,*,*,Fünf Schritte, La Linie Al,Grundform,Gibt es,Al,Al',
                                '。\t Symbol,Phrase,*,*,*,*,。,。,。']

    Parsing format:
        e.g. "* 0 1D 0/1 0.000000"
        |Säule|Bedeutung|
        | :----: | :----------------------------------------------------------- |
        |   1    |Die erste Spalte ist`*`.. Zeigt an, dass es sich um ein Ergebnis der Abhängigkeitsanalyse handelt.|
        |   2    |Phrasennummer (Ganzzahl ab 0)|
        |   3    |Kontaktnummer +`D`                                              |
        |   4    |Hauptadresse/Funktionswortposition und beliebig viele Identitätsspalten|
        |   5    |Verlobungspunktzahl. Im Allgemeinen ist es umso einfacher, sich zu engagieren, je größer der Wert ist.|

    Returns:
        List[Chunk]: List of chunks.
    """
    chunks = []
    chunk = None
    srcs = defaultdict(list)

    for i, word in enumerate(sent):
        if word[0] == "*":
            # Add chunk to chunks
            if chunk is not None:
                chunks.append(chunk)

            # eNw Chunk beggin
            chunk_id = word.split(" ")[1]
            dst = word.split(" ")[2].rstrip("D")
            chunk = Chunk(chunk_id, dst)
            srcs[dst].append(chunk_id)  # Add target->source to mapping list

        else:  # Add Morch to chunk.morphs
            features = word.split(",")
            dic = {
                "surface": features[0].split("\t")[0],
                "base": features[6],
                "pos": features[0].split("\t")[1],
                "pos1": features[1],
            }
            chunk.morphs.append(Morph(dic))

            if i == len(sent) - 1:  # Add the last chunk
                chunks.append(chunk)

    # Add srcs to each chunk
    for chunk in chunks:
        chunk.srcs = list(srcs[chunk.id])

    return chunks


def get_path(chunks: List[Chunk]) -> List[List[str]]:
    """Get all paths in one sentence.

    Terms:
        -Prädikat(predicate)
        -Artikel(argument)
        -Fall(case)

    Notice:
        - Chunk.Morphen haben "sa-hen Verbindungsnomenklatur" und "o (Hilfs)"
        - Chunk.srcs haben "Verben"

    Args:
        chunks (List[Chunk]): A sentence contains many chunks.
            e.g. [Chunk( id: 0, dst: 5, srcs: [], morphs: [Morph(ich), Morph(Ist)] ),
                  Chunk( id: 1, dst: 2, srcs: [], morphs: [Morph(Hier), Morph(damit)] ),
                  Chunk( id: 2, dst: 3, srcs: ['1'], morphs: [Morph(Start), Morph(Hand)] ),
                  Chunk( id: 3, dst: 4, srcs: ['2'], morphs: [Morph(Mensch), Morph(Das)] ),
                  Chunk( id: 4, dst: 5, srcs: ['3'], morphs: [Morph(Ding), Morph(Zu)] ),
                  Chunk( id: 5, dst: -1, srcs: ['0', '4'], morphs: [Morph(Sie sehen), Morph(Ta), Morph(。)] )]

    Returns:
        List[List[str]]: [['wo', 'Wurde geboren', 'Verwende nicht'], ['Ich habe eine Ahnung', 'Verwende nicht']]
    """
    paths = []
    for chunk in chunks:
        # Skip if chunk is invalid
        if (
            not any([morph.pos == "Substantiv" for morph in chunk.morphs])
            or int(chunk.dst) == -1
        ):
            continue

        # Get path
        path = [chunk.get_surface()]
        dst = int(chunk.dst)
        while dst != -1:
            path.append(chunks[dst].get_surface())
            dst = int(chunks[dst].dst)
        paths.append(path)

    return paths


def write_to_file(sents: List[dict], path: str) -> None:
    """Write to file.

    Args:
        sents ([type]):
            e.g.   [[['ich bin', 'Sei eine Katze']],
                    [['Name ist', 'Nein']],
                    [['wo', 'Wurde geboren', 'Verwende nicht'], ['Ich habe eine Ahnung', 'Verwende nicht']]]
    """
    # convert_frame_to_text
    lines = []

    for sent in sents:
        for chunk in sent:
            lines.append(" -> ".join(chunk))

    # write_to_file
    with open(path, "w") as f:
        for line in lines:
            f.write(f"{line}\n")


fpath = "neko.txt.cabocha"
sentences = read_file(fpath)
sentences = [convert_sent_to_chunks(sent) for sent in sentences]  # ans41

# ans48
pattern_sents = [get_path(sent) for sent in sentences]
pattern_sents = list(filter(lambda x: len(x) != 0, pattern_sents))
write_to_file(pattern_sents, "noun_paths.txt")
#ich bin->Sei eine Katze
#Name ist->Nein
#wo->Wurde geboren->Verwende nicht
#Ich habe eine Ahnung->Verwende nicht
#etwas->dim->Apropos->In Tränen->Ich erinnere mich