[PYTHON] 100 language processing knocks (2020): 41

"""
41.Reading the dependency analysis result (phrase / dependency)
In addition to 40, implement the clause Chunk class. This class has a list of morphemes (Morph objects) (morphs), a list of related clause index numbers (dst), and a list of related original clause index numbers (srcs) as member variables. In addition, read the analysis result of CaboCha of the input text, express one sentence as a list of Chunk objects, and display the character string and the contact of the phrase of the eighth sentence. For the rest of the problems in Chapter 5, use the program created here.
"""
from collections import defaultdict
from typing import List


def read_file(fpath: str) -> List[List[str]]:
    """Get clear format of parsed sentences.

    Args:
        fpath (str): File path.

    Returns:
        List[List[str]]: List of sentences, and each sentence contains a word list.
                         e.g. result[1]:
                            ['* 0 2D 0/0 -0.764522',
                             '\u3000\t sign,Blank,*,*,*,*,\u3000,\u3000,\u3000',
                             '* 1 2D 0/1 -0.764522',
                             'I\t noun,Pronoun,General,*,*,*,I,Wagamama,Wagamama',
                             'Is\t particle,Particle,*,*,*,*,Is,C,Wow',
                             '* 2 -1D 0/2 0.000000',
                             'Cat\t noun,General,*,*,*,*,Cat,cat,cat',
                             'so\t auxiliary verb,*,*,*,Special,Continuous form,Is,De,De',
                             'is there\t auxiliary verb,*,*,*,Five steps, La line Al,Uninflected word,is there,Al,Al',
                             '。\t sign,Kuten,*,*,*,*,。,。,。']
    """
    with open(fpath, mode="rt", encoding="utf-8") as f:
        sentences = f.read().split("EOS\n")
    return [sent.strip().split("\n") for sent in sentences if sent.strip() != ""]


class Morph:
    """Morph information for each token.

    Args:
        data (dict): A dictionary contains necessary information.

    Attributes:
        surface (str):Surface
        base (str):Uninflected word (base)
        pos (str):Part of speech (base)
        pos1 (str):Part of speech subcategory 1 (pos1
    """

    def __init__(self, data):
        self.surface = data["surface"]
        self.base = data["base"]
        self.pos = data["pos"]
        self.pos1 = data["pos1"]

    def __repr__(self):
        return f"Morph({self.surface})"

    def __str__(self):
        return "surface[{}]\tbase[{}]\tpos[{}]\tpos1[{}]".format(
            self.surface, self.base, self.pos, self.pos1
        )


class Chunk:
    """Containing information for Clause/phrase.

    Args:
        data (dict): A dictionary contains necessary information.

    Attributes:
        chunk_id (str): The number of clause chunk (Phrase number).
        morphs List[Morph]: Morph (morpheme) list.
        dst (str): The index of dependency target (Contact clause index number).
        srcs (List[str]): The index list of dependency source. (Original clause index number).
    """

    def __init__(self, chunk_id, dst):
        self.id = chunk_id
        self.morphs = []
        self.dst = dst
        self.srcs = []

    def __repr__(self):
        return "Chunk( id: {}, dst: {}, srcs: {}, morphs: {} )".format(
            self.id, self.dst, self.srcs, self.morphs
        )


# ans41
def convert_sent_to_chunks(sent: List[str]) -> List[Morph]:
    """Extract word and convert to morph.

    Args:
        sent (List[str]): A sentence contains a word list.
                            e.g. sent:
                               ['* 0 1D 0/1 0.000000',
                                'I\t noun,Pronoun,General,*,*,*,I,Wagamama,Wagamama',
                                'Is\t particle,Particle,*,*,*,*,Is,C,Wow',
                                '* 1 -1D 0/2 0.000000',
                                'Cat\t noun,General,*,*,*,*,Cat,cat,cat',
                                'so\t auxiliary verb,*,*,*,Special,Continuous form,Is,De,De',
                                'is there\t auxiliary verb,*,*,*,Five steps, La line Al,Uninflected word,is there,Al,Al',
                                '。\t sign,Kuten,*,*,*,*,。,。,。']

    Parsing format:
        e.g. "* 0 1D 0/1 0.000000"
        |column|meaning|
        | :----: | :----------------------------------------------------------- |
        |   1    |The first column is`*`.. Indicates that this is a dependency analysis result.|
        |   2    |Phrase number (integer starting from 0)|
        |   3    |Contact number +`D`                                              |
        |   4    |Head/Function word positions and any number of feature sequences|
        |   5    |Engagement score. In general, the larger the value, the easier it is to engage.|

    Returns:
        List[Chunk]: List of chunks.
    """
    chunks = []
    chunk = None
    srcs = defaultdict(list)

    for i, word in enumerate(sent):
        if word[0] == "*":
            # Add chunk to chunks
            if chunk is not None:
                chunks.append(chunk)

            # eNw Chunk beggin
            chunk_id = word.split(" ")[1]
            dst = word.split(" ")[2].rstrip("D")
            chunk = Chunk(chunk_id, dst)
            srcs[dst].append(chunk_id)  # Add target->source to mapping list

        else:  # Add Morch to chunk.morphs
            features = word.split(",")
            dic = {
                "surface": features[0].split("\t")[0],
                "base": features[6],
                "pos": features[0].split("\t")[1],
                "pos1": features[1],
            }
            chunk.morphs.append(Morph(dic))

            if i == len(sent) - 1:  # Add the last chunk
                chunks.append(chunk)

    # Add srcs to each chunk
    for chunk in chunks:
        chunk.srcs = list(srcs[chunk.id])

    return chunks


fpath = "neko.txt.cabocha"
sentences = read_file(fpath)
chunks = [convert_sent_to_chunks(sent) for sent in sentences]

for chunk in chunks[5]:
    print(chunk)

# Chunk( id: 0, dst: 5, srcs: [], morphs: [Morph(I), Morph(Is)] )
# Chunk( id: 1, dst: 2, srcs: [], morphs: [Morph(here), Morph(so)] )
# Chunk( id: 2, dst: 3, srcs: ['1'], morphs: [Morph(start), Morph(hand)] )
# Chunk( id: 3, dst: 4, srcs: ['2'], morphs: [Morph(Human), Morph(That)] )
# Chunk( id: 4, dst: 5, srcs: ['3'], morphs: [Morph(thing), Morph(To)] )
# Chunk( id: 5, dst: -1, srcs: ['0', '4'], morphs: [Morph(You see), Morph(Ta), Morph(。)] )