[PYTHON] 100 language processing knocks (2020): 46

"""
## 46.Extraction of verb case frame information[Permalink](https://nlp100.github.io/ja/ch05.html#46-Extractionofverbcaseframeinformation)

Modify 45 programs and output the terms (the clauses related to the predicates themselves) in tab-delimited format following the predicate and case pattern. In addition to the 45 specifications, meet the following specifications.

-The term should be a word string of the clause related to the predicate (there is no need to remove the trailing particle).
-If there are multiple clauses related to the predicate, arrange them in the same standard and order as the particles, separated by spaces.

An example sentence (neko) that "I saw a human being for the first time here".txt.Consider the 8th sentence of cabocha). This sentence contains two verbs, "begin" and "see", when the phrase "begin" is analyzed as "here" and the phrase as "see" is analyzed as "I am" and "thing". Should produce the following output.


Get started here
See what I see


"""
from collections import defaultdict
from typing import List


def read_file(fpath: str) -> List[List[str]]:
    """Get clear format of parsed sentences.

    Args:
        fpath (str): File path.

    Returns:
        List[List[str]]: List of sentences, and each sentence contains a word list.
                         e.g. result[1]:
                            ['* 0 2D 0/0 -0.764522',
                             '\u3000\t sign,Blank,*,*,*,*,\u3000,\u3000,\u3000',
                             '* 1 2D 0/1 -0.764522',
                             'I\t noun,Pronoun,General,*,*,*,I,Wagamama,Wagamama',
                             'Is\t particle,Particle,*,*,*,*,Is,C,Wow',
                             '* 2 -1D 0/2 0.000000',
                             'Cat\t noun,General,*,*,*,*,Cat,cat,cat',
                             'so\t auxiliary verb,*,*,*,Special,Continuous form,Is,De,De',
                             'is there\t auxiliary verb,*,*,*,Five steps, La line Al,Uninflected word,is there,Al,Al',
                             '。\t sign,Kuten,*,*,*,*,。,。,。']
    """
    with open(fpath, mode="rt", encoding="utf-8") as f:
        sentences = f.read().split("EOS\n")
    return [sent.strip().split("\n") for sent in sentences if sent.strip() != ""]


class Morph:
    """Morph information for each token.

    Args:
        data (dict): A dictionary contains necessary information.

    Attributes:
        surface (str):Surface
        base (str):Uninflected word (base)
        pos (str):Part of speech (base)
        pos1 (str):Part of speech subclassification 1 (pos1)
    """

    def __init__(self, data):
        self.surface = data["surface"]
        self.base = data["base"]
        self.pos = data["pos"]
        self.pos1 = data["pos1"]

    def __repr__(self):
        return f"Morph({self.surface})"

    def __str__(self):
        return "surface[{}]\tbase[{}]\tpos[{}]\tpos1[{}]".format(
            self.surface, self.base, self.pos, self.pos1
        )


class Chunk:
    """Containing information for Clause/phrase.

    Args:
        data (dict): A dictionary contains necessary information.

    Attributes:
        chunk_id (str): The number of clause chunk (Phrase number).
        morphs List[Morph]: Morph (morpheme) list.
        dst (str): The index of dependency target (Contact clause index number).
        srcs (List[str]): The index list of dependency source. (Original clause index number).
    """

    def __init__(self, chunk_id, dst):
        self.id = chunk_id
        self.morphs = []
        self.dst = dst
        self.srcs = []

    def __repr__(self):
        return "Chunk( id: {}, dst: {}, srcs: {}, morphs: {} )".format(
            self.id, self.dst, self.srcs, self.morphs
        )

    def get_surface(self) -> str:
        """Concatenate morph surfaces in a chink.

        Args:
            chunk (Chunk): e.g. Chunk( id: 0, dst: 5, srcs: [], morphs: [Morph(I), Morph(Is)]
        Return:
            e.g. 'I am'
        """
        morphs = self.morphs
        res = ""
        for morph in morphs:
            if morph.pos != "symbol":
                res += morph.surface
        return res

    def validate_pos(self, pos: str) -> bool:
        """Return Ture if 'noun' or 'verb' in chunk's morphs. Otherwise, return False."""
        morphs = self.morphs
        return any([morph.pos == pos for morph in morphs])


def convert_sent_to_chunks(sent: List[str]) -> List[Morph]:
    """Extract word and convert to morph.

    Args:
        sent (List[str]): A sentence contains a word list.
                            e.g. sent:
                               ['* 0 1D 0/1 0.000000',
                                'I\t noun,Pronoun,General,*,*,*,I,Wagamama,Wagamama',
                                'Is\t particle,Particle,*,*,*,*,Is,C,Wow',
                                '* 1 -1D 0/2 0.000000',
                                'Cat\t noun,General,*,*,*,*,Cat,cat,cat',
                                'so\t auxiliary verb,*,*,*,Special,Continuous form,Is,De,De',
                                'is there\t auxiliary verb,*,*,*,Five steps, La line Al,Uninflected word,is there,Al,Al',
                                '。\t sign,Kuten,*,*,*,*,。,。,。']

    Parsing format:
        e.g. "* 0 1D 0/1 0.000000"
        |column|meaning|
        | :----: | :----------------------------------------------------------- |
        |   1    |The first column is`*`.. Indicates that this is a dependency analysis result.|
        |   2    |Phrase number (integer starting from 0)|
        |   3    |Contact number +`D`                                              |
        |   4    |Head/Function word positions and any number of feature sequences|
        |   5    |Engagement score. In general, the larger the value, the easier it is to engage.|

    Returns:
        List[Chunk]: List of chunks.
    """
    chunks = []
    chunk = None
    srcs = defaultdict(list)

    for i, word in enumerate(sent):
        if word[0] == "*":
            # Add chunk to chunks
            if chunk is not None:
                chunks.append(chunk)

            # eNw Chunk beggin
            chunk_id = word.split(" ")[1]
            dst = word.split(" ")[2].rstrip("D")
            chunk = Chunk(chunk_id, dst)
            srcs[dst].append(chunk_id)  # Add target->source to mapping list

        else:  # Add Morch to chunk.morphs
            features = word.split(",")
            dic = {
                "surface": features[0].split("\t")[0],
                "base": features[6],
                "pos": features[0].split("\t")[1],
                "pos1": features[1],
            }
            chunk.morphs.append(Morph(dic))

            if i == len(sent) - 1:  # Add the last chunk
                chunks.append(chunk)

    # Add srcs to each chunk
    for chunk in chunks:
        chunk.srcs = list(srcs[chunk.id])

    return chunks


def get_predicate_frame(chunks: List[Chunk]) -> List[dict]:
    """Get edges from sentence chunks.

    Terms:
        -predicate(predicate)
        -Item(argument)
        -Case(case)

    Args:
        chunks (List[Chunk]): A sentence contains many chunks.
            e.g. [Chunk( id: 0, dst: 5, srcs: [], morphs: [Morph(I), Morph(Is)] ),
                  Chunk( id: 1, dst: 2, srcs: [], morphs: [Morph(here), Morph(so)] ),
                  Chunk( id: 2, dst: 3, srcs: ['1'], morphs: [Morph(start), Morph(hand)] ),
                  Chunk( id: 3, dst: 4, srcs: ['2'], morphs: [Morph(Human), Morph(That)] ),
                  Chunk( id: 4, dst: 5, srcs: ['3'], morphs: [Morph(thing), Morph(To)] ),
                  Chunk( id: 5, dst: -1, srcs: ['0', '4'], morphs: [Morph(You see), Morph(Ta), Morph(。)] )]

    Returns:
        List[dict]: Predicate frame.
            e.g. [{'pred': 'start', 'case': ['so'], 'arg': ['ここso']},
                  {'pred': 'to see', 'case': ['Is', 'To'], 'arg': ['吾輩Is', 'ものTo']}]
    """
    patterns = []
    for chunk in chunks:
        # Skip if not valid
        if len(chunk.srcs) == 0 or all([morph.pos != "verb" for morph in chunk.morphs]):
            continue

        # Initialize
        pred_frame = {"pred": None, "case": [], "arg": []}

        # Get predicate
        for morph in chunk.morphs:
            if morph.pos == "verb":
                pred_frame["pred"] = morph.base
                break

        # Get case
        for src in chunk.srcs:
            src_chunk = chunks[int(src)]
            for morph in src_chunk.morphs:
                if morph.pos == "Particle":
                    pred_frame["case"].append(morph.base)
                    pred_frame["arg"].append(src_chunk.get_surface())

        # Add to patterns
        patterns.append(pred_frame)
    return patterns


def write_to_file(sents, path):
    """Write patterns to file.

    Args:
        pattern_sents ([type]): predicate-case patterns.
            e.g. [[{'pred': 'Born', 'case': ['so'], 'arg': ['どこso']},
                   {'pred': 'Tsukuri', 'case': ['Or', 'But'], 'arg': ['生れたOr', '見当But']}],
                  [{'pred': 'cry', 'case': ['so'], 'arg': ['所so']},
                   {'pred': 'To do', 'case': ['hand', 'Only', 'Is'], 'arg': ['泣いhand', 'いた事OnlyIs', 'いた事OnlyIs']}]]
    """
    # convert_frame_to_text
    lines = []
    for sent in sents:
        for frame in sent:  # pattern: {'pred': 'cry', 'case': ['so'], 'arg': ['所so']}
            case_text = " ".join(frame["case"])
            arg_text = " ".join(frame["arg"])
            lines.append((frame["pred"], case_text, arg_text))

    # write_to_file
    with open(path, "w") as f:
        for line in lines:
            f.write(f"{line[0]}\t{line[1]}\t{line[2]}\n")


fpath = "neko.txt.cabocha"
sentences = read_file(fpath)
sentences = [convert_sent_to_chunks(sent) for sent in sentences]  # ans41

# ans46
pattern_sents = [get_predicate_frame(sent) for sent in sentences]
pattern_sents = list(filter(lambda x: len(x) != 0, pattern_sents))
write_to_file(pattern_sents, "predicate_frame.txt")

# "predicate_frame.txt":
#Where to be born
#I have no idea if it was born
#Where to cry
#The only thing I was crying was the only thing I was crying
#Get started here
#See what I see
#Listen later
#Catch us