[PYTHON] 100 language processing knocks (2020): 45

"""
## 45.Extraction of verb case patterns[Permalink](https://nlp100.github.io/ja/ch05.html#45-Extractionofverbcasepatterns)

I would like to consider the sentence used this time as a corpus and investigate the cases that Japanese predicates can take. Think of the verb as a predicate and the particle of the phrase related to the verb as a case, and output the predicate and case in tab-delimited format. However, make sure that the output meets the following specifications.

-In a clause containing a verb, the uninflected word of the leftmost verb is used as a predicate.
-Case particles related to predicates
-If there are multiple particles (phrases) related to the predicate, arrange all the particles in lexicographic order separated by spaces.

An example sentence (neko) that "I saw a human being for the first time here".txt.Consider the 8th sentence of cabocha). This sentence contains two verbs, "begin" and "see", when the phrase "begin" is analyzed as "here" and the phrase as "see" is analyzed as "I am" and "thing". Should produce the following output.


At the beginning
To see


Save the output of this program to a file and check the following items using UNIX commands.

-A combination of predicates and case patterns that frequently appear in the corpus
-Case patterns of the verbs "do", "see", and "give" (arrange in order of frequency of appearance in the corpus)
"""
from collections import defaultdict
from typing import Dict, List


def read_file(fpath: str) -> List[List[str]]:
    """Get clear format of parsed sentences.

    Args:
        fpath (str): File path.

    Returns:
        List[List[str]]: List of sentences, and each sentence contains a word list.
                         e.g. result[1]:
                            ['* 0 2D 0/0 -0.764522',
                             '\u3000\t sign,Blank,*,*,*,*,\u3000,\u3000,\u3000',
                             '* 1 2D 0/1 -0.764522',
                             'I\t noun,Pronoun,General,*,*,*,I,Wagamama,Wagamama',
                             'Is\t particle,Particle,*,*,*,*,Is,C,Wow',
                             '* 2 -1D 0/2 0.000000',
                             'Cat\t noun,General,*,*,*,*,Cat,cat,cat',
                             'so\t auxiliary verb,*,*,*,Special,Continuous form,Is,De,De',
                             'is there\t auxiliary verb,*,*,*,Five steps, La line Al,Uninflected word,is there,Al,Al',
                             '。\t sign,Kuten,*,*,*,*,。,。,。']
    """
    with open(fpath, mode="rt", encoding="utf-8") as f:
        sentences = f.read().split("EOS\n")
    return [sent.strip().split("\n") for sent in sentences if sent.strip() != ""]


class Morph:
    """Morph information for each token.

    Args:
        data (dict): A dictionary contains necessary information.

    Attributes:
        surface (str):Surface
        base (str):Uninflected word (base)
        pos (str):Part of speech (base)
        pos1 (str):Part of speech subclassification 1 (pos1)
    """

    def __init__(self, data):
        self.surface = data["surface"]
        self.base = data["base"]
        self.pos = data["pos"]
        self.pos1 = data["pos1"]

    def __repr__(self):
        return f"Morph({self.surface})"

    def __str__(self):
        return "surface[{}]\tbase[{}]\tpos[{}]\tpos1[{}]".format(
            self.surface, self.base, self.pos, self.pos1
        )


class Chunk:
    """Containing information for Clause/phrase.

    Args:
        data (dict): A dictionary contains necessary information.

    Attributes:
        chunk_id (str): The number of clause chunk (Phrase number).
        morphs List[Morph]: Morph (morpheme) list.
        dst (str): The index of dependency target (Contact clause index number).
        srcs (List[str]): The index list of dependency source. (Original clause index number).
    """

    def __init__(self, chunk_id, dst):
        self.id = chunk_id
        self.morphs = []
        self.dst = dst
        self.srcs = []

    def __repr__(self):
        return "Chunk( id: {}, dst: {}, srcs: {}, morphs: {} )".format(
            self.id, self.dst, self.srcs, self.morphs
        )

    def get_surface(self) -> str:
        """Concatenate morph surfaces in a chink.

        Args:
            chunk (Chunk): e.g. Chunk( id: 0, dst: 5, srcs: [], morphs: [Morph(I), Morph(Is)]
        Return:
            e.g. 'I am'
        """
        morphs = self.morphs
        res = ""
        for morph in morphs:
            if morph.pos != "symbol":
                res += morph.surface
        return res

    def validate_pos(self, pos: str) -> bool:
        """Return Ture if 'noun' or 'verb' in chunk's morphs. Otherwise, return False."""
        morphs = self.morphs
        return any([morph.pos == pos for morph in morphs])


def convert_sent_to_chunks(sent: List[str]) -> List[Morph]:
    """Extract word and convert to morph.

    Args:
        sent (List[str]): A sentence contains a word list.
                            e.g. sent:
                               ['* 0 1D 0/1 0.000000',
                                'I\t noun,Pronoun,General,*,*,*,I,Wagamama,Wagamama',
                                'Is\t particle,Particle,*,*,*,*,Is,C,Wow',
                                '* 1 -1D 0/2 0.000000',
                                'Cat\t noun,General,*,*,*,*,Cat,cat,cat',
                                'so\t auxiliary verb,*,*,*,Special,Continuous form,Is,De,De',
                                'is there\t auxiliary verb,*,*,*,Five steps, La line Al,Uninflected word,is there,Al,Al',
                                '。\t sign,Kuten,*,*,*,*,。,。,。']

    Parsing format:
        e.g. "* 0 1D 0/1 0.000000"
        |column|meaning|
        | :----: | :----------------------------------------------------------- |
        |   1    |The first column is`*`.. Indicates that this is a dependency analysis result.|
        |   2    |Phrase number (integer starting from 0)|
        |   3    |Contact number +`D`                                              |
        |   4    |Head/Function word positions and any number of feature sequences|
        |   5    |Engagement score. In general, the larger the value, the easier it is to engage.|

    Returns:
        List[Chunk]: List of chunks.
    """
    chunks = []
    chunk = None
    srcs = defaultdict(list)

    for i, word in enumerate(sent):
        if word[0] == "*":
            # Add chunk to chunks
            if chunk is not None:
                chunks.append(chunk)

            # eNw Chunk beggin
            chunk_id = word.split(" ")[1]
            dst = word.split(" ")[2].rstrip("D")
            chunk = Chunk(chunk_id, dst)
            srcs[dst].append(chunk_id)  # Add target->source to mapping list

        else:  # Add Morch to chunk.morphs
            features = word.split(",")
            dic = {
                "surface": features[0].split("\t")[0],
                "base": features[6],
                "pos": features[0].split("\t")[1],
                "pos1": features[1],
            }
            chunk.morphs.append(Morph(dic))

            if i == len(sent) - 1:  # Add the last chunk
                chunks.append(chunk)

    # Add srcs to each chunk
    for chunk in chunks:
        chunk.srcs = list(srcs[chunk.id])

    return chunks


def get_predicate_pattern(chunks: List[Chunk]) -> List[Dict[str, list]]:
    """Get edges from sentence chunks.

    Terms:
        -predicate(predicate)
        -Item(argument)
        -Case(case)

    Args:
        chunks (List[Chunk]): A sentence contains many chunks.
            e.g. [Chunk( id: 0, dst: 5, srcs: [], morphs: [Morph(I), Morph(Is)] ),
                  Chunk( id: 1, dst: 2, srcs: [], morphs: [Morph(here), Morph(so)] ),
                  Chunk( id: 2, dst: 3, srcs: ['1'], morphs: [Morph(start), Morph(hand)] ),
                  Chunk( id: 3, dst: 4, srcs: ['2'], morphs: [Morph(Human), Morph(That)] ),
                  Chunk( id: 4, dst: 5, srcs: ['3'], morphs: [Morph(thing), Morph(To)] ),
                  Chunk( id: 5, dst: -1, srcs: ['0', '4'], morphs: [Morph(You see), Morph(Ta), Morph(。)] )]

    Returns:
        List[Dict[str, list]]: Predicate and case.
            e.g. [defaultdict(list, {'start': ['so']}), defaultdict(list, {'to see': ['Is', 'To']})]
    """
    patterns = []
    for chunk in chunks:
        # Skip if not valid
        if len(chunk.srcs) == 0 or all([morph.pos != "verb" for morph in chunk.morphs]):
            continue

        # Initialize
        pred_case = defaultdict(list)

        # Get predicate
        for morph in chunk.morphs:
            if morph.pos == "verb":
                predicate = morph.base
                break

        # Get case
        for src in chunk.srcs:
            src_chunk = chunks[int(src)]
            for morph in src_chunk.morphs:
                if morph.pos == "Particle":
                    pred_case[predicate].append(morph.base)

        # Add to patterns
        patterns.append(pred_case)
    return patterns


def write_to_file(pattern_sents, path):
    """Write patterns to file.

    Args:
        pattern_sents ([type]): predicate-case patterns.
            e.g. [[defaultdict(list, {'Born': ['so']}), defaultdict(list, {'Tsukuri': ['Or', 'But']})],
                  [defaultdict(list, {'cry': ['so']}), defaultdict(list, {'To do': ['hand', 'Only', 'Is']})]]
    """
    # convert_patterns_to_text
    lines = []
    for pattern_sent in pattern_sents:
        for pattern in pattern_sent:  # pattern: {'Tsukuri': ['Or', 'But']}
            for predicate, case_list in pattern.items():
                case_text = " ".join(sorted(case_list))  #Sort characters in ascending order
                lines.append((predicate, case_text))

    # write_to_file
    with open(path, "w") as f:
        for line in lines:
            f.write(f"{line[0]}\t{line[1]}\n")


fpath = "neko.txt.cabocha"
sentences = read_file(fpath)
sentences = [convert_sent_to_chunks(sent) for sent in sentences]  # ans41

# ans45
pattern_sents = [get_predicate_pattern(sent) for sent in sentences]
pattern_sents = list(filter(lambda x: len(x) != 0, pattern_sents))
write_to_file(pattern_sents, "patterns.txt")

# "patterns.txt":
#Be born
#Tsukuka
#By crying
#Just to do
#At the beginning
#To see
#Listen
#To catch
#Boil
#Eat

ans45.sh


#A combination of predicates and case patterns that frequently appear in the corpus
# see ans19.sh
sort patterns.txt | uniq -c | sort -k1nr > patterns_sorted.txt
# -k 3: sort as the 3rd column
# -n: numeric sort
# -r: reverse order


#Only "do"
grep "^To do\s" patterns.txt | sort | uniq -c | sort -k1nr > "To do.txt"

# #"See" only
grep "^to see\s" patterns.txt | sort | uniq -c | sort -k1nr > "to see.txt"

# #"Give" only
grep "^give\s" patterns.txt | sort | uniq -c | sort -k1nr > "give.txt"