[PYTHON] 100 language processing knocks (2020): 47

"""
## 47.Functional verb syntax mining[Permalink](https://nlp100.github.io/ja/ch05.html#47-Functionalverbsyntaxmining)

I would like to pay attention only when the verb wo case contains a s-irregular noun. Modify 46 programs to meet the following specifications.

-"Sahen connection noun+Only when the phrase composed of "(particle)" is related to the verb
-The predicate is "Sahen connecting noun"+To+動詞の基本形」とし,文節中に複数の動詞があるときは,最左の動詞To用いる
-If there are multiple particles (phrases) related to the predicate, arrange all the particles in lexicographic order separated by spaces.
-If there are multiple clauses related to the predicate, arrange all the terms with spaces (align with the order of particles).

For example, the following output should be obtained from the sentence "There is also a method called reinforcement learning in which learning is performed based on one's own experience."


Experience based on learning

"""
from collections import defaultdict
from typing import List


def read_file(fpath: str) -> List[List[str]]:
    """Get clear format of parsed sentences.

    Args:
        fpath (str): File path.

    Returns:
        List[List[str]]: List of sentences, and each sentence contains a word list.
                         e.g. result[1]:
                            ['* 0 2D 0/0 -0.764522',
                             '\u3000\t sign,Blank,*,*,*,*,\u3000,\u3000,\u3000',
                             '* 1 2D 0/1 -0.764522',
                             'I\t noun,Pronoun,General,*,*,*,I,Wagamama,Wagamama',
                             'Is\t particle,Particle,*,*,*,*,Is,C,Wow',
                             '* 2 -1D 0/2 0.000000',
                             'Cat\t noun,General,*,*,*,*,Cat,cat,cat',
                             'so\t auxiliary verb,*,*,*,Special,Continuous form,Is,De,De',
                             'is there\t auxiliary verb,*,*,*,Five steps, La line Al,Uninflected word,is there,Al,Al',
                             '。\t sign,Kuten,*,*,*,*,。,。,。']
    """
    with open(fpath, mode="rt", encoding="utf-8") as f:
        sentences = f.read().split("EOS\n")
    return [sent.strip().split("\n") for sent in sentences if sent.strip() != ""]


class Morph:
    """Morph information for each token.

    Args:
        data (dict): A dictionary contains necessary information.

    Attributes:
        surface (str):Surface
        base (str):Uninflected word (base)
        pos (str):Part of speech (base)
        pos1 (str):Part of speech subclassification 1 (pos1)
    """

    def __init__(self, data):
        self.surface = data["surface"]
        self.base = data["base"]
        self.pos = data["pos"]
        self.pos1 = data["pos1"]

    def __repr__(self):
        return f"Morph({self.surface})"

    def __str__(self):
        return "surface[{}]\tbase[{}]\tpos[{}]\tpos1[{}]".format(
            self.surface, self.base, self.pos, self.pos1
        )


class Chunk:
    """Containing information for Clause/phrase.

    Args:
        data (dict): A dictionary contains necessary information.

    Attributes:
        chunk_id (str): The number of clause chunk (Phrase number).
        morphs List[Morph]: Morph (morpheme) list.
        dst (str): The index of dependency target (Contact clause index number).
        srcs (List[str]): The index list of dependency source. (Original clause index number).
    """

    def __init__(self, chunk_id, dst):
        self.id = chunk_id
        self.morphs = []
        self.dst = dst
        self.srcs = []

    def __repr__(self):
        return "Chunk( id: {}, dst: {}, srcs: {}, morphs: {} )".format(
            self.id, self.dst, self.srcs, self.morphs
        )

    def get_surface(self) -> str:
        """Concatenate morph surfaces in a chink.

        Args:
            chunk (Chunk): e.g. Chunk( id: 0, dst: 5, srcs: [], morphs: [Morph(I), Morph(Is)]
        Return:
            e.g. 'I am'
        """
        morphs = self.morphs
        res = ""
        for morph in morphs:
            if morph.pos != "symbol":
                res += morph.surface
        return res

    def validate_pos(self, pos: str) -> bool:
        """Return Ture if 'noun' or 'verb' in chunk's morphs. Otherwise, return False."""
        morphs = self.morphs
        return any([morph.pos == pos for morph in morphs])


def convert_sent_to_chunks(sent: List[str]) -> List[Morph]:
    """Extract word and convert to morph.

    Args:
        sent (List[str]): A sentence contains a word list.
                            e.g. sent:
                               ['* 0 1D 0/1 0.000000',
                                'I\t noun,Pronoun,General,*,*,*,I,Wagamama,Wagamama',
                                'Is\t particle,Particle,*,*,*,*,Is,C,Wow',
                                '* 1 -1D 0/2 0.000000',
                                'Cat\t noun,General,*,*,*,*,Cat,cat,cat',
                                'so\t auxiliary verb,*,*,*,Special,Continuous form,Is,De,De',
                                'is there\t auxiliary verb,*,*,*,Five steps, La line Al,Uninflected word,is there,Al,Al',
                                '。\t sign,Kuten,*,*,*,*,。,。,。']

    Parsing format:
        e.g. "* 0 1D 0/1 0.000000"
        |column|meaning|
        | :----: | :----------------------------------------------------------- |
        |   1    |The first column is`*`.. Indicates that this is a dependency analysis result.|
        |   2    |Phrase number (integer starting from 0)|
        |   3    |Contact number +`D`                                              |
        |   4    |Head/Function word positions and any number of feature sequences|
        |   5    |Engagement score. In general, the larger the value, the easier it is to engage.|

    Returns:
        List[Chunk]: List of chunks.
    """
    chunks = []
    chunk = None
    srcs = defaultdict(list)

    for i, word in enumerate(sent):
        if word[0] == "*":
            # Add chunk to chunks
            if chunk is not None:
                chunks.append(chunk)

            # eNw Chunk beggin
            chunk_id = word.split(" ")[1]
            dst = word.split(" ")[2].rstrip("D")
            chunk = Chunk(chunk_id, dst)
            srcs[dst].append(chunk_id)  # Add target->source to mapping list

        else:  # Add Morch to chunk.morphs
            features = word.split(",")
            dic = {
                "surface": features[0].split("\t")[0],
                "base": features[6],
                "pos": features[0].split("\t")[1],
                "pos1": features[1],
            }
            chunk.morphs.append(Morph(dic))

            if i == len(sent) - 1:  # Add the last chunk
                chunks.append(chunk)

    # Add srcs to each chunk
    for chunk in chunks:
        chunk.srcs = list(srcs[chunk.id])

    return chunks


def validate_chunk(chunk: Chunk, chunks: List[Chunk]) -> bool:
    """Validate chunk contains s-irregular connection nouns(sa_connect_noun)and (particle)(particle).

    Args:
        chunk (Chunk): Chunk object.
            e.g. Chunk( id: 4, dst: 5, srcs: ['3'], morphs: [Morph(thing), Morph(To)] )

    Returns:
        bool: True or False
    """
    sa_hen_flag = False
    jyo_shi_flag = False
    verb_flag = False

    for morph in chunk.morphs:
        if morph.pos1 == "Change connection":
            sa_hen_flag = True
        if morph.pos == "Particle" and morph.surface == "To":
            jyo_shi_flag = True

    if (not sa_hen_flag) and (not jyo_shi_flag):
        return False

    for src in chunk.srcs:
        src_chunk = chunks[int(src)]
        for morph in src_chunk.morphs:
            if morph.pos == "verb":
                verb_flag = True

    return all([sa_hen_flag, jyo_shi_flag, verb_flag])


def get_verb_frame(chunks: List[Chunk]) -> dict:
    """Get edges from sentence chunks.

    Terms:
        -predicate(predicate)
        -Item(argument)
        -Case(case)

    Notice:
        - Chunk.morphs have "sa-hen connection noun" and "o (particle)"
        - Chunk.srcs have "verbs"

    Args:
        chunks (List[Chunk]): A sentence contains many chunks.
            e.g. [Chunk( id: 0, dst: 5, srcs: [], morphs: [Morph(I), Morph(Is)] ),
                  Chunk( id: 1, dst: 2, srcs: [], morphs: [Morph(here), Morph(so)] ),
                  Chunk( id: 2, dst: 3, srcs: ['1'], morphs: [Morph(start), Morph(hand)] ),
                  Chunk( id: 3, dst: 4, srcs: ['2'], morphs: [Morph(Human), Morph(That)] ),
                  Chunk( id: 4, dst: 5, srcs: ['3'], morphs: [Morph(thing), Morph(To)] ),
                  Chunk( id: 5, dst: -1, srcs: ['0', '4'], morphs: [Morph(You see), Morph(Ta), Morph(。)] )]

    Returns:
        dict: Predicate frame.
            e.g. {'pred': 'talk', 'case': [], 'arg': []}
              or {'pred': 'See a laugh', 'case': ['hand'], 'arg': ['見hand']}
              or {'pred': 'Have a demonstration', 'case': ['hand', 'To'], 'arg': ['持っhand', '性格To']},
    """
    frame = {"pred": None, "case": [], "arg": []}
    for chunk in chunks:
        # Initialize
        sa_hen_surface = None

        # Skip if not valid
        if not validate_chunk(chunk, chunks):
            continue

        # Get sa_hen
        for morph in chunk.morphs:
            if morph.pos1 == "Change connection":
                sa_hen_surface = morph.surface

        # Get verb
        src_verb_chunks = [
            morph
            for src in chunk.srcs
            for morph in chunks[int(src)].morphs
            if morph.pos == "verb"
        ]

        # Get predicate
        frame["pred"] = sa_hen_surface + "To" + src_verb_chunks[0].base

        # Get case
        for src in chunk.srcs:
            src_chunk = chunks[int(src)]
            for morph in src_chunk.morphs:
                if morph.pos == "Particle":
                    frame["case"].append(morph.base)
                    frame["arg"].append(src_chunk.get_surface())

    return frame


def write_to_file(sents: List[dict], path):
    """Write patterns to file.

    Args:
        sents ([type]): predicate-verb frame
            e.g.   [{'pred': 'talk', 'case': [], 'arg': []},
                    {'pred': 'See a laugh', 'case': ['hand'], 'arg': ['見hand']},
                    {'pred': 'Have a demonstration', 'case': ['hand', 'To'], 'arg': ['持っhand', '性格To']}]
    """
    # convert_frame_to_text
    lines = []

    for frame in sents:
        case_text = " ".join(frame["case"])
        arg_text = " ".join(frame["arg"])
        lines.append((frame["pred"], case_text, arg_text))

    # write_to_file
    with open(path, "w") as f:
        for line in lines:
            f.write(f"{line[0]}\t{line[1]}\t{line[2]}\n")


fpath = "neko.txt.cabocha"
sentences = read_file(fpath)
sentences = [convert_sent_to_chunks(sent) for sent in sentences]  # ans41

# ans47
pattern_sents = [get_verb_frame(sent) for sent in sentences]
pattern_sents = list(filter(lambda x: x["pred"] is not None, pattern_sents))

write_to_file(pattern_sents, "predicate_verb_frame.txt")

# "predicate_verb_frame.txt":
#talk
#Look at the laughter
#Grief the press
#I wonder if I will listen to my imagination and write it down.
#Controversy
#Misguided greetings
#After reading aloud
#Have a personality

Recommended Posts

100 language processing knocks 03 ~ 05
100 language processing knocks (2020): 40
100 language processing knocks (2020): 32
100 language processing knocks (2020): 35
100 language processing knocks (2020): 47
100 language processing knocks (2020): 39
100 language processing knocks (2020): 22
100 language processing knocks (2020): 26
100 language processing knocks (2020): 34
100 language processing knocks (2020): 42
100 language processing knocks (2020): 29
100 language processing knocks (2020): 49
100 language processing knocks 06 ~ 09
100 language processing knocks (2020): 43
100 language processing knocks (2020): 24
100 language processing knocks (2020): 45
100 language processing knocks (2020): 10-19
100 language processing knocks (2020): 30
100 language processing knocks (2020): 00-09
100 language processing knocks (2020): 31
100 language processing knocks (2020): 48
100 language processing knocks (2020): 44
100 language processing knocks (2020): 41
100 language processing knocks (2020): 37
100 language processing knocks (2020): 25
100 language processing knocks (2020): 23
100 language processing knocks (2020): 33
100 language processing knocks (2020): 20
100 language processing knocks (2020): 27
100 language processing knocks (2020): 46
100 language processing knocks (2020): 21
100 language processing knocks (2020): 36
100 amateur language processing knocks: 71
100 amateur language processing knocks: 56
100 amateur language processing knocks: 24
100 amateur language processing knocks: 50
100 amateur language processing knocks: 59
100 amateur language processing knocks: 70
100 amateur language processing knocks: 62
100 amateur language processing knocks: 60
100 amateur language processing knocks: 92
100 amateur language processing knocks: 30
100 amateur language processing knocks: 06
100 amateur language processing knocks: 84
100 amateur language processing knocks: 81
100 amateur language processing knocks: 33
100 amateur language processing knocks: 46
100 amateur language processing knocks: 88
100 amateur language processing knocks: 89
100 amateur language processing knocks: 40
100 amateur language processing knocks: 45
100 amateur language processing knocks: 43
100 amateur language processing knocks: 55
100 amateur language processing knocks: 22
100 amateur language processing knocks: 61
100 amateur language processing knocks: 94
100 amateur language processing knocks: 54
100 amateur language processing knocks: 04
100 amateur language processing knocks: 63
100 amateur language processing knocks: 78
100 amateur language processing knocks: 12