[PYTHON] 100 language processing knocks (2020): 48

"""
## 48.Extracting paths from nouns to roots[Permalink](https://nlp100.github.io/ja/ch05.html#48-Extractingpathsfromnounstoroots)

For a clause that contains all the nouns in the sentence, extract the path from that clause to the root of the syntax tree. However, the path on the syntax tree shall satisfy the following specifications.

-Each clause is represented by a (superficial) morpheme sequence
-From the beginning clause to the ending clause of the path, the expression of each clause is "` -> `Connect with

Consider the example sentence "John McCarthy coined the term artificial intelligence at the first conference on AI." When CaboCha is used for the dependency analysis, the following output may be obtained.


John McCarthy->Created
About AI->the first->At a meeting->Created
the first->At a meeting->Created
At a meeting->Created
Called artificial intelligence->Terminology->Created
Terminology->Created

When KNP is used for dependency analysis, the following output may be obtained.


John McCarthy->Created
To AI->Related->At a meeting->Created
At a meeting->Created
With artificial intelligence->Say->Terminology->Created
Terminology->Created

"""
from collections import defaultdict
from typing import List


def read_file(fpath: str) -> List[List[str]]:
    """Get clear format of parsed sentences.

    Args:
        fpath (str): File path.

    Returns:
        List[List[str]]: List of sentences, and each sentence contains a word list.
                         e.g. result[1]:
                            ['* 0 2D 0/0 -0.764522',
                             '\u3000\t sign,Blank,*,*,*,*,\u3000,\u3000,\u3000',
                             '* 1 2D 0/1 -0.764522',
                             'I\t noun,Pronoun,General,*,*,*,I,Wagamama,Wagamama',
                             'Is\t particle,Particle,*,*,*,*,Is,C,Wow',
                             '* 2 -1D 0/2 0.000000',
                             'Cat\t noun,General,*,*,*,*,Cat,cat,cat',
                             'so\t auxiliary verb,*,*,*,Special,Continuous form,Is,De,De',
                             'is there\t auxiliary verb,*,*,*,Five steps, La line Al,Uninflected word,is there,Al,Al',
                             '。\t sign,Kuten,*,*,*,*,。,。,。']
    """
    with open(fpath, mode="rt", encoding="utf-8") as f:
        sentences = f.read().split("EOS\n")
    return [sent.strip().split("\n") for sent in sentences if sent.strip() != ""]


class Morph:
    """Morph information for each token.

    Args:
        data (dict): A dictionary contains necessary information.

    Attributes:
        surface (str):Surface
        base (str):Uninflected word (base)
        pos (str):Part of speech (base)
        pos1 (str):Part of speech subclassification 1 (pos1)
    """

    def __init__(self, data):
        self.surface = data["surface"]
        self.base = data["base"]
        self.pos = data["pos"]
        self.pos1 = data["pos1"]

    def __repr__(self):
        return f"Morph({self.surface})"

    def __str__(self):
        return "surface[{}]\tbase[{}]\tpos[{}]\tpos1[{}]".format(
            self.surface, self.base, self.pos, self.pos1
        )


class Chunk:
    """Containing information for Clause/phrase.

    Args:
        data (dict): A dictionary contains necessary information.

    Attributes:
        chunk_id (str): The number of clause chunk (Phrase number).
        morphs List[Morph]: Morph (morpheme) list.
        dst (str): The index of dependency target (Contact clause index number).
        srcs (List[str]): The index list of dependency source. (Original clause index number).
    """

    def __init__(self, chunk_id, dst):
        self.id = chunk_id
        self.morphs = []
        self.dst = dst
        self.srcs = []

    def __repr__(self):
        return "Chunk( id: {}, dst: {}, srcs: {}, morphs: {} )".format(
            self.id, self.dst, self.srcs, self.morphs
        )

    def get_surface(self) -> str:
        """Concatenate morph surfaces in a chink.

        Args:
            chunk (Chunk): e.g. Chunk( id: 0, dst: 5, srcs: [], morphs: [Morph(I), Morph(Is)]
        Return:
            e.g. 'I am'
        """
        morphs = self.morphs
        res = ""
        for morph in morphs:
            if morph.pos != "symbol":
                res += morph.surface
        return res

    def validate_pos(self, pos: str) -> bool:
        """Return Ture if 'noun' or 'verb' in chunk's morphs. Otherwise, return False."""
        morphs = self.morphs
        return any([morph.pos == pos for morph in morphs])


def convert_sent_to_chunks(sent: List[str]) -> List[Morph]:
    """Extract word and convert to morph.

    Args:
        sent (List[str]): A sentence contains a word list.
                            e.g. sent:
                               ['* 0 1D 0/1 0.000000',
                                'I\t noun,Pronoun,General,*,*,*,I,Wagamama,Wagamama',
                                'Is\t particle,Particle,*,*,*,*,Is,C,Wow',
                                '* 1 -1D 0/2 0.000000',
                                'Cat\t noun,General,*,*,*,*,Cat,cat,cat',
                                'so\t auxiliary verb,*,*,*,Special,Continuous form,Is,De,De',
                                'is there\t auxiliary verb,*,*,*,Five steps, La line Al,Uninflected word,is there,Al,Al',
                                '。\t sign,Kuten,*,*,*,*,。,。,。']

    Parsing format:
        e.g. "* 0 1D 0/1 0.000000"
        |column|meaning|
        | :----: | :----------------------------------------------------------- |
        |   1    |The first column is`*`.. Indicates that this is a dependency analysis result.|
        |   2    |Phrase number (integer starting from 0)|
        |   3    |Contact number +`D`                                              |
        |   4    |Head/Function word positions and any number of feature sequences|
        |   5    |Engagement score. In general, the larger the value, the easier it is to engage.|

    Returns:
        List[Chunk]: List of chunks.
    """
    chunks = []
    chunk = None
    srcs = defaultdict(list)

    for i, word in enumerate(sent):
        if word[0] == "*":
            # Add chunk to chunks
            if chunk is not None:
                chunks.append(chunk)

            # eNw Chunk beggin
            chunk_id = word.split(" ")[1]
            dst = word.split(" ")[2].rstrip("D")
            chunk = Chunk(chunk_id, dst)
            srcs[dst].append(chunk_id)  # Add target->source to mapping list

        else:  # Add Morch to chunk.morphs
            features = word.split(",")
            dic = {
                "surface": features[0].split("\t")[0],
                "base": features[6],
                "pos": features[0].split("\t")[1],
                "pos1": features[1],
            }
            chunk.morphs.append(Morph(dic))

            if i == len(sent) - 1:  # Add the last chunk
                chunks.append(chunk)

    # Add srcs to each chunk
    for chunk in chunks:
        chunk.srcs = list(srcs[chunk.id])

    return chunks


def get_path(chunks: List[Chunk]) -> List[List[str]]:
    """Get all paths in one sentence.

    Terms:
        -predicate(predicate)
        -Item(argument)
        -Case(case)

    Notice:
        - Chunk.morphs have "sa-hen connection noun" and "o (particle)"
        - Chunk.srcs have "verbs"

    Args:
        chunks (List[Chunk]): A sentence contains many chunks.
            e.g. [Chunk( id: 0, dst: 5, srcs: [], morphs: [Morph(I), Morph(Is)] ),
                  Chunk( id: 1, dst: 2, srcs: [], morphs: [Morph(here), Morph(so)] ),
                  Chunk( id: 2, dst: 3, srcs: ['1'], morphs: [Morph(start), Morph(hand)] ),
                  Chunk( id: 3, dst: 4, srcs: ['2'], morphs: [Morph(Human), Morph(That)] ),
                  Chunk( id: 4, dst: 5, srcs: ['3'], morphs: [Morph(thing), Morph(To)] ),
                  Chunk( id: 5, dst: -1, srcs: ['0', '4'], morphs: [Morph(You see), Morph(Ta), Morph(。)] )]

    Returns:
        List[List[str]]: [['where', 'Was born', 'Do not use'], ['I have a clue', 'Do not use']]
    """
    paths = []
    for chunk in chunks:
        # Skip if chunk is invalid
        if (
            not any([morph.pos == "noun" for morph in chunk.morphs])
            or int(chunk.dst) == -1
        ):
            continue

        # Get path
        path = [chunk.get_surface()]
        dst = int(chunk.dst)
        while dst != -1:
            path.append(chunks[dst].get_surface())
            dst = int(chunks[dst].dst)
        paths.append(path)

    return paths


def write_to_file(sents: List[dict], path: str) -> None:
    """Write to file.

    Args:
        sents ([type]):
            e.g.   [[['I am', 'Be a cat']],
                    [['Name is', 'No']],
                    [['where', 'Was born', 'Do not use'], ['I have a clue', 'Do not use']]]
    """
    # convert_frame_to_text
    lines = []

    for sent in sents:
        for chunk in sent:
            lines.append(" -> ".join(chunk))

    # write_to_file
    with open(path, "w") as f:
        for line in lines:
            f.write(f"{line}\n")


fpath = "neko.txt.cabocha"
sentences = read_file(fpath)
sentences = [convert_sent_to_chunks(sent) for sent in sentences]  # ans41

# ans48
pattern_sents = [get_path(sent) for sent in sentences]
pattern_sents = list(filter(lambda x: len(x) != 0, pattern_sents))
write_to_file(pattern_sents, "noun_paths.txt")
#I am->Be a cat
#Name is->No
#where->Was born->Do not use
#I have a clue->Do not use
#anything->dim->By the way->In tears->I remember

Recommended Posts

100 language processing knocks 03 ~ 05
100 language processing knocks (2020): 40
100 language processing knocks (2020): 35
100 language processing knocks (2020): 47
100 language processing knocks (2020): 39
100 language processing knocks (2020): 22
100 language processing knocks (2020): 26
100 language processing knocks (2020): 34
100 language processing knocks (2020): 42
100 language processing knocks (2020): 29
100 language processing knocks (2020): 49
100 language processing knocks 06 ~ 09
100 language processing knocks (2020): 43
100 language processing knocks (2020): 24
100 language processing knocks (2020): 45
100 language processing knocks (2020): 10-19
100 language processing knocks (2020): 30
100 language processing knocks (2020): 00-09
100 language processing knocks (2020): 31
100 language processing knocks (2020): 48
100 language processing knocks (2020): 44
100 language processing knocks (2020): 41
100 language processing knocks (2020): 37
100 language processing knocks (2020): 25
100 language processing knocks (2020): 23
100 language processing knocks (2020): 33
100 language processing knocks (2020): 20
100 language processing knocks (2020): 27
100 language processing knocks (2020): 46
100 language processing knocks (2020): 21
100 language processing knocks (2020): 36
100 amateur language processing knocks: 41
100 amateur language processing knocks: 71
100 amateur language processing knocks: 24
100 amateur language processing knocks: 50
100 amateur language processing knocks: 70
100 amateur language processing knocks: 62
100 amateur language processing knocks: 60
100 amateur language processing knocks: 92
100 amateur language processing knocks: 30
100 amateur language processing knocks: 06
100 amateur language processing knocks: 84
100 amateur language processing knocks: 81
100 amateur language processing knocks: 33
100 amateur language processing knocks: 46
100 amateur language processing knocks: 88
100 amateur language processing knocks: 89
100 amateur language processing knocks: 40
100 amateur language processing knocks: 45
100 amateur language processing knocks: 43
100 amateur language processing knocks: 55
100 amateur language processing knocks: 22
100 amateur language processing knocks: 61
100 amateur language processing knocks: 94
100 amateur language processing knocks: 54
100 amateur language processing knocks: 04
100 amateur language processing knocks: 63
100 amateur language processing knocks: 78
100 amateur language processing knocks: 08
100 amateur language processing knocks: 42
100 language processing knocks ~ Chapter 1