[PYTHON] 100 Language Processing Knock (2020): 28

"""
28.MediaWiki markup removal
In addition to the 27 processes, remove MediaWiki markup from the template values as much as possible and format the basic country information.
"""

import json
import re

import utils


def get_uk_text(path):
    with open(path) as f:
        for line in f:
            line_data = json.loads(line)
            if line_data["title"] == "England":
                data = line_data
                break
    return data["text"]


def get_basic_info(string: str) -> str:
    """Get basic information section
    """
    pattern = re.compile(
        r"""
            ^\{\{Basic information.*?$   # '{{Basic information'Lines starting with
            (.*?)       #Capture target, any 0 or more characters, non-greedy
            ^\}\}$      # '}}'Lines ending with
        """,
        re.MULTILINE | re.DOTALL | re.VERBOSE,
    )

    return re.findall(pattern, string)[0]


def get_content(string: str) -> list:
    r"""
    https://docs.python.org/3/library/re.html#regular-expression-syntax

    RE:
        - re.X (re.VERBOSE)     Allow us add command to explain the regular expression
        - re.M (re.MULTILINE)   Apply match to each line. If not specified, only match the first line.
        - re.S (re.DOTALL)      Allow to recognize '\n'
        - []        Used to indicate a set of characters. [(+*)] will match any of the literal characters '(', '+', '*', or ')'.
        - ^\|       String begin with |
        - ?         Causes the resulting RE to match 0 or 1 repetitions

        - *?        The '*' qualifier is greedy.
                    Adding ? after the qualifier makes it perform the match in non-greedy or minimal fashion; as few characters as possible will be matched.
                    e.g. <.*> is matched against '<a> b <c>'
                    e.g. <.*?> will match only '<a>'

        - (...)     Matches whatever regular expression is inside the parentheses,
        - (?=...)   Matches if ... matches next, but doesn’t consume any of the string. This is called a lookahead assertion.
                    For example, Isaac (?=Asimov) will match 'Isaac ' only if it’s followed by 'Asimov'.
        - (?:...)   A non-capturing version of regular parentheses.

    Input:
        - '|National emblem link=([[British coat of arms|National emblem]])'
    Return:
        - {'National emblem link': '([[British coat of arms|National emblem]])'}
    """
    pattern = re.compile(
        r"""
            ^\|         # '|'Lines starting with
            (.+?)       #Capture target (field name), any one or more characters, non-greedy
            \s*         #0 or more whitespace characters
            =
            \s*         #0 or more whitespace characters
            (.+?)       #Capture target (value), any one or more characters, non-greedy
            (?:         #Start a group that is not captured
                (?=\n\|)    #new line+'|'Before (Affirmative look-ahead)
                |           #Or
                (?=\n$)     #new line+Before the end (affirmative look-ahead)
            )           #Group end
            """,
        re.MULTILINE | re.DOTALL | re.VERBOSE,
    )
    result = re.findall(pattern, string)
    return {k: v for k, v in result}  # dict is ordered when using python 3.7


def remove_markup(target: str) -> str:
    # ans26: remvoe highlight markup
    """
    「'''Great britain'''」->「Great britain」
    """
    pattern = re.compile(
        r"""
            (\'{2,5})   #2-5'(Start of markup)
            (.*?)       #Any one or more characters (target character string)
            (\1)        #Same as the first capture (end of markup)
        """,
        re.MULTILINE | re.VERBOSE,
    )
    target = pattern.sub(r"\2", target)  #Switch the second group to whole

    """
    and27: remove internal link and file
        [[London]] -> London
        [[British Prime Minister|Prime Minister]] -> Prime Minister
        [[File:Royal Coat of Arms of the United Kingdom.svg|85px|British coat of arms]] -> British coat of arms
    """
    pattern = re.compile(
        r"""
            \[\[        # '[['(Start of markup)
            (?:         #Start a group that is not captured
                [^|]*?  # '|'0 or more characters other than, non-greedy
                \|      # '|'
            )*?          #Group is 0 or more, non-greedy
            ([^|]*?)    #Capture target,'|'Other than 0 characters, non-greedy (character string to be displayed)
            \]\]        # ']]'(End of markup)
        """,
        re.MULTILINE | re.VERBOSE,
    )
    target = pattern.sub(r"\1", target)  #Switch the first group to whole

    # ans28: remove markups as many as possible
    # Remove {{}}
    """
    {{lang|fr|Dieu et mon droit}} -> Dieu et mon droit
    {{Temporary link|Lindsay Foil|en|Lindsay Hoyle}} -> Lindsay Hoyle
    """
    pattern = re.compile(
        r"""
            \{\{    # '{{'(Start of markup)
            .*?     #0 or more characters, non-greedy
            (?:
                [^|]*?
                \|
            )*?
            ([^|]*?)
            \}\}        # '}}'(End of markup)
        """,
        re.MULTILINE | re.VERBOSE,
    )
    target = pattern.sub(r"\1", target)

    # Remove <ref> pattern 1
    """
    "66,435,600<ref>{{Cite web|url=https://www.ons.gov.uk/peoplepopulationandcommunity/populationandmigration/populationestimates|title=Population estimates - Office for National Statistics|accessdate=2019-06-26|date=2019-06-26}}</ref>",
    ->
    "66,435,600",
    """
    pattern = re.compile(r"<ref.*?</ref>")
    target = pattern.sub(r"", target)

    # Remove <ref> pattern 2
    """
    ('2,316.2 billion<ref name="imf-statistics-gdp" />', "2,316.2 billion"),
    """
    pattern = re.compile(
        r"""
            <           # '<'(Start of markup)
            \/?         # '/'Appears 0 or 1 (in the case of the end tag/There is)
            ref         # 'ref'
            [^>]*?      # '>'Other than 0 characters, non-greedy
            >           # '>'(End of markup)
        """,
        re.MULTILINE + re.VERBOSE,
    )
    target = pattern.sub(r"", target)  # no space

    # Replace <br> with ' ',the national anthem is easier to read
    """
    "God Save the Queen}}{{en icon}}<br />God save the queen<br />{{center|File:United States Navy Band - God Save the Queen.ogg}}"
    ->
    "God Save the Queen}}en icon God Save the Queen File:United States Navy Band - God Save the Queen.ogg",
    """
    pattern = re.compile(
        r"""
            <           # '<'(Start of markup)
            \/?         # '/'Appears 0 or 1 (in the case of the end tag/There is)
            br         # 'br'
            [^>]*?      # '>'Other than 0 characters, non-greedy
            >           # '>'(End of markup)
        """,
        re.MULTILINE + re.VERBOSE,
    )
    target = pattern.sub(r" ", target)  # with space

    # # Premove <br>, <ref> pattern 2
    # """
    # ("Established<br />(1707 Act)", "Established(1707 Act)"),
    # ('2,316.2 billion<ref name="imf-statistics-gdp" />', "2,316.2 billion"),
    # """
    # pattern = re.compile(
    #     r"""
    #         <           # '<'(Start of markup)
    #         \/?         # '/'Appears 0 or 1 (in the case of the end tag/There is)
    #         [br|ref]    # 'br'Or'ref'
    #         [^>]*?      # '>'Other than 0 characters, non-greedy
    #         >           # '>'(End of markup)
    #     """,
    #     re.MULTILINE + re.VERBOSE,
    # )
    # target = pattern.sub(r"", target)

    # Remove external link [http://xxxx] 、[http://xxx xxx]
    """
    [http://www.example.org] -> ''
    [http://www.example.org display character] -> 'Display character'
    """
    pattern = re.compile(
        r"""
            \[http.?:\/\/ # '[http://'(Start markup) or https
            (?:         #Start a group that is not captured
                [^\s]*? #Zero or more non-blank characters, non-greedy
                \s      #Blank
            )?          #Group ends, this group appears 0 or 1
            ([^]]*?)    #Capture target,']'Other than 0 characters, non-greedy (character string to be displayed)
            \]          # ']'(End of markup)
        """,
        re.MULTILINE + re.VERBOSE,
    )
    target = pattern.sub(r"\1", target)

    return target


# and20
uk_text = get_uk_text("jawiki-country.json")  # See uk_text.txt

# ans25
basic_info = get_basic_info(uk_text)
fields = get_content(basic_info)  # See 25_en_basic_info.json

# ans26, ans27, ans28
result = {k: remove_markup(v) for k, v in fields.items()}  # See 26_no_markup.json
utils.save_json(result, "28_no_markup.json")


# Test for 27
data = [
    ("[[London]]", "London"),
    ("[[British Prime Minister|Prime Minister]]", "Prime Minister"),
    ("[[File:Royal Coat of Arms of the United Kingdom.svg|85px|British coat of arms]]", "British coat of arms"),
    (
        "{{lang|fr|[[Dieu et mon droit]]}}<br />([[French]]:[[Dieu et mon droit|God and my rights]])",
        "Dieu et mon droit (French:God and my rights)",
    ),
    ("{{Temporary link|Lindsay Foil|en|Lindsay Hoyle}}", "Lindsay Hoyle"),
    ("Established<br />(1707 Act)", "Established(1707 Act)"),
    ('2,316.2 billion<ref name="imf-statistics-gdp" />', "2,316.2 billion"),
    (
        "66,435,600<ref>{{Cite web|url=https://www.ons.gov.uk/peoplepopulationandcommunity/populationandmigration/populationestimates|title=Population estimates - Office for National Statistics|accessdate=2019-06-26|date=2019-06-26}}</ref>",
        "66,435,600",
    ),
]


# for target, answer in data:
#     print(answer)
#     print(remove_markup(target))
#     print()

Recommended Posts

100 Language Processing Knock (2020): 28
100 Language Processing Knock (2020): 38
100 language processing knock 00 ~ 02
100 language processing knock 2020 [00 ~ 39 answer]
100 language processing knock 2020 [00-79 answer]
100 language processing knock 2020 [00 ~ 69 answer]
100 Language Processing Knock 2020 Chapter 1
100 Amateur Language Processing Knock: 17
100 language processing knock 2020 [00 ~ 49 answer]
100 Language Processing Knock-52: Stemming
100 Language Processing Knock Chapter 1
100 Amateur Language Processing Knock: 07
100 Language Processing Knock 2020 Chapter 3
100 Language Processing Knock 2020 Chapter 2
100 Amateur Language Processing Knock: 09
100 Amateur Language Processing Knock: 47
100 Language Processing Knock-53: Tokenization
100 Amateur Language Processing Knock: 97
100 language processing knock 2020 [00 ~ 59 answer]
100 Amateur Language Processing Knock: 67
100 Language Processing with Python Knock 2015
100 Language Processing Knock-51: Word Clipping
100 Language Processing Knock-58: Tuple Extraction
100 Language Processing Knock-57: Dependency Analysis
100 language processing knock-50: sentence break
100 Language Processing Knock Chapter 1 (Python)
100 Language Processing Knock Chapter 2 (Python)
100 Language Processing Knock-25: Template Extraction
100 Language Processing Knock-87: Word Similarity
I tried 100 language processing knock 2020
100 language processing knock-56: co-reference analysis
Solving 100 Language Processing Knock 2020 (01. "Patatokukashi")
100 Amateur Language Processing Knock: Summary
100 language processing knocks (2020): 40
100 language processing knocks (2020): 32
100 Language Processing Knock 2020 Chapter 2: UNIX Commands
100 Language Processing Knock 2015 Chapter 5 Dependency Analysis (40-49)
100 language processing knocks (2020): 35
100 language processing knocks (2020): 47
100 language processing knocks (2020): 39
100 Language Processing Knock with Python (Chapter 1)
100 Language Processing Knock Chapter 1 in Python
100 language processing knocks (2020): 22
100 language processing knocks (2020): 26
100 language processing knocks (2020): 34
100 Language Processing Knock 2020 Chapter 4: Morphological Analysis
100 Language Processing Knock 2020 Chapter 9: RNN, CNN
100 language processing knocks (2020): 42
100 language processing knock-76 (using scikit-learn): labeling
100 language processing knock-55: named entity extraction
100 language processing knocks (2020): 29
100 language processing knocks (2020): 49
100 language processing knocks 06 ~ 09
100 language processing knocks (2020): 43
100 language processing knocks (2020): 24
I tried 100 language processing knock 2020: Chapter 3
100 Language Processing Knock-82 (Context Word): Context Extraction
100 Language Processing Knock with Python (Chapter 3)
100 language processing knocks (2020): 45
100 Language Processing Knock: Chapter 1 Preparatory Movement
100 Language Processing Knock 2020 Chapter 6: Machine Learning