[PYTHON] 100 coups de traitement linguistique (2020): 28

"""
28.Suppression du balisage MediaWiki
En plus des 27 processus, supprimez autant que possible le balisage MediaWiki des valeurs de modèle et formatez les informations de base sur le pays.
"""

import json
import re

import utils


def get_uk_text(path):
    with open(path) as f:
        for line in f:
            line_data = json.loads(line)
            if line_data["title"] == "Angleterre":
                data = line_data
                break
    return data["text"]


def get_basic_info(string: str) -> str:
    """Get basic information section
    """
    pattern = re.compile(
        r"""
            ^\{\{Informations de base.*?$   # '{{Informations de base'Lignes commençant par
            (.*?)       #Capturer la cible, n'importe quel 0 caractère ou plus, non gourmand
            ^\}\}$      # '}}'Lignes se terminant par
        """,
        re.MULTILINE | re.DOTALL | re.VERBOSE,
    )

    return re.findall(pattern, string)[0]


def get_content(string: str) -> list:
    r"""
    https://docs.python.org/3/library/re.html#regular-expression-syntax

    RE:
        - re.X (re.VERBOSE)     Allow us add command to explain the regular expression
        - re.M (re.MULTILINE)   Apply match to each line. If not specified, only match the first line.
        - re.S (re.DOTALL)      Allow to recognize '\n'
        - []        Used to indicate a set of characters. [(+*)] will match any of the literal characters '(', '+', '*', or ')'.
        - ^\|       String begin with |
        - ?         Causes the resulting RE to match 0 or 1 repetitions

        - *?        The '*' qualifier is greedy.
                    Adding ? after the qualifier makes it perform the match in non-greedy or minimal fashion; as few characters as possible will be matched.
                    e.g. <.*> is matched against '<a> b <c>'
                    e.g. <.*?> will match only '<a>'

        - (...)     Matches whatever regular expression is inside the parentheses,
        - (?=...)   Matches if ... matches next, but doesn’t consume any of the string. This is called a lookahead assertion.
                    For example, Isaac (?=Asimov) will match 'Isaac ' only if it’s followed by 'Asimov'.
        - (?:...)   A non-capturing version of regular parentheses.

    Input:
        - '|Lien de l'emblème national=（[[Emblème national britannique|emblème national]]）'
    Return:
        - {'Lien de l'emblème national': '（[[Emblème national britannique|emblème national]]）'}
    """
    pattern = re.compile(
        r"""
            ^\|         # '|'Lignes commençant par
            (.+?)       #Cible de capture (nom du champ), un ou plusieurs caractères, non gourmand
            \s*         #0 ou plusieurs caractères vides
            =
            \s*         #0 ou plusieurs caractères vides
            (.+?)       #Capturer la cible (valeur), un ou plusieurs caractères, non gourmand
            (?:         #Démarrer un groupe qui n'est pas capturé
                (?=\n\|)    #nouvelle ligne+'|'Avant (anticipation affirmative)
                |           #Ou
                (?=\n$)     #nouvelle ligne+Avant la fin (anticipation affirmative)
            )           #Fin du groupe
            """,
        re.MULTILINE | re.DOTALL | re.VERBOSE,
    )
    result = re.findall(pattern, string)
    return {k: v for k, v in result}  # dict is ordered when using python 3.7


def remove_markup(target: str) -> str:
    # ans26: remvoe highlight markup
    """
    「'''Grande Bretagne'''」->「Grande Bretagne」
    """
    pattern = re.compile(
        r"""
            (\'{2,5})   #2-5'(Début du balisage)
            (.*?)       #Un ou plusieurs caractères (chaîne de caractères cible)
            (\1)        #Identique à la première capture (fin du balisage)
        """,
        re.MULTILINE | re.VERBOSE,
    )
    target = pattern.sub(r"\2", target)  #Changer le deuxième groupe en entier

    """
    and27: remove internal link and file
        [[Londres]] -> Londres
        [[Premier ministre britannique|premier ministre]] -> premier ministre
        [[Fichier:Royal Coat of Arms of the United Kingdom.svg|85px|Emblème national britannique]] -> Emblème national britannique
    """
    pattern = re.compile(
        r"""
            \[\[        # '[['(Début du balisage)
            (?:         #Démarrer un groupe qui n'est pas capturé
                [^|]*?  # '|'0 ou plus de caractères autres que, non gourmand
                \|      # '|'
            )*?          #Le groupe est égal ou supérieur à 0, non gourmand
            ([^|]*?)    #Capturer la cible,'|'Autre que 0 caractère, non gourmand (chaîne de caractères à afficher)
            \]\]        # ']]'(Fin du balisage)
        """,
        re.MULTILINE | re.VERBOSE,
    )
    target = pattern.sub(r"\1", target)  #Changer le premier groupe en entier

    # ans28: remove markups as many as possible
    # Remove {{}}
    """
    {{lang|fr|Dieu et mon droit}} -> Dieu et mon droit
    {{Lien temporaire|Feuille de Lindsay|en|Lindsay Hoyle}} -> Lindsay Hoyle
    """
    pattern = re.compile(
        r"""
            \{\{    # '{{'(Début du balisage)
            .*?     #0 caractère ou plus, non gourmand
            (?:
                [^|]*?
                \|
            )*?
            ([^|]*?)
            \}\}        # '}}'(Fin du balisage)
        """,
        re.MULTILINE | re.VERBOSE,
    )
    target = pattern.sub(r"\1", target)

    # Remove <ref> pattern 1
    """
    "66 435 600<ref>{{Cite web|url=https://www.ons.gov.uk/peoplepopulationandcommunity/populationandmigration/populationestimates|title=Population estimates - Office for National Statistics|accessdate=2019-06-26|date=2019-06-26}}</ref>",
    ->
    "66 435 600",
    """
    pattern = re.compile(r"<ref.*?</ref>")
    target = pattern.sub(r"", target)

    # Remove <ref> pattern 2
    """
    ('2316,2 milliards<ref name="imf-statistics-gdp" />', "2316,2 milliards"),
    """
    pattern = re.compile(
        r"""
            <           # '<'(Début du balisage)
            \/?         # '/'Apparaît 0 ou 1 (dans le cas de la balise de fin/Il y a)
            ref         # 'ref'
            [^>]*?      # '>'Autre que 0 caractère, non gourmand
            >           # '>'(Fin du balisage)
        """,
        re.MULTILINE + re.VERBOSE,
    )
    target = pattern.sub(r"", target)  # no space

    # Replace <br> with ' ',l'hymne national est plus facile à lire
    """
    "God Save the Queen}}{{en icon}}<br />Dieu, protège la reine<br />{{center|Fichier:United States Navy Band - God Save the Queen.ogg}}"
    ->
    "God Save the Queen}}en icon Dieu protège le fichier de la reine:United States Navy Band - God Save the Queen.ogg",
    """
    pattern = re.compile(
        r"""
            <           # '<'(Début du balisage)
            \/?         # '/'Apparaît 0 ou 1 (dans le cas de la balise de fin/Il y a)
            br         # 'br'
            [^>]*?      # '>'Autre que 0 caractère, non gourmand
            >           # '>'(Fin du balisage)
        """,
        re.MULTILINE + re.VERBOSE,
    )
    target = pattern.sub(r" ", target)  # with space

    # # Premove <br>, <ref> pattern 2
    # """
    # ("Établi<br />(1707 Loi commune)", "Établi(1707 Loi commune)"),
    # ('2316,2 milliards<ref name="imf-statistics-gdp" />', "2316,2 milliards"),
    # """
    # pattern = re.compile(
    #     r"""
    #         <           # '<'(Début du balisage)
    #         \/?         # '/'Apparaît 0 ou 1 (dans le cas de la balise de fin/Il y a)
    #         [br|ref]    # 'br'Ou'ref'
    #         [^>]*?      # '>'Autre que 0 caractère, non gourmand
    #         >           # '>'(Fin du balisage)
    #     """,
    #     re.MULTILINE + re.VERBOSE,
    # )
    # target = pattern.sub(r"", target)

    # Remove external link [http://xxxx] 、[http://xxx xxx]
    """
    [http://www.example.org] -> ''
    [http://www.example.caractère d'affichage de l'organisation] -> 'Caractère d'affichage'
    """
    pattern = re.compile(
        r"""
            \[http.?:\/\/ # '[http://'(Début du balisage) ou https
            (?:         #Démarrer un groupe qui n'est pas capturé
                [^\s]*? #0 ou plusieurs caractères non vides, non gourmands
                \s      #Vide
            )?          #Fin du groupe, ce groupe apparaît 0 ou 1
            ([^]]*?)    #Capturer la cible,']'Autre que 0 caractère, non gourmand (chaîne de caractères à afficher)
            \]          # ']'(Fin du balisage)
        """,
        re.MULTILINE + re.VERBOSE,
    )
    target = pattern.sub(r"\1", target)

    return target


# and20
uk_text = get_uk_text("jawiki-country.json")  # See uk_text.txt

# ans25
basic_info = get_basic_info(uk_text)
fields = get_content(basic_info)  # See 25_en_basic_info.json

# ans26, ans27, ans28
result = {k: remove_markup(v) for k, v in fields.items()}  # See 26_no_markup.json
utils.save_json(result, "28_no_markup.json")


# Test for 27
data = [
    ("[[Londres]]", "Londres"),
    ("[[Premier ministre britannique|premier ministre]]", "premier ministre"),
    ("[[Fichier:Royal Coat of Arms of the United Kingdom.svg|85px|Emblème national britannique]]", "Emblème national britannique"),
    (
        "{{lang|fr|[[Dieu et mon droit]]}}<br />（[[français]]:[[Dieu et mon droit|Dieu et mes droits]]）",
        "Dieu et mon droit (français:Dieu et mes droits)",
    ),
    ("{{Lien temporaire|Feuille de Lindsay|en|Lindsay Hoyle}}", "Lindsay Hoyle"),
    ("Établi<br />(1707 Loi commune)", "Établi(1707 Loi commune)"),
    ('2316,2 milliards<ref name="imf-statistics-gdp" />', "2316,2 milliards"),
    (
        "66 435 600<ref>{{Cite web|url=https://www.ons.gov.uk/peoplepopulationandcommunity/populationandmigration/populationestimates|title=Population estimates - Office for National Statistics|accessdate=2019-06-26|date=2019-06-26}}</ref>",
        "66 435 600",
    ),
]


# for target, answer in data:
#     print(answer)
#     print(remove_markup(target))
#     print()