"""
28.MediaWiki markup removal
In addition to the 27 processes, remove MediaWiki markup from the template values as much as possible and format the basic country information.
"""
import json
import re
import utils
def get_uk_text(path):
with open(path) as f:
for line in f:
line_data = json.loads(line)
if line_data["title"] == "England":
data = line_data
break
return data["text"]
def get_basic_info(string: str) -> str:
"""Get basic information section
"""
pattern = re.compile(
r"""
^\{\{Basic information.*?$ # '{{Basic information'Lines starting with
(.*?) #Capture target, any 0 or more characters, non-greedy
^\}\}$ # '}}'Lines ending with
""",
re.MULTILINE | re.DOTALL | re.VERBOSE,
)
return re.findall(pattern, string)[0]
def get_content(string: str) -> list:
r"""
https://docs.python.org/3/library/re.html#regular-expression-syntax
RE:
- re.X (re.VERBOSE) Allow us add command to explain the regular expression
- re.M (re.MULTILINE) Apply match to each line. If not specified, only match the first line.
- re.S (re.DOTALL) Allow to recognize '\n'
- [] Used to indicate a set of characters. [(+*)] will match any of the literal characters '(', '+', '*', or ')'.
- ^\| String begin with |
- ? Causes the resulting RE to match 0 or 1 repetitions
- *? The '*' qualifier is greedy.
Adding ? after the qualifier makes it perform the match in non-greedy or minimal fashion; as few characters as possible will be matched.
e.g. <.*> is matched against '<a> b <c>'
e.g. <.*?> will match only '<a>'
- (...) Matches whatever regular expression is inside the parentheses,
- (?=...) Matches if ... matches next, but doesn’t consume any of the string. This is called a lookahead assertion.
For example, Isaac (?=Asimov) will match 'Isaac ' only if it’s followed by 'Asimov'.
- (?:...) A non-capturing version of regular parentheses.
Input:
- '|National emblem link=([[British coat of arms|National emblem]])'
Return:
- {'National emblem link': '([[British coat of arms|National emblem]])'}
"""
pattern = re.compile(
r"""
^\| # '|'Lines starting with
(.+?) #Capture target (field name), any one or more characters, non-greedy
\s* #0 or more whitespace characters
=
\s* #0 or more whitespace characters
(.+?) #Capture target (value), any one or more characters, non-greedy
(?: #Start a group that is not captured
(?=\n\|) #new line+'|'Before (Affirmative look-ahead)
| #Or
(?=\n$) #new line+Before the end (affirmative look-ahead)
) #Group end
""",
re.MULTILINE | re.DOTALL | re.VERBOSE,
)
result = re.findall(pattern, string)
return {k: v for k, v in result} # dict is ordered when using python 3.7
def remove_markup(target: str) -> str:
# ans26: remvoe highlight markup
"""
「'''Great britain'''」->「Great britain」
"""
pattern = re.compile(
r"""
(\'{2,5}) #2-5'(Start of markup)
(.*?) #Any one or more characters (target character string)
(\1) #Same as the first capture (end of markup)
""",
re.MULTILINE | re.VERBOSE,
)
target = pattern.sub(r"\2", target) #Switch the second group to whole
"""
and27: remove internal link and file
[[London]] -> London
[[British Prime Minister|Prime Minister]] -> Prime Minister
[[File:Royal Coat of Arms of the United Kingdom.svg|85px|British coat of arms]] -> British coat of arms
"""
pattern = re.compile(
r"""
\[\[ # '[['(Start of markup)
(?: #Start a group that is not captured
[^|]*? # '|'0 or more characters other than, non-greedy
\| # '|'
)*? #Group is 0 or more, non-greedy
([^|]*?) #Capture target,'|'Other than 0 characters, non-greedy (character string to be displayed)
\]\] # ']]'(End of markup)
""",
re.MULTILINE | re.VERBOSE,
)
target = pattern.sub(r"\1", target) #Switch the first group to whole
# ans28: remove markups as many as possible
# Remove {{}}
"""
{{lang|fr|Dieu et mon droit}} -> Dieu et mon droit
{{Temporary link|Lindsay Foil|en|Lindsay Hoyle}} -> Lindsay Hoyle
"""
pattern = re.compile(
r"""
\{\{ # '{{'(Start of markup)
.*? #0 or more characters, non-greedy
(?:
[^|]*?
\|
)*?
([^|]*?)
\}\} # '}}'(End of markup)
""",
re.MULTILINE | re.VERBOSE,
)
target = pattern.sub(r"\1", target)
# Remove <ref> pattern 1
"""
"66,435,600<ref>{{Cite web|url=https://www.ons.gov.uk/peoplepopulationandcommunity/populationandmigration/populationestimates|title=Population estimates - Office for National Statistics|accessdate=2019-06-26|date=2019-06-26}}</ref>",
->
"66,435,600",
"""
pattern = re.compile(r"<ref.*?</ref>")
target = pattern.sub(r"", target)
# Remove <ref> pattern 2
"""
('2,316.2 billion<ref name="imf-statistics-gdp" />', "2,316.2 billion"),
"""
pattern = re.compile(
r"""
< # '<'(Start of markup)
\/? # '/'Appears 0 or 1 (in the case of the end tag/There is)
ref # 'ref'
[^>]*? # '>'Other than 0 characters, non-greedy
> # '>'(End of markup)
""",
re.MULTILINE + re.VERBOSE,
)
target = pattern.sub(r"", target) # no space
# Replace <br> with ' ',the national anthem is easier to read
"""
"God Save the Queen}}{{en icon}}<br />God save the queen<br />{{center|File:United States Navy Band - God Save the Queen.ogg}}"
->
"God Save the Queen}}en icon God Save the Queen File:United States Navy Band - God Save the Queen.ogg",
"""
pattern = re.compile(
r"""
< # '<'(Start of markup)
\/? # '/'Appears 0 or 1 (in the case of the end tag/There is)
br # 'br'
[^>]*? # '>'Other than 0 characters, non-greedy
> # '>'(End of markup)
""",
re.MULTILINE + re.VERBOSE,
)
target = pattern.sub(r" ", target) # with space
# # Premove <br>, <ref> pattern 2
# """
# ("Established<br />(1707 Act)", "Established(1707 Act)"),
# ('2,316.2 billion<ref name="imf-statistics-gdp" />', "2,316.2 billion"),
# """
# pattern = re.compile(
# r"""
# < # '<'(Start of markup)
# \/? # '/'Appears 0 or 1 (in the case of the end tag/There is)
# [br|ref] # 'br'Or'ref'
# [^>]*? # '>'Other than 0 characters, non-greedy
# > # '>'(End of markup)
# """,
# re.MULTILINE + re.VERBOSE,
# )
# target = pattern.sub(r"", target)
# Remove external link [http://xxxx] 、[http://xxx xxx]
"""
[http://www.example.org] -> ''
[http://www.example.org display character] -> 'Display character'
"""
pattern = re.compile(
r"""
\[http.?:\/\/ # '[http://'(Start markup) or https
(?: #Start a group that is not captured
[^\s]*? #Zero or more non-blank characters, non-greedy
\s #Blank
)? #Group ends, this group appears 0 or 1
([^]]*?) #Capture target,']'Other than 0 characters, non-greedy (character string to be displayed)
\] # ']'(End of markup)
""",
re.MULTILINE + re.VERBOSE,
)
target = pattern.sub(r"\1", target)
return target
# and20
uk_text = get_uk_text("jawiki-country.json") # See uk_text.txt
# ans25
basic_info = get_basic_info(uk_text)
fields = get_content(basic_info) # See 25_en_basic_info.json
# ans26, ans27, ans28
result = {k: remove_markup(v) for k, v in fields.items()} # See 26_no_markup.json
utils.save_json(result, "28_no_markup.json")
# Test for 27
data = [
("[[London]]", "London"),
("[[British Prime Minister|Prime Minister]]", "Prime Minister"),
("[[File:Royal Coat of Arms of the United Kingdom.svg|85px|British coat of arms]]", "British coat of arms"),
(
"{{lang|fr|[[Dieu et mon droit]]}}<br />([[French]]:[[Dieu et mon droit|God and my rights]])",
"Dieu et mon droit (French:God and my rights)",
),
("{{Temporary link|Lindsay Foil|en|Lindsay Hoyle}}", "Lindsay Hoyle"),
("Established<br />(1707 Act)", "Established(1707 Act)"),
('2,316.2 billion<ref name="imf-statistics-gdp" />', "2,316.2 billion"),
(
"66,435,600<ref>{{Cite web|url=https://www.ons.gov.uk/peoplepopulationandcommunity/populationandmigration/populationestimates|title=Population estimates - Office for National Statistics|accessdate=2019-06-26|date=2019-06-26}}</ref>",
"66,435,600",
),
]
# for target, answer in data:
# print(answer)
# print(remove_markup(target))
# print()
Recommended Posts