"""
24.Extracting file references
Extract all the media files referenced in the article.
"""
import json
import re
def get_uk_text(path):
with open(path) as f:
for line in f:
line_data = json.loads(line)
if line_data["title"] == "England":
data = line_data
break
return data["text"]
uk_text = get_uk_text("jawiki-country.json")
# See uk_text.txt
# ans24
def get_file(string: str) -> list:
"""
https://docs.python.org/3/library/re.html#regular-expression-syntax
RE:
- re.X (re.VERBOSE) allow us add command to explain the regular expression
- re.M (re.MULTILINE) apply match to each line. If not specified, only match the first line.
- re.S (re.DOTALL) allow to recognize '\n'
- (...) matches whatever regular expression is inside the parentheses,
- (?:...) a non-capturing version of regular parentheses.
- ? causes the resulting RE to match 0 or 1 repetitions
- *? the '*' qualifier is greedy.
Adding ? after the qualifier makes it perform the match in non-greedy or minimal fashion; as few characters as possible will be matched.
e.g. <.*> is matched against '<a> b <c>'
e.g. <.*?> will match only '<a>'
Input:
- '[[File:2019 Greenwich Peninsula & Canary Wharf.jpg|150px]]'
Return:
- '2019 Greenwich Peninsula & Canary Wharf.jpg'
"""
pattern = re.compile(
r"""
(?:File|File) #Non-capture,'File'Or'File'
:
(.+?) #Capture target, one or more arbitrary characters, non-greedy
\|
""",
re.MULTILINE | re.VERBOSE,
)
result = re.findall(pattern, string)
return result
files = get_file(uk_text)
for f in files:
print(f)
# Royal Coat of Arms of the United Kingdom.svg
# Descriptio Prime Tabulae Europae.jpg
# Lenepveu, Jeanne d'Arc au siège d'Orléans.jpg
Recommended Posts