Introduction
Installation
pip install camelot
pip install pandas
pip install requests
pip install beautifulsoup4
pip install matplotlib
pip install japanize-matplotlib
program
import csv
import datetime
import pathlib
import re
from urllib.parse import urljoin
import camelot
import pandas as pd
import requests
from bs4 import BeautifulSoup
from pdfminer.high_level import extract_text
def fetch_file(url, dir="."):
r = requests.get(url)
r.raise_for_status()
p = pathlib.Path(dir, pathlib.PurePath(url).name)
p.parent.mkdir(parents=True, exist_ok=True)
with p.open(mode="wb") as fw:
fw.write(r.content)
return p
url = "https://www.mhlw.go.jp/stf/seisakunitsuite/bunya/kenkou_iryou/kenkou/kekkaku-kansenshou01/houdou_00008.html"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
}
r = requests.get(url, headers=headers)
r.raise_for_status()
soup = BeautifulSoup(r.content, "html.parser")
d1 = []
d2 = []
for i in soup.select('ul.m-listLink > li > a[href$=".pdf"]')[::-1]:
text = i.get_text(strip=True)
t = re.match("(\d{4})Year(\d{1,2})Moon(\d{1,2})Day", text)
#Press release date
if t:
year, month, day = map(int, t.groups())
dt_date = datetime.date(year, month, day)
else:
dt_date = datetime.date.today()
#PDF file
link = urljoin(url, i.get("href"))
p = fetch_file(link)
s = extract_text(p, page_numbers=[1])
m = re.search("(\d{4})Year(\d{1,2})week\((\d{1,2})Moon(\d{1,2})Sun ~(\d{1,2})Moon(\d{1,2})Day\)", s)
if m:
s_year, s_week, s_month, s_day, e_month, e_day = map(int, m.groups())
dt_start = datetime.date(s_year, s_month, s_day)
dt_end = datetime.date(s_year, e_month, e_day)
if dt_start > dt_end:
dt_end = datetime.date(s_year + 1, e_month, e_day)
tables = camelot.read_pdf(
link, pages="2", split_text=True, strip_text=" ,\n", line_scale=40
)
df_tmp = pd.DataFrame(
tables[0].data[2:], columns=["Prefectures", "Number of reports", "Per fixed point"]
).set_index("Prefectures")
df_tmp = df_tmp.mask(df_tmp == "-").astype(float)
df_tmp["Number of reports"] = df_tmp["Number of reports"].astype("Int64")
df_tmp.loc["Year"] = s_year
df_tmp.loc["week"] = s_week
df_tmp.loc["start date"] = dt_start
df_tmp.loc["End date"] = dt_end
s1 = df_tmp["Number of reports"]
s1.name = dt_date
d1.append(s1)
s2 = df_tmp["Per fixed point"]
s2.name = dt_date
d2.append(s2)
df1 = pd.concat(d1, axis=1, sort=False).T.astype({"Year": int, "week": int})
df2 = pd.concat(d2, axis=1, sort=False).T.astype({"Year": int, "week": int})
df3 = df1.join(df2, rsuffix="(Per fixed point)")
df = df3.reindex(
columns=[
"Year",
"week",
"start date",
"End date",
"Hokkaido",
"Hokkaido (per fixed point)",
"Aomori Prefecture",
"Aomori prefecture (per fixed point)",
"Iwate Prefecture",
"Iwate Prefecture (per fixed point)",
"Miyagi Prefecture",
"Miyagi prefecture (per fixed point)",
"Akita",
"Akita Prefecture (per fixed point)",
"Yamagata Prefecture",
"Yamagata Prefecture (per fixed point)",
"Fukushima Prefecture",
"Fukushima Prefecture (per fixed point)",
"Ibaraki Prefecture",
"Ibaraki Prefecture (per fixed point)",
"Tochigi Prefecture",
"Tochigi prefecture (per fixed point)",
"Gunma Prefecture",
"Gunma prefecture (per fixed point)",
"Saitama",
"Saitama Prefecture (per fixed point)",
"Chiba",
"Chiba (per fixed point)",
"Tokyo",
"Tokyo (per fixed point)",
"Kanagawa Prefecture",
"Kanagawa Prefecture (per fixed point)",
"Niigata Prefecture",
"Niigata Prefecture (per fixed point)",
"Toyama Prefecture",
"Toyama Prefecture (per fixed point)",
"Ishikawa Prefecture",
"Ishikawa Prefecture (per fixed point)",
"Fukui prefecture",
"Fukui Prefecture (per fixed point)",
"Yamanashi Prefecture",
"Yamanashi Prefecture (per fixed point)",
"Nagano Prefecture",
"Nagano prefecture (per fixed point)",
"Gifu Prefecture",
"Gifu prefecture (per fixed point)",
"Shizuoka Prefecture",
"Shizuoka Prefecture (per fixed point)",
"Aichi prefecture",
"Aichi Prefecture (per fixed point)",
"Mie Prefecture",
"Mie prefecture (per fixed point)",
"Shiga Prefecture",
"Shiga Prefecture (per fixed point)",
"Kyoto",
"Kyoto (per fixed point)",
"Osaka",
"Osaka (per fixed point)",
"Hyogo prefecture",
"Hyogo prefecture (per fixed point)",
"Nara Prefecture",
"Nara prefecture (per fixed point)",
"Wakayama Prefecture",
"Wakayama Prefecture (per fixed point)",
"Tottori prefecture",
"Tottori prefecture (per fixed point)",
"Shimane Prefecture",
"Shimane Prefecture (per fixed point)",
"Okayama Prefecture",
"Okayama Prefecture (per fixed point)",
"Hiroshima Prefecture",
"Hiroshima prefecture (per fixed point)",
"Yamaguchi Prefecture",
"Yamaguchi Prefecture (per fixed point)",
"Tokushima Prefecture",
"Tokushima Prefecture (per fixed point)",
"Kagawa Prefecture",
"Kagawa Prefecture (per fixed point)",
"Ehime Prefecture",
"Ehime Prefecture (per fixed point)",
"Kochi Prefecture",
"Kochi prefecture (per fixed point)",
"Fukuoka Prefecture",
"Fukuoka prefecture (per fixed point)",
"Saga Prefecture",
"Saga Prefecture (per fixed point)",
"Nagasaki Prefecture",
"Nagasaki (per fixed point)",
"Kumamoto Prefecture",
"Kumamoto Prefecture (per fixed point)",
"Oita Prefecture",
"Oita prefecture (per fixed point)",
"Miyazaki prefecture",
"Miyazaki Prefecture (per fixed point)",
"Kagoshima prefecture",
"Kagoshima Prefecture (per fixed point)",
"Okinawa Prefecture",
"Okinawa prefecture (per fixed point)",
"Total number",
"Total number (per fixed point)",
"Synchronized last year (total)",
"Synchronized last year (total number) (per fixed point)",
]
)
df.to_csv(
"influ_all.csv", index=False, quoting=csv.QUOTE_NONNUMERIC, encoding="utf_8_sig", na_rep="-",
)
Visualization
import japanize_matplotlib
import matplotlib.pyplot as plt
#resolution
import matplotlib as mpl
mpl.rcParams['figure.dpi'] = 200
df1.set_index("End date", inplace=True)
df1.loc[:, "Hokkaido": "Okinawa Prefecture"].fillna(0).plot.bar(subplots=True, layout=(7, 7), figsize=(30, 30), sharey=True)
#Save graph
plt.savefig("influ.png ", dpi=200, bbox_inches="tight")
plt.show()