Introduction

Announced by the Ministry of Health, Labor and Welfare, 112-line conversion program that checks the number of influenza reports by prefecture with CSV / JSON

Installation

pip install camelot
pip install pandas
pip install requests
pip install beautifulsoup4
pip install matplotlib
pip install japanize-matplotlib

program

import csv
import datetime
import pathlib
import re
from urllib.parse import urljoin

import camelot
import pandas as pd
import requests
from bs4 import BeautifulSoup
from pdfminer.high_level import extract_text

def fetch_file(url, dir="."):

    r = requests.get(url)
    r.raise_for_status()

    p = pathlib.Path(dir, pathlib.PurePath(url).name)
    p.parent.mkdir(parents=True, exist_ok=True)

    with p.open(mode="wb") as fw:
        fw.write(r.content)
    return p

url = "https://www.mhlw.go.jp/stf/seisakunitsuite/bunya/kenkou_iryou/kenkou/kekkaku-kansenshou01/houdou_00008.html"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
}

r = requests.get(url, headers=headers)
r.raise_for_status()

soup = BeautifulSoup(r.content, "html.parser")

d1 = []
d2 = []

for i in soup.select('ul.m-listLink > li > a[href$=".pdf"]')[::-1]:

    text = i.get_text(strip=True)

    t = re.match("(\d{4})Year(\d{1,2})Moon(\d{1,2})Day", text)

    #Press release date

    if t:
        year, month, day = map(int, t.groups())
        dt_date = datetime.date(year, month, day)
    else:
        dt_date = datetime.date.today()

    #PDF file

    link = urljoin(url, i.get("href"))

    p = fetch_file(link)
    s = extract_text(p, page_numbers=[1])

    m = re.search("(\d{4})Year(\d{1,2})week\((\d{1,2})Moon(\d{1,2})Sun ~(\d{1,2})Moon(\d{1,2})Day\)", s)

    if m:
        s_year, s_week, s_month, s_day, e_month, e_day = map(int, m.groups())

        dt_start = datetime.date(s_year, s_month, s_day)
        dt_end = datetime.date(s_year, e_month, e_day)

        if dt_start > dt_end:
            dt_end = datetime.date(s_year + 1, e_month, e_day)

        tables = camelot.read_pdf(
            link, pages="2", split_text=True, strip_text=" 　,\n", line_scale=40
        )

        df_tmp = pd.DataFrame(
            tables[0].data[2:], columns=["Prefectures", "Number of reports", "Per fixed point"]
        ).set_index("Prefectures")
        
        df_tmp = df_tmp.mask(df_tmp == "-").astype(float)
        df_tmp["Number of reports"] = df_tmp["Number of reports"].astype("Int64")

        df_tmp.loc["Year"] = s_year
        df_tmp.loc["week"] = s_week
        df_tmp.loc["start date"] = dt_start
        df_tmp.loc["End date"] = dt_end

        s1 = df_tmp["Number of reports"]
        s1.name = dt_date
        d1.append(s1)

        s2 = df_tmp["Per fixed point"]
        s2.name = dt_date
        d2.append(s2)

df1 = pd.concat(d1, axis=1, sort=False).T.astype({"Year": int, "week": int})

df2 = pd.concat(d2, axis=1, sort=False).T.astype({"Year": int, "week": int})

df3 = df1.join(df2, rsuffix="(Per fixed point)")

df = df3.reindex(
    columns=[
        "Year",
        "week",
        "start date",
        "End date",
        "Hokkaido",
        "Hokkaido (per fixed point)",
        "Aomori Prefecture",
        "Aomori prefecture (per fixed point)",
        "Iwate Prefecture",
        "Iwate Prefecture (per fixed point)",
        "Miyagi Prefecture",
        "Miyagi prefecture (per fixed point)",
        "Akita",
        "Akita Prefecture (per fixed point)",
        "Yamagata Prefecture",
        "Yamagata Prefecture (per fixed point)",
        "Fukushima Prefecture",
        "Fukushima Prefecture (per fixed point)",
        "Ibaraki Prefecture",
        "Ibaraki Prefecture (per fixed point)",
        "Tochigi Prefecture",
        "Tochigi prefecture (per fixed point)",
        "Gunma Prefecture",
        "Gunma prefecture (per fixed point)",
        "Saitama",
        "Saitama Prefecture (per fixed point)",
        "Chiba",
        "Chiba (per fixed point)",
        "Tokyo",
        "Tokyo (per fixed point)",
        "Kanagawa Prefecture",
        "Kanagawa Prefecture (per fixed point)",
        "Niigata Prefecture",
        "Niigata Prefecture (per fixed point)",
        "Toyama Prefecture",
        "Toyama Prefecture (per fixed point)",
        "Ishikawa Prefecture",
        "Ishikawa Prefecture (per fixed point)",
        "Fukui prefecture",
        "Fukui Prefecture (per fixed point)",
        "Yamanashi Prefecture",
        "Yamanashi Prefecture (per fixed point)",
        "Nagano Prefecture",
        "Nagano prefecture (per fixed point)",
        "Gifu Prefecture",
        "Gifu prefecture (per fixed point)",
        "Shizuoka Prefecture",
        "Shizuoka Prefecture (per fixed point)",
        "Aichi prefecture",
        "Aichi Prefecture (per fixed point)",
        "Mie Prefecture",
        "Mie prefecture (per fixed point)",
        "Shiga Prefecture",
        "Shiga Prefecture (per fixed point)",
        "Kyoto",
        "Kyoto (per fixed point)",
        "Osaka",
        "Osaka (per fixed point)",
        "Hyogo prefecture",
        "Hyogo prefecture (per fixed point)",
        "Nara Prefecture",
        "Nara prefecture (per fixed point)",
        "Wakayama Prefecture",
        "Wakayama Prefecture (per fixed point)",
        "Tottori prefecture",
        "Tottori prefecture (per fixed point)",
        "Shimane Prefecture",
        "Shimane Prefecture (per fixed point)",
        "Okayama Prefecture",
        "Okayama Prefecture (per fixed point)",
        "Hiroshima Prefecture",
        "Hiroshima prefecture (per fixed point)",
        "Yamaguchi Prefecture",
        "Yamaguchi Prefecture (per fixed point)",
        "Tokushima Prefecture",
        "Tokushima Prefecture (per fixed point)",
        "Kagawa Prefecture",
        "Kagawa Prefecture (per fixed point)",
        "Ehime Prefecture",
        "Ehime Prefecture (per fixed point)",
        "Kochi Prefecture",
        "Kochi prefecture (per fixed point)",
        "Fukuoka Prefecture",
        "Fukuoka prefecture (per fixed point)",
        "Saga Prefecture",
        "Saga Prefecture (per fixed point)",
        "Nagasaki Prefecture",
        "Nagasaki (per fixed point)",
        "Kumamoto Prefecture",
        "Kumamoto Prefecture (per fixed point)",
        "Oita Prefecture",
        "Oita prefecture (per fixed point)",
        "Miyazaki prefecture",
        "Miyazaki Prefecture (per fixed point)",
        "Kagoshima prefecture",
        "Kagoshima Prefecture (per fixed point)",
        "Okinawa Prefecture",
        "Okinawa prefecture (per fixed point)",
        "Total number",
        "Total number (per fixed point)",
        "Synchronized last year (total)",
        "Synchronized last year (total number) (per fixed point)",
    ]
)

df.to_csv(
    "influ_all.csv", index=False, quoting=csv.QUOTE_NONNUMERIC, encoding="utf_8_sig", na_rep="-",
)

Visualization

import japanize_matplotlib
import matplotlib.pyplot as plt

#resolution
import matplotlib as mpl
mpl.rcParams['figure.dpi'] = 200

df1.set_index("End date", inplace=True)

df1.loc[:, "Hokkaido": "Okinawa Prefecture"].fillna(0).plot.bar(subplots=True, layout=(7, 7), figsize=(30, 30), sharey=True)

#Save graph
plt.savefig("influ.png ", dpi=200, bbox_inches="tight")
plt.show()

[PYTHON] Data Langling PDF on the outbreak of influenza by the Ministry of Health, Labor and Welfare

Introduction

Installation

program

Visualization