[PYTHON] Data Langling PDF on the outbreak of influenza by the Ministry of Health, Labor and Welfare

Introduction

Installation

pip install camelot
pip install pandas
pip install requests
pip install beautifulsoup4
pip install matplotlib
pip install japanize-matplotlib

program

import csv
import datetime
import pathlib
import re
from urllib.parse import urljoin

import camelot
import pandas as pd
import requests
from bs4 import BeautifulSoup
from pdfminer.high_level import extract_text

def fetch_file(url, dir="."):

    r = requests.get(url)
    r.raise_for_status()

    p = pathlib.Path(dir, pathlib.PurePath(url).name)
    p.parent.mkdir(parents=True, exist_ok=True)

    with p.open(mode="wb") as fw:
        fw.write(r.content)
    return p

url = "https://www.mhlw.go.jp/stf/seisakunitsuite/bunya/kenkou_iryou/kenkou/kekkaku-kansenshou01/houdou_00008.html"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
}

r = requests.get(url, headers=headers)
r.raise_for_status()

soup = BeautifulSoup(r.content, "html.parser")

d1 = []
d2 = []

for i in soup.select('ul.m-listLink > li > a[href$=".pdf"]')[::-1]:

    text = i.get_text(strip=True)

    t = re.match("(\d{4})Year(\d{1,2})Moon(\d{1,2})Day", text)

    #Press release date

    if t:
        year, month, day = map(int, t.groups())
        dt_date = datetime.date(year, month, day)
    else:
        dt_date = datetime.date.today()

    #PDF file

    link = urljoin(url, i.get("href"))

    p = fetch_file(link)
    s = extract_text(p, page_numbers=[1])

    m = re.search("(\d{4})Year(\d{1,2})week\((\d{1,2})Moon(\d{1,2})Sun ~(\d{1,2})Moon(\d{1,2})Day\)", s)

    if m:
        s_year, s_week, s_month, s_day, e_month, e_day = map(int, m.groups())

        dt_start = datetime.date(s_year, s_month, s_day)
        dt_end = datetime.date(s_year, e_month, e_day)

        if dt_start > dt_end:
            dt_end = datetime.date(s_year + 1, e_month, e_day)

        tables = camelot.read_pdf(
            link, pages="2", split_text=True, strip_text="  ,\n", line_scale=40
        )

        df_tmp = pd.DataFrame(
            tables[0].data[2:], columns=["Prefectures", "Number of reports", "Per fixed point"]
        ).set_index("Prefectures")
        
        df_tmp = df_tmp.mask(df_tmp == "-").astype(float)
        df_tmp["Number of reports"] = df_tmp["Number of reports"].astype("Int64")

        df_tmp.loc["Year"] = s_year
        df_tmp.loc["week"] = s_week
        df_tmp.loc["start date"] = dt_start
        df_tmp.loc["End date"] = dt_end

        s1 = df_tmp["Number of reports"]
        s1.name = dt_date
        d1.append(s1)

        s2 = df_tmp["Per fixed point"]
        s2.name = dt_date
        d2.append(s2)

df1 = pd.concat(d1, axis=1, sort=False).T.astype({"Year": int, "week": int})

df2 = pd.concat(d2, axis=1, sort=False).T.astype({"Year": int, "week": int})

df3 = df1.join(df2, rsuffix="(Per fixed point)")

df = df3.reindex(
    columns=[
        "Year",
        "week",
        "start date",
        "End date",
        "Hokkaido",
        "Hokkaido (per fixed point)",
        "Aomori Prefecture",
        "Aomori prefecture (per fixed point)",
        "Iwate Prefecture",
        "Iwate Prefecture (per fixed point)",
        "Miyagi Prefecture",
        "Miyagi prefecture (per fixed point)",
        "Akita",
        "Akita Prefecture (per fixed point)",
        "Yamagata Prefecture",
        "Yamagata Prefecture (per fixed point)",
        "Fukushima Prefecture",
        "Fukushima Prefecture (per fixed point)",
        "Ibaraki Prefecture",
        "Ibaraki Prefecture (per fixed point)",
        "Tochigi Prefecture",
        "Tochigi prefecture (per fixed point)",
        "Gunma Prefecture",
        "Gunma prefecture (per fixed point)",
        "Saitama",
        "Saitama Prefecture (per fixed point)",
        "Chiba",
        "Chiba (per fixed point)",
        "Tokyo",
        "Tokyo (per fixed point)",
        "Kanagawa Prefecture",
        "Kanagawa Prefecture (per fixed point)",
        "Niigata Prefecture",
        "Niigata Prefecture (per fixed point)",
        "Toyama Prefecture",
        "Toyama Prefecture (per fixed point)",
        "Ishikawa Prefecture",
        "Ishikawa Prefecture (per fixed point)",
        "Fukui prefecture",
        "Fukui Prefecture (per fixed point)",
        "Yamanashi Prefecture",
        "Yamanashi Prefecture (per fixed point)",
        "Nagano Prefecture",
        "Nagano prefecture (per fixed point)",
        "Gifu Prefecture",
        "Gifu prefecture (per fixed point)",
        "Shizuoka Prefecture",
        "Shizuoka Prefecture (per fixed point)",
        "Aichi prefecture",
        "Aichi Prefecture (per fixed point)",
        "Mie Prefecture",
        "Mie prefecture (per fixed point)",
        "Shiga Prefecture",
        "Shiga Prefecture (per fixed point)",
        "Kyoto",
        "Kyoto (per fixed point)",
        "Osaka",
        "Osaka (per fixed point)",
        "Hyogo prefecture",
        "Hyogo prefecture (per fixed point)",
        "Nara Prefecture",
        "Nara prefecture (per fixed point)",
        "Wakayama Prefecture",
        "Wakayama Prefecture (per fixed point)",
        "Tottori prefecture",
        "Tottori prefecture (per fixed point)",
        "Shimane Prefecture",
        "Shimane Prefecture (per fixed point)",
        "Okayama Prefecture",
        "Okayama Prefecture (per fixed point)",
        "Hiroshima Prefecture",
        "Hiroshima prefecture (per fixed point)",
        "Yamaguchi Prefecture",
        "Yamaguchi Prefecture (per fixed point)",
        "Tokushima Prefecture",
        "Tokushima Prefecture (per fixed point)",
        "Kagawa Prefecture",
        "Kagawa Prefecture (per fixed point)",
        "Ehime Prefecture",
        "Ehime Prefecture (per fixed point)",
        "Kochi Prefecture",
        "Kochi prefecture (per fixed point)",
        "Fukuoka Prefecture",
        "Fukuoka prefecture (per fixed point)",
        "Saga Prefecture",
        "Saga Prefecture (per fixed point)",
        "Nagasaki Prefecture",
        "Nagasaki (per fixed point)",
        "Kumamoto Prefecture",
        "Kumamoto Prefecture (per fixed point)",
        "Oita Prefecture",
        "Oita prefecture (per fixed point)",
        "Miyazaki prefecture",
        "Miyazaki Prefecture (per fixed point)",
        "Kagoshima prefecture",
        "Kagoshima Prefecture (per fixed point)",
        "Okinawa Prefecture",
        "Okinawa prefecture (per fixed point)",
        "Total number",
        "Total number (per fixed point)",
        "Synchronized last year (total)",
        "Synchronized last year (total number) (per fixed point)",
    ]
)

df.to_csv(
    "influ_all.csv", index=False, quoting=csv.QUOTE_NONNUMERIC, encoding="utf_8_sig", na_rep="-",
)

Visualization

import japanize_matplotlib
import matplotlib.pyplot as plt

#resolution
import matplotlib as mpl
mpl.rcParams['figure.dpi'] = 200

df1.set_index("End date", inplace=True)

df1.loc[:, "Hokkaido": "Okinawa Prefecture"].fillna(0).plot.bar(subplots=True, layout=(7, 7), figsize=(30, 30), sharey=True)

#Save graph
plt.savefig("influ.png ", dpi=200, bbox_inches="tight")
plt.show()

influ.png

Recommended Posts

Data Langling PDF on the outbreak of influenza by the Ministry of Health, Labor and Welfare
Data wrangling (pdfplumber) PDF about influenza outbreak situation of Ministry of Health, Labor and Welfare
Scraping PDF of the national list of minimum wages by region of the Ministry of Health, Labor and Welfare
Data cleansing of open data of the occurrence situation of the Ministry of Health, Labor and Welfare
Scraping PDF of the status of test positives in each prefecture of the Ministry of Health, Labor and Welfare
[Python] Automatically read prefectural information on the new coronavirus from the PDF of the Ministry of Health, Labor and Welfare and write it in a spreadsheet or Excel.
[Python] Create a script that uses FeedParser and LINE Notify to notify LINE of the latest information on the new coronavirus of the Ministry of Health, Labor and Welfare.
IDWR bulletin data scraping the number of reports per fixed point of influenza and by prefecture
Extract and plot the latest population data from the PDF data provided by the city
Scraping the rainfall data of the Japan Meteorological Agency and displaying it on M5Stack
I tried to rescue the data of the laptop by booting it on Ubuntu
Convert PDF of the situation of people infected in Tokyo with the new coronavirus infection of the Tokyo Metropolitan Health and Welfare Bureau to CSV
Analysis of financial data by pandas and its visualization (2)
Analysis of financial data by pandas and its visualization (1)
Visualization method of data by explanatory variable and objective variable
I analyzed the rank battle data of Pokemon sword shield and visualized it on Tableau
Fixed-point observation of specific data on the Web by automatically executing the Web browser on the server (Ubuntu16.04) (1) -Web browser installation-