[PYTHON] Data wrangling (pdfplumber) PDF about influenza outbreak situation of Ministry of Health, Labor and Welfare

Created PDF data wrangling about influenza outbreak situation of Ministry of Health, Labor and Welfare with pdfplumber

It's easy because you can check the position of the character with chars and specify the range of crop.

with pdfplumber.open("data.pdf") as pdf:

    p1 = pdf.pages[1]

    #Check the position of the text
    p1.chars

    #Get text with crop
    week_crop = p1.within_bbox((0, 90, p1.width, 105))
    s = week_crop.extract_text()

program

import csv
import datetime
import pathlib
import re
from urllib.parse import urljoin

import pdfplumber
import pandas as pd
import requests
from bs4 import BeautifulSoup

def fetch_file(url, dir="."):

    r = requests.get(url)
    r.raise_for_status()

    p = pathlib.Path(dir, pathlib.PurePath(url).name)
    p.parent.mkdir(parents=True, exist_ok=True)

    with p.open(mode="wb") as fw:
        fw.write(r.content)
    return p

url = "https://www.mhlw.go.jp/stf/seisakunitsuite/bunya/kenkou_iryou/kenkou/kekkaku-kansenshou01/houdou_00008.html"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
}

r = requests.get(url, headers=headers)
r.raise_for_status()

soup = BeautifulSoup(r.content, "html.parser")

d1 = []
d2 = []

for i in soup.select('ul.m-listLink > li > a[href$=".pdf"]')[::-1]:

    text = i.get_text(strip=True)

    t = re.match("(\d{4})Year(\d{1,2})Moon(\d{1,2})Day", text)

    #Press release date

    if t:
        year, month, day = map(int, t.groups())
        dt_date = datetime.date(year, month, day)
    else:
        dt_date = datetime.date.today()

    #PDF file

    link = urljoin(url, i.get("href"))

    p = fetch_file(link)

    with pdfplumber.open(p) as pdf:

        p1 = pdf.pages[1]

        #Get text with crop
        week_crop = p1.within_bbox((0, 90, p1.width, 105))
        s = week_crop.extract_text()

        m = re.search("(\d{4})Year(\d{1,2})week\((\d{1,2})Moon(\d{1,2})Sun ~(\d{1,2})Moon(\d{1,2})Day\)", s)

        if m:
            s_year, s_week, s_month, s_day, e_month, e_day = map(int, m.groups())

            dt_start = datetime.date(s_year, s_month, s_day)
            dt_end = datetime.date(s_year, e_month, e_day)

            if dt_start > dt_end:
                dt_end = datetime.date(s_year + 1, e_month, e_day)

            table = p1.extract_table()

            df_tmp = pd.DataFrame(
                table[2:], columns=["Prefectures", "Number of reports", "Per fixed point"]
            ).set_index("Prefectures")

            df_tmp.index = df_tmp.index.map(lambda s: "".join(s.split()))

            df_tmp = df_tmp.mask(df_tmp == "-")
            df_tmp["Number of reports"] = df_tmp["Number of reports"].str.replace(",", "").astype(float).astype("Int64")
            df_tmp["Per fixed point"] = df_tmp["Per fixed point"].astype(float)

            df_tmp.loc["Year"] = s_year
            df_tmp.loc["week"] = s_week
            df_tmp.loc["start date"] = dt_start
            df_tmp.loc["End date"] = dt_end

            s1 = df_tmp["Number of reports"]
            s1.name = dt_date
            d1.append(s1)

            s2 = df_tmp["Per fixed point"]
            s2.name = dt_date
            d2.append(s2)

df1 = pd.concat(d1, axis=1, sort=False).T.astype({"Year": int, "week": int})
df2 = pd.concat(d2, axis=1, sort=False).T.astype({"Year": int, "week": int})

df3 = df1.join(df2, rsuffix="(Per fixed point)")

df = df3.reindex(
    columns=[
        "Year",
        "week",
        "start date",
        "End date",
        "Hokkaido",
        "Hokkaido (per fixed point)",
        "Aomori Prefecture",
        "Aomori prefecture (per fixed point)",
        "Iwate Prefecture",
        "Iwate Prefecture (per fixed point)",
        "Miyagi Prefecture",
        "Miyagi prefecture (per fixed point)",
        "Akita",
        "Akita Prefecture (per fixed point)",
        "Yamagata Prefecture",
        "Yamagata Prefecture (per fixed point)",
        "Fukushima Prefecture",
        "Fukushima Prefecture (per fixed point)",
        "Ibaraki Prefecture",
        "Ibaraki Prefecture (per fixed point)",
        "Tochigi Prefecture",
        "Tochigi prefecture (per fixed point)",
        "Gunma Prefecture",
        "Gunma prefecture (per fixed point)",
        "Saitama",
        "Saitama Prefecture (per fixed point)",
        "Chiba",
        "Chiba (per fixed point)",
        "Tokyo",
        "Tokyo (per fixed point)",
        "Kanagawa Prefecture",
        "Kanagawa Prefecture (per fixed point)",
        "Niigata Prefecture",
        "Niigata Prefecture (per fixed point)",
        "Toyama Prefecture",
        "Toyama Prefecture (per fixed point)",
        "Ishikawa Prefecture",
        "Ishikawa Prefecture (per fixed point)",
        "Fukui prefecture",
        "Fukui Prefecture (per fixed point)",
        "Yamanashi Prefecture",
        "Yamanashi Prefecture (per fixed point)",
        "Nagano Prefecture",
        "Nagano prefecture (per fixed point)",
        "Gifu Prefecture",
        "Gifu prefecture (per fixed point)",
        "Shizuoka Prefecture",
        "Shizuoka Prefecture (per fixed point)",
        "Aichi prefecture",
        "Aichi Prefecture (per fixed point)",
        "Mie Prefecture",
        "Mie prefecture (per fixed point)",
        "Shiga Prefecture",
        "Shiga Prefecture (per fixed point)",
        "Kyoto",
        "Kyoto (per fixed point)",
        "Osaka",
        "Osaka (per fixed point)",
        "Hyogo prefecture",
        "Hyogo prefecture (per fixed point)",
        "Nara Prefecture",
        "Nara prefecture (per fixed point)",
        "Wakayama Prefecture",
        "Wakayama Prefecture (per fixed point)",
        "Tottori prefecture",
        "Tottori prefecture (per fixed point)",
        "Shimane Prefecture",
        "Shimane Prefecture (per fixed point)",
        "Okayama Prefecture",
        "Okayama Prefecture (per fixed point)",
        "Hiroshima Prefecture",
        "Hiroshima prefecture (per fixed point)",
        "Yamaguchi Prefecture",
        "Yamaguchi Prefecture (per fixed point)",
        "Tokushima Prefecture",
        "Tokushima Prefecture (per fixed point)",
        "Kagawa Prefecture",
        "Kagawa Prefecture (per fixed point)",
        "Ehime Prefecture",
        "Ehime Prefecture (per fixed point)",
        "Kochi Prefecture",
        "Kochi prefecture (per fixed point)",
        "Fukuoka Prefecture",
        "Fukuoka prefecture (per fixed point)",
        "Saga Prefecture",
        "Saga Prefecture (per fixed point)",
        "Nagasaki Prefecture",
        "Nagasaki (per fixed point)",
        "Kumamoto Prefecture",
        "Kumamoto Prefecture (per fixed point)",
        "Oita Prefecture",
        "Oita prefecture (per fixed point)",
        "Miyazaki prefecture",
        "Miyazaki Prefecture (per fixed point)",
        "Kagoshima prefecture",
        "Kagoshima Prefecture (per fixed point)",
        "Okinawa Prefecture",
        "Okinawa prefecture (per fixed point)",
        "Total number",
        "Total number (per fixed point)",
        "Synchronized last year (total)",
        "Synchronized last year (total number) (per fixed point)",
    ]
)

df1.to_csv(
    "influ_count.csv", index=False, quoting=csv.QUOTE_NONNUMERIC, encoding="utf_8_sig",
)

df2.to_csv(
    "influ_point.csv", index=False, quoting=csv.QUOTE_NONNUMERIC, encoding="utf_8_sig",
)

df.to_csv(
    "influ_all.csv", index=False, quoting=csv.QUOTE_NONNUMERIC, encoding="utf_8_sig", na_rep="-",
)

Recommended Posts

Data wrangling (pdfplumber) PDF about influenza outbreak situation of Ministry of Health, Labor and Welfare
Data Langling PDF on the outbreak of influenza by the Ministry of Health, Labor and Welfare
Data cleansing of open data of the occurrence situation of the Ministry of Health, Labor and Welfare
Scraping PDF of the status of test positives in each prefecture of the Ministry of Health, Labor and Welfare
Scraping PDF of the national list of minimum wages by region of the Ministry of Health, Labor and Welfare
[Python] Automatically read prefectural information on the new coronavirus from the PDF of the Ministry of Health, Labor and Welfare and write it in a spreadsheet or Excel.
Convert PDF of the situation of people infected in Tokyo with the new coronavirus infection of the Tokyo Metropolitan Health and Welfare Bureau to CSV
Story of image analysis of PDF file and data extraction