[PYTHON] Convert PDF of Sagamihara City presentation materials (occurrence status, etc.) regarding new coronavirus infection to CSV

import datetime
import pathlib
import re
from urllib.parse import urljoin

import pandas as pd
import pdfplumber
import requests
from bs4 import BeautifulSoup


def fetch_file(url, dir="."):

    r = requests.get(url)
    r.raise_for_status()

    p = pathlib.Path(dir, pathlib.PurePath(url).name)
    p.parent.mkdir(parents=True, exist_ok=True)

    with p.open(mode="wb") as fw:
        fw.write(r.content)
    return p


url = "https://www.city.sagamihara.kanagawa.jp/shisei/koho/1019191.html"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
}

r = requests.get(url, headers=headers)
r.raise_for_status()

soup = BeautifulSoup(r.content, "html.parser")

tag = soup.find(
    "a", href=re.compile(".pdf$"), onclick=re.compile("Confirmation of new patients due to new coronavirus infection")
)

link = urljoin(url, tag.get("href"))

path_pdf = fetch_file(link)

with pdfplumber.open(path_pdf) as pdf:

    dfs = []

    for page in pdf.pages:

        if page.page_number == 1:

            #Get text with crop
            crop = page.within_bbox((400, 44, page.width, 60))
            update = crop.extract_text()

        for table in page.extract_tables():

            df_tmp = pd.DataFrame(table)

            row, col = df_tmp.shape

            #11 columns

            if col == 11:

                #No less than the top of the table

                if "Less than" not in table[0][0]:

                    dfs.append(df_tmp)

df = (
    pd.concat(dfs)
    .iloc[1:]
    .set_axis(
        ["Case No..", "Age", "sex", "Occupation, etc.", "place", "residence", "Symptoms", "Date of onset", "Positive finding date", "Infection route, etc.", "Remarks"],
        axis=1,
    )
)

df

#Whitespace before and after, normalization
for col in df.select_dtypes(include=object).columns:
    df[col] = df[col].str.replace("\s", "").str.normalize("NFKC")

dt_now = datetime.datetime.now()


def str2date(s: pd.Series) -> pd.Series:

    df = (
        s.str.extract("(\d{1,2})Moon(\d{1,2})Day")
        .rename(columns={0: "month", 1: "day"})
        .fillna(0)
        .astype(int)
    )

    df["year"] = dt_now.year

    tmp = pd.to_datetime(df, errors="coerce")

    df["year"] = df["year"].mask(tmp > dt_now, df["year"] - 1)

    return pd.to_datetime(df, errors="coerce")


df["Date of onset YMD"] = str2date(df["Date of onset"])

df["Positive finding date YMD"] = str2date(df["Positive finding date"])

y, m, d = map(int, re.findall("\d+", update))

dt_update = datetime.datetime(2018 + y, m, d)


df.to_csv(f'sagamihara{dt_update.strftime("%Y%m%d")}.csv', encoding="utf_8_sig")

Recommended Posts

Convert PDF of Sagamihara City presentation materials (occurrence status, etc.) regarding new coronavirus infection to CSV
Convert PDF of product list containing effective surfactants for new coronavirus to CSV
Convert PDF of the situation of people infected in Tokyo with the new coronavirus infection of the Tokyo Metropolitan Health and Welfare Bureau to CSV
Convert PDF of new corona outbreak case in Aichi prefecture to CSV
Convert PDF of Go To Eat Hokkaido campaign dealer list to CSV
Convert PDF of available stores of Go To EAT in Kagoshima prefecture to CSV
Convert PDF of Kumamoto Prefecture Go To EAT member store list to CSV
Convert PDF of Go To EAT member stores in Ishikawa prefecture to CSV
Convert from PDF to CSV with pdfplumber
Convert PDF of Chiba Prefecture Go To EAT member store list to CSV (command)
Convert PDF of list of Go To EAT member stores in Niigata prefecture to CSV