Convert PDF of Status of people infected in Tokyo with new coronavirus infection of Tokyo Metropolitan Health and Welfare Bureau to CSV
import pathlib
from urllib.parse import urljoin
from bs4 import BeautifulSoup
import pandas as pd
import pdfplumber
import requests
from tqdm.notebook import tqdm
def fetch_file(url, dir="."):
r = requests.get(url)
r.raise_for_status()
p = pathlib.Path(dir, pathlib.PurePath(url).name)
p.parent.mkdir(parents=True, exist_ok=True)
with p.open(mode="wb") as fw:
fw.write(r.content)
return p
url = "https://www.fukushihoken.metro.tokyo.lg.jp/iryo/kansen/todokedehcyouseisya.html"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
}
r = requests.get(url, headers=headers)
r.raise_for_status()
soup = BeautifulSoup(r.content, "html.parser")
tag = soup.select_one("div#main p.filelink > a.pdf")
link = urljoin(url, tag.get("href"))
path_pdf = fetch_file(link)
dfs = []
#Convert PDF
with pdfplumber.open(path_pdf) as pdf:
for page in tqdm(pdf.pages):
table = page.extract_table()
df_tmp = pd.DataFrame(table[1:], columns=table[0])
dfs.append(df_tmp)
#Combine all pages
df = pd.concat(dfs)
df.shape
#Whitespace before and after, normalization
for col in df.select_dtypes(include=object).columns:
df[col] = df[col].str.strip().str.normalize("NFKC")
#Change extension to CSV
path_csv = path_pdf.with_suffix(".csv")
df.to_csv(path_csv, encoding="utf_8_sig", index=False)
df1 = df.copy()
#Data wrangling
import datetime
dt_now = datetime.datetime.now()
#Complement the date with the current year and convert it to the date, and if the date is in the future from the present, set it one year ago
def str2date(s: pd.Series) -> pd.Series:
df = s.str.extract("(\d{1,2})Moon(\d{1,2})Day").rename(columns={0: "month", 1: "day"}).fillna(0).astype(int)
df["year"] = dt_now.year
tmp = pd.to_datetime(df, errors="coerce")
df["year"] = df["year"].mask(tmp > dt_now, df["year"] - 1)
return pd.to_datetime(df, errors="coerce")
df1["Release date YMD"] = str2date(df1["Release date"])
df1["Date of onset YMD"] = str2date(df1["Date of onset"])
df1["Confirmed date YMD"] = str2date(df1["Fixed date"])
p = path_csv.with_name(path_csv.name.replace(".csv", "_c.csv"))
df1.to_csv(p, index=False, encoding="utf_8_sig")
#download
from google.colab import files
files.download(str(p))