[PYTHON] Scraping PDF of the status of test positives in each prefecture of the Ministry of Health, Labor and Welfare
apt install python3-tk ghostscript
pip install camelot-py[cv]
pip install requests
pip install beautifulsoup4
import re
from urllib.parse import urljoin
import camelot
import pandas as pd
import requests
from bs4 import BeautifulSoup
def get_link(url, text):
r = requests.get(url)
r.raise_for_status()
soup = BeautifulSoup(r.content, "html.parser")
tag = soup.find("a", text=re.compile(text))
link = urljoin(url, tag.get("href"))
return link
def set_col(df, n = 1):
if n > 1:
columns = ["".join(i) for i in zip(*(df.head(n).values))]
else:
columns = df.iloc[0]
return df.iloc[n:].set_axis(columns, axis=1).reset_index(drop=True)
url = get_link(
"https://www.mhlw.go.jp/stf/seisakunitsuite/bunya/0000121431_00086.html",
"^Current status of new coronavirus infection and response by the Ministry of Health, Labor and Welfare",
)
link = get_link(url, "Attachment 1")
tables = camelot.read_pdf(link, pages="all", split_text=True, strip_text="\n", )
df1 = set_col(tables[0].df, 2)
df2 = set_col(tables[1].df)
df = pd.concat([df1, df2], axis=1)
df.columns = df.columns.str.replace("\s", "").str.replace("※\d", "")
df["Name of prefectures"] = df["Name of prefectures"].str.replace("\s", "").str.replace("※\d", "")
df = df.apply(lambda x: x.str.replace(",", ""))
df.mask(df == "-", inplace=True)
df.to_csv("corona.csv", encoding="utf_8_sig")