apt install ghostscript
pip install camelot-py[cv]
pip install pandas
pip install requests
pip install beautifulsoup4
pip install japanmap
pip install tqdm
BAD Open Data Memorial Process
tables = camelot.read_pdf(
link, pages="all", split_text=True, strip_text=" ,%%\n", line_scale=40
)
Delete unnecessary characters with "strip_text =", %% \ n "" when extracting a table from PDF
import csv
import datetime
import pathlib
import re
from urllib.parse import urljoin
import camelot
import pandas as pd
import requests
from bs4 import BeautifulSoup
from japanmap import pref_code
from tqdm.notebook import tqdm
#Convert from Japanese calendar to Western calendar
def wareki2date(s):
m = re.search("(H|R|Heisei|Reiwa)([0-9 yuan]{1,2})[.Year]([0-9]{1,2})[.Month]([0-9]{1,2})Day?",s)
year, month, day = [1 if i == "Former" else int(i) for i in m.group(2, 3, 4)]
if m.group(1) in ["Heisei", "H"]:
year += 1988
elif m.group(1) in ["Reiwa", "R"]:
year += 2018
return datetime.date(year, month, day)
def df_conv(df, col_name, population_date, delivery_date, criteria_date):
df.set_axis(col_name, axis=1, inplace=True)
df["Population base date"] = population_date
df["Base date for calculating the number of deliveries"] = delivery_date
df.insert(0, "Calculation base date", criteria_date)
return df
url = "https://www.soumu.go.jp/kojinbango_card/"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
}
cjk = str.maketrans("Heihaizi Heihaizi Heihaizi", "Turtle Naga Kuroto Ryu Nishi Ao Oni")
df_code = pd.read_csv(
"https://docs.google.com/spreadsheets/d/e/2PACX-1vSseDxB5f3nS-YQ1NOkuFKZ7rTNfPLHqTKaSag-qaK25EWLcSL0klbFBZm1b6JDKGtHTk6iMUxsXpxt/pub?gid=0&single=true&output=csv",
dtype={"Group code": int, "Name of prefectures": str, "County name": str, "City name": str},
)
df_code["City name"] = df_code["County name"].fillna("") + df_code["City name"]
df_code.drop("County name", axis=1, inplace=True)
r = requests.get(url, headers=headers)
r.raise_for_status()
soup = BeautifulSoup(r.content, "html.parser")
links = soup.select("ul.normal > li a[href$=pdf]")
for i in tqdm(links):
creation_date = wareki2date(i.find_parent("li").get_text(strip=True))
link = urljoin(url, i.get("href"))
#Create folder
path_dir = pathlib.Path(creation_date.strftime("%Y%m%d"))
path_dir.mkdir(parents=True, exist_ok=True)
#Extract table from PDF
tables = camelot.read_pdf(
link, pages="all", split_text=True, strip_text=" ,%%\n", line_scale=40
)
#By group classification
df_tmp = tables[0].df
df_tmp.iat[0, 1] = "Classification"
df_tmp.iat[1, 1] = "Nationwide"
population_date = wareki2date(df_tmp.iat[0, 2]).strftime("%Y/%m/%d")
delivery_date = wareki2date(df_tmp.iat[0, 3]).strftime("%Y/%m/%d")
df0 = df_conv(
df_tmp.iloc[1:, 1:].reset_index(drop=True),
["Classification", "population", "Number of deliveries", "populationに対するNumber of deliveries率"],
population_date,
delivery_date,
creation_date.strftime("%Y/%m/%d"),
)
df0 = df0.astype({"population": int, "Number of deliveries": int, "populationに対するNumber of deliveries率": float})
df0.to_csv(
pathlib.Path(path_dir, "summary_by_types.csv"),
index=False,
quoting=csv.QUOTE_NONNUMERIC,
encoding="utf_8_sig",
)
#List of prefectures
dfs = []
for table in tables[3:5]:
df_tmp = table.df
population_date = wareki2date(df_tmp.iat[0, 1]).strftime("%Y/%m/%d")
#Date adjustment of My Number card issuance status (as of April 1, 2019)
if creation_date == datetime.date(2019, 4, 1):
delivery_date = wareki2date(
df_tmp.iat[0, 2].replace("H31.41", "H31.4.1")
).strftime("%Y/%m/%d")
else:
delivery_date = wareki2date(df_tmp.iat[0, 2]).strftime("%Y/%m/%d")
df = df_conv(
df_tmp.iloc[1:].reset_index(drop=True),
["Name of prefectures", "Total number (population)", "Number of deliveries", "人口に対するNumber of deliveries率"],
population_date,
delivery_date,
creation_date.strftime("%Y/%m/%d"),
)
dfs.append(df)
df3 = pd.concat(dfs)
df3["Name of prefectures"] = df3["Name of prefectures"].str.normalize("NFKC")
df3["Name of prefectures"] = df3["Name of prefectures"].apply(lambda s: s.translate(cjk))
#Sort by prefecture number
df3.index = df3["Name of prefectures"].apply(lambda s: pref_code(s))
df3.sort_index(inplace=True)
df3 = df3.astype({"Total number (population)": int, "Number of deliveries": int, "人口に対するNumber of deliveries率": float})
df3.to_csv(
pathlib.Path(path_dir, "all_prefectures.csv"),
index=False,
quoting=csv.QUOTE_NONNUMERIC,
encoding="utf_8_sig",
)
df3
#By gender and age
n = 5
if creation_date > datetime.date(2017, 3, 8):
n = 6
df_tmp = tables[5].df
population_date = wareki2date(df_tmp.iat[0, 1]).strftime("%Y/%m/%d")
delivery_date = wareki2date(df_tmp.iat[0, 4]).strftime("%Y/%m/%d")
df5 = df_conv(
df_tmp.iloc[2:].reset_index(drop=True),
[
"age",
"population(Man)",
"population(woman)",
"population(Total)",
"Number of deliveries(Man)",
"Number of deliveries(woman)",
"Number of deliveries(Total)",
"Grant rate(Man)",
"Grant rate(woman)",
"Grant rate(Total)",
"Ratio of the number of grants to the whole(Man)",
"Ratio of the number of grants to the whole(woman)",
"Ratio of the number of grants to the whole(Total)",
],
population_date,
delivery_date,
creation_date.strftime("%Y/%m/%d"),
)
df5 = df5.astype(
{
"population(Man)": int,
"population(woman)": int,
"population(Total)": int,
"Number of deliveries(Man)": int,
"Number of deliveries(woman)": int,
"Number of deliveries(Total)": int,
"Grant rate(Man)": float,
"Grant rate(woman)": float,
"Grant rate(Total)": float,
"Ratio of the number of grants to the whole(Man)": float,
"Ratio of the number of grants to the whole(woman)": float,
"Ratio of the number of grants to the whole(Total)": float,
}
)
df5.to_csv(
pathlib.Path(path_dir, "demographics.csv"),
index=False,
quoting=csv.QUOTE_NONNUMERIC,
encoding="utf_8_sig",
)
df5
#List by city
dfs = []
for table in tables[n:]:
df_tmp = table.df
population_date = wareki2date(df_tmp.iat[0, 2]).strftime("%Y/%m/%d")
delivery_date = wareki2date(df_tmp.iat[0, 3]).strftime("%Y/%m/%d")
df = df_conv(
df_tmp.iloc[1:].reset_index(drop=True),
["Name of prefectures", "City name", "Total number (population)", "Number of deliveries", "人口に対するNumber of deliveries率"],
population_date,
delivery_date,
creation_date.strftime("%Y/%m/%d"),
)
dfs.append(df)
df6 = pd.concat(dfs)
df6["Name of prefectures"] = df6["Name of prefectures"].str.normalize("NFKC")
df6["Name of prefectures"] = df6["Name of prefectures"].apply(lambda s: s.translate(cjk))
#Exclude nationwide
df6 = df6[df6["Name of prefectures"] != "Nationwide"].copy()
df6["City name"] = df6["City name"].str.normalize("NFKC")
df6["City name"] = df6["City name"].apply(lambda s: s.translate(cjk))
df6["City name"] = df6["City name"].mask(df6["Name of prefectures"] + df6["City name"] == "Sasayama City, Hyogo Prefecture", "Tamba Sasayama City")
df6["City name"] = df6["City name"].mask(
df6["Name of prefectures"] + df6["City name"] == "Yusuhara Town, Takaoka District, Kochi Prefecture", "Yusuhara Town, Takaoka District"
)
df6["City name"] = df6["City name"].mask(
df6["Name of prefectures"] + df6["City name"] == "Sue Town, Kasuya District, Fukuoka Prefecture", "Sue-cho, Kasuya-gun"
)
if creation_date < datetime.date(2018, 10, 1):
df6["City name"] = df6["City name"].mask(
df6["Name of prefectures"] + df6["City name"] == "Nakagawa City, Fukuoka Prefecture", "Nakagawa-cho, Chikushi-gun"
)
else:
df6["City name"] = df6["City name"].mask(
df6["Name of prefectures"] + df6["City name"] == "Nakagawa Town, Chikushi County, Fukuoka Prefecture", "Nakagawa City"
)
df6 = pd.merge(df6, df_code, on=["Name of prefectures", "City name"], how="left")
df6 = df6.astype(
{"Total number (population)": int, "Number of deliveries": int, "人口に対するNumber of deliveries率": float, "Group code": "Int64"}
)
df6.to_csv(
pathlib.Path(path_dir, "all_localgovs.csv"),
index=False,
quoting=csv.QUOTE_NONNUMERIC,
encoding="utf_8_sig",
)