[PYTHON] Data wrangling PDF file of My Number card issuance status

reference

Installation

apt install ghostscript
pip install camelot-py[cv]
pip install pandas
pip install requests
pip install beautifulsoup4
pip install japanmap
pip install tqdm

program

BAD Open Data Memorial Process

  1. Create a CSV file from PDF
  2. Convert the character string to numeric data by removing commas and%
tables = camelot.read_pdf(
    link, pages="all", split_text=True, strip_text="  ,%%\n", line_scale=40
)

Delete unnecessary characters with "strip_text =", %% \ n "" when extracting a table from PDF

import csv
import datetime
import pathlib
import re
from urllib.parse import urljoin

import camelot
import pandas as pd
import requests
from bs4 import BeautifulSoup
from japanmap import pref_code
from tqdm.notebook import tqdm

#Convert from Japanese calendar to Western calendar

def wareki2date(s):

    m = re.search("(H|R|Heisei|Reiwa)([0-9 yuan]{1,2})[.Year]([0-9]{1,2})[.Month]([0-9]{1,2})Day?",s)

    year, month, day = [1 if i == "Former" else int(i) for i in m.group(2, 3, 4)]

    if m.group(1) in ["Heisei", "H"]:
        year += 1988
    elif m.group(1) in ["Reiwa", "R"]:
        year += 2018

    return datetime.date(year, month, day)

def df_conv(df, col_name, population_date, delivery_date, criteria_date):

    df.set_axis(col_name, axis=1, inplace=True)

    df["Population base date"] = population_date
    df["Base date for calculating the number of deliveries"] = delivery_date
    df.insert(0, "Calculation base date", criteria_date)

    return df

url = "https://www.soumu.go.jp/kojinbango_card/"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
}

cjk = str.maketrans("Heihaizi Heihaizi Heihaizi", "Turtle Naga Kuroto Ryu Nishi Ao Oni")

df_code = pd.read_csv(
    "https://docs.google.com/spreadsheets/d/e/2PACX-1vSseDxB5f3nS-YQ1NOkuFKZ7rTNfPLHqTKaSag-qaK25EWLcSL0klbFBZm1b6JDKGtHTk6iMUxsXpxt/pub?gid=0&single=true&output=csv",
    dtype={"Group code": int, "Name of prefectures": str, "County name": str, "City name": str},
)

df_code["City name"] = df_code["County name"].fillna("") + df_code["City name"]
df_code.drop("County name", axis=1, inplace=True)

r = requests.get(url, headers=headers)
r.raise_for_status()

soup = BeautifulSoup(r.content, "html.parser")

links = soup.select("ul.normal > li a[href$=pdf]")

for i in tqdm(links):

    creation_date = wareki2date(i.find_parent("li").get_text(strip=True))
    link = urljoin(url, i.get("href"))

    #Create folder
    path_dir = pathlib.Path(creation_date.strftime("%Y%m%d"))
    path_dir.mkdir(parents=True, exist_ok=True)

    #Extract table from PDF
    tables = camelot.read_pdf(
        link, pages="all", split_text=True, strip_text="  ,%%\n", line_scale=40
    )

    #By group classification

    df_tmp = tables[0].df

    df_tmp.iat[0, 1] = "Classification"
    df_tmp.iat[1, 1] = "Nationwide"

    population_date = wareki2date(df_tmp.iat[0, 2]).strftime("%Y/%m/%d")
    delivery_date = wareki2date(df_tmp.iat[0, 3]).strftime("%Y/%m/%d")

    df0 = df_conv(
        df_tmp.iloc[1:, 1:].reset_index(drop=True),
        ["Classification", "population", "Number of deliveries", "populationに対するNumber of deliveries率"],
        population_date,
        delivery_date,
        creation_date.strftime("%Y/%m/%d"),
    )

    df0 = df0.astype({"population": int, "Number of deliveries": int, "populationに対するNumber of deliveries率": float})

    df0.to_csv(
        pathlib.Path(path_dir, "summary_by_types.csv"),
        index=False,
        quoting=csv.QUOTE_NONNUMERIC,
        encoding="utf_8_sig",
    )

    #List of prefectures

    dfs = []

    for table in tables[3:5]:

        df_tmp = table.df

        population_date = wareki2date(df_tmp.iat[0, 1]).strftime("%Y/%m/%d")

        #Date adjustment of My Number card issuance status (as of April 1, 2019)
        if creation_date == datetime.date(2019, 4, 1):
            delivery_date = wareki2date(
                df_tmp.iat[0, 2].replace("H31.41", "H31.4.1")
            ).strftime("%Y/%m/%d")
        else:
            delivery_date = wareki2date(df_tmp.iat[0, 2]).strftime("%Y/%m/%d")

        df = df_conv(
            df_tmp.iloc[1:].reset_index(drop=True),
            ["Name of prefectures", "Total number (population)", "Number of deliveries", "人口に対するNumber of deliveries率"],
            population_date,
            delivery_date,
            creation_date.strftime("%Y/%m/%d"),
        )

        dfs.append(df)

    df3 = pd.concat(dfs)

    df3["Name of prefectures"] = df3["Name of prefectures"].str.normalize("NFKC")
    df3["Name of prefectures"] = df3["Name of prefectures"].apply(lambda s: s.translate(cjk))

    #Sort by prefecture number
    df3.index = df3["Name of prefectures"].apply(lambda s: pref_code(s))
    df3.sort_index(inplace=True)

    df3 = df3.astype({"Total number (population)": int, "Number of deliveries": int, "人口に対するNumber of deliveries率": float})

    df3.to_csv(
        pathlib.Path(path_dir, "all_prefectures.csv"),
        index=False,
        quoting=csv.QUOTE_NONNUMERIC,
        encoding="utf_8_sig",
    )

    df3

    #By gender and age

    n = 5

    if creation_date > datetime.date(2017, 3, 8):

        n = 6

        df_tmp = tables[5].df

        population_date = wareki2date(df_tmp.iat[0, 1]).strftime("%Y/%m/%d")
        delivery_date = wareki2date(df_tmp.iat[0, 4]).strftime("%Y/%m/%d")

        df5 = df_conv(
            df_tmp.iloc[2:].reset_index(drop=True),
            [
                "age",
                "population(Man)",
                "population(woman)",
                "population(Total)",
                "Number of deliveries(Man)",
                "Number of deliveries(woman)",
                "Number of deliveries(Total)",
                "Grant rate(Man)",
                "Grant rate(woman)",
                "Grant rate(Total)",
                "Ratio of the number of grants to the whole(Man)",
                "Ratio of the number of grants to the whole(woman)",
                "Ratio of the number of grants to the whole(Total)",
            ],
            population_date,
            delivery_date,
            creation_date.strftime("%Y/%m/%d"),
        )

        df5 = df5.astype(
            {
                "population(Man)": int,
                "population(woman)": int,
                "population(Total)": int,
                "Number of deliveries(Man)": int,
                "Number of deliveries(woman)": int,
                "Number of deliveries(Total)": int,
                "Grant rate(Man)": float,
                "Grant rate(woman)": float,
                "Grant rate(Total)": float,
                "Ratio of the number of grants to the whole(Man)": float,
                "Ratio of the number of grants to the whole(woman)": float,
                "Ratio of the number of grants to the whole(Total)": float,
            }
        )

        df5.to_csv(
            pathlib.Path(path_dir, "demographics.csv"),
            index=False,
            quoting=csv.QUOTE_NONNUMERIC,
            encoding="utf_8_sig",
        )

        df5

    #List by city

    dfs = []

    for table in tables[n:]:

        df_tmp = table.df

        population_date = wareki2date(df_tmp.iat[0, 2]).strftime("%Y/%m/%d")
        delivery_date = wareki2date(df_tmp.iat[0, 3]).strftime("%Y/%m/%d")

        df = df_conv(
            df_tmp.iloc[1:].reset_index(drop=True),
            ["Name of prefectures", "City name", "Total number (population)", "Number of deliveries", "人口に対するNumber of deliveries率"],
            population_date,
            delivery_date,
            creation_date.strftime("%Y/%m/%d"),
        )

        dfs.append(df)

    df6 = pd.concat(dfs)

    df6["Name of prefectures"] = df6["Name of prefectures"].str.normalize("NFKC")
    df6["Name of prefectures"] = df6["Name of prefectures"].apply(lambda s: s.translate(cjk))

    #Exclude nationwide
    df6 = df6[df6["Name of prefectures"] != "Nationwide"].copy()

    df6["City name"] = df6["City name"].str.normalize("NFKC")
    df6["City name"] = df6["City name"].apply(lambda s: s.translate(cjk))

    df6["City name"] = df6["City name"].mask(df6["Name of prefectures"] + df6["City name"] == "Sasayama City, Hyogo Prefecture", "Tamba Sasayama City")
    df6["City name"] = df6["City name"].mask(
        df6["Name of prefectures"] + df6["City name"] == "Yusuhara Town, Takaoka District, Kochi Prefecture", "Yusuhara Town, Takaoka District"
    )
    df6["City name"] = df6["City name"].mask(
        df6["Name of prefectures"] + df6["City name"] == "Sue Town, Kasuya District, Fukuoka Prefecture", "Sue-cho, Kasuya-gun"
    )

    if creation_date < datetime.date(2018, 10, 1):
        df6["City name"] = df6["City name"].mask(
            df6["Name of prefectures"] + df6["City name"] == "Nakagawa City, Fukuoka Prefecture", "Nakagawa-cho, Chikushi-gun"
        )
    else:
        df6["City name"] = df6["City name"].mask(
            df6["Name of prefectures"] + df6["City name"] == "Nakagawa Town, Chikushi County, Fukuoka Prefecture", "Nakagawa City"
        )

    df6 = pd.merge(df6, df_code, on=["Name of prefectures", "City name"], how="left")

    df6 = df6.astype(
        {"Total number (population)": int, "Number of deliveries": int, "人口に対するNumber of deliveries率": float, "Group code": "Int64"}
    )

    df6.to_csv(
        pathlib.Path(path_dir, "all_localgovs.csv"),
        index=False,
        quoting=csv.QUOTE_NONNUMERIC,
        encoding="utf_8_sig",
    )

Recommended Posts

Data wrangling PDF file of My Number card issuance status
Data wrangling of Excel file of My Number card issuance status (August)
Data wrangling of Excel file of My Number card issuance status (September)
Story of image analysis of PDF file and data extraction