[PYTHON] Aggregate by prefecture / city / ward / town / village from the PDF of the weekly Bellmark reception status of the Bellmark Education Grant Foundation

Introduction

Bellmark Education Grant Foundation Weekly Bellmark Reception Status Aggregated from PDF by prefecture / city / ward / town / village

Currently, Webbellmark has also started, and you can access your favorite shop from the webbellmark site and collect bellmark points according to your shopping.

Available shops

ウェブベルマーク.png

You can also use Jalan and Rakuten Travel, so you can support without paying yourself just by using it before applying for Go To Travel.

Explanation

This process aggregates what is in the limit range of X coordinate and Y coordinate to the position of many coordinates. I am adjusting things that have a lot of characters and are in two stages, or where there is a slight deviation

def snap_adjustment(s, limit=5):

    count = s.value_counts().sort_index()

    index = 0
    value = 0

    for i, v in count.items():

        if (i - index) < limit:

            if v > value:
                s = s.replace(index, i)
                index = i
                value = v

            else:
                s = s.replace(i, index)

        else:
            index = i
            value = v

    return s

program

import pathlib
import time

import pandas as pd
import pdfplumber
import requests
from bs4 import BeautifulSoup


def fetch_file(url, dir="."):

    r = requests.get(url)
    r.raise_for_status()

    p = pathlib.Path(dir, pathlib.PurePath(url).name)
    p.parent.mkdir(parents=True, exist_ok=True)

    with p.open(mode="wb") as fw:
        fw.write(r.content)
    return p


def snap_adjustment(s, limit=5):

    count = s.value_counts().sort_index()

    index = 0
    value = 0

    for i, v in count.items():

        if (i - index) < limit:

            if v > value:
                s = s.replace(index, i)
                index = i
                value = v

            else:
                s = s.replace(i, index)

        else:
            index = i
            value = v

    return s


url = "https://www.bellmark.or.jp/collect/accept.htm"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
}

r = requests.get(url, headers=headers)
r.raise_for_status()

soup = BeautifulSoup(r.content, "html.parser")

links = {
    href.get("href")
    for href in soup.select("div.cal-process > div.cal-row-date > div > a")
}

dfs = []

for link in links:

    p = fetch_file(link)

    with pdfplumber.open(p) as pdf:

        for page in pdf.pages:

            crop = page.within_bbox((0, 65, page.width, page.height - 40))

            df_tmp = (
                pd.DataFrame(crop.extract_words(keep_blank_chars=True))
                .astype({"x0": float, "x1": float, "top": float, "bottom": float})
                .sort_values(["top", "x0"])
            )

            df_tmp["top"] = snap_adjustment(df_tmp["top"], 6)
            df_tmp["x0"] = snap_adjustment(df_tmp["x0"])

            table = (
                df_tmp.pivot_table(
                    index=["top"],
                    columns="x0",
                    values="text",
                    aggfunc=lambda x: "".join(str(v) for v in x),
                )
            ).values

            df = pd.DataFrame(table, columns=["Prefectures", "city", "Municipalities", "Participating groups", "Reception date"])
            dfs.append(df)

            time.sleep(3)

df = pd.concat(dfs)

df

df["Municipality"] = df["city"].fillna("") + df["Municipalities"].fillna("")

df1 = df.reindex(columns=["Prefectures", "Municipality", "Participating groups", "Reception date"])

df1.to_csv("data.csv")

df1

By prefecture

import japanize_matplotlib
import matplotlib as mpl
import matplotlib.pyplot as plt

mpl.rcParams["figure.dpi"] = 200

df1["Prefectures"].value_counts(ascending=True).plot.barh(figsize=(5, 10))
#Save graph
plt.savefig("01.png ", dpi=200, bbox_inches="tight")
plt.show()

01.png

By city

s = df1.groupby(["Prefectures", "Municipality"])["Municipality"].count().sort_values(ascending=True)

s.tail(50).plot.barh(figsize=(5, 10))

#Save graph
plt.savefig("02.png ", dpi=200, bbox_inches="tight")
plt.show()

02.png

Recommended Posts

Aggregate by prefecture / city / ward / town / village from the PDF of the weekly Bellmark reception status of the Bellmark Education Grant Foundation
Let's guess the development status of the city from the satellite image.