Visualize Internet response rate and mail response rate by scraping Excel of response status by prefecture in Census 2020
import requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urljoin
url = "https://www.kokusei2020.go.jp/internet/"
r = requests.get(url)
r.raise_for_status()
soup = BeautifulSoup(r.content, "html.parser")
links = {}
for i in soup.find_all("span", text="Excel"):
link = urljoin(url, i.find_parent("a").get("href"))
m = re.search("census_answers_(pref|city)_\d{6}.xlsx", link)
if m:
links[m.group(1)] = link
links
import pandas as pd
df_pref = pd.read_excel(
links["pref"],
index_col=[0, 1],
header=None,
skiprows=9,
usecols=[1, 2, 3, 4, 5, 6, 7],
names=["code", "Prefectures", "Number of H27 households", "Net", "By mail", "Net率", "By mail率"],
)
df_pref["Number of responses"] = df_pref["Net"] + df_pref["By mail"]
df_pref["Net rate"] *= 100
df_pref["Mailing rate"] *= 100
df_pref["Response rate"] = df_pref["Net rate"] + df_pref["Mailing rate"]
df_pref.to_csv("pref.csv", encoding="utf_8_sig")
df_city = pd.read_excel(
links["city"],
index_col=[0, 1, 2],
header=None,
skiprows=9,
usecols=[1, 2, 3, 4, 5, 6, 7, 8],
names=["code", "Prefectures", "Municipality", "Number of H27 households", "Net", "By mail", "Net率", "By mail率"],
)
df_city["Number of responses"] = df_city["Net"] + df_city["By mail"]
df_city["Net rate"] *= 100
df_city["Mailing rate"] *= 100
df_city["Response rate"] = df_city["Net rate"] + df_city["Mailing rate"]
df_city.to_csv("city.csv", encoding="utf_8_sig")
df_city
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import japanize_matplotlib
#resolution
import matplotlib as mpl
mpl.rcParams["figure.dpi"] = 200
df1 = df_pref.sort_index(ascending=False).reset_index(level="code", drop=True)
df1.loc[:, ["Net rate", "Mailing rate"]].plot.barh(stacked=True, figsize=(5, 10))
plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left", borderaxespad=0, fontsize=8)
plt.savefig("01.png ", dpi=200, bbox_inches="tight")
plt.show()
Recommended Posts