[PYTHON] Scraping and tabulating weather warnings and warnings nationwide from the Japan Meteorological Agency

From the Weather Warning / Warning page of the Japan Meteorological Agency Scraping and tabulating weather warnings and warnings nationwide

import pathlib
import time
from urllib.parse import urljoin

import requests
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm

#For sorting
pref_code = {
    "01": "Hokkaido",
    "02": "Aomori Prefecture",
    "03": "Iwate Prefecture",
    "04": "Miyagi Prefecture",
    "05": "Akita",
    "06": "Yamagata Prefecture",
    "07": "Fukushima Prefecture",
    "08": "Ibaraki Prefecture",
    "09": "Tochigi Prefecture",
    "10": "Gunma Prefecture",
    "11": "Saitama",
    "12": "Chiba",
    "13": "Tokyo",
    "14": "Kanagawa Prefecture",
    "15": "Niigata Prefecture",
    "16": "Toyama Prefecture",
    "17": "Ishikawa Prefecture",
    "18": "Fukui prefecture",
    "19": "Yamanashi Prefecture",
    "20": "Nagano Prefecture",
    "21": "Gifu Prefecture",
    "22": "Shizuoka Prefecture",
    "23": "Aichi prefecture",
    "24": "Mie Prefecture",
    "25": "Shiga Prefecture",
    "26": "Kyoto",
    "27": "Osaka",
    "28": "Hyogo prefecture",
    "29": "Nara Prefecture",
    "30": "Wakayama Prefecture",
    "31": "Tottori prefecture",
    "32": "Shimane Prefecture",
    "33": "Okayama Prefecture",
    "34": "Hiroshima Prefecture",
    "35": "Yamaguchi Prefecture",
    "36": "Tokushima Prefecture",
    "37": "Kagawa Prefecture",
    "38": "Ehime Prefecture",
    "39": "Kochi Prefecture",
    "40": "Fukuoka Prefecture",
    "41": "Saga Prefecture",
    "42": "Nagasaki Prefecture",
    "43": "Kumamoto Prefecture",
    "44": "Oita Prefecture",
    "45": "Miyazaki prefecture",
    "46": "Kagoshima prefecture",
    "47": "Okinawa Prefecture",
}

#Creating a list of prefecture names
pref = [v for v in pref_code.values()]

Scraping

url = "https://www.jma.go.jp/jp/warn/"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
}

#session
with requests.Session() as s:

    r = s.get(url, headers=headers)

    r.raise_for_status()

    base = BeautifulSoup(r.content, "html5lib")

    htmls = []

    for tag in tqdm(base.select("div#title > noscript > table > tbody > tr > td > a")):

        area = tag.get_text(strip=True)
        link = urljoin(url, tag.get("href"))

        r = s.get(link, headers=headers)
        r.raise_for_status()

        soup = BeautifulSoup(r.content, "html5lib")

        p = pathlib.Path("html", pathlib.PurePath(link).name)
        p.parent.mkdir(parents=True, exist_ok=True)

        with p.open(mode="w") as fw:
            fw.write(soup.prettify())

        htmls.append({"area": area, "url": link, "path": p})

        time.sleep(3)
import pandas as pd

def fetch_warn(p, area):

    tmp = pd.read_html(p.open(mode="r"), attrs={"id": "WarnTableTable"})[0]

    df = tmp.melt(
        id_vars=[
            ("Unnamed: 0_level_0", "Unnamed: 0_level_1"),
            ("Unnamed: 1_level_0", "Unnamed: 1_level_1"),
            ("Unnamed: 2_level_0", "Unnamed: 2_level_1"),
        ]
    ).dropna(thresh=5)

    df.set_axis(
        ["area1", "area2", "city", "level", "alert", "value"], axis=1, inplace=True
    )

    df["pref"] = area

    return df


dfs = [fetch_warn(html["path"], html["area"]) for html in htmls]

df = pd.concat(dfs).reset_index(drop=True)

#Normalize characters, remove blank characters
for col in df.select_dtypes(include=object).columns:
    df[col] = df[col].str.normalize("NFKC").str.replace("\s", "")

#Replaced local names with Hokkaido and Okinawa
df["pref"].replace(
    {
        "Soya region": "Hokkaido",
        "Kamikawa / Rumoi region": "Hokkaido",
        "Abashiri / Kitami / Monbetsu region": "Hokkaido",
        "Kushiro / Nemuro / Tokachi region": "Hokkaido",
        "Iburi / Hidaka region": "Hokkaido",
        "Ishikari / Sorachi / Shiribeshi region": "Hokkaido",
        "Watashima / Hiyama region": "Hokkaido",
        "Okinawa main island region": "Okinawa Prefecture",
        "Daito Islands region": "Okinawa Prefecture",
        "Miyakojima region": "Okinawa Prefecture",
        "Yaeyama region": "Okinawa Prefecture",
    },
    inplace=True,
)

# "●"Converted to 0 and 1
df["value"] = (df["value"] == "●").astype(int)

#Aggregate
df_alert = df.pivot_table(
    index="pref", columns="level", values="value", aggfunc=sum
).reindex(index=pref, columns=["alarm", "Warning"])

df_alert

df.to_csv("alert.csv", encoding="utf_8_sig")

Recommended Posts

Scraping and tabulating weather warnings and warnings nationwide from the Japan Meteorological Agency
Scraping the rainfall data of the Japan Meteorological Agency and displaying it on M5Stack
Download the wind data of the Japan Meteorological Agency
[2020 version] Scraping and processing the text from Aozora Bunko
Read the GRIB2 file of the Japan Meteorological Agency with pygrib