Introduction

Information on the new coronavirus, which is updated daily, is posted on the website of the Ministry of Health, Labor and Welfare. Press release on new coronavirus infection (outbreak situation, domestic patient outbreak, overseas situation, etc.) From there, I created a program that collects detailed data by prefecture (although it is only data after 5/10 with the same PDF format) and records it in Google Spread Sheet or Excel, so I would like to introduce it.

Since it is recorded in spreadsheet software, you can easily get a bird's-eye view of changes in the number of infected people and the rate of aggravation by graphing them later.

Preparation

If you use it in Excel, you can use it as it is, but you need to prepare a little to access Google Spread Sheet from a Python program. Please refer to the following site etc., make preparations, and enter the name of the json file and the spreadsheet key in the comment part (around line 105) in the program. [Don't hesitate anymore] Summary of initial settings for reading and writing spreadsheets with Python Edit Google Sheets in Python

program

import requests
import re
import regex
from bs4 import BeautifulSoup
from tika import parser
import gspread
from oauth2client.service_account import ServiceAccountCredentials
import openpyxl
import pickle

isGss = False if bool(int(input("1. GoogleSpreadSheet  2. Excel    ")) -
                      1) else True

req = requests.get(
    "https://www.mhlw.go.jp/stf/seisakunitsuite/bunya/0000121431_00086.html")
req.encoding = 'utf-8'
soup = BeautifulSoup(req.text, 'html.parser')

urls = {}
try:
    with open("dates.pickle", "rb") as f:
        dates = pickle.load(f)
except:
    dates = []
column_cnt = len(dates) + 2

for i in soup.select("div[class='m-grid__col1']")[1].find_all("a"):
    url = i["href"]
    if "Current status of new coronavirus infection and response by the Ministry of Health, Labor and Welfare" not in i.text: continue
    b = re.findall("Reiwa\d{1,2}Year\d{1,2}Month\d{1,2}Day", i.text)
    if b: date = b[0]
    else:
        temp_r = requests.get(url)
        temp_r.encoding = 'utf-8'
        date = re.findall("Reiwa\d{1,2}Year\d{1,2}Month\d{1,2}Day", temp_r.text)[0]
    date = date.translate(
        str.maketrans({
            "０": "0",
            "１": "1",
            "２": "2",
            "３": "3",
            "４": "4",
            "５": "5",
            "６": "6",
            "７": "7",
            "８": "8",
            "９": "9"
        }))
    date = "".join([(i.zfill(2) if i.isdigit() else i) for i in re.findall(
        "(Reiwa)(\d{1,2})(Year)(\d{1,2})(Month)(\d{1,2})(Day)", date)[0]])
    if re.findall("Reiwa May 2002 0[1-9]Day|Reiwa 2002 0[1-4]Month\d{1,2}Day", date): continue
    elif date not in dates:
        urls[date] = i["href"]
        dates.append(date)

with open("dates.pickle", "wb") as f:
    pickle.dump(dates, f)

pdfs = {}
for date, url in urls.items():
    temp_r = requests.get(url)
    temp_r.encoding = 'utf-8'
    soup = BeautifulSoup(temp_r.text, 'html.parser')
    for j in soup.find_all('a', href=True):
        if 'Status of prefectural test positives' in j.text:
            pdfs[date] = "https://www.mhlw.go.jp" + j['href']


def makeDict(text):
    l = {}
    for i in regex.findall(
            "(\p{Han}(?:\s+\p{Han}|Total){1,2}|(Other))([※\d\s]+?[\nG])",
            re.sub("※\d{1,2}", "", text).translate(
                str.maketrans({
                    "\u3000": " ",
                    ",": "",
                    "-": "0"
                }))):
        a = list(map(int, i[1].replace("G", "").split()))
        b = "".join(i[0].split())
        l[b] = {}
        l[b]["Number of positives"] = a[0]
        l[b]["Number of people performing PCR tests"] = a[1]
        l[b]["Those who need hospital treatment, etc."] = a[2]
        l[b]["Severe"] = a[3]
        l[b]["Number of people discharged or canceled"] = a[4]
        l[b]["Death (cumulative)"] = a[5]
        l[b]["Checking"] = a[6]
    return l


cnt = 0

if len(pdfs) == 0: pass
elif isGss:
    scope = [
        'https://spreadsheets.google.com/feeds',
        'https://www.googleapis.com/auth/drive'
    ]

    credentials = ServiceAccountCredentials.from_json_keyfile_name('json file name.json', scope) #json file name
    gc = gspread.authorize(credentials)

    SPREADSHEET_KEY = 'Spreadsheet key' #Spreadsheet key

    workbook = gc.open_by_key(SPREADSHEET_KEY)
    try:
        worksheets = [
            workbook.worksheet('Number of positives'),
            workbook.worksheet('Number of people performing PCR tests'),
            workbook.worksheet('Those who need hospital treatment, etc.'),
            workbook.worksheet('Severe'),
            workbook.worksheet('Number of people discharged or canceled'),
            workbook.worksheet('Death (cumulative)'),
            workbook.worksheet('Checking')
        ]
        isFirst = False
    except:
        workbook.add_worksheet(title='Number of positives', rows=100, cols=500)
        workbook.add_worksheet(title='Number of people performing PCR tests', rows=100, cols=500)
        workbook.add_worksheet(title='Those who need hospital treatment, etc.', rows=100, cols=500)
        workbook.add_worksheet(title='Severe', rows=100, cols=500)
        workbook.add_worksheet(title='Number of people discharged or canceled', rows=100, cols=500)
        workbook.add_worksheet(title='Death (cumulative)', rows=100, cols=500)
        workbook.add_worksheet(title='Checking', rows=100, cols=500)
        workbook.del_worksheet(workbook.sheet1)
        worksheets = [
            workbook.worksheet('Number of positives'),
            workbook.worksheet('Number of people performing PCR tests'),
            workbook.worksheet('Those who need hospital treatment, etc.'),
            workbook.worksheet('Severe'),
            workbook.worksheet('Number of people discharged or canceled'),
            workbook.worksheet('Death (cumulative)'),
            workbook.worksheet('Checking')
        ]
        isFirst = True

    cas = [
        worksheet.range(1, column_cnt, 50, column_cnt + len(pdfs) - 1)
        for worksheet in worksheets
    ]
    for date, url in sorted(pdfs.items()):
        print(date)
        file_data = parser.from_buffer(requests.get(url))
        text = file_data["content"]

        l = makeDict(text)
        if isFirst:
            for worksheet in worksheets:
                cells = worksheet.range(2, 1, 50, 1)
                for i, key in enumerate(l.keys()):
                    cells[i].value = key
                worksheet.update_cells(cells)
            isFirst = False
        for i, worksheet in enumerate(worksheets):
            cells = cas[i][cnt::len(pdfs)]
            cells[0].value = date
            for j, a in enumerate(l.keys()):
                key = list(l[a].keys())[i]
                cells[j + 1].value = l[a][key]
        cnt += 1

    for i, worksheet in enumerate(worksheets):
        worksheet.update_cells(cas[i])

else:
    filename = "covid19.xlsx"
    try:
        wb = openpyxl.load_workbook(filename)
        wss = [
            wb.get_sheet_by_name('Number of positives'),
            wb.get_sheet_by_name('Number of people performing PCR tests'),
            wb.get_sheet_by_name('Those who need hospital treatment, etc.'),
            wb.get_sheet_by_name('Severe'),
            wb.get_sheet_by_name('Number of people discharged or canceled'),
            wb.get_sheet_by_name('Death (cumulative)'),
            wb.get_sheet_by_name('Checking')
        ]
        isFirst = False
    except:
        wb = openpyxl.Workbook()
        wss = [
            wb.create_sheet('Number of positives'),
            wb.create_sheet('Number of people performing PCR tests'),
            wb.create_sheet('Those who need hospital treatment, etc.'),
            wb.create_sheet('Severe'),
            wb.create_sheet('Number of people discharged or canceled'),
            wb.create_sheet('Death (cumulative)'),
            wb.create_sheet('Checking')
        ]
        wb.remove(wb.get_sheet_by_name('Sheet'))
        isFirst = True

    for date, url in sorted(pdfs.items()):
        print(date)
        file_data = parser.from_buffer(requests.get(url))
        text = file_data["content"]

        l = makeDict(text)
        if isFirst:
            for ws in wss:
                for i, key in enumerate(l.keys()):
                    ws.cell(i + 2, 1, key)
            isFirst = False
        for i, ws in enumerate(wss):
            ws.cell(1, column_cnt + cnt, date)
            for j, a in enumerate(l.keys()):
                key = list(l[a].keys())[i]
                ws.cell(j + 2, column_cnt + cnt, l[a][key])
        cnt += 1

    wb.save(filename)

How to use

Just save and run from Python. A dates.pickle will be generated for saving the recorded dates, please do not move it from the same directory as the executable.

result

You can easily visualize changes in the number of infected people.

Summary

Looking at the changes in the number of infected people nationwide, I wonder if the second wave is settling down ...? If you want to observe the infection transition in detail, or if you want to perform statistical processing, please try it. It seems that it can be updated automatically if it is executed regularly using the task scheduler or GCF.