Scraping data wrangling of statistical information on new coronavirus infection in Yamanashi Prefecture
Introduction
procedure
- Divide the period into start and end, divide the breakdown by day, combine
- Convert start and end to date
- Organize by date
- Divide into days and number
- Combine start / end and date / number
- Convert from day to date
- Date and subtotal
Scraping
dfs = pd.read_html(
"https://www.pref.yamanashi.jp/koucho/coronavirus/info_coronavirus_data.html"
)
df0 = df.iloc[1:].set_axis(["period", "number", "Breakdown"], axis=1)
|
period |
number |
Breakdown |
1 |
October 4th (Sunday) -October 10th (Saturday) |
25 |
25 cases on 4 days |
2 |
September 27th (Sunday) -October 3rd (Saturday) |
245 |
27 days 18 cases, 28 days 21 cases, 29 days 56 cases, 30 days 46 cases, October 1 44 cases, 2 days 27 cases, 3 days 33 cases |
3 |
September 20th (Sunday) -September 26th (Saturday) |
230 |
20 days 23 cases, 21 days 23 cases, 22 days 17 cases, 23 days 38 cases, 24 days 44 cases, 25 days 32 cases, 26 days 53 cases |
4 |
September 13th (Sunday) -September 19th (Saturday) |
296 |
13 days 35 cases, 14 days 63 cases, 15 days 51 cases, 16 days 29 cases, 17 days 48 cases, 18 days 31 cases, 19 days 39 cases |
5 |
September 6th (Sunday) -September 12th (Saturday) |
285 |
6 days 33 cases, 7 days 36 cases, 8 days 39 cases, 9 days 37 cases, 10 days 50 cases, 11 days 20 cases, 12 days 40 cases |
6 |
August 30th (Sunday) -September 5th (Saturday) |
280 |
30 days 34 cases, 31 days 23 cases, September 1 57 cases, 2 days 35 cases, 3 days 42 cases, 4 days 49 cases, 5 days 40 cases |
7 |
August 23 (Sunday) -August 29 (Saturday) |
371 |
23 days 26 cases, 24 days 90 cases, 25 days 54 cases, 26 days 58 cases, 27 days 56 cases, 28 days 49 cases, 29 days 38 cases |
8 |
August 16th (Sunday) -August 22nd (Saturday) |
537 |
16 days 83 cases, 17 days 54 cases, 18 days 94 cases, 19 days 108 cases, 20 days 95 cases, 21 days 69 cases, 22 days 34 cases |
9 |
August 9th (Sunday) -August 15th (Saturday) |
500 |
9 days 68 cases, 10 days 75 cases, 11 days 57 cases, 12 days 73 cases, 13 days 77 cases, 14 days 63 cases, 15 days 87 cases |
10 |
August 2nd (Sunday) -August 8th (Saturday) |
711 |
2 days 52 cases, 3 days 43 cases, 4 days 90 cases, 5 days 84 cases, 6 days 126 cases, 7 days 141 cases, 8 days 175 cases |
Data wrangling
Divide the period into start and end, divide the breakdown by day, combine
df1 = pd.concat(
[
df0["period"].str.split("~", expand=True).rename(columns={0: "start", 1: "End"}),
df0["Breakdown"].str.split("、", expand=True),
],
axis=1,
)
|
start |
End |
0 |
1 |
2 |
3 |
4 |
5 |
6 |
1 |
October 4th (Sunday) |
October 10th (Saturday) |
25 cases on 4 days |
|
|
|
|
|
|
2 |
September 27 (Sunday) |
Saturday, October 3 |
18 cases on 27th |
21 cases on 28th |
29 days 56 cases |
46 cases on 30 days |
October 1st 44 cases |
27 cases on 2 days |
33 cases in 3 days |
3 |
September 20th (Sunday) |
Saturday, September 26 |
23 cases on 20th |
21 days 23 cases |
22 days 17 cases |
23 days 38 cases |
44 cases on 24th |
32 cases on 25th |
26 days 53 cases |
4 |
September 13th (Sunday) |
Saturday, September 19 |
13 days 35 cases |
63 cases on 14th |
51 cases on 15th |
29 cases on 16th |
48 cases on 17th |
18 days 31 cases |
39 cases on 19th |
5 |
September 6th (Sunday) |
Saturday, September 12 |
33 cases on 6 days |
36 cases on 7 days |
39 cases on 8th |
37 cases on 9th |
50 cases in 10 days |
20 cases on 11th |
40 cases on 12th |
6 |
August 30 (Sunday) |
Saturday, September 5 |
34 cases in 30 days |
23 cases on 31st |
September 1st 57 cases |
35 cases in 2 days |
42 cases on 3 days |
49 cases on 4 days |
40 cases in 5 days |
7 |
August 23 (Sunday) |
Saturday, August 29 |
26 cases on 23rd |
90 cases on 24th |
54 cases on 25th |
26 days 58 cases |
27 days 56 cases |
28 days 49 cases |
29 days 38 cases |
8 |
August 16th (Sunday) |
Saturday, August 22 |
16 days 83 cases |
17 days 54 cases |
18 days 94 cases |
108 cases on 19th |
20 days 95 cases |
21 days 69 cases |
22 days 34 cases |
9 |
Sunday, August 9 |
Saturday, August 15 |
68 cases on 9th |
75 cases on 10 days |
57 cases on 11th |
73 cases on the 12th |
77 cases on 13th |
63 cases on 14th |
87 cases on 15th |
10 |
August 2nd (Sunday) |
Saturday, August 8 |
52 cases in 2 days |
43 cases in 3 days |
90 cases on 4 days |
84 cases on 5 days |
126 cases on 6 days |
141 cases on 7 days |
8 days 175 cases |
Convert start and end to date
df1["start"] = df1["start"].str.normalize("NFKC").apply(my_parser)
df1["End"] = df1["End"].str.normalize("NFKC").apply(my_parser)
|
start |
End |
0 |
1 |
2 |
3 |
4 |
5 |
6 |
1 |
2020-10-04 00:00:00 |
2020-10-10 00:00:00 |
25 cases on 4 days |
|
|
|
|
|
|
2 |
2020-09-27 00:00:00 |
2020-10-03 00:00:00 |
18 cases on 27th |
21 cases on 28th |
29 days 56 cases |
46 cases on 30 days |
October 1st 44 cases |
27 cases on 2 days |
33 cases in 3 days |
3 |
2020-09-20 00:00:00 |
2020-09-26 00:00:00 |
23 cases on 20th |
21 days 23 cases |
22 days 17 cases |
23 days 38 cases |
44 cases on 24th |
32 cases on 25th |
26 days 53 cases |
4 |
2020-09-13 00:00:00 |
2020-09-19 00:00:00 |
13 days 35 cases |
63 cases on 14th |
51 cases on 15th |
29 cases on 16th |
48 cases on 17th |
18 days 31 cases |
39 cases on 19th |
5 |
2020-09-06 00:00:00 |
2020-09-12 00:00:00 |
33 cases on 6 days |
36 cases on 7 days |
39 cases on 8th |
37 cases on 9th |
50 cases in 10 days |
20 cases on 11th |
40 cases on 12th |
6 |
2020-08-30 00:00:00 |
2020-09-05 00:00:00 |
34 cases in 30 days |
23 cases on 31st |
September 1st 57 cases |
35 cases in 2 days |
42 cases on 3 days |
49 cases on 4 days |
40 cases in 5 days |
7 |
2020-08-23 00:00:00 |
2020-08-29 00:00:00 |
26 cases on 23rd |
90 cases on 24th |
54 cases on 25th |
26 days 58 cases |
27 days 56 cases |
28 days 49 cases |
29 days 38 cases |
8 |
2020-08-16 00:00:00 |
2020-08-22 00:00:00 |
16 days 83 cases |
17 days 54 cases |
18 days 94 cases |
108 cases on 19th |
20 days 95 cases |
21 days 69 cases |
22 days 34 cases |
9 |
2020-08-09 00:00:00 |
2020-08-15 00:00:00 |
68 cases on 9th |
75 cases on 10 days |
57 cases on 11th |
73 cases on the 12th |
77 cases on 13th |
63 cases on 14th |
87 cases on 15th |
10 |
2020-08-02 00:00:00 |
2020-08-08 00:00:00 |
52 cases in 2 days |
43 cases in 3 days |
90 cases on 4 days |
84 cases on 5 days |
126 cases on 6 days |
141 cases on 7 days |
8 days 175 cases |
Organized by date
df2 = df1.melt(id_vars=["start", "End"]).dropna()
|
start |
End |
variable |
value |
0 |
2020-10-04 00:00:00 |
2020-10-10 00:00:00 |
0 |
25 cases on 4 days |
1 |
2020-09-27 00:00:00 |
2020-10-03 00:00:00 |
0 |
18 cases on 27th |
2 |
2020-09-20 00:00:00 |
2020-09-26 00:00:00 |
0 |
23 cases on 20th |
3 |
2020-09-13 00:00:00 |
2020-09-19 00:00:00 |
0 |
13 days 35 cases |
4 |
2020-09-06 00:00:00 |
2020-09-12 00:00:00 |
0 |
33 cases on 6 days |
5 |
2020-08-30 00:00:00 |
2020-09-05 00:00:00 |
0 |
34 cases in 30 days |
6 |
2020-08-23 00:00:00 |
2020-08-29 00:00:00 |
0 |
26 cases on 23rd |
7 |
2020-08-16 00:00:00 |
2020-08-22 00:00:00 |
0 |
16 days 83 cases |
8 |
2020-08-09 00:00:00 |
2020-08-15 00:00:00 |
0 |
68 cases on 9th |
9 |
2020-08-02 00:00:00 |
2020-08-08 00:00:00 |
0 |
52 cases in 2 days |
Divided into days and number
df3 = (
df2["value"]
.str.extract("([0-9,]+)[Daily]([0-9,]+)Case",expand=True)
.rename(columns={0: "Day", 1: "subtotal"})
.astype(int)
)
|
Day |
subtotal |
0 |
4 |
25 |
1 |
27 |
18 |
2 |
20 |
23 |
3 |
13 |
35 |
4 |
6 |
33 |
5 |
30 |
34 |
6 |
23 |
26 |
7 |
16 |
83 |
8 |
9 |
68 |
9 |
2 |
52 |
Combine start / end and date / number
df4 = pd.concat([df2, df3], axis=1)
|
start |
End |
variable |
value |
Day |
subtotal |
0 |
2020-10-04 00:00:00 |
2020-10-10 00:00:00 |
0 |
25 cases on 4 days |
4 |
25 |
1 |
2020-09-27 00:00:00 |
2020-10-03 00:00:00 |
0 |
18 cases on 27th |
27 |
18 |
2 |
2020-09-20 00:00:00 |
2020-09-26 00:00:00 |
0 |
23 cases on 20th |
20 |
23 |
3 |
2020-09-13 00:00:00 |
2020-09-19 00:00:00 |
0 |
13 days 35 cases |
13 |
35 |
4 |
2020-09-06 00:00:00 |
2020-09-12 00:00:00 |
0 |
33 cases on 6 days |
6 |
33 |
5 |
2020-08-30 00:00:00 |
2020-09-05 00:00:00 |
0 |
34 cases in 30 days |
30 |
34 |
6 |
2020-08-23 00:00:00 |
2020-08-29 00:00:00 |
0 |
26 cases on 23rd |
23 |
26 |
7 |
2020-08-16 00:00:00 |
2020-08-22 00:00:00 |
0 |
16 days 83 cases |
16 |
83 |
8 |
2020-08-09 00:00:00 |
2020-08-15 00:00:00 |
0 |
68 cases on 9th |
9 |
68 |
9 |
2020-08-02 00:00:00 |
2020-08-08 00:00:00 |
0 |
52 cases in 2 days |
2 |
52 |
Convert from day to date
- Compare the end date and day, replace the start date if it is larger, replace the end date if it is smaller, and create a date
df4["date"] = df4.apply(
lambda x: x["start"].replace(day=x["Day"])
if x["End"].day < x["Day"]
else x["End"].replace(day=x["Day"]),
axis=1,
)
|
start |
End |
variable |
value |
Day |
subtotal |
date |
0 |
2020-10-04 00:00:00 |
2020-10-10 00:00:00 |
0 |
25 cases on 4 days |
4 |
25 |
2020-10-04 00:00:00 |
1 |
2020-09-27 00:00:00 |
2020-10-03 00:00:00 |
0 |
18 cases on 27th |
27 |
18 |
2020-09-27 00:00:00 |
2 |
2020-09-20 00:00:00 |
2020-09-26 00:00:00 |
0 |
23 cases on 20th |
20 |
23 |
2020-09-20 00:00:00 |
3 |
2020-09-13 00:00:00 |
2020-09-19 00:00:00 |
0 |
13 days 35 cases |
13 |
35 |
2020-09-13 00:00:00 |
4 |
2020-09-06 00:00:00 |
2020-09-12 00:00:00 |
0 |
33 cases on 6 days |
6 |
33 |
2020-09-06 00:00:00 |
5 |
2020-08-30 00:00:00 |
2020-09-05 00:00:00 |
0 |
34 cases in 30 days |
30 |
34 |
2020-08-30 00:00:00 |
6 |
2020-08-23 00:00:00 |
2020-08-29 00:00:00 |
0 |
26 cases on 23rd |
23 |
26 |
2020-08-23 00:00:00 |
7 |
2020-08-16 00:00:00 |
2020-08-22 00:00:00 |
0 |
16 days 83 cases |
16 |
83 |
2020-08-16 00:00:00 |
8 |
2020-08-09 00:00:00 |
2020-08-15 00:00:00 |
0 |
68 cases on 9th |
9 |
68 |
2020-08-09 00:00:00 |
9 |
2020-08-02 00:00:00 |
2020-08-08 00:00:00 |
0 |
52 cases in 2 days |
2 |
52 |
2020-08-02 00:00:00 |
Date and subtotal
df = pd.DataFrame(
{"subtotal": df4.set_index("date")["subtotal"].sort_index().asfreq("D", fill_value=0)}
)
date |
subtotal |
2020-02-02 00:00:00 |
1 |
2020-02-03 00:00:00 |
2 |
2020-02-04 00:00:00 |
1 |
2020-02-05 00:00:00 |
1 |
2020-02-06 00:00:00 |
0 |
2020-02-07 00:00:00 |
0 |
2020-02-08 00:00:00 |
0 |
2020-02-09 00:00:00 |
0 |
2020-02-10 00:00:00 |
0 |
2020-02-11 00:00:00 |
1 |
import datetime
import re
import pandas as pd
def my_parser(s):
y = dt_now.year
m, d = map(int, re.findall("(\d{1,2})", s))
return pd.Timestamp(year=y, month=m, day=d)
def df_conv(df):
df0 = df.iloc[1:].set_axis(["period", "number", "Breakdown"], axis=1)
#Divide the period into start date and end date, divide the breakdown by day, combine
df1 = pd.concat(
[
df0["period"].str.split("~", expand=True).rename(columns={0: "start", 1: "End"}),
df0["Breakdown"].str.split("、", expand=True),
],
axis=1,
)
#Convert to date
df1["start"] = df1["start"].str.normalize("NFKC").apply(my_parser)
df1["End"] = df1["End"].str.normalize("NFKC").apply(my_parser)
#Organized by date
df2 = df1.melt(id_vars=["start", "End"]).dropna()
#Divided into days and number
df3 = (
df2["value"]
.str.extract("([0-9,]+)[Daily]([0-9,]+)Case",expand=True)
.rename(columns={0: "Day", 1: "subtotal"})
.astype(int)
)
#Combine start / end and date / number
df4 = pd.concat([df2, df3], axis=1)
#Convert from day to date
df4["date"] = df4.apply(
lambda x: x["start"].replace(day=x["Day"])
if x["End"].day < x["Day"]
else x["End"].replace(day=x["Day"]),
axis=1,
)
#Date and subtotal
df = pd.DataFrame(
{"subtotal": df4.set_index("date")["subtotal"].sort_index().asfreq("D", fill_value=0)}
)
return df
JST = datetime.timezone(datetime.timedelta(hours=+9))
dt_now = datetime.datetime.now(JST)
dfs = pd.read_html(
"https://www.pref.yamanashi.jp/koucho/coronavirus/info_coronavirus_data.html"
)
len(dfs)
#Inspection status of pseudo cases at the Prefectural Institute of Health and Environment
df1 = df_conv(dfs[1])
df1.to_csv("pcr.csv", encoding="utf_8_sig")
#Returnee / Contact Consultation Center
df2 = df_conv(dfs[2])
df2.to_csv("soudan.csv", encoding="utf_8_sig")
#Consultation dial for new coronavirus infection
df3 = df_conv(dfs[3])
df3.to_csv("dial.csv", encoding="utf_8_sig")