Download the past meteorological data of the Japan Meteorological Agency.
This time the wind. Parse HTML using Python's Beautiful soup.
First, prepare the module.
Preparation
import requests
import bs4
import pandas as pd
import datetime
import time
import numpy as np
from numpy import NaN
Next is the conversion helper. If the character string is a number, convert it to float. /// is a symbol that there is no observation data.
helper
def convert(item_str):
if not item_str:
return ''
if item_str.replace('.','').replace('-','').isdigit():
return float(item_str)
if item_str == '///':
return NaN
return item_str
Then the main body. Enter the prefecture number and place number in the url. To find it http://www.data.jma.go.jp/obd/stats/etrn/ You can select the location in order from. Look at the final URL and copy it.
Since the date comes in URL specification and the time comes in hh: mm format, convert it with datetime + timedelta to convert it to JST.
Body
columns = ('JST', 'time', 'Precipitation', 'temperature', 'Average wind speed', 'Average wind direction', 'Maximum instantaneous wind speed', 'Maximum instantaneous wind speed時風向', 'Daylight hours')
all_df = []
for year in range(2015, 2017): # 2015 ... 2016
for month in range(1, 13): # 1 ... 12
for day in range(1, 32): # 1 ... 31
try:
this_day = datetime.datetime(year, month, day)
except ValueError:
continue # incorrect date; e.g., 2007/2/31 etc.
print(this_day)
url = 'http://www.data.jma.go.jp/obd/stats/etrn/view/10min_a1.php? prec_no=44&block_no=47662&year=' + str(year) + '&month=' + str(month) + '&day=' + str(day) + '&view='
print(url)
time.sleep(1) # wait for 1 sec
res = requests.get(url)
try:
res.raise_for_status() # check for error
except Exception as e:
print('Error: {}'.format(e))
continue # go to next if error
res.encoding = 'utf-8'
soup = bs4.BeautifulSoup(res.text, "lxml")
tbl = soup.select("#tablefix1 td") # find the table
n_rows = len(tbl) // 8
for r in range(n_rows):
i = 8 * r
# JST
hh, mm = tbl[i + 0].getText().split(":") # '00:10' --> '00', '10'
row_timedelta = datetime.timedelta(hours=int(hh), minutes=int(mm))
row_time = this_day + row_timedelta # for converting "24:00" to "00:00" of the next day
row = [row_time]
# other data
row.extend([convert(tbl[i + j].getText()) for j in range(8)])
row_df = pd.DataFrame(columns=columns)
row_df.loc[0] = row
all_df.append(row_df)
df = pd.concat(all_df, ignore_index=True)
df.to_excel('wind_data.xlsx')
Recommended Posts