Scrap the operation information from Yahoo! Route Information and notify LINE.
import os
from concurrent.futures import ThreadPoolExecutor
import requests
from bs4 import BeautifulSoup
class NotFoundElementError(Exception):
"""Error when element does not exist"""
class Collecter:
"""Collection class"""
def __init__(self):
"""Get environment variables"""
try:
self.urls = list(os.environ['TRAIN_URLS'].split())
except KeyError:
raise NotFoundElementError('url get failed!')
def format_train_info(self, info, err_trains):
"""Operation information shaping
@param:
info [ <str:Route>, <str:Details> ]
err_trains [ <str:Operation information failure URL> ]
@return:
train_info <str:Operation information>
"""
train_info = '\n'
for i in info:
try:
lead, _, detail = i[1].strip('\n').split('\n')
train_info += '{0}\n{1}\n{2}\n\n'.format(i[0], lead, detail)
except ValueError:
raise ValueError('format failed!')
if not err_trains:
train_info += 'Collect Complete!'
else:
train_info += 'This is Error url!'
for url in err_trains:
train_info += '\n' + url
return train_info
def get_train_info(self):
"""Operation information collection
@return:
format_train_info(train_info, err_trains) <str:Operation information>
"""
pool = ThreadPoolExecutor()
res_list = pool.map(requests.get, self.urls)
train_info, err_trains = [[], []]
for res in res_list:
try:
res.raise_for_status()
except requests.exceptions.RequestException:
err_trains.append(res.url)
continue
bs_obj = BeautifulSoup(res.text, 'lxml')
try:
route = bs_obj.h1.text
detail = bs_obj.find(id='mdServiceStatus').text
except AttributeError:
err_trains.append(res.url)
else:
train_info.append([route, detail])
if not train_info:
raise NotFoundElementError('collect failed!')
return self.format_train_info(train_info, err_trains)
I implemented parallel task processing by multithreading. As a result of GET requests for 6 URLs in parallel, it became about 1 second faster.
pool = ThreadPoolExecutor()
res_list = pool.map(requests.get, self.urls)
It may not be possible to increase the speed due to overhead, so it is necessary to determine the usage.
import os
import requests
class Line:
"""LINE notification class"""
def __init__(self):
"""Get environment variables"""
try:
self.url = os.environ['LINE_API_URL']
self.token = os.environ['LINE_API_TOKEN']
self.headers = {'Authorization': 'Bearer ' + self.token}
except KeyError as err:
raise KeyError(err)
def send_success(self, info):
"""Successful collection
@param:
info <str:Operation information>
"""
requests.post(self.url,
headers=self.headers,
params={'message': info})
def send_error(self, err_msg):
"""Collection failure
@param:
err_msg <str:Error message>
"""
requests.post(self.url,
headers=self.headers,
params={'message': err_msg})
For Selenium, you need to download WebDrive, but Beautiful Soup is easy to use just by installing the library. It works faster than Selenium because it doesn't operate the browser. Selenium is more convenient for dynamic sites using JavaScript. Basically, it is better to use Beautiful Soup and partially use Selenium.
concurrent.futures --Concurrent task execution
Recommended Posts