It takes time to check the broken link check by hand because the accuracy is uncertain. However, external tools are heavy to execute and cannot be done in the development environment. So I made it myself. It supports relative links and absolute links.
Beautiful Soup
.It runs 60% faster than the Python2 version described below with non-blocking HTTP requests. Python2 version is at the bottom of the page. With this code, 100 links will be confirmed in 1-3 seconds.
tests_url.py
# -*- coding: utf-8 -*-
from __future__ import absolute_import, unicode_literals
import random
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import requests
import asyncio
import aiohttp
from module.site.site import Site
def tests_urls():
urls = [
"http://www.disney.co.jp/home.html",
"http://starwars.disney.co.jp/home.html"
]
for test_url in urls:
parse_and_request(test_url)
def parse_and_request(url):
"""
Download the url and parse the bs4
Check the status of all links
"""
#parse url
o = urlparse(url)
host = o.netloc
#GET and parse the specified URL
response = requests.get(url, timeout=2)
assert response.status_code == 200
soup = BeautifulSoup(response.text, "lxml")
test_urls = []
for a in soup.find_all("a"):
href = a.get("href")
if href[0] == '#':
pass
elif href[0] == '/':
#Relative link
test_url = 'http://{}{}'.format(host, href)
test_urls.append(test_url)
elif host in href:
#Absolute link and same domain
test_urls.append(href)
else:
#Do not test external site links
print('IGNORE:{}'.format(href))
#Deduplication
test_urls = list(set(test_urls))
for test_url in test_urls:
print(test_url)
#Check if the link is alive by running asynchronously
loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.wait([check_url(url) for url in test_urls]))
async def check_url(url):
"""
Check the URL asynchronously and check that HTTP STATUS responds 200
:param url: str
"""
response = await aiohttp.request('GET', url)
status_code = response.status
assert status_code == 200, '{}:{}'.format(str(status_code), url)
response.close()
Execution method
>>>py.test tests_url.py
tests_url.py
# -*- coding: utf-8 -*-
from __future__ import absolute_import, unicode_literals
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import requests
def tests_urls():
urls = [
"http://www.disney.co.jp/home.html",
"http://starwars.disney.co.jp/home.html"
]
for test_url in urls:
parse_and_request(test_url)
def parse_and_request(url):
"""
Download the url and parse the bs4
Check the status of all links
"""
#parse url
o = urlparse(url)
host = o.netloc
#GET and parse the specified URL
response = requests.get(url, timeout=2)
assert response.status_code == 200
soup = BeautifulSoup(response.text, "lxml")
test_urls = []
for a in soup.find_all("a"):
href = a.get("href")
if href[0] == '#':
pass
elif href[0] == '/':
#Relative link
test_url = 'http://{}{}'.format(host, href)
test_urls.append(test_url)
elif host in href:
#Absolute link and same domain
test_urls.append(href)
else:
#Do not test external site links
print('IGNORE:{}'.format(href))
#Deduplication
test_urls = list(set(test_urls))
for test_url in test_urls:
print(test_url)
#Check if the link is alive
response = requests.get(test_url, timeout=2)
assert response.status_code == 200
Recommended Posts