It takes time to check the broken link check by hand because the accuracy is uncertain. However, external tools are heavy to execute and cannot be done in the development environment. So I made it myself. It supports relative links and absolute links.

Operation flow of the broken link check tool

HTTP GET the specified URL and parse it with Beautiful Soup.
Classify links into external site links, relative links, and absolute links
Extract the destination URL from the links of the same domain on the page
Eliminate duplication Make an HTTP request to the link destination URL generated in 5.4 and confirm that the HTTP status is 200.

Python 3.5 async / await version

It runs 60% faster than the Python2 version described below with non-blocking HTTP requests. Python2 version is at the bottom of the page. With this code, 100 links will be confirmed in 1-3 seconds.

`tests_url.py`


# -*- coding: utf-8 -*-
from __future__ import absolute_import, unicode_literals

import random
from urllib.parse import urlparse

from bs4 import BeautifulSoup
import requests
import asyncio
import aiohttp
from module.site.site import Site


def tests_urls():
    urls = [
        "http://www.disney.co.jp/home.html",
        "http://starwars.disney.co.jp/home.html"
    ]

    for test_url in urls:
        parse_and_request(test_url)


def parse_and_request(url):
    """
Download the url and parse the bs4
Check the status of all links
    """
    #parse url
    o = urlparse(url)
    host = o.netloc

    #GET and parse the specified URL
    response = requests.get(url, timeout=2)
    assert response.status_code == 200
    soup = BeautifulSoup(response.text, "lxml")
    test_urls = []
    for a in soup.find_all("a"):
        href = a.get("href")
        if href[0] == '#':
            pass
        elif href[0] == '/':
            #Relative link
            test_url = 'http://{}{}'.format(host, href)
            test_urls.append(test_url)
        elif host in href:
            #Absolute link and same domain
            test_urls.append(href)
        else:
            #Do not test external site links
            print('IGNORE:{}'.format(href))

    #Deduplication
    test_urls = list(set(test_urls))
    for test_url in test_urls:
        print(test_url)

    #Check if the link is alive by running asynchronously
    loop = asyncio.get_event_loop()
    loop.run_until_complete(asyncio.wait([check_url(url) for url in test_urls]))


async def check_url(url):
    """
Check the URL asynchronously and check that HTTP STATUS responds 200
    :param url: str
    """
    response = await aiohttp.request('GET', url)
    status_code = response.status
    assert status_code == 200, '{}:{}'.format(str(status_code), url)
    response.close()

`Execution method`


>>>py.test tests_url.py

Python 2 series version

`tests_url.py`


# -*- coding: utf-8 -*-
from __future__ import absolute_import, unicode_literals

from urllib.parse import urlparse

from bs4 import BeautifulSoup
import requests


def tests_urls():
    urls = [
        "http://www.disney.co.jp/home.html",
        "http://starwars.disney.co.jp/home.html"
    ]

    for test_url in urls:
        parse_and_request(test_url)


def parse_and_request(url):
    """
Download the url and parse the bs4
Check the status of all links
    """
    #parse url
    o = urlparse(url)
    host = o.netloc

    #GET and parse the specified URL
    response = requests.get(url, timeout=2)
    assert response.status_code == 200
    soup = BeautifulSoup(response.text, "lxml")
    test_urls = []
    for a in soup.find_all("a"):
        href = a.get("href")
        if href[0] == '#':
            pass
        elif href[0] == '/':
            #Relative link
            test_url = 'http://{}{}'.format(host, href)
            test_urls.append(test_url)
        elif host in href:
            #Absolute link and same domain
            test_urls.append(href)
        else:
            #Do not test external site links
            print('IGNORE:{}'.format(href))

    #Deduplication
    test_urls = list(set(test_urls))
    for test_url in test_urls:
        print(test_url)

        #Check if the link is alive
        response = requests.get(test_url, timeout=2)
        assert response.status_code == 200

[PYTHON] Test code to check for broken links in the page

Operation flow of the broken link check tool

Python 3.5 async / await version

tests_url.py

Execution method

Python 2 series version

tests_url.py

`tests_url.py`

`Execution method`

`tests_url.py`