Overview

If you do a Google search and any of the title ʻog: description`` h1~h4` in the hit page contains a specific keyword, the title and URL of the target page will be separated into text files. Output to. If the URL is already in the text file, skip processing.

Google prohibits scraping, so please do not put a load on the server and use it at your own risk. https://support.google.com/webmasters/answer/66357

Python version

3.8.2

Execution procedure

After building a virtual environment with venv etc.

pip install -r requirements.txt
python main.py

code

requirements.txt

beautifulsoup4 == 4.9.1
requests == 2.24.0

settings.py

settings = {
    #Keywords used for Google search
    'google_search_keywords': ['Medical', 'corona'],

    #Number of searches
    'google_search_num': 10,

    #Keywords to search from within the hit page
    'search_keywords_in_page': ['Medical']
}

main.py

import urllib.parse
import re

import requests
import bs4

from settings import settings
from output import OutputText


def get_ogdesc_from_soup(soup: bs4.BeautifulSoup) -> str:
    """
From a BeautifulSoup instance
    <meta property="og:description" content="...">
And returns the contents of content.
If not found, returns an empty string.
    """
    og_desc = soup.find('meta', attrs={'property': 'og:description', 'content': True})
    if og_desc:
        return og_desc['content']
    return ''


def get_href_from_soup(soup: bs4.BeautifulSoup):
    href = soup.get('href')
    href = re.search('(http)(.+)(&sa)', href).group()[0:-3]  #Remove unnecessary strings
    href = urllib.parse.unquote(href)  #Decode
    return href


def do_google_search(keywords: [str], search_num: int) -> [str]:
    """
Perform a Google search with keywords
Returns a list of hit URLs
    """
    #Perform a Google search
    url = 'https://www.google.co.jp/search'
    params = {
        'hl': 'ja',
        'num': search_num,
        'q': ' '.join(keywords)
    }
    response = requests.get(url, params=params)

    #Returns a list of hit URLs
    # `.kCrYT`May need to be fixed due to changes in Google specifications
    soup = bs4.BeautifulSoup(response.content, 'html.parser')
    soups = soup.select('.kCrYT > a')
    return [get_href_from_soup(soup) for soup in soups]


def main():
    output_text = OutputText('output.txt')
    urls = do_google_search(settings['google_search_keywords'], settings['google_search_num'])

    for url in urls:
        #Skip processing if the text file already contains the URL
        if url in output_text.get_urls():
            continue

        try:
            response = requests.get(url)
            response.encoding = 'utf-8'
            response.raise_for_status()
        except:
            #Skip processing if connection error occurs
            continue

        soup = bs4.BeautifulSoup(response.content, 'html.parser')

        titles = [a.text for a in soup.select('title')]
        desc = get_ogdesc_from_soup(soup)
        h1s = [a.text for a in soup.select('h1')]
        h2s = [a.text for a in soup.select('h2')]
        h3s = [a.text for a in soup.select('h3')]
        h4s = [a.text for a in soup.select('h4')]

        #Skip processing if keywords are not included in the page
        no_keyword = True
        for keyword in settings['search_keywords_in_page']:
            for text in titles + [desc] + h1s + h2s + h3s + h4s:
                if keyword in text:
                    no_keyword = False
        if no_keyword:
            continue

        #Write to text file
        title = '**No title**' if len(titles) <= 0 else titles[0].strip().replace('\n', '')
        output_text.write(title, url)

    #Output a text file in an easy-to-read format
    output_text.output_readable_file()


if __name__ == '__main__':
    main()

output.py

import myutil as u
import os


class OutputText:
    file_path = None

    def __init__(self, file_path):
        self.file_path = file_path

        if not os.path.isfile(file_path):
            file = open(self.file_path, 'w', encoding='utf-8')
            file.close()

    def write(self, title, url):
        with open(self.file_path, mode='a', encoding='utf-8') as f:
            u.write_with_tab(f, title, url)
            f.write('\n')

    def get_urls(self):
        lines = self.get_lines()
        return [self.get_url(line) for line in lines]

    def output_readable_file(self):
        file = self.file_path.replace('.txt', '_readable.txt')
        with open(file, mode='w', encoding='utf-8') as f:
            lines = self.get_lines()
            for line in lines:
                f.write(self.get_title(line) + '\n' + self.get_url(line) + '\n')
                f.write('\n------------------------------\n\n')

    def get_lines(self):
        with open(self.file_path, mode='r', encoding='utf-8') as f:
            text = f.read()
            lines = text.strip().split('\n')
            return lines

    def get_title(self, line):
        texts_in_line = line.split('\t')
        return texts_in_line[0] if len(texts_in_line) >= 1 else ''

    def get_url(self, line):
        texts_in_line = line.split('\t')
        return texts_in_line[1] if len(texts_in_line) >= 2 else ''

myutil.py

def write_with_tab(file, *strings):
    """
Write a string to the file separated by tabs
    """
    for i, string in enumerate(strings):
        file.write(string)
        if i != len(strings) - 1:  #If not the last loop
            file.write('\t')
    return file

[Python scraping] Output the URL and title of the site containing a specific keyword to a text file

Overview

Python version

Execution procedure

code