[PYTHON] Image crawling summary performed at the speed of a second

Overview

I think Kaggle is a large data set, but there are times when you don't have the image you're looking for.

By executing the code written in this article, Image crawling,

Google
Yahoo
Bing
Baidu
Flickr

You can do it easily and quickly. This is a brief summary of Kita's article and icrawler's example.

Google edition

https://qiita.com/Yuki-Takatsu/items/3f30727d5b21a83ea4ed I used this as a reference (copy and paste).

`scrape_google.py`


import requests
import random
import shutil
import bs4
import ssl
import os.path
ssl._create_default_https_context = ssl._create_unverified_context

def image(keyword):
    Res = requests.get("https://www.google.com/search?hl=jp&q=" + keyword + "&btnG=Google+Search&tbs=0&safe=off&tbm=isch")
    Html = Res.text
    Soup = bs4.BeautifulSoup(Html,'lxml')
    links = Soup.find_all("img")
    link = random.choice(links).get("src")
    return link

def download_img(url, file_name, save_dir, img_num):
    try: 
        r = requests.get(url, stream=True)
        if r.status_code == 200:
                with open( os.path.join( save_dir, file_name, str(img_num)+".png " ) , 'wb') as f:
                    r.raw.decode_content = True
                    shutil.copyfileobj(r.raw, f)
    except:
        pass 



if __name__ == "__main__":
    num = 10
    keyword = "cat"
    save_dir = 'data'

    data_list = keyword.split(' ')
    data = keyword.lower()
    file_name_dir = '_'.join(data_list)
    file_name_dir = file_name_dir.lower() 

    if not os.path.exists(save_dir): 
        os.mkdir(save_dir) 
    if not os.path.exists( os.path.join( save_dir, file_name_dir ) ): 
        os.mkdir( os.path.join( save_dir, file_name_dir ) ) 


    for i in range(int(num)):
        try:
            print(f'{i} th attempt for crawl...')
            link = image(data)
            download_img(link, file_name_dir, save_dir, i)
        except:
            pass

Yahoo edition

https://qiita.com/ishiwara51/items/3979fbc1c69b4f8ee2d3 I referred to here (completely copied).

`scrape_yahoo.py`


import os
import sys
import traceback
from mimetypes import guess_extension
from time import time, sleep
from urllib.request import urlopen, Request
from urllib.parse import quote
from bs4 import BeautifulSoup

MY_EMAIL_ADDR = ''

class Fetcher:
    def __init__(self, ua=''):
        self.ua = ua

    def fetch_img_direct(self, url):
        """
Extract the byte information of the image displayed on the yahoo image search screen.
argument:
            url:The url of the yahoo image search screen.

Return value:
            img_b_content:A list of website resource bytecodes.
            mime: CONTENT_The extension specified by TYPE.
        """
        req = Request(url, headers={'User-Agent': self.ua})
        try:
            with urlopen(req, timeout=3) as p:
                page_b_content = p.read()
                structured_page = BeautifulSoup(page_b_content.decode('UTF-8'), 'html.parser')
                img_link_elems = structured_page.find_all('img')
                img_urls = [e.get('src') for e in img_link_elems if e.get('src').startswith('http')]
                img_urls = list(set(img_urls)) #It is unknown why it is set
        except:
            sys.stderr.write('Error in fetching {}\n'.format(url))
            sys.stderr.write(traceback.format_exc())
            return None, None

        img_b_content = []
        mime = []
        for i, img_url in enumerate(img_urls):
            req1 = Request(img_url, headers={'User-Agent': self.ua})
            try:
                with urlopen(req1, timeout=3) as p:
                    img_b_content.append(p.read())
                    mime.append(p.getheader('Content-Type'))
            except:
                sys.stderr.write('Error in fetching {}\n'.format(img_url))
                sys.stderr.write(traceback.format_exc())
                continue

        return img_b_content, mime

fetcher = Fetcher(MY_EMAIL_ADDR)



def url_brancher(word):
    """
Get the url of the yahoo image search screen as many as the number of combinations of search conditions.

argument:
        word :Search term.

Return value:
        urllist :It is a list of url of yahoo image search screen.
    """
    constant = "https://search.yahoo.co.jp/image/search?p={}&n=60".format(quote(word))

    values = [\
    ["", "small"],\
    ["", "red"],\
    ["", "face"]\
    ]
    """
    values = [\
    ["", "small", "medium", "large", "wallpaper", "widewallpaper"],\
    ["", "red", "orange", "yellow", "green", "teal", "blue", "purple", "pink", "white", "gray", "black", "brown"],\
    ["", "face", "photo", "clipart", "lineart"]\
    ]
    """
    urllist = []

    for i in range(len(values[0])):
        for j in range(len(values[1])):
            for k in range(len(values[2])):
                urllist.append(constant + "&dim={}".format(values[0][i]) + "&imc={}".format(values[1][j]) + "&ctype={}".format(values[2][k]))
    return urllist



def main(word):
    """
Save the image file based on the information obtained by Fetch.

argument:
    word:Search term.

Return value:
    """
    data_dir = 'data/'
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)

    yahoo_url_list = url_brancher(word)

    for i, yahoo_url in enumerate(yahoo_url_list):
        sleep(0.1)
        img, mime = fetcher.fetch_img_direct(yahoo_url)
        if not mime or not img:
            print('Error in fetching {}\n'.format(yahoo_url))
            continue

        for j, img_url in enumerate(img):
            ext = guess_extension(mime[j].split(';')[0])
            if ext in ('.jpe', '.jpeg'):
                ext = '.jpg'
            if not ext:
                print('Error in fetching {}\n'.format(img_url))
                continue

            result_file = os.path.join(data_dir, str(i) + str(j)+ ext)
            with open(result_file, mode='wb') as f:
                f.write(img_url)
            print('fetched', str(i) + str(j) + ext)



if __name__ == '__main__':
    word = 'cat'
    main(word)

icrawler edition

icrawler A library that allows you to crawl images with google, bing, baidu, and Flickr. But maybe you can't just crawl on Google right now? ?? However, I was able to crawl with Bing, Baidu, and Flickr without any problems. Please note that you only need to get an API Key for Flickr (don't worry, you can do it in about 2 minutes.)

`scrape_icrawler.py`



import logging
import os.path as osp
from argparse import ArgumentParser

from icrawler.builtin import (BaiduImageCrawler, BingImageCrawler,
                              FlickrImageCrawler, GoogleImageCrawler,
                              GreedyImageCrawler, UrlListCrawler)


key_word = 'cat'
max_num = 200

def test_google():
    print('start testing GoogleImageCrawler')
    google_crawler = GoogleImageCrawler(
        downloader_threads=4,
        storage={'root_dir': 'images/google'},
        log_level=logging.INFO)
    search_filters = dict()
        # size='large',
        # color='orange',
        # license='commercial,modify',
        # date=(None, (2017, 11, 30)))
    google_crawler.crawl(key_word, filters=search_filters, max_num=max_num)


def test_bing():
    print('start testing BingImageCrawler')
    bing_crawler = BingImageCrawler(
        downloader_threads=2,
        storage={'root_dir': 'images/bing'},
        log_level=logging.INFO)
    search_filters = dict()
        # type='photo',
        # license='commercial',
        # layout='wide',
        # size='large',
        # date='pastmonth')
    bing_crawler.crawl(key_word, max_num=max_num, filters=search_filters)


def test_baidu():
    print('start testing BaiduImageCrawler')
    # search_filters = dict(size='large', color='blue')
    search_filters = dict()
    baidu_crawler = BaiduImageCrawler(
        downloader_threads=4, storage={'root_dir': 'images/baidu'})
    baidu_crawler.crawl(key_word, filters=search_filters, max_num=max_num)


def test_flickr():
    print('start testing FlickrImageCrawler')
    flickr_crawler = FlickrImageCrawler(
        apikey=None,
        parser_threads=2,
        downloader_threads=4,
        storage={'root_dir': 'images/flickr'})
    flickr_crawler.crawl(
        max_num=max_num,
        tags=key_word,
        tag_mode='all',
        )


def test_greedy():
    print('start testing GreedyImageCrawler')
    greedy_crawler = GreedyImageCrawler(
        parser_threads=4, storage={'root_dir': 'images/greedy'})
    greedy_crawler.crawl(
        'http://www.bbc.com/news', max_num=max_num, min_size=(100, 100))


def test_urllist():
    print('start testing UrlListCrawler')
    urllist_crawler = UrlListCrawler(
        downloader_threads=3, storage={'root_dir': 'images/urllist'})
    filelist = osp.join(osp.dirname(__file__), 'filelist_demo.txt')
    urllist_crawler.crawl(filelist)


def main():
    parser = ArgumentParser(description='Test built-in crawlers')
    parser.add_argument(
        '--crawler',
        nargs='+',
        # default=['google', 'bing', 'baidu', 'flickr', 'greedy', 'urllist'],
        default=['google', 'bing', 'baidu', 'flickr'],
        help='which crawlers to test')
    args = parser.parse_args()
    for crawler in args.crawler:
        eval('test_{}()'.format(crawler))
        print('\n')


if __name__ == '__main__':
    main()

Summary

It would be nice to be able to quickly crawl and collect images when you don't have a large dataset. I think there are other good ways to do it and image sources, so I hope I can try other things as well.