I think Kaggle is a large data set, but there are times when you don't have the image you're looking for.
By executing the code written in this article, Image crawling,
You can do it easily and quickly. This is a brief summary of Kita's article and icrawler's example.
https://qiita.com/Yuki-Takatsu/items/3f30727d5b21a83ea4ed I used this as a reference (copy and paste).
scrape_google.py
import requests
import random
import shutil
import bs4
import ssl
import os.path
ssl._create_default_https_context = ssl._create_unverified_context
def image(keyword):
Res = requests.get("https://www.google.com/search?hl=jp&q=" + keyword + "&btnG=Google+Search&tbs=0&safe=off&tbm=isch")
Html = Res.text
Soup = bs4.BeautifulSoup(Html,'lxml')
links = Soup.find_all("img")
link = random.choice(links).get("src")
return link
def download_img(url, file_name, save_dir, img_num):
try:
r = requests.get(url, stream=True)
if r.status_code == 200:
with open( os.path.join( save_dir, file_name, str(img_num)+".png " ) , 'wb') as f:
r.raw.decode_content = True
shutil.copyfileobj(r.raw, f)
except:
pass
if __name__ == "__main__":
num = 10
keyword = "cat"
save_dir = 'data'
data_list = keyword.split(' ')
data = keyword.lower()
file_name_dir = '_'.join(data_list)
file_name_dir = file_name_dir.lower()
if not os.path.exists(save_dir):
os.mkdir(save_dir)
if not os.path.exists( os.path.join( save_dir, file_name_dir ) ):
os.mkdir( os.path.join( save_dir, file_name_dir ) )
for i in range(int(num)):
try:
print(f'{i} th attempt for crawl...')
link = image(data)
download_img(link, file_name_dir, save_dir, i)
except:
pass
https://qiita.com/ishiwara51/items/3979fbc1c69b4f8ee2d3 I referred to here (completely copied).
scrape_yahoo.py
import os
import sys
import traceback
from mimetypes import guess_extension
from time import time, sleep
from urllib.request import urlopen, Request
from urllib.parse import quote
from bs4 import BeautifulSoup
MY_EMAIL_ADDR = ''
class Fetcher:
def __init__(self, ua=''):
self.ua = ua
def fetch_img_direct(self, url):
"""
Extract the byte information of the image displayed on the yahoo image search screen.
argument:
url:The url of the yahoo image search screen.
Return value:
img_b_content:A list of website resource bytecodes.
mime: CONTENT_The extension specified by TYPE.
"""
req = Request(url, headers={'User-Agent': self.ua})
try:
with urlopen(req, timeout=3) as p:
page_b_content = p.read()
structured_page = BeautifulSoup(page_b_content.decode('UTF-8'), 'html.parser')
img_link_elems = structured_page.find_all('img')
img_urls = [e.get('src') for e in img_link_elems if e.get('src').startswith('http')]
img_urls = list(set(img_urls)) #It is unknown why it is set
except:
sys.stderr.write('Error in fetching {}\n'.format(url))
sys.stderr.write(traceback.format_exc())
return None, None
img_b_content = []
mime = []
for i, img_url in enumerate(img_urls):
req1 = Request(img_url, headers={'User-Agent': self.ua})
try:
with urlopen(req1, timeout=3) as p:
img_b_content.append(p.read())
mime.append(p.getheader('Content-Type'))
except:
sys.stderr.write('Error in fetching {}\n'.format(img_url))
sys.stderr.write(traceback.format_exc())
continue
return img_b_content, mime
fetcher = Fetcher(MY_EMAIL_ADDR)
def url_brancher(word):
"""
Get the url of the yahoo image search screen as many as the number of combinations of search conditions.
argument:
word :Search term.
Return value:
urllist :It is a list of url of yahoo image search screen.
"""
constant = "https://search.yahoo.co.jp/image/search?p={}&n=60".format(quote(word))
values = [\
["", "small"],\
["", "red"],\
["", "face"]\
]
"""
values = [\
["", "small", "medium", "large", "wallpaper", "widewallpaper"],\
["", "red", "orange", "yellow", "green", "teal", "blue", "purple", "pink", "white", "gray", "black", "brown"],\
["", "face", "photo", "clipart", "lineart"]\
]
"""
urllist = []
for i in range(len(values[0])):
for j in range(len(values[1])):
for k in range(len(values[2])):
urllist.append(constant + "&dim={}".format(values[0][i]) + "&imc={}".format(values[1][j]) + "&ctype={}".format(values[2][k]))
return urllist
def main(word):
"""
Save the image file based on the information obtained by Fetch.
argument:
word:Search term.
Return value:
"""
data_dir = 'data/'
if not os.path.exists(data_dir):
os.makedirs(data_dir)
yahoo_url_list = url_brancher(word)
for i, yahoo_url in enumerate(yahoo_url_list):
sleep(0.1)
img, mime = fetcher.fetch_img_direct(yahoo_url)
if not mime or not img:
print('Error in fetching {}\n'.format(yahoo_url))
continue
for j, img_url in enumerate(img):
ext = guess_extension(mime[j].split(';')[0])
if ext in ('.jpe', '.jpeg'):
ext = '.jpg'
if not ext:
print('Error in fetching {}\n'.format(img_url))
continue
result_file = os.path.join(data_dir, str(i) + str(j)+ ext)
with open(result_file, mode='wb') as f:
f.write(img_url)
print('fetched', str(i) + str(j) + ext)
if __name__ == '__main__':
word = 'cat'
main(word)
icrawler A library that allows you to crawl images with google, bing, baidu, and Flickr. But maybe you can't just crawl on Google right now? ?? However, I was able to crawl with Bing, Baidu, and Flickr without any problems. Please note that you only need to get an API Key for Flickr (don't worry, you can do it in about 2 minutes.)
scrape_icrawler.py
import logging
import os.path as osp
from argparse import ArgumentParser
from icrawler.builtin import (BaiduImageCrawler, BingImageCrawler,
FlickrImageCrawler, GoogleImageCrawler,
GreedyImageCrawler, UrlListCrawler)
key_word = 'cat'
max_num = 200
def test_google():
print('start testing GoogleImageCrawler')
google_crawler = GoogleImageCrawler(
downloader_threads=4,
storage={'root_dir': 'images/google'},
log_level=logging.INFO)
search_filters = dict()
# size='large',
# color='orange',
# license='commercial,modify',
# date=(None, (2017, 11, 30)))
google_crawler.crawl(key_word, filters=search_filters, max_num=max_num)
def test_bing():
print('start testing BingImageCrawler')
bing_crawler = BingImageCrawler(
downloader_threads=2,
storage={'root_dir': 'images/bing'},
log_level=logging.INFO)
search_filters = dict()
# type='photo',
# license='commercial',
# layout='wide',
# size='large',
# date='pastmonth')
bing_crawler.crawl(key_word, max_num=max_num, filters=search_filters)
def test_baidu():
print('start testing BaiduImageCrawler')
# search_filters = dict(size='large', color='blue')
search_filters = dict()
baidu_crawler = BaiduImageCrawler(
downloader_threads=4, storage={'root_dir': 'images/baidu'})
baidu_crawler.crawl(key_word, filters=search_filters, max_num=max_num)
def test_flickr():
print('start testing FlickrImageCrawler')
flickr_crawler = FlickrImageCrawler(
apikey=None,
parser_threads=2,
downloader_threads=4,
storage={'root_dir': 'images/flickr'})
flickr_crawler.crawl(
max_num=max_num,
tags=key_word,
tag_mode='all',
)
def test_greedy():
print('start testing GreedyImageCrawler')
greedy_crawler = GreedyImageCrawler(
parser_threads=4, storage={'root_dir': 'images/greedy'})
greedy_crawler.crawl(
'http://www.bbc.com/news', max_num=max_num, min_size=(100, 100))
def test_urllist():
print('start testing UrlListCrawler')
urllist_crawler = UrlListCrawler(
downloader_threads=3, storage={'root_dir': 'images/urllist'})
filelist = osp.join(osp.dirname(__file__), 'filelist_demo.txt')
urllist_crawler.crawl(filelist)
def main():
parser = ArgumentParser(description='Test built-in crawlers')
parser.add_argument(
'--crawler',
nargs='+',
# default=['google', 'bing', 'baidu', 'flickr', 'greedy', 'urllist'],
default=['google', 'bing', 'baidu', 'flickr'],
help='which crawlers to test')
args = parser.parse_args()
for crawler in args.crawler:
eval('test_{}()'.format(crawler))
print('\n')
if __name__ == '__main__':
main()
It would be nice to be able to quickly crawl and collect images when you don't have a large dataset. I think there are other good ways to do it and image sources, so I hope I can try other things as well.
Recommended Posts