Note. For some reason, I can only download up to 20. ..
As you commented, I increased it to 400dl while enabling javascript using selenium. Requires a chrome driver. https://sites.google.com/a/chromium.org/chromedriver/downloads
get_img.py
#-*- coding:utf-8 -*-
import os
import urllib2
import re
from bs4 import BeautifulSoup
def get_ulist_o(search_word):
#Previous version
#http://stackoverflow.com/questions/20716842/python-download-images-from-google-image-search
url="https://www.google.co.in/search?q="+search_word+"&source=lnms&tbm=isch"
header = {'User-Agent': 'Mozilla/5.0'}
soup=BeautifulSoup(urllib2.urlopen(urllib2.Request(url,headers=header)),"lxml")
ulist = [a['src'] for a in soup.find_all("img", {"src": re.compile("gstatic.com")})]
return ulist
def get_ulist(search_word,n):
#New version
from selenium import webdriver
from ast import literal_eval
if n>400:print("n should be less than 400");exit()
url="https://www.google.co.in/search?q="+search_word+"&source=lnms&tbm=isch"
chromedriver = "./chromedriver"
driver = webdriver.Chrome(chromedriver)
driver.get(url)
cnt=0
while (cnt<n):
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
page_source= driver.page_source
soup=BeautifulSoup(page_source,"lxml")
soup= soup.find_all('div', class_="rg_meta")
cnt=len(soup)
else:
driver.quit()
ulist=[]
for i in soup:
dic=i.text.replace("false","False").replace("true","True")
ulist.append(literal_eval(dic)["ou"])
return ulist[:n]
def get_img(search_word,n):
FOLDERNAME=str(search_word)
if os.path.exists(FOLDERNAME)==False:
os.mkdir(FOLDERNAME)
urls=get_ulist(search_word,n)
for cntr,img in enumerate(urls):
print "[%03d]Donloading.. %s"%(cntr,img)
try:
raw_img = urllib2.urlopen(img).read()
f = open('%s/%s_%03d.jpg' % (FOLDERNAME, search_word, cntr), 'wb')
f.write(raw_img)
f.close()
except:
pass
get_img("Cute cat",10)
Please do not use it for any purpose.
Recommended Posts