Ich möchte Bilder durch tiefes Lernen klassifizieren, deshalb habe ich es implementiert, während ich das Scraping untersucht habe.

0. Module importieren

Verwenden Sie schöne Suppe.

from bs4 import BeautifulSoup
import urllib
import os
from PIL import Image
import matplotlib.pyplot as plt
from numpy import *
import six
%matplotlib inline

1. Scraping von der Bildsuche

Ich ziehe es aus dem Yahoo und bing Suchmaschinen.

class imageGetter():
    def __init__(self):
        pass

    def set_search_engine(self, key="yahoo"):
        if key == "yahoo":
            self.search_engine = u"https://search.yahoo.co.jp/image/search?p={0}&oq=&ei=UTF-8&b=21"
        elif key == "bing":
            self.search_engine = u"https://www.bing.com/images/search?q={0}"
    def series_process(self, word_list, padir="padir", key="yahoo", rec=1000):
        os.mkdir(padir)
        for word in word_list:
            o = imageGetter()
            o.search(word, False, key)
            
            cnt = 0
            try:
                for i in range(2, rec):
                    print(i)
                    o.next(i)
                    cnt = i
            except:
                pass
            print("Curation of the images from {} pages is Succeed".format(cnt))
            path = padir+"/"+word
            os.mkdir(path)
            print("Images is dumped at {}".format(path))
            o.dump(path)

    def search(self, search_word, is_show=False, key="yahoo"):
        self.set_search_engine(key)
        self.search_word = search_word
        response = urllib.request.urlopen(self.search_engine.format(urllib.parse.quote(search_word)))
        soup = BeautifulSoup(response, "lxml")

        urllst = []
        for obj in soup.find_all("img"):
            line = obj.get("rel")
            try:
                n = line.index("jpg")
                urllst += [line[:n+3]]
            except:
                pass
        self.urllst = urllst
        if is_show:
            self.print()

    def next(self, idx=2, is_show=False):
        url = self.soup.find_all("a",string="%s"%idx)[0].get("href")
        response = urllib.request.urlopen(url)
        self.soup = BeautifulSoup(response, "lxml")
        
        for obj in self.soup.find_all("img"):
            line = obj.get("rel")
            try:
                n = line.index("jpg")
                self.urllst += [line[:n+3]]
            except:
                pass
        if is_show:
            self.print()

    def print(self):
        for line in self.urllst:
            print(line)

    def show(self):
        print("{0} images.".format(len(self.urllst)))
        for line in self.urllst:
            try:
                file = six.BytesIO(urllib.request.urlopen(line).read())
                plt.figure(figsize=(10, 10), dpi=80)
                plt.subplots_adjust(left=0.0, right=1.0, bottom=0.0, top=1.0, hspace=0.0, wspace=0.0)
                plt.axis('off')
                plt.imshow(array(Image.open(file)))
            except urllib.error.HTTPError:
                print("{0} is not found".format(line))

        plt.show()

    def dump(self, path):
        for i, line in enumerate(self.urllst):
            try:
                file = six.BytesIO(request.urlopen(line).read())
                img = Image.open(file)
                img.save("{0}/{1}_{2}.jpg ".format(path, self.search_word, i))
            except urllib.error.HTTPError:
                print("{0} is not found".format(line))

Sie können es nur in der Größenordnung von 10 Blatt erhalten. .. .. Gibt es einen guten Weg?

1.1. Verwendung

Holen Sie sich eine Liste der Bild-URLs.

o = imageGetter()
o.search("Hund", False) #Bei der Einstellung True wird die URL standardmäßig ausgegeben

Bildvorschau (Ausgabe). Wenn es sich um ein Jupyter-Notebook handelt, werden die Bilder in der Ausgabe aufgelistet.

o.show()

：：：

Sie können Bilder in ein Verzeichnis kopieren.

path = "./tmp"
o.dump(path)

2. Scraping von Flickr

Sie benötigen einen API-Schlüssel (siehe hier oder hier (http://helog.jp/api-2/introduction-6/)).

class Flickr_handler:
    def __init__(self, key):
        self.key = key
    
    def search(self, search_word):
        self.search_word = search_word
        line = "https://api.flickr.com/services/rest/?method=flickr.photos.search&api_key={0}&per_page=500&format=rest&text={1}"
        
        response = urllib.request.urlopen(line.format(self.key, search_word))
        soup = BeautifulSoup(response, "lxml")
        self.soup = soup
        lst = soup.find_all("photo")
        self.lst = lst
        urllst = []
        for tag in lst:
            urllst += ["http://farm{0}.staticflickr.com/{1}/{2}_{3}.jpg ".format(lst[0].get("farm"),
                 tag.get("server"),
                 tag.get("id"),
                 tag.get("secret"))]
        self.urllst = urllst
        #response = urllib.request.urlopen(url.format(urllib.parse.quote(search_word)))
        
    def show(self):
        print("{0} images.".format(len(self.urllst)))
        for line in self.urllst:
            try:
                file = six.BytesIO(urllib.request.urlopen(line).read())
                plt.figure(figsize=(10, 10), dpi=80)
                plt.subplots_adjust(left=0.0, right=1.0, bottom=0.0, top=1.0, hspace=0.0, wspace=0.0)
                plt.axis('off')
                plt.imshow(array(Image.open(file)))
            except urllib.error.HTTPError:
                print("{0} is not found".format(line))
                
        plt.show()
    
    def dump(self, path):
        for i, line in enumerate(self.urllst):
            try:
                file = six.BytesIO(urllib.request.urlopen(line).read())
                img = Image.open(file)
                img.save("{0}/{1}_{2}.jpg ".format(path, self.search_word, i))
            except urllib.error.HTTPError:
                print("{0} is not found".format(line))

key = #your key 
o = Flickr_handler(key)
o.search("car")

Andere Verwendung ist die gleiche wie 1. Sie können hier bis zu 500 Blatt erhalten.

[PYTHON] Bildkratzen ②-Holen Sie sich Bilder von Bing, Yahoo, Flickr

0. Module importieren

1. Scraping von der Bildsuche

1.1. Verwendung

2. Scraping von Flickr