[PYTHON] Scraping member images from the official website of Sakamichi Group

Target

Save the image file (image path) with the name member name.jpg in this way.
* This time, the save destination is cloudinary. When you execute it, please change some storage or local save destination.

image

Final code

Since the html of each official HP was different, the code is slightly different.

Nogizaka

get_ng_images.py


from bs4 import BeautifulSoup
import urllib
import cloudinary
import cloudinary.uploader

import os

cloudinary.config(
  cloud_name = os.environ.get("cloud_name"),
  api_key = os.environ.get("api_key"),
  api_secret = os.environ.get("api_secret")
)

def get_mem_list():
  url = "http://www.nogizaka46.com/member/"
  headers = {
          "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:47.0) Gecko/20100101 Firefox/47.0",
          }

  request = urllib.request.Request(url, headers=headers)
  html = urllib.request.urlopen(request)

  soup = BeautifulSoup(html, 'html.parser')

  li = soup.select('#memberlist div[class="clearfix"] a')
  li = [url + str(l.attrs['href'])[2:] for l in li]

  return li

def get_img(url):

  headers = {
          "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:47.0) Gecko/20100101 Firefox/47.0",
          }

  request = urllib.request.Request(url=url, headers=headers)
  html = urllib.request.urlopen(request)

  soup = BeautifulSoup(html, 'html.parser')

  img = soup.select('#profile img')[0].attrs['src']
  name = str(soup.select('#profile div[class="txt"] h2')[0].contents[1]).replace(' ', '')

  res = cloudinary.uploader.upload(file=img, public_id="q-u46/member-images/"+name)

  return 'finished {} !!'.format(name)

def main():
  list_ = get_mem_list()

  for url in list_:
    print(get_img(url), end=' | ')

if __name__ == "__main__":
  main()

Keyakizaka

get_keya_images.py


from bs4 import BeautifulSoup
import urllib
import cloudinary
import cloudinary.uploader

import os

cloudinary.config(
  cloud_name = os.environ.get("cloud_name"),
  api_key = os.environ.get("api_key"),
  api_secret = os.environ.get("api_secret")
)

def get_mem_list():
  url = "https://www.keyakizaka46.com/s/k46o/search/artist?ima=0000"
  base = "https://www.keyakizaka46.com"
  headers = {
          "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:47.0) Gecko/20100101 Firefox/47.0",
          }

  request = urllib.request.Request(url, headers=headers)
  html = urllib.request.urlopen(request)

  soup = BeautifulSoup(html, 'html.parser')

  li = soup.select('div[class="sorted sort-default current"] li a')
  li = list(set([base + str(l.attrs['href']) for l in li]))

  return li

def get_img(url):

  headers = {
          "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:47.0) Gecko/20100101 Firefox/47.0",
          }

  request = urllib.request.Request(url=url, headers=headers)
  html = urllib.request.urlopen(request)

  soup = BeautifulSoup(html, 'html.parser')

  img = soup.select('div[class="box-profile_img"] img')[0].attrs['src']
  name = str(soup.select('div[class="box-profile_text"] p[class="name"]')[0].text).replace(' ', '')

  res = cloudinary.uploader.upload(file=img, public_id='q-u46/member-images/'+''.join(name.splitlines()))

  return 'finished {} !!'.format(name)

def main():
  list_ = get_mem_list()

  for url in list_:
    
    print(get_img(url), end=' | ')

if __name__ == "__main__":
  main()

Hinatazaka

get_hina_images.py


from bs4 import BeautifulSoup
import urllib
import cloudinary
import cloudinary.uploader

import os

cloudinary.config(
  cloud_name = os.environ.get("cloud_name"),
  api_key = os.environ.get("api_key"),
  api_secret = os.environ.get("api_secret")
)

def get_mem_list():
  url = "https://www.hinatazaka46.com/s/official/search/artist?ima=0000"
  base = "https://www.hinatazaka46.com"
  headers = {
          "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:47.0) Gecko/20100101 Firefox/47.0",
          }

  request = urllib.request.Request(url, headers=headers)
  html = urllib.request.urlopen(request)

  soup = BeautifulSoup(html, 'html.parser')

  li = soup.select('ul[class="p-member__list"] li[class="p-member__item"] a')
  li = list(set([base + str(l.attrs['href']) for l in li]))

  return li

def get_img(url):

  headers = {
          "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:47.0) Gecko/20100101 Firefox/47.0",
          }

  request = urllib.request.Request(url=url, headers=headers)
  html = urllib.request.urlopen(request)

  soup = BeautifulSoup(html, 'html.parser')

  img = soup.select('div[class="c-member__thumb c-member__thumb__large"] img')[0].attrs['src']
  name = str(soup.select('div[class="p-member__info"] div[class="p-member__info__head"] div[class="c-member__name--info"]')[0].text).replace(' ', '')
  #print(img)

  res = cloudinary.uploader.upload(file=img, public_id='q-u46/member-images/'+''.join(name.splitlines()))

  return 'finished {} !!'.format(name)

def main():
  list_ = get_mem_list()

  for url in list_:
    
    print(get_img(url), end=' | ')

if __name__ == "__main__":
  main()

Code description

I will mainly explain the code of Nogizaka.

Preparation

Items with a star are required when performing only scraping. Other than that, it is for saving to storage, so ignore it.

from bs4 import BeautifulSoup #☆
import urllib #☆
import cloudinary
import cloudinary.uploader

import os

cloudinary.config(
  cloud_name = os.environ.get("cloud_name"),
  api_key = os.environ.get("api_key"),
  api_secret = os.environ.get("api_secret")
)

List links to member detail pages

Get all the links to the details page from the member introduction Top.

image

First, let's take a look at the html of the member introduction Top.

image

You can see that the child element of the <div id =" memberlist "class =" left "> tag is likely to contain the desired image link.

And there are three child element <div class =" clearfix "> tags in parallel. The first was a block with all members other than the 4th gen, the second was empty, and the third was a 4th gen block.

Therefore, narrow down by # memberlist`` div [class = "clearfix"] .

Next, take a look at the child elements of <div class =" clearfix ">.

As shown below, there is a <div class =" unit "> tag for each member, and the relative path is written in the a tag in it.

image

Therefore, it seems that you can get a tag for members with #memberlist`` div [class = "clearfix"] a.

def get_mem_list():
  url = "http://www.nogizaka46.com/member/"
  #If you do not specify the following in headers, an error will occur, so add it.
  headers = {
          "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:47.0) Gecko/20100101 Firefox/47.0",
          }

  request = urllib.request.Request(url, headers=headers)
  html = urllib.request.urlopen(request)

  soup = BeautifulSoup(html, 'html.parser')

  #Get a list of elements that meet the criteria.
  li = soup.select('#memberlist div[class="clearfix"] a')

  #The first of the relative paths'./'Adjust, .You can get the attributes in the tag with attrs.
  li = [url + str(l.attrs['href'])[2:] for l in li]

  return li

Get image and name

image

It seems that you can get it with the ʻimgtag inside the

` tag.

image

img = soup.select('#profile img')[0].attrs['src']

It turned out to be something like this. Since the list type is returned by soup.select, it is fetched with [0].

name

You can see that it is an element of <div class =" txt "> <h2> in the <div id =" profile "> tag.

image

The purpose of this time is to get the name of the kanji display, so I'd like to do my best to ignore Shiraishi Mai.

So use the h2 tag .contents [1]. You will get a list like [<span> Mai Shiraishi </ span>," Mai Shiraishi "], so specify [1] and get only the ones in Kanji display.

I also want to remove the space between the first and last name, so I will remove it with replace.


#The argument is the member detail page
def get_img(url):

  headers = {
          "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:47.0) Gecko/20100101 Firefox/47.0",
          }

  request = urllib.request.Request(url=url, headers=headers)
  html = urllib.request.urlopen(request)

  soup = BeautifulSoup(html, 'html.parser')

  img = soup.select('#profile img')[0].attrs['src']
  name = str(soup.select('#profile div[class="txt"] h2')[0].contents[1]).replace(' ', '')

  #The following depends on the save destination.
  res = cloudinary.uploader.upload(file=img, public_id="q-u46/member-images/"+name)

  return 'finished {} !!'.format(name)

Main processing

def main():
  #List links to member detail pages
  list_ = get_mem_list()
  #Loop processing of image acquisition function with for statement
  for url in list_:
    print(get_img(url), end=' | ')

if __name__ == "__main__":
  main()

Keyakizaka and Hinatazaka

The selector is a little different, but the basic method is the same as Nogizaka.

For some reason, I got the line breaks when getting the name, and I got an error when naming the file. So I ignored the line breaks in ''. Join (name.splitlines ()). (List and join, which is a dirty way, but please forgive me mm)

in conclusion

Scraping is very convenient, so let's use it!

  • This time, we only requested members, so we omitted time.sleep. When sending a large number of requests, make sure to time.sleep.

Recommended Posts

Scraping member images from the official website of Sakamichi Group
Scraping the result of "Schedule-kun"
Scraping immediately from google images!
Scraping Shizuoka's GoToEat official website
[Python] Practical Beautiful Soup ~ Scraping the triple single odds table on the official website of Kyotei ~
Studying web scraping for the purpose of extracting data from Filmarks # 2
I checked the usage status of the parking lot from satellite images.
Existence from the viewpoint of Python
Identify the YouTube channel of Hikakin videos from thumbnail images using CNN
Scraping with Python-Getting the base price of mutual funds from Yahoo! Finance
Scraping the usage history of the community cycle
Omit BOM from the beginning of the string
Learning notes from the beginning of Python 2
[Python] I tried to judge the member image of the idol group using Keras
Scraping Shizuoka's GoToEat official website and listing the stores targeted for the Izu campaign