[PYTHON] Scraping member images from the official website of Sakamichi Group

Target

Save the image file (image path) with the name member name.jpg in this way.
* This time, the save destination is cloudinary. When you execute it, please change some storage or local save destination.

Final code

Since the html of each official HP was different, the code is slightly different.

Nogizaka

`get_ng_images.py`


from bs4 import BeautifulSoup
import urllib
import cloudinary
import cloudinary.uploader

import os

cloudinary.config(
  cloud_name = os.environ.get("cloud_name"),
  api_key = os.environ.get("api_key"),
  api_secret = os.environ.get("api_secret")
)

def get_mem_list():
  url = "http://www.nogizaka46.com/member/"
  headers = {
          "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:47.0) Gecko/20100101 Firefox/47.0",
          }

  request = urllib.request.Request(url, headers=headers)
  html = urllib.request.urlopen(request)

  soup = BeautifulSoup(html, 'html.parser')

  li = soup.select('#memberlist div[class="clearfix"] a')
  li = [url + str(l.attrs['href'])[2:] for l in li]

  return li

def get_img(url):

  headers = {
          "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:47.0) Gecko/20100101 Firefox/47.0",
          }

  request = urllib.request.Request(url=url, headers=headers)
  html = urllib.request.urlopen(request)

  soup = BeautifulSoup(html, 'html.parser')

  img = soup.select('#profile img')[0].attrs['src']
  name = str(soup.select('#profile div[class="txt"] h2')[0].contents[1]).replace(' ', '')

  res = cloudinary.uploader.upload(file=img, public_id="q-u46/member-images/"+name)

  return 'finished {} !!'.format(name)

def main():
  list_ = get_mem_list()

  for url in list_:
    print(get_img(url), end=' | ')

if __name__ == "__main__":
  main()

Keyakizaka

`get_keya_images.py`


from bs4 import BeautifulSoup
import urllib
import cloudinary
import cloudinary.uploader

import os

cloudinary.config(
  cloud_name = os.environ.get("cloud_name"),
  api_key = os.environ.get("api_key"),
  api_secret = os.environ.get("api_secret")
)

def get_mem_list():
  url = "https://www.keyakizaka46.com/s/k46o/search/artist?ima=0000"
  base = "https://www.keyakizaka46.com"
  headers = {
          "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:47.0) Gecko/20100101 Firefox/47.0",
          }

  request = urllib.request.Request(url, headers=headers)
  html = urllib.request.urlopen(request)

  soup = BeautifulSoup(html, 'html.parser')

  li = soup.select('div[class="sorted sort-default current"] li a')
  li = list(set([base + str(l.attrs['href']) for l in li]))

  return li

def get_img(url):

  headers = {
          "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:47.0) Gecko/20100101 Firefox/47.0",
          }

  request = urllib.request.Request(url=url, headers=headers)
  html = urllib.request.urlopen(request)

  soup = BeautifulSoup(html, 'html.parser')

  img = soup.select('div[class="box-profile_img"] img')[0].attrs['src']
  name = str(soup.select('div[class="box-profile_text"] p[class="name"]')[0].text).replace(' ', '')

  res = cloudinary.uploader.upload(file=img, public_id='q-u46/member-images/'+''.join(name.splitlines()))

  return 'finished {} !!'.format(name)

def main():
  list_ = get_mem_list()

  for url in list_:
    
    print(get_img(url), end=' | ')

if __name__ == "__main__":
  main()

Hinatazaka

`get_hina_images.py`


from bs4 import BeautifulSoup
import urllib
import cloudinary
import cloudinary.uploader

import os

cloudinary.config(
  cloud_name = os.environ.get("cloud_name"),
  api_key = os.environ.get("api_key"),
  api_secret = os.environ.get("api_secret")
)

def get_mem_list():
  url = "https://www.hinatazaka46.com/s/official/search/artist?ima=0000"
  base = "https://www.hinatazaka46.com"
  headers = {
          "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:47.0) Gecko/20100101 Firefox/47.0",
          }

  request = urllib.request.Request(url, headers=headers)
  html = urllib.request.urlopen(request)

  soup = BeautifulSoup(html, 'html.parser')

  li = soup.select('ul[class="p-member__list"] li[class="p-member__item"] a')
  li = list(set([base + str(l.attrs['href']) for l in li]))

  return li

def get_img(url):

  headers = {
          "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:47.0) Gecko/20100101 Firefox/47.0",
          }

  request = urllib.request.Request(url=url, headers=headers)
  html = urllib.request.urlopen(request)

  soup = BeautifulSoup(html, 'html.parser')

  img = soup.select('div[class="c-member__thumb c-member__thumb__large"] img')[0].attrs['src']
  name = str(soup.select('div[class="p-member__info"] div[class="p-member__info__head"] div[class="c-member__name--info"]')[0].text).replace(' ', '')
  #print(img)

  res = cloudinary.uploader.upload(file=img, public_id='q-u46/member-images/'+''.join(name.splitlines()))

  return 'finished {} !!'.format(name)

def main():
  list_ = get_mem_list()

  for url in list_:
    
    print(get_img(url), end=' | ')

if __name__ == "__main__":
  main()

Code description

I will mainly explain the code of Nogizaka.

Preparation

Items with a star are required when performing only scraping. Other than that, it is for saving to storage, so ignore it.

from bs4 import BeautifulSoup #☆
import urllib #☆
import cloudinary
import cloudinary.uploader

import os

cloudinary.config(
  cloud_name = os.environ.get("cloud_name"),
  api_key = os.environ.get("api_key"),
  api_secret = os.environ.get("api_secret")
)

List links to member detail pages

Get all the links to the details page from the member introduction Top.

First, let's take a look at the html of the member introduction Top.

You can see that the child element of the <div id =" memberlist "class =" left "> tag is likely to contain the desired image link.

And there are three child element <div class =" clearfix "> tags in parallel. The first was a block with all members other than the 4th gen, the second was empty, and the third was a 4th gen block.

Therefore, narrow down by # memberlist`` div [class = "clearfix"] .

Next, take a look at the child elements of <div class =" clearfix ">.

As shown below, there is a <div class =" unit "> tag for each member, and the relative path is written in the a tag in it.

Therefore, it seems that you can get a tag for members with #memberlist`` div [class = "clearfix"] a.

def get_mem_list():
  url = "http://www.nogizaka46.com/member/"
  #If you do not specify the following in headers, an error will occur, so add it.
  headers = {
          "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:47.0) Gecko/20100101 Firefox/47.0",
          }

  request = urllib.request.Request(url, headers=headers)
  html = urllib.request.urlopen(request)

  soup = BeautifulSoup(html, 'html.parser')

  #Get a list of elements that meet the criteria.
  li = soup.select('#memberlist div[class="clearfix"] a')

  #The first of the relative paths'./'Adjust, .You can get the attributes in the tag with attrs.
  li = [url + str(l.attrs['href'])[2:] for l in li]

  return li

Get image and name

image

It seems that you can get it with the ʻimgtag inside the

` tag.

img = soup.select('#profile img')[0].attrs['src']

It turned out to be something like this. Since the list type is returned by soup.select, it is fetched with [0].

name

You can see that it is an element of <div class =" txt "> <h2> in the <div id =" profile "> tag.

The purpose of this time is to get the name of the kanji display, so I'd like to do my best to ignore Shiraishi Mai.

So use the h2 tag .contents [1]. You will get a list like [<span> Mai Shiraishi </ span>," Mai Shiraishi "], so specify [1] and get only the ones in Kanji display.

I also want to remove the space between the first and last name, so I will remove it with replace.


#The argument is the member detail page
def get_img(url):

  headers = {
          "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:47.0) Gecko/20100101 Firefox/47.0",
          }

  request = urllib.request.Request(url=url, headers=headers)
  html = urllib.request.urlopen(request)

  soup = BeautifulSoup(html, 'html.parser')

  img = soup.select('#profile img')[0].attrs['src']
  name = str(soup.select('#profile div[class="txt"] h2')[0].contents[1]).replace(' ', '')

  #The following depends on the save destination.
  res = cloudinary.uploader.upload(file=img, public_id="q-u46/member-images/"+name)

  return 'finished {} !!'.format(name)

Main processing

def main():
  #List links to member detail pages
  list_ = get_mem_list()
  #Loop processing of image acquisition function with for statement
  for url in list_:
    print(get_img(url), end=' | ')

if __name__ == "__main__":
  main()

Keyakizaka and Hinatazaka

The selector is a little different, but the basic method is the same as Nogizaka.

For some reason, I got the line breaks when getting the name, and I got an error when naming the file. So I ignored the line breaks in ''. Join (name.splitlines ()). (List and join, which is a dirty way, but please forgive me mm)

in conclusion

Scraping is very convenient, so let's use it!

This time, we only requested members, so we omitted time.sleep. When sending a large number of requests, make sure to time.sleep.