Save the image file (image path) with the name member name.jpg
in this way.
* This time, the save destination is cloudinary. When you execute it, please change some storage or local save destination.
Since the html of each official HP was different, the code is slightly different.
get_ng_images.py
from bs4 import BeautifulSoup
import urllib
import cloudinary
import cloudinary.uploader
import os
cloudinary.config(
cloud_name = os.environ.get("cloud_name"),
api_key = os.environ.get("api_key"),
api_secret = os.environ.get("api_secret")
)
def get_mem_list():
url = "http://www.nogizaka46.com/member/"
headers = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:47.0) Gecko/20100101 Firefox/47.0",
}
request = urllib.request.Request(url, headers=headers)
html = urllib.request.urlopen(request)
soup = BeautifulSoup(html, 'html.parser')
li = soup.select('#memberlist div[class="clearfix"] a')
li = [url + str(l.attrs['href'])[2:] for l in li]
return li
def get_img(url):
headers = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:47.0) Gecko/20100101 Firefox/47.0",
}
request = urllib.request.Request(url=url, headers=headers)
html = urllib.request.urlopen(request)
soup = BeautifulSoup(html, 'html.parser')
img = soup.select('#profile img')[0].attrs['src']
name = str(soup.select('#profile div[class="txt"] h2')[0].contents[1]).replace(' ', '')
res = cloudinary.uploader.upload(file=img, public_id="q-u46/member-images/"+name)
return 'finished {} !!'.format(name)
def main():
list_ = get_mem_list()
for url in list_:
print(get_img(url), end=' | ')
if __name__ == "__main__":
main()
get_keya_images.py
from bs4 import BeautifulSoup
import urllib
import cloudinary
import cloudinary.uploader
import os
cloudinary.config(
cloud_name = os.environ.get("cloud_name"),
api_key = os.environ.get("api_key"),
api_secret = os.environ.get("api_secret")
)
def get_mem_list():
url = "https://www.keyakizaka46.com/s/k46o/search/artist?ima=0000"
base = "https://www.keyakizaka46.com"
headers = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:47.0) Gecko/20100101 Firefox/47.0",
}
request = urllib.request.Request(url, headers=headers)
html = urllib.request.urlopen(request)
soup = BeautifulSoup(html, 'html.parser')
li = soup.select('div[class="sorted sort-default current"] li a')
li = list(set([base + str(l.attrs['href']) for l in li]))
return li
def get_img(url):
headers = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:47.0) Gecko/20100101 Firefox/47.0",
}
request = urllib.request.Request(url=url, headers=headers)
html = urllib.request.urlopen(request)
soup = BeautifulSoup(html, 'html.parser')
img = soup.select('div[class="box-profile_img"] img')[0].attrs['src']
name = str(soup.select('div[class="box-profile_text"] p[class="name"]')[0].text).replace(' ', '')
res = cloudinary.uploader.upload(file=img, public_id='q-u46/member-images/'+''.join(name.splitlines()))
return 'finished {} !!'.format(name)
def main():
list_ = get_mem_list()
for url in list_:
print(get_img(url), end=' | ')
if __name__ == "__main__":
main()
get_hina_images.py
from bs4 import BeautifulSoup
import urllib
import cloudinary
import cloudinary.uploader
import os
cloudinary.config(
cloud_name = os.environ.get("cloud_name"),
api_key = os.environ.get("api_key"),
api_secret = os.environ.get("api_secret")
)
def get_mem_list():
url = "https://www.hinatazaka46.com/s/official/search/artist?ima=0000"
base = "https://www.hinatazaka46.com"
headers = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:47.0) Gecko/20100101 Firefox/47.0",
}
request = urllib.request.Request(url, headers=headers)
html = urllib.request.urlopen(request)
soup = BeautifulSoup(html, 'html.parser')
li = soup.select('ul[class="p-member__list"] li[class="p-member__item"] a')
li = list(set([base + str(l.attrs['href']) for l in li]))
return li
def get_img(url):
headers = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:47.0) Gecko/20100101 Firefox/47.0",
}
request = urllib.request.Request(url=url, headers=headers)
html = urllib.request.urlopen(request)
soup = BeautifulSoup(html, 'html.parser')
img = soup.select('div[class="c-member__thumb c-member__thumb__large"] img')[0].attrs['src']
name = str(soup.select('div[class="p-member__info"] div[class="p-member__info__head"] div[class="c-member__name--info"]')[0].text).replace(' ', '')
#print(img)
res = cloudinary.uploader.upload(file=img, public_id='q-u46/member-images/'+''.join(name.splitlines()))
return 'finished {} !!'.format(name)
def main():
list_ = get_mem_list()
for url in list_:
print(get_img(url), end=' | ')
if __name__ == "__main__":
main()
I will mainly explain the code of Nogizaka.
Items with a star are required when performing only scraping. Other than that, it is for saving to storage, so ignore it.
from bs4 import BeautifulSoup #☆
import urllib #☆
import cloudinary
import cloudinary.uploader
import os
cloudinary.config(
cloud_name = os.environ.get("cloud_name"),
api_key = os.environ.get("api_key"),
api_secret = os.environ.get("api_secret")
)
Get all the links to the details page from the member introduction Top.
First, let's take a look at the html of the member introduction Top.
You can see that the child element of the <div id =" memberlist "class =" left ">
tag is likely to contain the desired image link.
And there are three child element <div class =" clearfix ">
tags in parallel.
The first was a block with all members other than the 4th gen, the second was empty, and the third was a 4th gen block.
Therefore, narrow down by # memberlist`` div [class = "clearfix"]
.
Next, take a look at the child elements of <div class =" clearfix ">
.
As shown below, there is a <div class =" unit ">
tag for each member, and the relative path is written in the a tag in it.
Therefore, it seems that you can get a tag for members with #memberlist`` div [class = "clearfix"] a
.
def get_mem_list():
url = "http://www.nogizaka46.com/member/"
#If you do not specify the following in headers, an error will occur, so add it.
headers = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:47.0) Gecko/20100101 Firefox/47.0",
}
request = urllib.request.Request(url, headers=headers)
html = urllib.request.urlopen(request)
soup = BeautifulSoup(html, 'html.parser')
#Get a list of elements that meet the criteria.
li = soup.select('#memberlist div[class="clearfix"] a')
#The first of the relative paths'./'Adjust, .You can get the attributes in the tag with attrs.
li = [url + str(l.attrs['href'])[2:] for l in li]
return li
It seems that you can get it with the ʻimgtag inside the
img = soup.select('#profile img')[0].attrs['src']
It turned out to be something like this. Since the list type is returned by soup.select, it is fetched with [0].
You can see that it is an element of <div class =" txt ">
<h2>
in the <div id =" profile ">
tag.
The purpose of this time is to get the name of the kanji display, so I'd like to do my best to ignore Shiraishi Mai
.
So use the h2 tag .contents [1]
.
You will get a list like [<span> Mai Shiraishi </ span>," Mai Shiraishi "]
, so specify [1] and get only the ones in Kanji display.
I also want to remove the space between the first and last name, so I will remove it with replace
.
#The argument is the member detail page
def get_img(url):
headers = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:47.0) Gecko/20100101 Firefox/47.0",
}
request = urllib.request.Request(url=url, headers=headers)
html = urllib.request.urlopen(request)
soup = BeautifulSoup(html, 'html.parser')
img = soup.select('#profile img')[0].attrs['src']
name = str(soup.select('#profile div[class="txt"] h2')[0].contents[1]).replace(' ', '')
#The following depends on the save destination.
res = cloudinary.uploader.upload(file=img, public_id="q-u46/member-images/"+name)
return 'finished {} !!'.format(name)
def main():
#List links to member detail pages
list_ = get_mem_list()
#Loop processing of image acquisition function with for statement
for url in list_:
print(get_img(url), end=' | ')
if __name__ == "__main__":
main()
The selector is a little different, but the basic method is the same as Nogizaka.
For some reason, I got the line breaks when getting the name, and I got an error when naming the file.
So I ignored the line breaks in ''. Join (name.splitlines ())
. (List and join, which is a dirty way, but please forgive me mm)
Scraping is very convenient, so let's use it!
time.sleep
.
When sending a large number of requests, make sure to time.sleep
.Recommended Posts