Use python to scrape the BicCamera website.
I don't think there were any "good scraping articles", so I made a code & Youtube video to practice as a Youtuber that surpasses Hikakin.
[Link] https://www.youtube.com/watch?v=SZuNFDzJndA&list=PLzPCqF-heFHyFb_aoqnXc8GrECL6yMdvZ
Please be careful when scraping at your own risk as the following may occur. Okazaki Municipal Central Library Case url : https://ja.wikipedia.org/wiki/%E5%B2%A1%E5%B4%8E%E5%B8%82%E7%AB%8B%E4%B8%AD%E5%A4%AE%E5%9B%B3%E6%9B%B8%E9%A4%A8%E4%BA%8B%E4%BB%B6
Also, if the site UI of BicCamera is changed, there is a high possibility that it will not work, so Please comment if it doesn't work. May be corrected.
scraping_biccamera.py
from bs4 import BeautifulSoup as bs
from datetime import datetime
import pandas as pd
import requests
import urllib
import time
import re
def get_html(url):
"""
A function that returns a beautiful object given a url
"""
res = requests.get(url)
return bs(res.content, "html.parser")
def next_url(res_bs):
"""
html data(res_bs)Function that returns url or None on the next page
"""
domain = "https://www.biccamera.com/"
for res in res_bs.findAll(class_="bcs_l"):
if "Next" in res.text:
if res.find("a"):
next_url = domain + res.find("a").get("href")
return next_url
return None
def product_df(res_bs):
"""
Function to get DataFrame from BicCamera html
"""
#Create an empty list
output = []
#Get product list information from html
item_list = res_bs.findAll(class_=re.compile(r"prod_box sku*"))
# item_Get product information one by one from list
for num, item in enumerate(item_list):
#Get product details URL
item_url = item.find(class_="cssopa").get("href")
#Get product title
title = item.find(class_="cssopa").find("img").get("alt")
#Product photo acquisition
picture = item.find(class_="cssopa").find("img").get("src")
#Acquired product manufacturer
maker = item.find(class_="bcs_maker").text
#Product price acquisition
price = item.find(class_=re.compile(r"bcs_price*")).text
#Get point information if there is, get 0 if not
if item.find(class_="bcs_point"):
point = item.find(class_="bcs_point").text
else:
point = "0"
#Get inventory information if available, get 0 if not
if item.find(class_=re.compile(r"label_*")):
stock = item.find(class_=re.compile(r"label_*")).text
else:
stock = "0"
#Get if there is evaluation number information, get 0 if not
if item.find(class_="bcs_star"):
ratings = item.find(class_="bcs_star").find("a").text
else:
ratings = "0"
#Get delivery date information if available, get 0 if not
if item.find(class_="bcs_ship"):
terms = item.find(class_="bcs_ship").text
else:
terms = "no ship info"
#Output the information for each product acquired above(list)Append to(add to)To do
output.append({
"item_url": item_url,
"title": title,
"picture": picture,
"maker": maker,
"price": price,
"point": point,
"stock": stock,
"ratings": ratings,
"terms": terms,
})
#Save all information in output and store output in pandas dataframe
df = pd.DataFrame(output)
return df
def get_product_list(url, pages=10):
"""
After entering the target url, page transition and
A function that returns all product list information as a dataframe
"""
#Create an empty dataframe
all_df = pd.DataFrame()
for _ in range(pages):
#Get html from the entered URL
res_bs = get_html(url)
#Get dataframe from html
df = product_df(res_bs)
# all_append the df created above to df(add to)To do
all_df = all_df.append(df)
# all_Print the number of dfs and check that they are added
print("all_df:", len(all_df))
print(f"sleeping.....{_}Time")
#Allow 5 seconds to avoid overloading the other site
time.sleep(5)
#Get next page URL or None
url = next_url(res_bs)
#If the url is None, the for statement ends
if url is None:
print("break")
break
return all_df
return all_df
if __name__ == "__main__":
#Define the URL that contains the data you want to get
url = "https://www.biccamera.com/bc/category/?q=laptop&rowPerPage=100#bcs_resultTxt"
#All pages transition and data is acquired
df = get_product_list(url)
#Get search information from url
qs = urllib.parse.urlparse(url).query
#Turn search information into a dictionary
kw = urllib.parse.parse_qs(qs)
#Get search word
query = kw.get("q")[0]
#Save the acquired data as csv
today = datetime.today().strftime("%Y%m%d_%H%M%S")
df.to_csv(f"{today}_biccamera_{query}", index=False)
#By substituting a numerical value for pages as shown below, the number of page transitions can be changed to 20 times.
# df = get_product_list(url, pages=20)
I will not explain the above code from the beginning, so if you are interested, please watch the video.
Also, BicCamera has a slightly higher scraping difficulty (using re.complile), It is a good teaching material, so I hope you will understand this principle and enjoy your scraping life.
As a youtuber, I will continue to preprocess, save and visualize data, etc. We will realize "live by what you like".
Recommended Posts