Target

Read the following items from the CSV file
Search keyword
Acquisition page
Minimum product price
Maximum product price
Get product information on eBay
Output the following file based on the acquired product information
CSV file
Product name
Product status
Selling price
Shipping
Total amount (sales price + shipping fee)
Dictionary file
Frequency of words appearing in product names
Part-speech classification of words appearing in product names
Graph
Frequency distribution of total amount (histogram)

Source code

from time import sleep
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.support.ui import WebDriverWait
import csv
import pandas as pd
import datetime
from collections import Counter
import collections
import numpy as np
import matplotlib as mpl
mpl.use('TkAgg')  # or whatever other backend that you want
import matplotlib.pyplot as plt
import nltk
import time
import random
import re

# # Input Keywordkoda
# print("Input Keyword:")
# keyword = input().strip().replace(" ","+")

# # Input PageNumber
# print("Input PageNumber")
# page_num = int(input())
# # Input MaxPrice
# print("Input MaxPrice")
# x_max = int(input())

# Open Browser
options = Options()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
# options.add_argument('–-disable-dev-shm-usage')
# options.add_argument('--proxy-server="direct://"') #Connect directly instead of via Proxy
# options.add_argument('--disable-web-security')
driver = webdriver.Chrome(executable_path="/Users/micksmith/home/work/eBay/Python/chromedriver", chrome_options=options)

names = []
statuses = []
prices = []
shippings = []
amounts = []
displayed_results = 200


Source_file = "/Users/micksmith/home/work/eBay/Python/movie anime.csv"
CSV_file_path = "/Users/micksmith/home/work/eBay/Python/CSV/"
keyword_file_path = "/Users/micksmith/home/work/eBay/Python/Keyword/"
analysis_file_path = "/Users/micksmith/home/work/eBay/Python/Analysis/"
hist_file_path = "/Users/micksmith/home/work/eBay/Python/Hist/"


with open(Source_file, "r") as f:
    reader = csv.reader(f)
    header = next(reader)
    
    for row in reader:
        keyword = row[0].strip().replace(" ","+")
        page_num = int(row[1])
        x_min = int(row[2])
        x_max = int(row[3])
        #print(page_num)
        #print(x_max)
        #driver.implicitly_wait(random.randint(10,20))
        
        # number_of_search
        url  = 'https://www.ebay.com/sch/i.html?_from=R40&_trksid=m570.l1313&_nkw={}&_sacat=0&_ipg={}'.format(keyword, displayed_results)
        driver.get(url)
        
        number_of_search = driver.find_element_by_css_selector('#mainContent > div.s-answer-region.s-answer-region-center-top > div > div.clearfix.srp-controls__row-2 > div > div.srp-controls__control.srp-controls__count > h1 > span:nth-child(1)').text
        number_of_search = int(re.sub("\\D", "", number_of_search))
        if(page_num * displayed_results > number_of_search):
            page_num = number_of_search // displayed_results + 1
        print("number_of_search:", number_of_search)
        print("page_num:", page_num)
        time.sleep(3)

        print(keyword, page_num, x_max)
        for i in range (1,page_num+1):
            url  = 'https://www.ebay.com/sch/i.html?_from=R40&_nkw={}&_sacat=0&rt=nc&LH_Sold=1&LH_Complete=1&_ipg=200&_pgn={}'.format(keyword, i)
            
            # driver.implicitly_wait(random.randint(1,3))
            
            #driver.implicitly_wait(1)
            driver.get(url)
            #print("debug")
            
            print("Page_num:", i)
            time.sleep(random.randint(1,3))
            
            items = driver.find_elements_by_class_name('s-item__info.clearfix')

            for item in items:
                # get Name
                try:
                    name = item.find_element_by_class_name('s-item__title').text
                    name = name.replace("NEW LISTING", "")
                    names.append(name)
                except:
                    names.append(" ")
                print(name)
                # #get Status
                try:
                    status = item.find_element_by_class_name('SECONDARY_INFO').text
                    statuses.append(status)
                except:
                    statuses.append(" ")
                # #get Price
                try:
                    price = item.find_element_by_class_name('s-item__price').text
                    if( 'to' in price):
                        price = price.replace(price, '0')
                        #print(price)
                    price = int(re.sub("\\D", "", price))
                    prices.append(price)
                except:
                    prices.append(" ")
                print(price)
                
                # #get ShippingCost
                try:
                    shipping = item.find_element_by_class_name('s-item__logisticsCost').text
                    shipping = int(re.sub("\\D", "", shipping))
                    # shipping = int(shipping)
                    shippings.append(shipping)
                except:
                    shipping = 0 
                    shippings.append(shipping)
                print(shipping)            
                
                amounts.append(price + shipping)
                #print(amount)

        #print(len(names))
        # TitleExtraction
        title = names
        titles = ' '.join(title)
        keyword = keyword.replace("+"," ")
        #print(mojiretsu)
        counter = collections.Counter(titles.split())
        d = counter.most_common(200)
        date = datetime.datetime.today().strftime("%Y%m%d")
        keyword_file_name = keyword_file_path +keyword + "_" + str(page_num) + "_" + date + ".txt"
        with open(keyword_file_name, 'w') as f:
            for key, value in d:
                f.write('{0} {1}\n'.format(key, value))

        # MorphologicalAnalysis
        morph = nltk.word_tokenize(titles)
        pos = nltk.pos_tag(morph)
        entitles = nltk.chunk.ne_chunk(pos)
        analysis_file_name = analysis_file_path  + keyword + "_" + str(number_of_search) + "_" + date + ".txt" 
        #print(type(entitles))
        with open(analysis_file_name, mode="w") as f:
            f.write(str(entitles))

        df = pd.DataFrame()
        df['name'] = names
        df['status'] = statuses
        df['price'] = prices
        df['shippingcost'] = shippings
        df['amount'] = amounts

        date = datetime.datetime.today().strftime("%Y%m%d")
        csv_file_name = CSV_file_path + keyword + "_" + str(number_of_search) + "_" + date + ".csv"

        df.to_csv(csv_file_name)


        y_min = 0       # MinimumFrequency
        y_max = number_of_search # MaximumFrequency
        interval = (x_max - x_min) // 20 # Interval
        bin_num =  (x_max - x_min) // interval # bins

        # Hists
        # plt.xlim(x_min, x_max)
        # plt.ylim(y_min, y_max)
        plt.title(keyword + "_" + str(number_of_search) + "_" + date)
        plt.xlabel("Price")
        plt.ylabel("Frequency")
        plt.xticks(np.arange(x_min, x_max, interval), rotation = 90)
        plt.hist(prices, range=(x_min, x_max), bins = bin_num)
        plt.savefig(hist_file_path + keyword + "_" + str(number_of_search) + "_" + date + ".png ")
        #plt.show()

    driver.quit()

result

analysis

The following error message occurs frequently

 Traceback (most recent call last):
  File "/Users/UserName/home/work/eBay/Python/eBay_Scraping.py", line 89, in <module>
    driver.get(url)
  File "/Users/UserName/.pyenv/versions/anaconda3-4.2.0/lib/python3.5/site-packages/selenium/webdriver/remote/webdriver.py", line 333, in get
    self.execute(Command.GET, {'url': url})
  File "/Users/UserName/.pyenv/versions/anaconda3-4.2.0/lib/python3.5/site-packages/selenium/webdriver/remote/webdriver.py", line 321, in execute
    self.error_handler.check_response(response)
  File "/Users/UserName/.pyenv/versions/anaconda3-4.2.0/lib/python3.5/site-packages/selenium/webdriver/remote/errorhandler.py", line 242, in check_response
    raise exception_class(message, screen, stacktrace)
selenium.common.exceptions.InvalidSessionIdException: Message: invalid session id

Possible causes are as follows
Consecutive access with the same session ID will be blocked on the server
Considering how to deal with it

Python Scraping eBay

Target

Source code

result

analysis