Python Scraping eBay

Ziel

Quellcode

from time import sleep
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.support.ui import WebDriverWait
import csv
import pandas as pd
import datetime
from collections import Counter
import collections
import numpy as np
import matplotlib as mpl
mpl.use('TkAgg')  # or whatever other backend that you want
import matplotlib.pyplot as plt
import nltk
import time
import random
import re

# # Input Keywordkoda
# print("Input Keyword:")
# keyword = input().strip().replace(" ","+")

# # Input PageNumber
# print("Input PageNumber")
# page_num = int(input())
# # Input MaxPrice
# print("Input MaxPrice")
# x_max = int(input())

# Open Browser
options = Options()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
# options.add_argument('–-disable-dev-shm-usage')
# options.add_argument('--proxy-server="direct://"') #Stellen Sie eine direkte Verbindung her, nicht über Proxy
# options.add_argument('--disable-web-security')
driver = webdriver.Chrome(executable_path="/Users/micksmith/home/work/eBay/Python/chromedriver", chrome_options=options)

names = []
statuses = []
prices = []
shippings = []
amounts = []
displayed_results = 200


Source_file = "/Users/micksmith/home/work/eBay/Python/movie anime.csv"
CSV_file_path = "/Users/micksmith/home/work/eBay/Python/CSV/"
keyword_file_path = "/Users/micksmith/home/work/eBay/Python/Keyword/"
analysis_file_path = "/Users/micksmith/home/work/eBay/Python/Analysis/"
hist_file_path = "/Users/micksmith/home/work/eBay/Python/Hist/"


with open(Source_file, "r") as f:
    reader = csv.reader(f)
    header = next(reader)
    
    for row in reader:
        keyword = row[0].strip().replace(" ","+")
        page_num = int(row[1])
        x_min = int(row[2])
        x_max = int(row[3])
        #print(page_num)
        #print(x_max)
        #driver.implicitly_wait(random.randint(10,20))
        
        # number_of_search
        url  = 'https://www.ebay.com/sch/i.html?_from=R40&_trksid=m570.l1313&_nkw={}&_sacat=0&_ipg={}'.format(keyword, displayed_results)
        driver.get(url)
        
        number_of_search = driver.find_element_by_css_selector('#mainContent > div.s-answer-region.s-answer-region-center-top > div > div.clearfix.srp-controls__row-2 > div > div.srp-controls__control.srp-controls__count > h1 > span:nth-child(1)').text
        number_of_search = int(re.sub("\\D", "", number_of_search))
        if(page_num * displayed_results > number_of_search):
            page_num = number_of_search // displayed_results + 1
        print("number_of_search:", number_of_search)
        print("page_num:", page_num)
        time.sleep(3)

        print(keyword, page_num, x_max)
        for i in range (1,page_num+1):
            url  = 'https://www.ebay.com/sch/i.html?_from=R40&_nkw={}&_sacat=0&rt=nc&LH_Sold=1&LH_Complete=1&_ipg=200&_pgn={}'.format(keyword, i)
            
            # driver.implicitly_wait(random.randint(1,3))
            
            #driver.implicitly_wait(1)
            driver.get(url)
            #print("debug")
            
            print("Page_num:", i)
            time.sleep(random.randint(1,3))
            
            items = driver.find_elements_by_class_name('s-item__info.clearfix')

            for item in items:
                # get Name
                try:
                    name = item.find_element_by_class_name('s-item__title').text
                    name = name.replace("NEW LISTING", "")
                    names.append(name)
                except:
                    names.append(" ")
                print(name)
                # #get Status
                try:
                    status = item.find_element_by_class_name('SECONDARY_INFO').text
                    statuses.append(status)
                except:
                    statuses.append(" ")
                # #get Price
                try:
                    price = item.find_element_by_class_name('s-item__price').text
                    if( 'to' in price):
                        price = price.replace(price, '0')
                        #print(price)
                    price = int(re.sub("\\D", "", price))
                    prices.append(price)
                except:
                    prices.append(" ")
                print(price)
                
                # #get ShippingCost
                try:
                    shipping = item.find_element_by_class_name('s-item__logisticsCost').text
                    shipping = int(re.sub("\\D", "", shipping))
                    # shipping = int(shipping)
                    shippings.append(shipping)
                except:
                    shipping = 0 
                    shippings.append(shipping)
                print(shipping)            
                
                amounts.append(price + shipping)
                #print(amount)

        #print(len(names))
        # TitleExtraction
        title = names
        titles = ' '.join(title)
        keyword = keyword.replace("+"," ")
        #print(mojiretsu)
        counter = collections.Counter(titles.split())
        d = counter.most_common(200)
        date = datetime.datetime.today().strftime("%Y%m%d")
        keyword_file_name = keyword_file_path +keyword + "_" + str(page_num) + "_" + date + ".txt"
        with open(keyword_file_name, 'w') as f:
            for key, value in d:
                f.write('{0} {1}\n'.format(key, value))

        # MorphologicalAnalysis
        morph = nltk.word_tokenize(titles)
        pos = nltk.pos_tag(morph)
        entitles = nltk.chunk.ne_chunk(pos)
        analysis_file_name = analysis_file_path  + keyword + "_" + str(number_of_search) + "_" + date + ".txt" 
        #print(type(entitles))
        with open(analysis_file_name, mode="w") as f:
            f.write(str(entitles))

        df = pd.DataFrame()
        df['name'] = names
        df['status'] = statuses
        df['price'] = prices
        df['shippingcost'] = shippings
        df['amount'] = amounts

        date = datetime.datetime.today().strftime("%Y%m%d")
        csv_file_name = CSV_file_path + keyword + "_" + str(number_of_search) + "_" + date + ".csv"

        df.to_csv(csv_file_name)


        y_min = 0       # MinimumFrequency
        y_max = number_of_search # MaximumFrequency
        interval = (x_max - x_min) // 20 # Interval
        bin_num =  (x_max - x_min) // interval # bins

        # Hists
        # plt.xlim(x_min, x_max)
        # plt.ylim(y_min, y_max)
        plt.title(keyword + "_" + str(number_of_search) + "_" + date)
        plt.xlabel("Price")
        plt.ylabel("Frequency")
        plt.xticks(np.arange(x_min, x_max, interval), rotation = 90)
        plt.hist(prices, range=(x_min, x_max), bins = bin_num)
        plt.savefig(hist_file_path + keyword + "_" + str(number_of_search) + "_" + date + ".png ")
        #plt.show()

    driver.quit()

Ergebnis

Screen Shot 2020-04-13 at 15.41.42.png Screen Shot 2020-04-13 at 15.44.19.png Screen Shot 2020-04-13 at 15.45.06.png Screen Shot 2020-04-13 at 15.42.42.png

Analyse

 Traceback (most recent call last):
  File "/Users/UserName/home/work/eBay/Python/eBay_Scraping.py", line 89, in <module>
    driver.get(url)
  File "/Users/UserName/.pyenv/versions/anaconda3-4.2.0/lib/python3.5/site-packages/selenium/webdriver/remote/webdriver.py", line 333, in get
    self.execute(Command.GET, {'url': url})
  File "/Users/UserName/.pyenv/versions/anaconda3-4.2.0/lib/python3.5/site-packages/selenium/webdriver/remote/webdriver.py", line 321, in execute
    self.error_handler.check_response(response)
  File "/Users/UserName/.pyenv/versions/anaconda3-4.2.0/lib/python3.5/site-packages/selenium/webdriver/remote/errorhandler.py", line 242, in check_response
    raise exception_class(message, screen, stacktrace)
selenium.common.exceptions.InvalidSessionIdException: Message: invalid session id

Recommended Posts

Python Scraping eBay
[Scraping] Python-Scraping
Python-Scraping-Memo
Python Scraping get_ranker_categories
Scraping mit Python
Scraping mit Python
Python Scraping get_title
Python: Scraping Teil 1
Scraping mit Python
Python: Scraping Teil 2
Scraping in Python (Vorbereitung)
UnicodeEncodeError: 'cp932' während des Python-Scrapings
Grundlagen der Python-Scraping-Grundlagen
Scraping mit Python + PhantomJS
Schaben mit Selen [Python]
Scraping mit Python + PyQuery
Scraping von RSS mit Python
Scraping mit Python 3.5 async / await
Ich habe versucht, mit Python zu kratzen
Web Scraping mit Python + JupyterLab
Schaben mit Selen in Python
Schaben mit Selen + Python Teil 1
[Python] Scraping in AWS Lambda
Python Super Anfänger versucht zu kratzen
Python
Schaben mit Chromedriver in Python
Festliches Scraping mit Python, Scrapy
Scraping mit Python 3.5 Async-Syntax
Scraping mit Selen in Python
Scraping mit Tor in Python
Web Scraping mit Selenium (Python)
Kratzwettervorhersage mit Python
Schaben mit Selen + Python Teil 2
[Python + Selen] Tipps zum Scraping
Ich habe versucht, mit Python zu kratzen
Web Scraping Anfänger mit Python
Schaben 1
Python Crawling & Scraping Kapitel 4 Zusammenfassung
Scraping mit Node, Ruby und Python
Scraping mit Selen in Python (Basic)
Scraping mit Python, Selen und Chromedriver
Web Scraping mit Python Erster Schritt
Ich habe versucht, WebScraping mit Python.
Kratzen mit Python und schöner Suppe
Lassen Sie uns mit Python Image Scraping durchführen
Holen Sie sich Qiita-Trends mit Python-Scraping
[Python] Memo zum Erstellen von Scraping-Tools
Web Scraping für Anfänger in Python (1)
Web Scraping für Anfänger in Python (4) -1
"Scraping & maschinelles Lernen mit Python" Lernnotiz
Holen Sie sich Wetterinformationen mit Python & Scraping
[Python] Informationen zu Scraping-Objektiven von price.com
Kafka Python
Abrufen von Eigenschaftsinformationen durch Scraping mit Python
Python-Grundlagen ⑤
Python-Zusammenfassung