Python Scraping get_ranker_categories

Ziel

Quellcodedatei

get_ranker_categories.py


import requests
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.support.ui import WebDriverWait
import time
import csv

# Open Browser
options = Options()
# options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage') 
driver = webdriver.Chrome(executable_path="/Users/micksmith/home/work/eBay/Python/chromedriver", chrome_options=options)

dictionary = "/Users/micksmith/home/work/eBay/Python/word_dictionary.csv"
url_exchange = "/Users/micksmith/home/work/eBay/Python/url_exchange.csv"

def get_categories():    
    categories_entire = []
    categories_entertainment = []
    categories_nerdy = []
    categories_channel = []
    
    items_entire = driver.find_elements_by_class_name('site__subItem')
    items_entertainment = driver.find_elements_by_class_name('-entertainment')
    items_nerdy = driver.find_elements_by_class_name('-nerdy')
    items_channel = driver.find_elements_by_class_name('-channels')
    #print(len(items))
    #items = set(items_entire) - set(items_channel)
    for item in items_entire:
        categories_entire.append(item.get_attribute("textContent"))
    
    for item in items_entertainment:
        contents = item.find_elements_by_class_name('site__subItem')
        for content in contents:
            categories_entertainment.append(content.get_attribute("textContent"))
        
    for item in items_nerdy:
        contents = item.find_elements_by_class_name('site__subItem')
        for content in contents:
            categories_nerdy.append(content.get_attribute("textContent"))

    for item in items_channel:
        #print(item.get_attribute("textContent"))
        contents = item.find_elements_by_class_name('site__subItem')
        for content in contents:
            categories_channel.append(content.get_attribute("textContent"))
    #print(categories_channel)
    #print(len(list(set(categories_entire) - set(categories_channel))))

    # print(categories_entire)
    # print(categories_entertainment)
    # print(categories_nerdy)
    # print(categories_channel)
   
    return categories_entire, categories_channel
    # return (list(set(categories_entire) - set(categories_channel)))
    # for item in items:
    #     categories = item.find_elements_by_class_name('site__subItem')
    #     for category in categories:
    #         print(category.get_attribute("textContent"))  
        
        # print(item.get_attribute("textContent"))
        # categories = item.find_elements_by_class_name('site__subItem')
        # for category in categories:
        #     print(category.text)

def exchange_words(word_dictionary):
    print("word_dictionary:", word_dictionary)
    word_before = []
    word_after = []
    word_results = []
    
    with open( dictionary, 'r') as f:
        reader = csv.reader(f)    
        for row in reader:
            word_before.append(row[0])
            word_after.append(row[1])
    
    for word in word_dictionary:
        for num in range(len(word_before)):
            if(word == word_before[num]):
                word = word_after[num]
                print(word)
        word_results.append(word)
    
    return word_results


if __name__ == "__main__":
    
    url = "https://www.ranker.com/"
    driver.get(url)
    entire, channel = get_categories()
    
    items = []
    list_of = []
    lists = []
    tags = []    
    
    # list_of = entertainment + nerdy
    # tags = list(set(item) - set(list_of))
    
    # list_of = exchange_words(list_of)
    # tags = exchange_words(tags)
    items = exchange_words(list(set(entire) - set(channel)))
    
    word_list_of = []
    word_lists = []
    
    with open(url_exchange, 'r') as f:
        reader = csv.reader(f)
        for row in reader:           
            word_list_of.append(row[0])
            word_lists.append(row[1])
    
    for item in items:
        for i in range(len(word_list_of)):
            if(item == word_list_of[i]):
                list_of.append(word_list_of[i])
            elif(item == word_lists[i]):
                lists.append(word_lists[i])
    
    tags = set(items) - set((list_of + lists))
    
    # print("list_of:", list_of)
    # print("lists:", lists)
    # print("tags:", tags)
    # # exchange 

    for item in list_of:
        item = item.replace(" ","-")
        url = "https://www.ranker.com/list-of/" + item + "?ref=mainnav"
        driver.get(url)
        print("URL:", url)
        time.sleep(5)
    
    for item in lists:
        item = item.replace(" ","-")
        url = "https://www.ranker.com/lists/" + item + "?ref=mainnav"
        driver.get(url)
        print("URL:", url)
        time.sleep(5)
    
    for item in tags:
        item = item.replace(" ","-")
        url = "https://www.ranker.com/tags/" + item + "?ref=mainnav"
        driver.get(url)
        print("URL:", url)
        time.sleep(5)

    # print(item)
    # print(list_of)
    # print(tags)
    # for category in categories:
    #     category = category.replace(" ","-")
    #     url = "https://www.ranker.com/list-of/" + category + "?ref=mainnav"
    #     #print("URL:", url)
    #     #driver.get(url)
    #     time.sleep(3)
    
    # df.columns = ["Title_Eng","Page_Num","MIN_Price","MAX_Price"]
    # df.to_csv(Source_file, index=False)    
    driver.quit()

url_exchange.csv


"film","albums"
"tv","beverages"
"comics",""
"tech",""
"science",""
"cars",""
"arts",""
"books",""

word_dictionary.csv


"movies","film"
"celebrity","celebrities"
"watchworthy","what to watch"
"anime","anime underground"
"cartoons","animated"
"athletes","best athletes"
"family","parenting"
"career","jobs"
"automotive","cars"
"art","arts"
"deep thoughts","thought provoking"
"libations","alcohol"
"healthy eating","dieting"

Ergebnis

URL: https://www.ranker.com/list-of/arts?ref=mainnav
URL: https://www.ranker.com/list-of/tech?ref=mainnav
URL: https://www.ranker.com/list-of/tv?ref=mainnav
…
URL: https://www.ranker.com/tags/college-sports?ref=mainnav

Analyse

Aufgabe

Recommended Posts

Python Scraping get_ranker_categories
[Scraping] Python-Scraping
Python-Scraping-Memo
Scraping mit Python
Scraping mit Python
Python Scraping eBay
Python Scraping get_title
Python: Scraping Teil 1
Scraping mit Python
Python: Scraping Teil 2
Scraping in Python (Vorbereitung)
Versuchen Sie es mit Python.
UnicodeEncodeError: 'cp932' während des Python-Scrapings
Grundlagen der Python-Scraping-Grundlagen
Scraping mit Python + PhantomJS
Schaben mit Selen [Python]
Scraping mit Python + PyQuery
Scraping von RSS mit Python
Scraping mit Python 3.5 async / await
Ich habe versucht, mit Python zu kratzen
Web Scraping mit Python + JupyterLab
Schaben mit Selen in Python
Schaben mit Selen + Python Teil 1
[Python] Scraping in AWS Lambda
Python Super Anfänger versucht zu kratzen
Python
Schaben mit Chromedriver in Python
Festliches Scraping mit Python, Scrapy
Scraping mit Python 3.5 Async-Syntax
Scraping mit Selen in Python
Scraping mit Tor in Python
Web Scraping mit Selenium (Python)
Kratzwettervorhersage mit Python
Schaben mit Selen + Python Teil 2
[Python + Selen] Tipps zum Scraping
Ich habe versucht, mit Python zu kratzen
Web Scraping Anfänger mit Python
Schaben 1
Python Crawling & Scraping Kapitel 4 Zusammenfassung
Versuchen Sie es mit Python + Beautiful Soup
Scraping mit Node, Ruby und Python
Scraping mit Selen in Python (Basic)
Scraping mit Python, Selen und Chromedriver
Web Scraping mit Python Erster Schritt
Ich habe versucht, WebScraping mit Python.
Kratzen mit Python und schöner Suppe
Lassen Sie uns mit Python Image Scraping durchführen
Holen Sie sich Qiita-Trends mit Python-Scraping
[Python] Memo zum Erstellen von Scraping-Tools
Web Scraping für Anfänger in Python (1)
Web Scraping für Anfänger in Python (4) -1
"Scraping & maschinelles Lernen mit Python" Lernnotiz
Holen Sie sich Wetterinformationen mit Python & Scraping
[Python] Informationen zu Scraping-Objektiven von price.com
Kafka Python
Abrufen von Eigenschaftsinformationen durch Scraping mit Python
Python-Grundlagen ⑤