Python Scraping get_ranker_categories

Target

Source code file

get_ranker_categories.py


import requests
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.support.ui import WebDriverWait
import time
import csv

# Open Browser
options = Options()
# options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage') 
driver = webdriver.Chrome(executable_path="/Users/micksmith/home/work/eBay/Python/chromedriver", chrome_options=options)

dictionary = "/Users/micksmith/home/work/eBay/Python/word_dictionary.csv"
url_exchange = "/Users/micksmith/home/work/eBay/Python/url_exchange.csv"

def get_categories():    
    categories_entire = []
    categories_entertainment = []
    categories_nerdy = []
    categories_channel = []
    
    items_entire = driver.find_elements_by_class_name('site__subItem')
    items_entertainment = driver.find_elements_by_class_name('-entertainment')
    items_nerdy = driver.find_elements_by_class_name('-nerdy')
    items_channel = driver.find_elements_by_class_name('-channels')
    #print(len(items))
    #items = set(items_entire) - set(items_channel)
    for item in items_entire:
        categories_entire.append(item.get_attribute("textContent"))
    
    for item in items_entertainment:
        contents = item.find_elements_by_class_name('site__subItem')
        for content in contents:
            categories_entertainment.append(content.get_attribute("textContent"))
        
    for item in items_nerdy:
        contents = item.find_elements_by_class_name('site__subItem')
        for content in contents:
            categories_nerdy.append(content.get_attribute("textContent"))

    for item in items_channel:
        #print(item.get_attribute("textContent"))
        contents = item.find_elements_by_class_name('site__subItem')
        for content in contents:
            categories_channel.append(content.get_attribute("textContent"))
    #print(categories_channel)
    #print(len(list(set(categories_entire) - set(categories_channel))))

    # print(categories_entire)
    # print(categories_entertainment)
    # print(categories_nerdy)
    # print(categories_channel)
   
    return categories_entire, categories_channel
    # return (list(set(categories_entire) - set(categories_channel)))
    # for item in items:
    #     categories = item.find_elements_by_class_name('site__subItem')
    #     for category in categories:
    #         print(category.get_attribute("textContent"))  
        
        # print(item.get_attribute("textContent"))
        # categories = item.find_elements_by_class_name('site__subItem')
        # for category in categories:
        #     print(category.text)

def exchange_words(word_dictionary):
    print("word_dictionary:", word_dictionary)
    word_before = []
    word_after = []
    word_results = []
    
    with open( dictionary, 'r') as f:
        reader = csv.reader(f)    
        for row in reader:
            word_before.append(row[0])
            word_after.append(row[1])
    
    for word in word_dictionary:
        for num in range(len(word_before)):
            if(word == word_before[num]):
                word = word_after[num]
                print(word)
        word_results.append(word)
    
    return word_results


if __name__ == "__main__":
    
    url = "https://www.ranker.com/"
    driver.get(url)
    entire, channel = get_categories()
    
    items = []
    list_of = []
    lists = []
    tags = []    
    
    # list_of = entertainment + nerdy
    # tags = list(set(item) - set(list_of))
    
    # list_of = exchange_words(list_of)
    # tags = exchange_words(tags)
    items = exchange_words(list(set(entire) - set(channel)))
    
    word_list_of = []
    word_lists = []
    
    with open(url_exchange, 'r') as f:
        reader = csv.reader(f)
        for row in reader:           
            word_list_of.append(row[0])
            word_lists.append(row[1])
    
    for item in items:
        for i in range(len(word_list_of)):
            if(item == word_list_of[i]):
                list_of.append(word_list_of[i])
            elif(item == word_lists[i]):
                lists.append(word_lists[i])
    
    tags = set(items) - set((list_of + lists))
    
    # print("list_of:", list_of)
    # print("lists:", lists)
    # print("tags:", tags)
    # # exchange 

    for item in list_of:
        item = item.replace(" ","-")
        url = "https://www.ranker.com/list-of/" + item + "?ref=mainnav"
        driver.get(url)
        print("URL:", url)
        time.sleep(5)
    
    for item in lists:
        item = item.replace(" ","-")
        url = "https://www.ranker.com/lists/" + item + "?ref=mainnav"
        driver.get(url)
        print("URL:", url)
        time.sleep(5)
    
    for item in tags:
        item = item.replace(" ","-")
        url = "https://www.ranker.com/tags/" + item + "?ref=mainnav"
        driver.get(url)
        print("URL:", url)
        time.sleep(5)

    # print(item)
    # print(list_of)
    # print(tags)
    # for category in categories:
    #     category = category.replace(" ","-")
    #     url = "https://www.ranker.com/list-of/" + category + "?ref=mainnav"
    #     #print("URL:", url)
    #     #driver.get(url)
    #     time.sleep(3)
    
    # df.columns = ["Title_Eng","Page_Num","MIN_Price","MAX_Price"]
    # df.to_csv(Source_file, index=False)    
    driver.quit()

url_exchange.csv


"film","albums"
"tv","beverages"
"comics",""
"tech",""
"science",""
"cars",""
"arts",""
"books",""

word_dictionary.csv


"movies","film"
"celebrity","celebrities"
"watchworthy","what to watch"
"anime","anime underground"
"cartoons","animated"
"athletes","best athletes"
"family","parenting"
"career","jobs"
"automotive","cars"
"art","arts"
"deep thoughts","thought provoking"
"libations","alcohol"
"healthy eating","dieting"

result

URL: https://www.ranker.com/list-of/arts?ref=mainnav
URL: https://www.ranker.com/list-of/tech?ref=mainnav
URL: https://www.ranker.com/list-of/tv?ref=mainnav
…
URL: https://www.ranker.com/tags/college-sports?ref=mainnav

analysis

Task

Recommended Posts

Python Scraping get_ranker_categories
[Scraping] Python scraping
Python scraping notes
Scraping with Python
Scraping with Python
Python Scraping eBay
Python Scraping get_title
Python: Scraping Part 1
Scraping using Python
Python: Scraping Part 2
Scraping with Python (preparation)
Summary about Python scraping
Try scraping with Python.
UnicodeEncodeError:'cp932' during python scraping
Basics of Python scraping basics
Scraping with Python + PhantomJS
Scraping with Selenium [Python]
Python web scraping selenium
Scraping with Python + PyQuery
Scraping RSS with Python
Scraping using Python 3.5 async / await
I tried scraping with Python
Web scraping with python + JupyterLab
Scraping with selenium in Python
Scraping with Selenium + Python Part 1
[Python] Scraping in AWS Lambda
python super beginner tries scraping
Web scraping notes in python3
Python
Scraping with chromedriver in python
Festive scraping with Python, scrapy
Scraping using Python 3.5 Async syntax
Scraping with Selenium in Python
Scraping with Tor in Python
Web scraping using Selenium (Python)
Scraping weather forecast with python
Scraping with Selenium + Python Part 2
[Python + Selenium] Tips for scraping
I tried scraping with python
Web scraping beginner with python
Scraping 1
Python Crawling & Scraping Chapter 4 Summary
Try scraping with Python + Beautiful Soup
Scraping with Node, Ruby and Python
Web scraping with Python ① (Scraping prior knowledge)
Scraping with Selenium in Python (Basic)
Scraping with Python, Selenium and Chromedriver
Web scraping with Python First step
I tried web scraping with python.
Scraping with Python and Beautiful Soup
Let's do image scraping with Python
Get Qiita trends with Python scraping
[Python] Creating a scraping tool Memo
Beginners use Python for web scraping (1)
Beginners use Python for web scraping (4) ―― 1
"Scraping & machine learning with Python" Learning memo
Get weather information with Python & scraping
[Python] Scraping lens information from Kakaku.com
kafka python
Get property information by scraping with python
Python basics ⑤