from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
import time
from bs4 import BeautifulSoup as BS
import re
import requests
options = Options()
options.add_argument("no-sandbox")
options.add_argument("--disable-extensions")
options.add_argument("--headless")
options.add_argument('--disable-gpu')
options.add_argument('--ignore-certificate-errors')
options.add_argument('--allow-running-insecure-content')
options.add_argument('--disable-web-security')
options.add_argument('--disable-desktop-notifications')
options.add_argument("--disable-extensions")
options.add_argument('--lang=ja')
options.add_argument('--blink-settings=imagesEnabled=false')
options.add_argument('--disable-dev-shm-usage')
options.add_argument('--proxy-server="direct://"')
options.add_argument('--proxy-bypass-list=*')
options.add_argument('--start-maximized')
driver = webdriver.Chrome(options=options)
driver.get (http:///~~~)
Um die Lesereichweite zu verringern, lesen Sie unmittelbar vor dem Schaben bis zur Klasse. Beschrieben mit maximaler Zeitüberschreitung
wait = WebDriverWait(driver, 300);
element=wait.until(EC.presence_of_element_located((By.CLASS_NAME,"fluid")));
Danach lesen und verarbeiten Sie mit BeautifulSoup
res = driver.page_source.encode('utf-8')
print("loading")
soup=BS(res,"html.parser")
Jetzt habe ich es geschafft, das Timeout zu vermeiden.
Recommended Posts