Siehe folgenden Artikel Npm-Installationsverfahren
$ sudo yum install -y bzip2
$ npm install --save phantomjs
selenium
$ pip install selenium
nose
$ pip install nose
sample
# -*- coding:utf-8 -*-
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as ec
import nose.tools as nose
# account
email = 'user'
password = 'password'
#############
# phantomjs
#############
# user agent
user_agent = 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36'
#PhantomJS Körperpfad
pjs_path = 'node_modules/phantomjs/bin/phantomjs'
dcap = {
"phantomjs.page.settings.userAgent" : user_agent,
'marionette' : True
}
driver = webdriver.PhantomJS(executable_path=pjs_path, desired_capabilities=dcap)
#Warten Sie 5 Sekunden
wait = WebDriverWait(driver, 5)
#############
# get html
#############
# login page
login_page_url = 'http://127.0.0.1/sign_in'
driver.get(login_page_url)
#Warten Sie, bis die Seite geladen ist
wait.until(ec.presence_of_all_elements_located)
#Überprüfen Sie die aktuelle URL
nose.eq_('http://127.0.0.1:8080/login', driver.current_url)
#############
# login
#############
# button click
show_signin = driver.find_element_by_id('showSignIn')
show_signin.click()
# email
login_xpath = '//*[@id="user_email"]'
#Warten Sie, bis das Zielelement sichtbar ist
wait.until(ec.visibility_of_element_located((By.XPATH, login_xpath)))
#Eingabe des E-Mail-Formulars
login_id_form = driver.find_element_by_xpath(login_xpath)
login_id_form.clear()
login_id_form.send_keys(email)
# password
password_xpath = '//*[@id="user_password"]'
#Warten Sie, bis das Zielelement sichtbar ist
wait.until(ec.visibility_of_element_located((By.XPATH, password_xpath)))
#Eingabe des Passwortformulars
password_form = driver.find_element_by_xpath(password_xpath)
password_form.clear()
password_form.send_keys(password)
# submit
submit_xpath = '//*[@id="new_user"]/div[4]/input'
driver.find_element_by_xpath(submit_xpath).click()
#############
# result
#############
driver.get('http://127.0.0.1/users/edit')
#Warten Sie, bis die Seite geladen ist
wait.until(ec.presence_of_all_elements_located)
#Überprüfen Sie die aktuelle URL
nose.eq_('http://127.0.0.1:8080/users/edit', driver.current_url)
#Überprüfen Sie die Anzeigeelemente auf dem Bildschirm, um festzustellen, ob Sie angemeldet sind
user_email = driver.find_element_by_xpath('//*[@id="user_email"]').get_attribute("value")
nose.eq_(email, user_email)
BeautifulSoap Wenn Sie es in Kombination mit BeautifulSoap verwenden, können Sie HTML problemlos analysieren.
$ pip install beautifulsoup4
Es ist sicher, auch den folgenden Parser einzuschließen. Referenz: Geben Sie den Parser explizit in Beautiful Soup 4.x an
$ pip install html5lib
$ pip install lxml
# -*- coding:utf-8 -*-
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as ec
import nose.tools as nose
from bs4 import BeautifulSoup
#############
# phantomjs
#############
# user agent
user_agent = 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36'
#PhantomJS Körperpfad
pjs_path = 'node_modules/phantomjs/bin/phantomjs'
dcap = {
"phantomjs.page.settings.userAgent" : user_agent,
'marionette' : True
}
driver = webdriver.PhantomJS(executable_path=pjs_path, desired_capabilities=dcap)
#Warten Sie 5 Sekunden
wait = WebDriverWait(driver, 5)
#############
# load page
#############
driver.get('http://127.0.0.1/users/edit')
data = driver.page_source.encode('utf-8')
#############
# parse html
#############
html = BeautifulSoup(data)
print(html)
print(html.title)
print(html.title.string)
print(html.find('h1'))
print(html.find('select',{'id':'hoge'}))
# -*- coding:utf-8 -*-
import pandas as pd
url = 'http://stocks.finance.yahoo.co.jp/stocks/history/?code=998407.O'
tables = pd.io.html.read_html(url, flavor='bs4')
print(tables[1])
Sie können nicht nur von der URL, sondern auch von der HTML-Quelle analysieren.
# -*- coding:utf-8 -*-
import pandas as pd
html = '''
<html>
<body>
<table>
<tr><td>sample1</td></tr>
<tr><td>sample2</td></tr>
<tr><td>sample3</td></tr>
<tr><td>sample4</td></tr>
</table>
</body>
</html>
'''
tables = pd.io.html.read_html(html, flavor='bs4')
print(tables[0])
PhantomJS with pandas
# -*- coding:utf-8 -*-
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as ec
from bs4 import BeautifulSoup
import pandas as pd
#############
# phantomjs
#############
# user agent
user_agent = 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36'
#PhantomJS Körperpfad
pjs_path = 'node_modules/phantomjs/bin/phantomjs'
dcap = {
"phantomjs.page.settings.userAgent" : user_agent,
'marionette' : True
}
driver = webdriver.PhantomJS(executable_path=pjs_path, desired_capabilities=dcap)
#Warten Sie 5 Sekunden
wait = WebDriverWait(driver, 5)
#############
# load page
#############
driver.get('http://127.0.0.1/users/edit')
data = driver.page_source.encode('utf-8')
# parse
soup = BeautifulSoup(data,'lxml')
table = soup.find_all('table')[0]
df = pd.read_html(str(table))
print(df[0])
Recommended Posts