from urllib.request import urlopen
from bs4 import BeautifulSoup
from pprint import pprint
URL = 'http://news.yahoo.co.jp/'
with urlopen(URL) as res:
html = res.read().decode("utf-8")
soup = BeautifulSoup(html, 'html.parser')
titles = soup.select('.ttl a') #get dom
titles = [t.contents[0] for t in titles] #Get text
pprint(titles)
Output result
>>>
['Mr. Trump "Judiciary too much"',
'PKO Daily Report Special Defense Inspection Instructed',
'Administration mistake at the hospital Temporary cardiopulmonary arrest',
'200 junior high school students in special clothing Fukuoka',
'Dutch ruling party to maintain first party',
'Rare with a probability of 1 / 320,000 at WBC',
'Nakai's passionate office without denying',
'Mr. Watase's strongest legend and the existence of his brother']
BeautifulSoup4 element selection method
There are also find
and find_all
, but select
is often used.
If none is returned when retrieving the text
Try .string
, .text
, .contents [0]
etc.
http://stackoverflow.com/questions/20750852/beautifulsoup-4-python-string-returns-none
from urllib.request import urlopen
from bs4 import BeautifulSoup
from pprint import pprint
import csv
import re
tenki = []
URL = "http://weather.livedoor.com/forecast/rss/area/130010.xml"
with urlopen(URL) as res:
html = res.read().decode("utf-8")
soup = BeautifulSoup(html, "html.parser")
for item in soup.find_all("item"):
title = item.find("title").string
if title.find("[ PR ]") == -1: #Garbage removal
text = re.sub(r'\[?.+\]\s', '', title) # []Delete inside
result = text.split(' - ')
tenki.append(result)
pprint(tenki)
Output result
>>>
[['Tokyo', 'Cloudy', 'Maximum temperature ℃', 'July 23(Day)'],
['Tokyo', 'Cloudy', 'Maximum temperature 32 ℃', 'July 24(Month)'],
['Tokyo', 'Cloudy', 'Maximum temperature 34 ℃', 'July 25(fire)'],
['Tokyo', 'Cloudy and sometimes sunny', 'Maximum temperature 33 ℃', 'July 26(water)'],
['Tokyo', 'Cloudy and sometimes sunny', 'Maximum temperature 31 ℃', 'July 27(wood)'],
['Tokyo', 'Cloudy and sometimes sunny', 'Maximum temperature 32 ℃', 'July 28(Money)'],
['Tokyo', 'Cloudy and sometimes sunny', 'Maximum temperature 32 ℃', 'July 29(soil)'],
['Tokyo', 'Cloudy and sometimes sunny', 'Maximum temperature 32 ℃', 'July 30(Day)']]
Make it basic html.parser lxml.parser seems to have an error occasionally https://www.crummy.com/software/BeautifulSoup/bs4/doc/
with open('weather.csv','w',newline='') as f:
writer = csv.writer(f)
writer.writerow(['city','status','max','date'])
writer.writerows(tenki)
weather.csv
place,status,max,date
Tokyo,Cloudy,Maximum temperature ℃,July 23(Day)
Tokyo,Cloudy,Maximum temperature 32 ℃,July 24(Month)
Tokyo,Cloudy,Maximum temperature 34 ℃,July 25(fire)
Tokyo,Cloudy and sometimes sunny,Maximum temperature 33 ℃,July 26(water)
Tokyo,Cloudy and sometimes sunny,Maximum temperature 31 ℃,July 27(wood)
Tokyo,Cloudy and sometimes sunny,Maximum temperature 32 ℃,July 28(Money)
Tokyo,Cloudy and sometimes sunny,Maximum temperature 32 ℃,July 29(soil)
Tokyo,Cloudy and sometimes sunny,Maximum temperature 32 ℃,July 30(Day)
CSV->JSON
rows = []
csvfile = open('weather.csv', 'r')
jsonfile = open('weather.json', 'w')
fieldnames = ('city','status','max','date')
reader = csv.DictReader(csvfile, fieldnames)
for index,row in enumerate(reader):
if(index == 0): continue # {"city": "city" ...}Is unnecessary
rows.append(row)
json.dump(rows, jsonfile, ensure_ascii=False, indent=2)
weather.json
[
{
"date": "7\u670823\u65e5(\u65e5)",
"status": "\u66c7\u308a",
"city": "\u6771\u4eac",
"max": "\u6700\u9ad8\u6c17\u6e29\u2103"
},
{
"date": "7\u670824\u65e5(\u6708)",
"status": "\u66c7\u308a",
"city": "\u6771\u4eac",
"max": "\u6700\u9ad8\u6c17\u6e2932\u2103"
},
...
]
# ics.Scraping media
import os
from urllib.request import urlopen
from bs4 import BeautifulSoup
URL = "https://ics.media/"
with urlopen(URL) as res:
html = res.read().decode("utf-8")
soup = BeautifulSoup(html, "html.parser")
#Get images (and titles) of new articles
topics = soup.select(".topicsContainer")[0].nextSibling
topics_urls = topics.select(".thumb img")
topics_ttls = topics.select(".entryTitle a")
img_urls = [e["src"] for e in topics_urls]
img_ttls = [e.string for e in topics_ttls]
"""
#If it is a relative path, convert it to an absolute path
#List comprehension, ternary operator
img_urls = [u if u.find("http") == 0 else URL + u for u in img_urls]
"""
#Save
img_dir = "images"
if not os.path.exists(img_dir):
os.mkdir(img_dir)
for i,url in enumerate(img_urls):
print("article"+str(1+i), img_ttls[i])
print(url)
with urlopen(url) as res:
img = res.read()
with open(img_dir + "/entry_image%d.png " % (i+1), "wb") as f:
f.write(img)
Output result
>>>
Article 1 Basic knowledge of CSS design that web creators should be aware of
https://ics.media/wp-content/uploads/2017/03/170315_eyecatch-640x256.jpg
Article 2 Expressed only with CSS3! I made a micro-interaction that can be used with copy
https://ics.media/wp-content/uploads/2017/03/main-640x256.png
Article 3 The quality of advertising websites is amazing!5 domestic sites that have become a hot topic recently
https://ics.media/wp-content/uploads/2017/03/170227_web_trend_1611_1702_eye-640x256.jpg
... (Omitted) ...
Get sibling elements http://tdoc.info/beautifulsoup/
os module https://docs.python.jp/3/library/os.html
enumerate function loop with index http://python.civic-apps.com/zip-enumerate/
chrome dev tool Settings > Preferences > Debugger > Disable JavaScript
If you disable JS and reload and the element disappears, Sites where the DOM is dynamically generated by JS
This time I will use PhantomJS and Selenium
$ brew install phantomjs
$ pip3 install —upgrade selenium
from selenium import webdriver
from bs4 import BeautifulSoup
from pprint import pprint
URL = "https://dokusho-ojikan.jp/original/#!top"
driver = webdriver.PhantomJS()
driver.get(URL)
html = driver.page_source
bs = BeautifulSoup(html, "html.parser")
img_urls = [img.get("src") for img in bs.select("#unique-pickup img")]
for u in img_urls:
print(u)
Output result
>>>
https://cdn.om.dokusho-ojikan.jp/img/1f387c10-a8f8-11e6-8a10-525431b7cd60.jpg
https://cdn.om.dokusho-ojikan.jp/img/98c2e066-9a44-11e5-9ae2-52540e2de543.png
https://cdn.om.dokusho-ojikan.jp/img/7adbba1b-344b-11e5-be53-5254f877f35f.jpg
... (Omitted) ...
Recommended Posts