[PYTHON] Scraping sample

get yahoo! news title

from urllib.request import urlopen
from bs4 import BeautifulSoup
from pprint import pprint

URL = 'http://news.yahoo.co.jp/'
with urlopen(URL) as res:
  html = res.read().decode("utf-8")
    
soup = BeautifulSoup(html, 'html.parser')

titles = soup.select('.ttl a') #get dom
titles = [t.contents[0] for t in titles] #Get text
pprint(titles)

`Output result`


>>>
['Mr. Trump "Judiciary too much"',
 'PKO Daily Report Special Defense Inspection Instructed',
 'Administration mistake at the hospital Temporary cardiopulmonary arrest',
 '200 junior high school students in special clothing Fukuoka',
 'Dutch ruling party to maintain first party',
 'Rare with a probability of 1 / 320,000 at WBC',
 'Nakai's passionate office without denying',
 'Mr. Watase's strongest legend and the existence of his brother']

BeautifulSoup4 element selection method There are also find and find_all, but select is often used.

If none is returned when retrieving the text Try .string, .text, .contents [0] etc. http://stackoverflow.com/questions/20750852/beautifulsoup-4-python-string-returns-none

Get the weather forecast and save it as CSV / JSON

Get the weather

from urllib.request import urlopen
from bs4 import BeautifulSoup
from pprint import pprint
import csv
import re

tenki = []
URL = "http://weather.livedoor.com/forecast/rss/area/130010.xml"

with urlopen(URL) as res:
  html = res.read().decode("utf-8")
  
soup = BeautifulSoup(html, "html.parser")
for item in soup.find_all("item"):
    title = item.find("title").string
    if title.find("[ PR ]") == -1: #Garbage removal
        text = re.sub(r'\[?.+\]\s', '', title) # []Delete inside
        result = text.split(' - ')
        tenki.append(result)
pprint(tenki)

`Output result`


>>>
[['Tokyo', 'Cloudy', 'Maximum temperature ℃', 'July 23(Day)'],
 ['Tokyo', 'Cloudy', 'Maximum temperature 32 ℃', 'July 24(Month)'],
 ['Tokyo', 'Cloudy', 'Maximum temperature 34 ℃', 'July 25(fire)'],
 ['Tokyo', 'Cloudy and sometimes sunny', 'Maximum temperature 33 ℃', 'July 26(water)'],
 ['Tokyo', 'Cloudy and sometimes sunny', 'Maximum temperature 31 ℃', 'July 27(wood)'],
 ['Tokyo', 'Cloudy and sometimes sunny', 'Maximum temperature 32 ℃', 'July 28(Money)'],
 ['Tokyo', 'Cloudy and sometimes sunny', 'Maximum temperature 32 ℃', 'July 29(soil)'],
 ['Tokyo', 'Cloudy and sometimes sunny', 'Maximum temperature 32 ℃', 'July 30(Day)']]

Make it basic html.parser lxml.parser seems to have an error occasionally https://www.crummy.com/software/BeautifulSoup/bs4/doc/

Save as CSV

with open('weather.csv','w',newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['city','status','max','date'])
    writer.writerows(tenki)

`weather.csv`


place,status,max,date
Tokyo,Cloudy,Maximum temperature ℃,July 23(Day)
Tokyo,Cloudy,Maximum temperature 32 ℃,July 24(Month)
Tokyo,Cloudy,Maximum temperature 34 ℃,July 25(fire)
Tokyo,Cloudy and sometimes sunny,Maximum temperature 33 ℃,July 26(water)
Tokyo,Cloudy and sometimes sunny,Maximum temperature 31 ℃,July 27(wood)
Tokyo,Cloudy and sometimes sunny,Maximum temperature 32 ℃,July 28(Money)
Tokyo,Cloudy and sometimes sunny,Maximum temperature 32 ℃,July 29(soil)
Tokyo,Cloudy and sometimes sunny,Maximum temperature 32 ℃,July 30(Day)

CSV->JSON

rows = []
csvfile = open('weather.csv', 'r')
jsonfile = open('weather.json', 'w')
fieldnames = ('city','status','max','date')
reader = csv.DictReader(csvfile, fieldnames)
for index,row in enumerate(reader):
    if(index == 0): continue #  {"city": "city" ...}Is unnecessary
    rows.append(row)
json.dump(rows, jsonfile, ensure_ascii=False, indent=2)

`weather.json`


[
  {
    "date": "7\u670823\u65e5(\u65e5)",
    "status": "\u66c7\u308a",
    "city": "\u6771\u4eac",
    "max": "\u6700\u9ad8\u6c17\u6e29\u2103"
  },
  {
    "date": "7\u670824\u65e5(\u6708)",
    "status": "\u66c7\u308a",
    "city": "\u6771\u4eac",
    "max": "\u6700\u9ad8\u6c17\u6e2932\u2103"
  },
  ...
]

Get images of new articles on ics.media and save locally

# ics.Scraping media
import os
from urllib.request import urlopen
from bs4 import BeautifulSoup

URL = "https://ics.media/"
with urlopen(URL) as res:
    html = res.read().decode("utf-8")
    
soup = BeautifulSoup(html, "html.parser")

#Get images (and titles) of new articles
topics = soup.select(".topicsContainer")[0].nextSibling
topics_urls = topics.select(".thumb img")
topics_ttls = topics.select(".entryTitle a")
img_urls = [e["src"] for e in topics_urls]
img_ttls = [e.string for e in topics_ttls]

"""
#If it is a relative path, convert it to an absolute path
#List comprehension, ternary operator
img_urls = [u if u.find("http") == 0 else URL + u for u in img_urls]
"""

#Save
img_dir = "images"
if not os.path.exists(img_dir):
    os.mkdir(img_dir)
    
for i,url in enumerate(img_urls):
    print("article"+str(1+i), img_ttls[i])
    print(url)
    with urlopen(url) as res:
        img = res.read()
        with open(img_dir + "/entry_image%d.png " % (i+1), "wb") as f:
            f.write(img)

`Output result`


>>>
Article 1 Basic knowledge of CSS design that web creators should be aware of
https://ics.media/wp-content/uploads/2017/03/170315_eyecatch-640x256.jpg
Article 2 Expressed only with CSS3! I made a micro-interaction that can be used with copy
https://ics.media/wp-content/uploads/2017/03/main-640x256.png
Article 3 The quality of advertising websites is amazing!5 domestic sites that have become a hot topic recently
https://ics.media/wp-content/uploads/2017/03/170227_web_trend_1611_1702_eye-640x256.jpg
... (Omitted) ...

Get sibling elements http://tdoc.info/beautifulsoup/

os module https://docs.python.jp/3/library/os.html

enumerate function loop with index http://python.civic-apps.com/zip-enumerate/

Get from the site rendering the DOM with JavaScript

chrome dev tool Settings > Preferences > Debugger > Disable JavaScript

If you disable JS and reload and the element disappears, Sites where the DOM is dynamically generated by JS

This time I will use PhantomJS and Selenium

$ brew install phantomjs $ pip3 install —upgrade selenium

from selenium import webdriver
from bs4 import BeautifulSoup
from pprint import pprint

URL = "https://dokusho-ojikan.jp/original/#!top"

driver = webdriver.PhantomJS()
driver.get(URL)
html = driver.page_source

bs = BeautifulSoup(html, "html.parser")
img_urls = [img.get("src") for img in bs.select("#unique-pickup img")]
for u in img_urls:
    print(u)

`Output result`


>>>
https://cdn.om.dokusho-ojikan.jp/img/1f387c10-a8f8-11e6-8a10-525431b7cd60.jpg
https://cdn.om.dokusho-ojikan.jp/img/98c2e066-9a44-11e5-9ae2-52540e2de543.png
https://cdn.om.dokusho-ojikan.jp/img/7adbba1b-344b-11e5-be53-5254f877f35f.jpg
... (Omitted) ...