Python
https://github.com/yuzuru-program/scraping-python-yahoo
index.py
import urllib.request as request
from bs4 import BeautifulSoup
req = request.Request(
    "https://www.yahoo.co.jp",
    None,
    {}
)
instance = request.urlopen(req)
soup = BeautifulSoup(instance, "html.parser")
li = soup.select('main article section ul')[0].select('li')
for m in li:
    print(m.text)
    print(m.select("a")[0].get("href"))
    print()
Node.js
https://github.com/yuzuru-program/scraping-node-yahoo
package.json
{
  "dependencies": {
    "cheerio": "^1.0.0-rc.3",
    "node-fetch": "^2.6.0"
  }
}
index.js
const fetch = require('node-fetch');
const cheerio = require('cheerio');
const main = async () => {
  // https://www.yahoo.co.jp/Throw a request to
  const _ret = await fetch('https://www.yahoo.co.jp/', {
    method: 'get',
    headers: {
      'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
    },
    referrer: '',
  }).catch((err) => {
    console.log(err);
  });
  if (_ret.status !== 200) {
    console.log(`error status:${_ret.status}`);
    return;
  }
  //Converted for use with jquery ticks
  const $ = cheerio.load(await _ret.text());
  const _li = $('main article section ul').eq(0).find('li');
  //View Yahoo Top News
  _li.map(function (i) {
    console.log(_li.eq(i).text());
    console.log(_li.eq(i).find('a').attr()['href']);
    console.log();
  });
};
main();
PHP
https://github.com/yuzuru-program/scraping-php-yahoo
index.php
<?php
require_once './phpQuery-onefile.php';
function my_curl($url)
{
  $cp = curl_init();
  /*option:Get the redirected page when redirected*/
  curl_setopt($cp, CURLOPT_RETURNTRANSFER, 1);
  /*option:Specify the URL*/
  curl_setopt($cp, CURLOPT_URL, $url);
  /*option:Specify the timeout time*/
  curl_setopt($cp, CURLOPT_TIMEOUT, 30);
  /*option:Specify a user agent*/
  curl_setopt($cp, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36');
  $data = curl_exec($cp);
  curl_close($cp);
  return $data;
}
$url = 'https://www.yahoo.co.jp';
$doc = phpQuery::newDocument(my_curl($url));
$ul = $doc->find('main article section')->find("ul:eq(0)");
for ($i = 0; $i < count($ul->find("li")); ++$i) {
  $li = $ul->find("li:eq($i)");
  echo  $li[0]->text();
  echo "\n";
  echo $li[0]->find("a")->attr('href').PHP_EOL;
  echo "\n";
}
?>
phpQuery-onefile.php https://github.com/yuzuru-program/scraping-php-yahoo/blob/master/phpQuery-onefile.php
Ruby
https://github.com/yuzuru-program/scraping-ruby-yahoo
index.rb
require "nokogiri"
require "open-uri"
doc = Nokogiri::HTML(open("https://www.yahoo.co.jp"))
test = doc.css("main article section ul")[0].css("li")
test.each do |li|
  puts li.content
  puts li.css("a")[0][:href]
  puts
end
Go
https://github.com/yuzuru-program/scraping-go-yahoo
index.go
package main
import (
  "fmt"
  "log"
  "net/http"
  "github.com/PuerkitoBio/goquery"
)
func main() {
  req, _ := http.NewRequest("GET", "http://yahoo.co.jp", nil)
  req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36")
  res, _ := new(http.Client).Do(req)
  if res.StatusCode != 200 {
    log.Fatalf("status code error: %d %s\n", res.StatusCode, res.Status)
  }
  doc, err := goquery.NewDocumentFromReader(res.Body)
  if err != nil {
    log.Println(err)
  }
  li := doc.Find("main article section ul").Eq(0).Find("li")
  li.Each(func(index int, s *goquery.Selection) {
    fmt.Println(s.Text())
    tmp, err := s.Find("a").Attr("href")
    if err != true {
      log.Fatal(err)
    }
    fmt.Println(tmp + "\n")
  })
}
VBA
'Microsoft HTML Object Library
'Microsoft Internet Controls
'Function to delete IE process
Function IeProcessKill()
    CreateObject("WScript.Shell").Exec ("taskkill.exe /F /IM iexplore.exe")
    Application.Wait Now + TimeValue("0:00:2")
End Function
'Yahoo top scraping
Sub main()
    Dim ie As InternetExplorer
    'Delete IE process'
    Call IeProcessKill
    
    'IE startup
    Set ie = New InternetExplorer
    
    'Hide site
    ie.Visible = False
    
    Debug.Print "Loading..."
    Debug.Print
    
    'Yahoo
    ie.Navigate "https://www.yahoo.co.jp/"
    
    Do While ie.Busy = True Or ie.readyState < READYSTATE_COMPLETE
    Loop
    
    For Each tmp In ie.document.querySelector("main article section ul").getElementsByTagName("li")
        Debug.Print Trim(tmp.textContent)
        Debug.Print tmp.getElementsByTagName("a")(0).href
        Debug.Print
    Next tmp
    
    'Browser close
    ie.Quit
    Set ie = Nothing
End Sub
Recommended Posts