This time I'm going to get the box office revenue of the movie from wikipedia. Click here for scraping preparation (https://qiita.com/h1r0_1126/items/0989a2daf169c19adada)
For the time being, the source code and results are as follows.
from urllib.request import urlopen
from bs4 import BeautifulSoup
#Page to get
html = urlopen('https://ja.wikipedia.org/wiki/%E6%97%A5%E6%9C%AC%E6%AD%B4%E4%BB%A3%E8%88%88%E8%A1%8C%E5%8F%8E%E5%85%A5%E4%B8%8A%E4%BD%8D%E3%81%AE%E6%98%A0%E7%94%BB%E4%B8%80%E8%A6%A7')
bs = BeautifulSoup(html)
get_info = bs.select('table.wikitable tbody tr')
movies = []
for index, info in enumerate(get_info):
for info_td in info.select('td'):
if index <= 100:
movies.append(info_td.text)
else:
break
print(movies)
['Spirited Away\n', 'Toho\n', '2001\n', '\n', '308.0\n', 'Titanic\n', '20th Century FOX\n', '1997\n', '160.0\n', '262.0\n', 'Frozen[† 1]\n', 'Disney\n', '2014\n', '\n', '255.0\n', 'Your name is.\n', 'Toho\n', '2016\n', '\n', '250.3\n', 'Harry Potter and the Philosopher's Stone\n', 'Warner\n', '2001\n', '\n', '203.0\n', 'Howl's Moving Castle\n', 'Toho\n', '2004\n', '\n', '196.0\n', 'Princess Mononoke\n', 'Toho\n', '1997\n', '113.0\n', '193.0\n', 'Bayside Shakedown THE MOVIE 2 Block the Rainbow Bridge![† 2]\n', 'Toho\n', '2003\n', '\n', '173.5\n', 'Harry Potter and the Chamber of Secrets[† 3]\n', 'Warner\n', '2002\n', '\n', '173.0\n', 'Avatar\n', '20th Century FOX\n', '2009\n', '\n', '156.0\n', 'Ponyo on the cliff\n', 'Toho\n', '2008\n', '\n', '155.0\n', 'The Last Samurai\n', 'Warner\n', '2003\n', '\n', '137.0\n', 'E.T.[† 4][ † 5]\n', 'CIC\n', '1982\n', '96.2\n', '135.0\n', 'Armageddon\n', 'Disney\n', '1998\n', '83.5\n', '135.0\n', 'Harry Potter and the Prisoner of Azkaban\n', 'Warner\n', '2004\n', '\n', '135.0\n', 'Anna and the Snow Queen 2\n', 'Disney\n', '2019\n', '\n', '133.5\n', 'Bohemian Rhapsody\n', '20th Century FOX\n', '2018\n', '\n', '131.0\n', 'Jurassic Park[† 6]\n', 'UIP\n', '1993\n', '83.0\n', '128.5\n', 'Star Wars Episode 1/Phantom Menace\n', '20th Century FOX\n', '1999\n', '78.0\n', '127.0\n', 'beauty and the Beast\n', 'Disney\n', '2017\n', '\n', '124.0\n', 'Aladdin\n', 'Disney\n', '2019\n', '\n', '121.6\n', 'The Wind Rises\n', 'Toho\n', '2013\n', '\n', '120.2\n', 'Alice in Wonderland\n', 'Disney\n', '2010\n', '\n', '118.0\n', 'Star Wars/Force Awakening\n', 'Disney\n', '2015\n', '\n', '116.3\n', 'Antarctica\n', 'ヘラルド・Toho\n', '1983\n', '59.0\n', '110.0\n', 'Matrix Reloaded\n', 'Warner\n', '2003\n', '\n', '110.0\n', 'Finding Nemo\n', 'Disney\n', '2003\n', '\n', '110.0\n', 'Harry Potter and the Goblet of Fire\n', 'Warner\n', '2005\n', '\n', '110.0\n', 'Pirates of the Caribbean/World end\n', 'Disney\n', '2007\n', '\n', '109.0\n', 'Toy Story 3\n', 'Disney\n', '2010\n', '\n', '108.0\n', 'Independence Day\n', '20th Century FOX\n', '1996\n', '66.5\n', '106.5\n', 'Lord of the Ring/The return of the king\n', 'Herald Shochiku\n', '2004\n', '\n', '103.2\n', 'Bayside Shakedown THE MOVIE\n', 'Toho\n', '1998\n', '53.0\n', '101.0\n', 'Toy story 4\n', 'Disney\n', '2019\n', '\n', '100.9\n', 'Pirates of the Caribbean/Deadman's chest\n', 'Disney\n', '2006\n', '\n', '100.2\n', 'The kitten story\n', 'Toho\n', '1986\n', '54.0\n', '98.0\n', 'Mission:Impossible 2\n', 'UIP\n', '2000\n', '\n', '97.0\n', 'Harry Potter and the Deathly Hallows PART2\n', 'Warner\n', '2011\n', '\n', '96.7\n', 'A.I.[† 7]\n', 'Warner\n', '2001\n', '\n', '96.6\n', 'Jurassic World\n', 'Toho東和\n', '2015\n', '\n', '95.3\n', 'Back to the Future PART2\n', 'UIP\n', '1989\n', '55.3\n', '95.0\n', 'Lost World/Jurassic Park\n', 'UIP\n', '1997\n', '58.0\n', '95.0\n', 'Harry Potter and the Knights of the Immortal Birds\n', 'Warner\n', '2007\n', '\n', '94.0\n', 'Monsters Inc\n', 'Disney\n', '2002\n', '\n', '93.7\n', 'Detective Conan Navy Blue Fist\n', 'Toho\n', '2019\n', '\n', '93.7\n', 'Star Wars エピソード2/Clone attack\n', '20th Century FOX\n', '2002\n', '\n', '93.5\n', 'Theatrical version Code Blue-Doctor Heli Emergency Lifesaving-\n', 'Toho\n', '2018\n', '\n', '93.2\n', 'The Borrower Arrietty\n', 'Toho\n', '2010\n', '\n', '92.5\n', 'Heaven and earth[† 8]\n', 'Toei\n', '1990\n', '50.5\n', '92.0\n', 'Big hero 6\n', 'Disney\n', '2014\n', '\n', '91.8\n', 'Detective Conan Zero Enforcer\n', 'Toho\n', '2018\n', '\n', '91.8\n', 'Star Wars エピソード3/Revenge of Sith\n', '20th Century FOX\n', '2005\n', '\n', '91.7\n', 'Lord of the Ring\n', 'Herald Shochiku\n', '2002\n', '\n', '90.7\n', 'Da Vinci Code\n', 'Sony PE\n', '2006\n', '\n', '90.5\n', 'Jose[† 9]\n', 'CIC\n', '1975\n', '50.2\n', '90.0\n', 'Monsters University\n', 'Disney\n', '2013\n', '\n', '89.6\n', 'Pirates of the Caribbean/Fountain of life\n', 'Disney\n', '2011\n', '\n', '88.7\n', 'Terminator 2\n', 'Toho東和\n', '1991\n', '57.5\n', '87.9\n', 'Eternal zero\n', 'Toho\n', '2013\n', '\n', '87.6\n', 'matrix\n', 'Warner\n', '1999\n', '50.0\n', '87.0\n', 'ROOKIES -graduate-\n', 'Toho\n', '2009\n', '\n', '85.5\n', 'Avoid love in the center of the world\n', 'Toho\n', '2004\n', '\n', '85.0\n', 'STAND BY ME Doraemon\n', 'Toho\n', '2014\n', '\n', '83.8\n', 'Shin Godzilla\n', 'Toho\n', '2016\n', '\n', '82.5\n', 'Dunhuang\n', 'Toho\n', '1988\n', '45.0\n', '82.0\n', 'Back to the Future PART3\n', 'UIP\n', '1990\n', '47.5\n', '82.0\n', 'Terminator 3\n', 'Toho東和\n', '2003\n', '\n', '82.0\n', 'HERO\n', 'Toho\n', '2007\n', '\n', '81.5\n', 'deep Impact\n', 'UIP\n', '1998\n', '47.2\n', '81.0\n', 'Jurassic World/Kingdom of Fire\n', 'Toho東和\n', '2018\n', '\n', '80.6\n', 'THE LAST MESSAGE Umizaru\n', 'Toho\n', '2010\n', '\n', '80.4\n', 'Harry Potter and the Half-Blood Prince\n', 'Warner\n', '2009\n', '\n', '80.0\n', 'Lord of the Ring/Two towers\n', 'Herald Shochiku\n', '2003\n', '\n', '79.0\n', 'The secret of the birth of the movie Yo-Kai Watch Nyan!\n', 'Toho\n', '2014\n', '\n', '78.0\n', 'Boys Over Flowers F\n', 'Toho\n', '2008\n', '\n', '77.5\n', 'Tales from Earthsea[† 10]\n', 'Toho\n', '2006\n', '\n', '76.9\n', 'Sixth Sense\n', 'Toho東和\n', '1999\n', '45.0\n', '76.8\n', 'Zootopia\n', 'Disney\n', '2016\n', '\n', '76.3\n', 'Spiderman\n', 'Sony PE\n', '2002\n', '\n', '75.0\n', 'Star Wars/The Last Jedi\n', 'Disney\n', '2017\n', '\n', '75.0\n', 'Indiana Jones/Last holy war\n', 'UIP\n', '1989\n', '44.0\n', '74.0\n', 'BRAVE HEARTS Umizaru\n', 'Toho\n', '2012\n', '\n', '73.3\n', 'Bayside Shakedown THE MOVIE3 ヤツらを解放せよ!\n', 'Toho\n', '2010\n', '\n', '73.1\n', 'Fantastic Beasts and the Wizard's Journey[† 11]\n', 'Warner\n', '2016\n', '\n', '73.1\n', 'Despicable Me Minion Escape\n', 'Toho東和\n', '2017\n', '\n', '73.1\n', 'Star Wars/Skywalker dawn\n', 'Disney\n', '2019\n', '\n', '72.7\n', 'Pocket Monsters Miu Two's Counterattack\n', 'Toho\n', '1998\n', '41.5\n', '72.4\n', 'Die Hard 3\n', '20th Century FOX\n', '1995\n', '48.0\n', '72.0\n', 'Spider-Man 3\n', 'Sony PE\n', '2007\n', '\n', '71.2\n', 'LIMIT OF LOVE Umizaru\n', 'Toho\n', '2006\n', '\n', '71.0\n', 'speed\n', '20th Century FOX\n', '1994\n', '45.0\n', '70.3\n', 'Ghost Busters\n', 'Columbia\n', '1984\n', '41.0\n', '70.0\n', 'Oceans 11\n', 'Warner\n', '2002\n', '\n', '69.0\n', 'Crimson love song from Detective Conan\n', 'Toho\n', '2017\n', '\n', '68.9\n', 'Pearl Harbor\n', 'Disney\n', '2001\n', '\n', '68.8\n', 'ONE PIECE FILM Z\n', 'Toei\n', '2012\n', '\n', '68.7\n', 'Narnia story/Chapter 1:Lion and witch\n', 'Disney\n', '2006\n', '\n', '68.6\n', 'Harry Potter and the Deathly Hallows PART1\n', 'Warner\n', '2010\n', '\n', '68.6\n', 'Finding Dory\n', 'Disney\n', '2016\n', '\n', '68.3\n', 'Pirates of the Caribbean/Cursed pirates\n', 'Disney\n', '2003\n', '\n', '68.0\n']
html = urlopen('https://ja.wikipedia.org/wiki/%E6%97%A5%E6%9C%AC%E6%AD%B4%E4%BB%A3%E8%88%88%E8%A1%8C%E5%8F%8E%E5%85%A5%E4%B8%8A%E4%BD%8D%E3%81%AE%E6%98%A0%E7%94%BB%E4%B8%80%E8%A6%A7')
Specify the page to get the data with the above code. This time I will use wikipedia.
get_info = bs.select('table.wikitable tbody tr')
Here, get the contents of tr of table with <class = "wikitable"> attribute.
This time I specified the following part of wikipedia
for index, info in enumerate(get_info):
for info_td in info.select('td'):
if index <= 100:
movies.append(info_td.text)
else:
break
Turn the acquired tr content and add only the characters of the td content to the array You can get the result by displaying it with print at the end.
Next time, I will create a graph from the acquired data. Maybe using d3.js
Recommended Posts