In den letzten Tagen war ich süchtig danach, Daten von jeder Nachrichtenseite zu erhalten. Ich habe den Code, den ich geschrieben habe, zu einer etwas praktischen Klasse gemacht, also werde ich ihn veröffentlichen. Auf einigen Websites können Bots natürlich nicht gecrawlt werden. Seien Sie also vorsichtig, wenn Sie sie verwenden. Ich versuche es von der folgenden Seite zu bekommen.
Asahi Shimbun Nikkei Shimbun Sankei Shimbun Yomiuri Shimbun Mainichi Shimbun Yahoo Nachrichten CNN Bloomberg BBC Reuter Wall Street Journal Forbes Japan Newsweek CNN.co.jp ABC News Ministerium für Auswärtige Angelegenheiten AFP BB NHK News Tägliche Industriezeitung EUROPA NEWSWIRE PR-Zentrum der Vereinten Nationen OPCW News HAARETZ THE DAILY STAR INDEPENDENT JETRO Abend Fuji
Rufen Sie die Anforderungsbibliothek, die Analyse usw. mit BS4 ab, rufen Sie das Datum mit dem regulären Ausdruck und der Strptime von datetime ab und machen Sie es zum datetime-Typ. Die Granularität variiert, z. B. Websites, die nach Datum erfasst werden können, und Websites, die nach Minuten erfasst werden können. Ursprünglich sollte das, was täglich genommen werden kann, ein Datumsobjekt sein, aber es ist nicht gut. Die Notationsschwankung des Variablennamens ist ebenfalls schrecklich. Es ist ein Skript, das sagt: "Ich wünschte, ich könnte ein grobes Aktualisierungsdatum mit einer möglichst hohen Wahrscheinlichkeit erhalten." Es gibt auch eine Frage, dass es nicht möglich sein wird, das Aktualisierungsdatum ohne dies zu erhalten, daher würde ich gerne die Kommentare von denen hören, die damit vertraut sind. ..
news_timestamp.py
import bs4
import requests
import datetime
import re
from jeraconv import jeraconv
class ScrapeNewsTimestamp:
def __init__(self):
self.headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
}
def scrape_return_timestamp_1(self,bs4Obj):
try:
#Asahi Shimbun
bs4Obj = bs4Obj
news_timestamp = datetime.datetime.strptime(bs4Obj.select('time')[0].string, "%Y Jahr%m Monat%d Tag%Uhr%M Minuten")
return news_timestamp
except Exception as e:
return None
def scrape_return_timestamp_2(self,bs4Obj):
try:
#Nikkei Zeitungszeit ist Geld
bs4Obj = bs4Obj
news_timestamp = datetime.datetime.strptime(bs4Obj.select('time')[1].string, "%Y Jahr%m Monat%d Tag%H:%M")
return news_timestamp
except Exception as e:
return None
def scrape_return_timestamp_3(self,bs4Obj):
try:
#Nikkei Shimbun Overseas Financial News
bs4Obj = bs4Obj
news_timestamp = datetime.datetime.strptime(bs4Obj.select('.cmnc-publish')[0].string, "%Y/%m/%d %H:%M")
return news_timestamp
except Exception as e:
return None
def scrape_return_timestamp_4(self,bs4Obj):
try:
#Nikkei Shimbun Frühling Herbst
bs4Obj = bs4Obj
news_timestamp = datetime.datetime.strptime(bs4Obj.select('.cmnc-publish')[0].string, "%Y/%m/%Mit d")
return news_timestamp
except Exception as e:
return None
def scrape_return_timestamp_5(self,bs4Obj):
try:
#Sankei Shimbun International
bs4Obj = bs4Obj
news_timestamp = datetime.datetime.strptime(bs4Obj.select('#__r_publish_date__')[0].string, "%Y.%m.%d %H:%M")
return news_timestamp
except Exception as e:
return None
def scrape_return_timestamp_6(self,bs4Obj):
try:
#Yomiuri Shimbun Inland
bs4Obj = bs4Obj
news_timestamp = datetime.datetime.strptime(bs4Obj.select('time')[0].string, "%Y/%m/%d %H:%M")
return news_timestamp
except Exception as e:
return None
def scrape_return_timestamp_7(self,bs4Obj):
try:
#Mainichi Shimbun Tokyo Morgenausgabe
bs4Obj = bs4Obj
news_timestamp = datetime.datetime.strptime(bs4Obj.select('time')[0].string, "%Y Jahr%m Monat%d Tag Tokio Morgenausgabe")
return news_timestamp
except Exception as e:
return None
def scrape_return_timestamp_8(self,bs4Obj):
try:
#Mainichi Shimbun Tokyo Abendausgabe
bs4Obj = bs4Obj
news_timestamp = datetime.datetime.strptime(bs4Obj.select('time')[0].string, "%Y Jahr%m Monat%Tag d Tokio Abendausgabe")
return news_timestamp
except Exception as e:
return None
def scrape_return_timestamp_9(self,bs4Obj):
try:
#Mainichi Shimbun aktuelle Nachrichten
bs4Obj = bs4Obj
news_timestamp = datetime.datetime.strptime(bs4Obj.select('time')[0].string, "%Y Jahr%m Monat%d Tag%Uhr%M Minuten")
return news_timestamp
except Exception as e:
return None
def scrape_return_timestamp_10(self,bs4Obj):
try:
#Mainichi Shimbun Premier
bs4Obj = bs4Obj
news_timestamp = datetime.datetime.strptime(bs4Obj.select('time')[0].string, "%Y Jahr%m Monat%d Tag")
return news_timestamp
except Exception as e:
return None
def scrape_return_timestamp_11(self,bs4Obj):
try:
#Yahoo!Nachrichten
bs4Obj = bs4Obj
m1 = re.match(r'\d{1,2}/\d{1,2}',str(bs4Obj.select('p.source')[0].string))
tmp1 = m1.group()
m2 = re.search(r'\d{1,2}:\d{1,2}',str(bs4Obj.select('p.source')[0].string))
tmp2 = m2.group()
news_timestamp = datetime.datetime.strptime(str(datetime.datetime.now().year)+tmp1+' '+tmp2, "%Y%m/%d %H:%M")
return news_timestamp
except Exception as e:
return None
def scrape_return_timestamp_12(self,bs4Obj):
try:
#CNN
bs4Obj = bs4Obj
m1 = re.search(r'Updated (\d{4}) GMT', str(bs4Obj.select('.update-time')[0].getText()))
m2 = re.search(r'(January|February|March|April|May|June|July|August|September|October|November|December) (\d{1,2}), (\d{4})', str(bs4Obj.select('.update-time')[0].getText()))
nes_timestamp_tmp = m2.groups()[2]+m2.groups()[1]+m2.groups()[0]+m1.groups()[0]
news_timestamp = datetime.datetime.strptime(nes_timestamp_tmp, "%Y%d%B%H%M")
return news_timestamp
except Exception as e:
return None
def scrape_return_timestamp_13(self,bs4Obj):
try:
#Bloomberg
bs4Obj = bs4Obj
timesamp_tmp = re.sub(' ','',str(bs4Obj.select('time')[0].string))
timesamp_tmp = re.sub('\n','',timesamp_tmp)
news_timestamp = datetime.datetime.strptime(timesamp_tmp, "%Y Jahr%m Monat%d Tag%H:%MJST")
return news_timestamp
except Exception as e:
return None
def scrape_return_timestamp_14(self,bs4Obj):
try:
#BBC
bs4Obj = bs4Obj
news_timestamp = datetime.datetime.strptime(bs4Obj.select("div.date.date--v2")[0].string, "%d %B %Y")
return news_timestamp
except Exception as e:
return None
def scrape_return_timestamp_15(self,bs4Obj):
try:
#Reuter
bs4Obj = bs4Obj
m1 = re.match(r'(January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}, \d{4}',str(bs4Obj.select(".ArticleHeader_date")[0].string))
m2 = re.search(r'\d{1,2}:\d{1,2}',str(bs4Obj.select(".ArticleHeader_date")[0].string))
news_timestamp = datetime.datetime.strptime(m1.group()+' '+m2.group(), "%B %d, %Y %H:%M")
return news_timestamp
except Exception as e:
return None
def scrape_return_timestamp_16(self,bs4Obj):
try:
#Wall Street Journal
bs4Obj = bs4Obj
m = re.sub(' ','',str(bs4Obj.select(".timestamp.article__timestamp")[0].string))
m = re.sub('\n','',m)
m = re.match(r'(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec).(\d{1,2}),(\d{4})(\d{1,2}):(\d{1,2})',str(m))
tmp = m.groups()
timesamp_tmp = tmp[0]+' '+ tmp[1].zfill(2)+' '+tmp[2]+' '+tmp[3].zfill(2)+' '+tmp[4].zfill(2)
news_timestamp = datetime.datetime.strptime(timesamp_tmp, "%b %d %Y %H %M")
return news_timestamp
except Exception as e:
return None
def scrape_return_timestamp_17(self,bs4Obj):
try:
#Forbes Japan
bs4Obj = bs4Obj
news_timestamp = datetime.datetime.strptime(bs4Obj.select("time")[0].string, "%Y/%m/%d %H:%M")
return news_timestamp
except Exception as e:
return None
def scrape_return_timestamp_18(self,bs4Obj):
try:
#Newsweek
bs4Obj = bs4Obj
m = re.search(r'(\d{1,2})/(\d{1,2})/(\d{1,2}) at (\d{1,2}:\d{1,2}) ', str(bs4Obj.select('time')[0].string))
tmp = m.groups()
timesamp_tmp = tmp[0].zfill(2)+' '+ tmp[1].zfill(2)+' '+'20'+tmp[2].zfill(2)+' '+tmp[3]
news_timestamp = datetime.datetime.strptime(timesamp_tmp, "%m %d %Y %H:%M")
return news_timestamp
except Exception as e:
return None
def scrape_return_timestamp_19(self,bs4Obj):
try:
#CNN.co.jp
bs4Obj = bs4Obj
m1 = re.search(r'\d{4}.\d{2}.\d{2}', str(bs4Obj.select("div .metadata-updatetime")[0]))
m2 = re.search(r'\d{1,2}:\d{1,2}', str(bs4Obj.select("div .metadata-updatetime")[0]))
news_timestamp = datetime.datetime.strptime(m1.group()+' '+m2.group(), "%Y.%m.%d %H:%M")
return news_timestamp
except Exception as e:
return None
def scrape_return_timestamp_20(self,bs4Obj):
try:
#BBC 2
bs4Obj = bs4Obj
news_timestamp = datetime.datetime.strptime(bs4Obj.select("div.date.date--v2")[0].string, "%Y Jahr%m Monat%d Tag")
return news_timestamp
except Exception as e:
return None
def scrape_return_timestamp_21(self,bs4Obj):
try:
#ABC News
bs4Obj = bs4Obj
tmp = re.match(r'(January|February|March|April|May|June|July|August|September|October|November|December) (\d{1,2}), (\d{4}), (\d{1,2}:\d{1,2}) (AM|PM)',bs4Obj.select(".Byline__Meta.Byline__Meta--publishDate")[0].string)
hour = int(tmp.groups()[3].split(':')[0])
mini = tmp.groups()[3].split(':')[1]
if tmp.groups()[4] == 'PM':
hour += 12
hour = str(hour)
news_timestamp = datetime.datetime.strptime(tmp.groups()[0]+' '+tmp.groups()[1]+' '+tmp.groups()[2]+' '+hour+' '+mini,"%B %d %Y %H %M")
return news_timestamp
except Exception as e:
return None
def scrape_return_timestamp_22(self,bs4Obj):
try:
#Ministerium für Auswärtige Angelegenheiten
bs4Obj = bs4Obj
j2w = jeraconv.J2W()
m = bs4Obj.select('.rightalign')[0].string
y = m.split('Jahr')[0]
md = m.split('Jahr')[1]
news_timestamp = datetime.datetime.strptime(str(j2w.convert(str(y)+'Jahr'))+'Jahr'+ md, "%YJahr%m Monat%d Tag")
return news_timestamp
except Exception as e:
return None
def scrape_return_timestamp_23(self,bs4Obj):
try:
#AFP BB
bs4Obj = bs4Obj
for meta_tag in bs4Obj.find_all('meta', attrs={'property':"article:modified_time"}):
m = re.match(r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}',meta_tag.get('content'))
news_timestamp = datetime.datetime.strptime(m.group(),"%Y-%m-%dT%H:%M:%S")
return news_timestamp
except Exception as e:
return None
def scrape_return_timestamp_24(self,bs4Obj):
try:
#NHK News
bs4Obj = bs4Obj
for meta_tag in bs4Obj.find_all('time'):
news_timestamp = datetime.datetime.strptime(meta_tag.get('datetime'),'%Y-%m-%dT%H:%M')
return news_timestamp
except Exception as e:
return None
def scrape_return_timestamp_25(self,bs4Obj):
try:
#Nikkei Shimbun
bs4Obj = bs4Obj
m = re.search(r'\d{4}/\d{1,2}/\d{1,2} \d{1,2}:\d{1,2}',bs4Obj.select('.cmnc-publish')[0].string)
news_timestamp = datetime.datetime.strptime(m.group(), "%Y/%m/%d %H:%M")
return news_timestamp
except Exception as e:
return None
def scrape_return_timestamp_26(self,bs4Obj):
try:
#Tägliche Industriezeitung
bs4Obj = bs4Obj
m = re.search(r'\d{4}/\d{1,2}/\d{1,2} \d{2}:\d{1,2}',str(bs4Obj.select('.date')[1]))
news_timestamp = datetime.datetime.strptime(m.group(), "%Y/%m/%d %H:%M")
return news_timestamp
except Exception as e:
return None
def scrape_return_timestamp_27(self,bs4Obj):
try:
#Asahi Shimbun
bs4Obj = bs4Obj
news_timestamp = datetime.datetime.strptime(bs4Obj.select('.date')[0].string, "%Y Jahr%m Monat%d Tag")
return news_timestamp
except Exception as e:
return None
def scrape_return_timestamp_28(self,bs4Obj):
try:
#Asahi Shimbun Junior und Senior High School Zeitung
bs4Obj = bs4Obj
news_timestamp = datetime.datetime.strptime(bs4Obj.select('.date')[0].string, "%Y Jahr%m Monat%d Tag")
return news_timestamp
except Exception as e:
return None
def scrape_return_timestamp_28(self,bs4Obj):
try:
#EUROPA NEWSWIRE
bs4Obj = bs4Obj
m = re.search(r'(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) (\d{1,2}), (\d{4})',bs4Obj.select(".icon-cal")[0].string)
news_timestamp = datetime.datetime.strptime(''.join(m.groups()), "%b%d%Y")
return news_timestamp
except Exception as e:
return None
def scrape_return_timestamp_29(self,bs4Obj):
try:
#PR-Zentrum der Vereinten Nationen
bs4Obj = bs4Obj
news_timestamp = datetime.datetime.strptime(bs4Obj.select("#cm_header_text")[0].string, "%Y Jahr%m Monat%d Tag")
return news_timestamp
except Exception as e:
return None
def scrape_return_timestamp_29(self,bs4Obj):
try:
#OPCW News
bs4Obj = bs4Obj
m = re.search(r'(\d{1,2}) (January|February|March|April|May|June|July|August|September|October|November|December) \d{4}',bs4Obj.select(".news__date")[0].get_text())
news_timestamp = datetime.datetime.strptime(m.group(), "%d %B %Y")
return news_timestamp
except Exception as e:
return None
def scrape_return_timestamp_30(self,bs4Obj):
try:
#HAARETZ
bs4Obj = bs4Obj
m = re.search(r'(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) \d{1,2}, \d{4} \d{1,2}:\d{1,2}',bs4Obj.select("time")[1].get_text())
news_timestamp = datetime.datetime.strptime(m.group(), "%b %d, %Y %H:%M")
return news_timestamp
except Exception as e:
return None
def scrape_return_timestamp_31(self,bs4Obj):
try:
#THE DAILY STAR
bs4Obj = bs4Obj
m = re.match(r'(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec). (\d{1,2}), (\d{4}) \| (\d{1,2}):(\d{1,2})',bs4Obj.select("time")[0].get_text())
news_timestamp = datetime.datetime.strptime(''.join(m.groups()), "%b%d%Y%H%M")
return news_timestamp
except Exception as e:
return None
def scrape_return_timestamp_32(self,bs4Obj):
try:
#INDEPENDENT
bs4Obj = bs4Obj
m = re.search(r'\d{4}-\d{1,2}-\d{1,2}T\d{1,2}:\d{1,2}',str(bs4Obj.select("amp-timeago")[0]))
news_timestamp = datetime.datetime.strptime(m.group(), "%Y-%m-%dT%H:%M")
return news_timestamp
except Exception as e:
return None
def scrape_return_timestamp_33(self,bs4Obj):
try:
#JETRO
bs4Obj = bs4Obj
m = re.search(r'\d{4}Jahr\d{1,2}Mond\d{1,2}Tag',str(bs4Obj.select('p')))
news_timestamp = datetime.datetime.strptime(m.group(), "%Y Jahr%m Monat%d Tag")
return news_timestamp
except Exception as e:
return None
def scrape_return_timestamp_34(self,bs4Obj):
try:
#Abend Fuji
bs4Obj = bs4Obj
m = re.search(r'\d{4}.\d{1,2}.\d{1,2}',str(bs4Obj.select('#__r_publish_date__')[0]))
news_timestamp = datetime.datetime.strptime(m.group(), "%Y.%m.%d")
return news_timestamp
except Exception as e:
return None
def main(self,URL):
self.URL = URL
try:
get_url_info = requests.get(URL,headers=self.headers)
bs4Obj = bs4.BeautifulSoup(get_url_info.content, 'lxml')
except Exception as e:
print(e)
return 'Zugriff auf URL nicht möglich'
for i in range(1,35):
func_name = 'self.scrape_return_timestamp_' + str(i)
ts_temp = eval(func_name)(bs4Obj)
if ts_temp:
return td_temp
test.py
from news_timestamp import *
TS = ScrapeNewsTimestamp()
news_timestamp = TS.main('https://www.mofa.go.jp/mofaj/press/release/press1_000423.html')
print(news_timestamp)
2020-02-15 00:00:00
Ich werde es auch hier lassen. https://github.com/KanikaniYou/news_timestamp
Recommended Posts