TL; DR
Hit the API from python to collect any parliamentary minutes.
You can also search by GUI from National Diet Library Search System, but there is a proper API manual .jp / api.html).
Here, we will collect minutes that include the following keywords for the statements made during the 10 years from 2010 to 2019.
# -*- coding: utf-8 -*-
"""
Created on Thu Dec 26 15:05:04 2019
@author: boomin
pip install untangle
"""
import urllib
import untangle
import urllib.parse
import re
import pandas as pd
import os
spt = os.sep
pklDir = "pkl"
def getSpeech(keyword:str):
start="1" #'#Serial number of remark
apipath = 'http://kokkai.ndl.go.jp/api/1.0/speech?'
#Regular expression to remove the speaker part from the content of the statement
p = re.compile(r'^○([^ ]+)You?\s(.+)')
startdate='2010-01-01'
enddate= '2020-01-01'
df = pd.DataFrame()
while start!=None:
date = []
speaker = []
speech = []
speakerGroup = []
speakerPosition = []
url = apipath+urllib.parse.quote(
'maximumRecords=100&recordPacking=xml'
+ '&from=' + startdate
+ '&until=' + enddate
+ '&any=' + keyword
+ f'&startRecord={start}'
)
#Get signal request search results (XML)
obj = untangle.parse(url)
for record in obj.data.records.record:
speechrecord = record.recordData.speechRecord
speechdata = speechrecord.speech.cdata.replace("\u3000"," ").replace("\n"," ")
m = p.search(speechdata)
if not isinstance(m,type(None)):
date.append(speechrecord.date.cdata)
speaker.append(speechrecord.speaker.cdata)
speech.append(m.group(2))
speakerGroup.append(speechrecord.speakerGroup.cdata)
speakerPosition.append(speechrecord.speakerPosition.cdata)
offset = int(start)-1
index = [ offset+n for n in list(range(len(date))) ]
adddf = pd.DataFrame({
"date":date,
"speaker":speaker,
"speech":speech,
"speakerGroup":speakerGroup,
"speakerPosition":speakerPosition,
}, index=index)
df = pd.concat([df, adddf ])
#Since only 100 items are returned at a time, change the start position and repeatedly send the GET function.
try:
start = obj.data.nextRecordPosition.cdata
print(f"finished: {start}")
except:
pass
break
df["date"] = pd.to_datetime(df["date"])
return df
if __name__ == '__main__':
df1 = getSpeech('Artificial intelligence')
df2 = getSpeech('AI')
df3 = getSpeech('big data')
df4 = getSpeech('Machine learning')
df = pd.concat([df1,df2,df3,df4])
#Delete duplicate remarks
df.drop_duplicates(subset=["date","speaker","speech"], inplace=True)
df.sort_values(by=["date","speaker"],inplace=True)
df.reset_index(drop=True, inplace=True)
pd.to_pickle(df, f"{pklDir}{spt}kokkailog.pkl")
df.to_csv(f"{pklDir}{spt}kokkailog.tsv", sep="\t")
In[4]: df.tail()
Out[4]:
# date speaker ... speakerGroup speakerPosition
#4288 2019-12-05 Taku Eto...Liberal Democratic Party, Group of Independents Minister of Agriculture, Forestry and Fisheries
#4289 2019-12-05 Masayoshi Hamada...Komeito
#4290 2019-12-05 Mitsuko Ishii...Japan Restoration Party
#4291 2019-12-05 Takashi Midorikawa...Constitutional Democratic / National / Social Insurance / Independent Forum
#4292 2019-12-05 Koichi Hagiuda...Liberal Democratic Party, Independent Minister of Education, Culture, Sports, Science and Technology
#
#[5 rows x 5 columns]
Recommended Posts