The title is shit long. .. I'm 9uant, who is doing quantitative analysis and data science in the financial area. We also do twitter, so if you are interested, please follow us!
As the title suggests, we will share the code to quickly download the XBRL of financial statements. I would like to write a commentary one by one. Take the following two steps.
--DataFrame the link to XBRL --Download the XBRL zip file from DataFrame
import os
import glob
import shutil
import re
import time
from datetime import date, timedelta, datetime
from dateutil.relativedelta import relativedelta
import requests
from bs4 import BeautifulSoup
import urllib3
from urllib3.exceptions import InsecureRequestWarning
urllib3.disable_warnings(InsecureRequestWarning)
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
import zipfile
import numpy as np
import pandas as pd
import json
EDINET Easy because EDINET has an API.
def edinet_xbrl_link(annual=True, quarter=True, codes=None, year=0,month=0,day=0):
'''
Create a DataFrame of XBRL links for specific companies or all securities and quarterly reports
Parameters:
annual: bool, default True
Get a securities report if True
quarter: bool, default Ture
Get quarterly report if True
codes: None, int, float, str, or list (codes[code]=int, float,or str), default None
Get data for all companies if None
If you specify a stock code, only the data of those companies will be acquired.:point_up_tone4:
year, month, day: int, default 0
Specify how many days ago the data will be retrieved(Up to 5 years)
Returns:
database: pandas.DataFrame
database['code']: str
5-digit securities code
database['type']: str
'annual' or 'quarter'
database['date']: datetime.date
release date
database['title']: str
title
database['URL']: str
URL to download the XBRL zip file
'''
edinet_url = "https://disclosure.edinet-fsa.go.jp/api/v1/documents.json"
#Unify codes into a character type array.
if codes != None:
if type(codes) in (str, int, float):
codes = [int(codes)]
for code in codes:
#Convert 4-digit securities code to 5-digit
if len(str(int(code)))==4:
code = str(int(code))+'0'
#Turn a for statement with datetime type
def date_range(start, stop, step = timedelta(1)):
current = start
while current < stop:
yield current
current += step
#Prepare a DataFrame to store the result
database = pd.DataFrame(index=[], columns=['code','type','date','title','URL'])
for d in date_range(date.today()-relativedelta(years=year, months=month, days=day)+relativedelta(days=1), date.today()+relativedelta(days=1)):
#Access the EDINET API
d_str = d.strftime('%Y-%m-%d')
params = {'date' : d_str, 'type' : 2}
res = requests.get(edinet_url, params=params, verify=False)
json_res = json.loads(res.text)
time.sleep(5)
#If you cannot access normally
if json_res['metadata']['status']!='200':
print(d_str, 'not accessible')
continue
print(d_str, json_res['metadata']['resultset']['count'])#Display date and number
#In case of 0
if len(json_res['results'])==0:
continue
df = pd.DataFrame(json_res['results'])[['docID', 'secCode', 'ordinanceCode', 'formCode','docDescription']]
df.dropna(subset=['docID'], inplace=True)
df.dropna(subset=['secCode'], inplace=True)
df.rename(columns={'secCode': 'code', 'docDescription': 'title'}, inplace=True)
df['date'] = d
df['URL'] = df['docID']
df['URL'] = "https://disclosure.edinet-fsa.go.jp/api/v1/documents/" + df['URL']
#Extract only the specified securities code
if codes != None:
df = df[df['code'] in codes]
if annual == True:
df1 = df[(df['ordinanceCode']=='010') & (df['formCode']=='030000')]
df1['type'] = 'annual'
database = pd.concat([database, df1[['code', 'type', 'date','title', 'URL']]], axis=0, join='outer').reset_index(drop=True)
if quarter == True:
df2 = df[(df['ordinanceCode']=='010') & (df['formCode']=='043000')]
df2['type'] = 'quarter'
database = pd.concat([database, df2[['code', 'type', 'date','title', 'URL']]], axis=0, join='outer').reset_index(drop=True)
return database
TDNET
Use selenium
to collect data from TDNET.
Since only 200 free word search results are displayed, a function to search by securities code and a function to search by date were created separately.
def tdnet_xbrl_link_by_code(codes):
'''
Create a DataFrame that links the financial statements of the specified company to XBRL
Parameters:
codes: None, int, float, str, or list (codes[code]=int, float,or str), default None
Get data for all companies if None
Returns:
database: pandas.DataFrame
database['code']: str
5-digit securities code
database['type']: str
'annual' or 'quarter'
database['date']: datetime.date
release date
database['title']: str
title
database['URL']: str
URL to download the XBRL zip file
'''
#Unify codes into a character type array.
if type(codes) in (str, int, float):
codes = [int(codes)]
for i, code in enumerate(codes):
#Convert 4-digit securities code to 5-digit
if len(str(int(code)))==4:
codes[i] = str(int(code))+'0'
database = pd.DataFrame(index=[], columns=['code','type','date','title','URL'])
for code in codes:
#Launch the browser
chromeOptions = webdriver.ChromeOptions()
chromeOptions.add_argument('--headless') #Hide browser
driver = webdriver.Chrome(options=chromeOptions)
driver.get("https://www.release.tdnet.info/onsf/TDJFSearch/I_head")
#Send search word
duration = driver.find_element_by_name('t0')
select = Select(duration)
select.options[-1].click()
inputElement = driver.find_element_by_id("freewordtxt")
inputElement.send_keys(code)
inputElement.send_keys(Keys.RETURN)
time.sleep(5)
#Move to the frame where the search results are displayed
iframe = driver.find_element_by_name("mainlist")
driver.switch_to.frame(iframe)
#Ends processing when there are 0 search results
if driver.find_element_by_id("contentwrapper").text == 'The relevant timely disclosure information was not found.':
return database
#Read data from each row in the search results table
table = driver.find_element_by_id("maintable")
trs = table.find_elements(By.TAG_NAME, "tr")
for i in range(len(trs)):
title = trs[i].find_elements(By.TAG_NAME, "td")[3].text
#Select the financial statements of the specified company where XBRL exists, not the correction documents
if ('Financial statements' in title) and ('correction' not in title) and (len(trs[i].find_elements(By.TAG_NAME, "td")[4].text)!=0) and (code==trs[i].find_elements(By.TAG_NAME, "td")[1].text):
date = trs[i].find_elements(By.TAG_NAME, "td")[0].text[:10]
date = datetime.strptime(date, '%Y/%m/%d').date()
url = trs[i].find_elements(By.TAG_NAME, "td")[4].find_element_by_tag_name("a").get_attribute("href")
database = database.append(pd.Series([code,'brief',date,title,url], index=database.columns), ignore_index=True)
driver.quit()
return database
def tdnet_xbrl_link_by_date(date=None):
'''
Create a DataFrame with a link to XBRL for the specified date or all financial statements
Parameters:
date: None or str ('yyyy/mm/dd'), default None
Get data for all dates if None
Returns:
database: pandas.DataFrame
database['code']: str
5-digit securities code
database['type']: str
'annual' or 'quarter'
database['date']: datetime.date
release date
database['title']: str
title
database['URL']: str
URL to download the XBRL zip file
'''
database = pd.DataFrame(index=[], columns=['code','type','date','title','URL'])
#Launch the browser
chromeOptions = webdriver.ChromeOptions()
chromeOptions.add_argument('--headless') #Hide browser
driver = webdriver.Chrome(options=chromeOptions)
driver.get("https://www.release.tdnet.info/inbs/I_main_00.html")
duration = driver.find_element_by_name('daylist')
select = Select(duration)
for i in range(1, len(select.options)):
driver.get("https://www.release.tdnet.info/inbs/I_main_00.html")
duration = driver.find_element_by_name('daylist')
select = Select(duration)
d = datetime.strptime(select.options[i].text[:10], '%Y/%m/%d').date()
print(select.options[i].text)
if (date == None) or (date == select.options[i].text[:10]):
select.options[i].click()
time.sleep(5)
#Move to the frame where the search results are displayed
iframe = driver.find_element_by_id("main_list")
driver.switch_to.frame(iframe)
#Ends processing when there are 0 search results
if driver.find_element_by_id("kaiji-text-1").text!='Information disclosed in':
continue
#Continue processing until the last page
while True:
#Read data from each row in the search results table
table = driver.find_element_by_id("main-list-table")
trs = table.find_elements(By.TAG_NAME, "tr")
for i in range(len(trs)):
title = trs[i].find_elements(By.TAG_NAME, "td")[3].text
#Select the financial statements of the specified company where XBRL exists, not the correction documents
if ('Financial statements' in title) and ('correction' not in title) and (len(trs[i].find_elements(By.TAG_NAME, "td")[4].text)!=0):
code = trs[i].find_elements(By.TAG_NAME, "td")[1].text
url = trs[i].find_elements(By.TAG_NAME, "td")[4].find_element_by_tag_name("a").get_attribute("href")
database = database.append(pd.Series([code, 'brief', d, title,url], index=database.columns), ignore_index=True)
if len(driver.find_element_by_class_name("pager-R").text)!=0:
driver.find_element_by_class_name("pager-R").click()
time.sleep(5)
else:
#Ends processing if the "Next" character does not exist
break
driver.quit()
return database
def dl_xbrl_zip(codes=None, database):
'''
Download the XBRL zip file by referring to the DataFrame that lists the links to the XBRL.
Parameters:
codes: None, int, float, str, or list (codes[code]=int, float,or str), default None
Get XBRL for all companies if None
database: pandas.DataFrame
database['code']: str
5-digit securities code
database['type']: str
'annual' or 'quarter'
database['date']: datetime.date
release date
database['title']: str
title
database['URL']: str
URL to download the XBRL zip file
Returns:
None
'''
database.dropna(subset=['code'], inplace=True)
database = database.reset_index(drop=True)
#Unify codes into an array of character types
if codes == None:
codes = [None]
else:
if type(codes) in (str, int, float):
codes = [int(codes)]
for i, code in enumerate(codes):
#Convert 4-digit securities code to 5-digit
if len(str(int(code)))==4:
codes[i] = str(int(code))+'0'
for code in codes:
if code == None:
df_company = database
else:
df_company = database[database['code']==code]
df_company = df_company.reset_index(drop=True)
#Use the securities code as the directory name
dir_path = database.loc[i,'code']
if os.path.exists(dir_path)==False:
os.mkdir(dir_path)
#Download the zip file from each line of the extracted list from the link to XBRL
for i in range(df_company.shape[0]):
#When accessing EDINET
if (df_company.loc[i,'type'] == 'annual') or (df_company.loc[i,'type'] == 'quarter'):
params = {"type": 1}
res = requests.get(df_company.loc[i,'URL'], params=params, stream=True)
if df_company.loc[i,'type'] == 'annual':
#The file name of the securities report is"yyyy_0.zip"
filename = dir_path + r'/' + df_company.loc[i,'date'][:4] + r"_0.zip"
elif df_company.loc[i,'type'] == 'quarter':
if re.search('Period', df_company.loc[i,'title']) == None:
#The file name of the quarterly report of unknown period is"yyyy_unknown_docID.zip"
filename = dir_path + r'/' + df_company.loc[i,'date'][:4] + r'_unknown_' + df_company.loc[i,'URL'][-8:] + r'.zip'
else:
#The file name of the quarterly report is"yyyy_quarter.zip"
filename = dir_path + r'/' + df_company.loc[i,'date'][:4] + r'_' + df_company.loc[i,'title'][re.search('Period', df_company.loc[i,'title']).end()] + r'.zip'
#When accessing TDNET
elif df_company.loc[i,'type'] == 'brief':
res = requests.get(df_company.loc[i,'URL'], stream=True)
#Fill in whitespace
s_list = df_company.loc[i,'title'].split()
s = ''
for i in s_list:
s += i
filename = df_company.loc[i,'date'][:4] + r'_' + s[re.search('Period', s).end()] + r'_brief.zip'
#If a zip file with the same name exists, it will not be overwritten.
if os.path.exists(filename):
print(df_company.loc[i,'code'],df_company.loc[i,'date'],'already exists')
continue
#Download the zip file only if you can access it normally
if res.status_code == 200:
with open(filename, 'wb') as file:
for chunk in res.iter_content(chunk_size=1024):
file.write(chunk)
print(df_company.loc[i,'code'],df_company.loc[i,'date'],'saved')
print('done!')
return None
Recommended Posts