The title is shit long. .. I'm 9uant, who is doing quantitative analysis and data science in the financial area. We also do twitter, so if you are interested, please follow us!

As the title suggests, we will share the code to quickly download the XBRL of financial statements. I would like to write a commentary one by one. Take the following two steps.

--DataFrame the link to XBRL --Download the XBRL zip file from DataFrame

import os
import glob
import shutil
import re
import time
from datetime import date, timedelta, datetime
from dateutil.relativedelta import relativedelta

import requests
from bs4 import BeautifulSoup
import urllib3
from urllib3.exceptions import InsecureRequestWarning
urllib3.disable_warnings(InsecureRequestWarning)
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
import zipfile

import numpy as np
import pandas as pd
import json

Convert the link to XBRL into a DataFrame

EDINET Easy because EDINET has an API.

def edinet_xbrl_link(annual=True, quarter=True, codes=None, year=0,month=0,day=0):
    '''
Create a DataFrame of XBRL links for specific companies or all securities and quarterly reports
    
    Parameters:
        annual: bool, default True
Get a securities report if True
        quarter: bool, default Ture
Get quarterly report if True
        codes: None, int, float, str, or list (codes[code]=int, float,or str), default None
Get data for all companies if None
If you specify a stock code, only the data of those companies will be acquired.:point_up_tone4:
        year, month, day: int, default 0
Specify how many days ago the data will be retrieved(Up to 5 years)
            
    Returns:
        database: pandas.DataFrame
            database['code']: str
5-digit securities code
            database['type']: str
                'annual' or 'quarter'
            database['date']: datetime.date
release date
            database['title']: str
title
            database['URL']: str
URL to download the XBRL zip file
    '''
    edinet_url = "https://disclosure.edinet-fsa.go.jp/api/v1/documents.json"
    
    #Unify codes into a character type array.
    if codes != None:
        if type(codes) in (str, int, float):
            codes = [int(codes)]
        for code in codes:
            #Convert 4-digit securities code to 5-digit
            if len(str(int(code)))==4:
                code = str(int(code))+'0'

    #Turn a for statement with datetime type
    def date_range(start, stop, step = timedelta(1)):
        current = start
        while current < stop:
            yield current
            current += step
            
    #Prepare a DataFrame to store the result
    database = pd.DataFrame(index=[], columns=['code','type','date','title','URL'])
    
    for d in date_range(date.today()-relativedelta(years=year, months=month, days=day)+relativedelta(days=1), date.today()+relativedelta(days=1)):
        #Access the EDINET API
        d_str = d.strftime('%Y-%m-%d')
        params = {'date' : d_str, 'type' : 2}
        res = requests.get(edinet_url, params=params, verify=False)
        json_res = json.loads(res.text)
        time.sleep(5)
        
        #If you cannot access normally
        if json_res['metadata']['status']!='200':
            print(d_str, 'not accessible')
            continue
            
        print(d_str, json_res['metadata']['resultset']['count'])#Display date and number
        
        #In case of 0
        if len(json_res['results'])==0:
            continue
        
        df = pd.DataFrame(json_res['results'])[['docID', 'secCode', 'ordinanceCode', 'formCode','docDescription']]
        df.dropna(subset=['docID'], inplace=True)
        df.dropna(subset=['secCode'], inplace=True)
        df.rename(columns={'secCode': 'code', 'docDescription': 'title'}, inplace=True)
        df['date'] = d
        df['URL'] = df['docID']
        df['URL'] = "https://disclosure.edinet-fsa.go.jp/api/v1/documents/" + df['URL']
        
        #Extract only the specified securities code
        if codes != None:
            df = df[df['code'] in codes]
            
        if annual == True:
            df1 = df[(df['ordinanceCode']=='010') & (df['formCode']=='030000')]
            df1['type'] = 'annual'
            database = pd.concat([database, df1[['code', 'type', 'date','title', 'URL']]], axis=0, join='outer').reset_index(drop=True)
            
        if quarter == True:
            df2 = df[(df['ordinanceCode']=='010') & (df['formCode']=='043000')]
            df2['type'] = 'quarter'
            database = pd.concat([database, df2[['code', 'type', 'date','title', 'URL']]], axis=0, join='outer').reset_index(drop=True)
            
    return database

TDNET Use selenium to collect data from TDNET. Since only 200 free word search results are displayed, a function to search by securities code and a function to search by date were created separately.

def tdnet_xbrl_link_by_code(codes):
    '''
Create a DataFrame that links the financial statements of the specified company to XBRL
    
    Parameters:
        codes: None, int, float, str, or list (codes[code]=int, float,or str), default None
Get data for all companies if None
            
    Returns:
        database: pandas.DataFrame
            database['code']: str
5-digit securities code
            database['type']: str
                'annual' or 'quarter'
            database['date']: datetime.date
release date
            database['title']: str
title
            database['URL']: str
URL to download the XBRL zip file
    '''

    #Unify codes into a character type array.
    if type(codes) in (str, int, float):
        codes = [int(codes)]
    for i, code in enumerate(codes):
        #Convert 4-digit securities code to 5-digit
        if len(str(int(code)))==4:
            codes[i] = str(int(code))+'0'
    
    database = pd.DataFrame(index=[], columns=['code','type','date','title','URL'])
    
    for code in codes:
        #Launch the browser
        chromeOptions = webdriver.ChromeOptions()
        chromeOptions.add_argument('--headless') #Hide browser
        driver = webdriver.Chrome(options=chromeOptions)
        driver.get("https://www.release.tdnet.info/onsf/TDJFSearch/I_head")

        #Send search word
        duration = driver.find_element_by_name('t0')
        select = Select(duration)
        select.options[-1].click()
        inputElement = driver.find_element_by_id("freewordtxt")
        inputElement.send_keys(code)
        inputElement.send_keys(Keys.RETURN)
        time.sleep(5)

        #Move to the frame where the search results are displayed
        iframe = driver.find_element_by_name("mainlist")
        driver.switch_to.frame(iframe)
        
        #Ends processing when there are 0 search results
        if driver.find_element_by_id("contentwrapper").text == 'The relevant timely disclosure information was not found.':
            return database

        #Read data from each row in the search results table
        table = driver.find_element_by_id("maintable")
        trs = table.find_elements(By.TAG_NAME, "tr")
        for i in range(len(trs)):
            title = trs[i].find_elements(By.TAG_NAME, "td")[3].text

            #Select the financial statements of the specified company where XBRL exists, not the correction documents
            if ('Financial statements' in title) and ('correction' not in title) and (len(trs[i].find_elements(By.TAG_NAME, "td")[4].text)!=0) and (code==trs[i].find_elements(By.TAG_NAME, "td")[1].text):
                date = trs[i].find_elements(By.TAG_NAME, "td")[0].text[:10]
                date = datetime.strptime(date, '%Y/%m/%d').date()
                url = trs[i].find_elements(By.TAG_NAME, "td")[4].find_element_by_tag_name("a").get_attribute("href")
                database = database.append(pd.Series([code,'brief',date,title,url], index=database.columns), ignore_index=True)

        driver.quit()

        return database

def tdnet_xbrl_link_by_date(date=None):
    '''
Create a DataFrame with a link to XBRL for the specified date or all financial statements
    
    Parameters:
        date: None or str ('yyyy/mm/dd'), default None
Get data for all dates if None
            
    Returns:
        database: pandas.DataFrame
            database['code']: str
5-digit securities code
            database['type']: str
                'annual' or 'quarter'
            database['date']: datetime.date
release date
            database['title']: str
title
            database['URL']: str
URL to download the XBRL zip file
    '''
    database = pd.DataFrame(index=[], columns=['code','type','date','title','URL'])
    
    #Launch the browser
    chromeOptions = webdriver.ChromeOptions()
    chromeOptions.add_argument('--headless') #Hide browser
    driver = webdriver.Chrome(options=chromeOptions)
    driver.get("https://www.release.tdnet.info/inbs/I_main_00.html")
    
    duration = driver.find_element_by_name('daylist')
    select = Select(duration)
    for i in range(1, len(select.options)):
        driver.get("https://www.release.tdnet.info/inbs/I_main_00.html")
        duration = driver.find_element_by_name('daylist')
        select = Select(duration)
        d = datetime.strptime(select.options[i].text[:10], '%Y/%m/%d').date()
        print(select.options[i].text)
        
        if (date == None) or (date == select.options[i].text[:10]):
            select.options[i].click()
            time.sleep(5)

            #Move to the frame where the search results are displayed
            iframe = driver.find_element_by_id("main_list")
            driver.switch_to.frame(iframe)

            #Ends processing when there are 0 search results
            if driver.find_element_by_id("kaiji-text-1").text!='Information disclosed in':
                continue

            #Continue processing until the last page
            while True:
                #Read data from each row in the search results table
                table = driver.find_element_by_id("main-list-table")
                trs = table.find_elements(By.TAG_NAME, "tr")
                for i in range(len(trs)):
                    title = trs[i].find_elements(By.TAG_NAME, "td")[3].text

                    #Select the financial statements of the specified company where XBRL exists, not the correction documents
                    if ('Financial statements' in title) and ('correction' not in title) and (len(trs[i].find_elements(By.TAG_NAME, "td")[4].text)!=0):
                        code = trs[i].find_elements(By.TAG_NAME, "td")[1].text
                        url = trs[i].find_elements(By.TAG_NAME, "td")[4].find_element_by_tag_name("a").get_attribute("href")
                        database = database.append(pd.Series([code, 'brief', d, title,url], index=database.columns), ignore_index=True)

                if len(driver.find_element_by_class_name("pager-R").text)!=0:
                    driver.find_element_by_class_name("pager-R").click()
                    time.sleep(5)
                else:
                    #Ends processing if the "Next" character does not exist
                    break
    driver.quit()
    return database

Download XBRL from DataFrame

def dl_xbrl_zip(codes=None, database):
    '''
Download the XBRL zip file by referring to the DataFrame that lists the links to the XBRL.
    
    Parameters:
        codes: None, int, float, str, or list (codes[code]=int, float,or str), default None
Get XBRL for all companies if None
            
        database: pandas.DataFrame
            database['code']: str
5-digit securities code
            database['type']: str
                'annual' or 'quarter'
            database['date']: datetime.date
release date
            database['title']: str
title
            database['URL']: str
URL to download the XBRL zip file
            
    Returns:
        None
    '''
    database.dropna(subset=['code'], inplace=True)
    database = database.reset_index(drop=True)
    
    #Unify codes into an array of character types
    if codes == None:
        codes = [None]
    else:
        if type(codes) in (str, int, float):
            codes = [int(codes)]
        for i, code in enumerate(codes):
            #Convert 4-digit securities code to 5-digit
            if len(str(int(code)))==4:
                codes[i] = str(int(code))+'0'
            
    for code in codes:
        if code == None:
            df_company = database
        else:
            df_company = database[database['code']==code]
            df_company = df_company.reset_index(drop=True)
        
        #Use the securities code as the directory name
        dir_path = database.loc[i,'code']
        if os.path.exists(dir_path)==False:
            os.mkdir(dir_path)
            
        #Download the zip file from each line of the extracted list from the link to XBRL
        for i in range(df_company.shape[0]):
            #When accessing EDINET
            if (df_company.loc[i,'type'] == 'annual') or (df_company.loc[i,'type'] == 'quarter'):
                params = {"type": 1}
                res = requests.get(df_company.loc[i,'URL'], params=params, stream=True)
                
                if df_company.loc[i,'type'] == 'annual':
                    #The file name of the securities report is"yyyy_0.zip"
                    filename = dir_path + r'/' + df_company.loc[i,'date'][:4] + r"_0.zip"
                elif df_company.loc[i,'type'] == 'quarter':
                    if re.search('Period', df_company.loc[i,'title']) == None:
                        #The file name of the quarterly report of unknown period is"yyyy_unknown_docID.zip"
                        filename = dir_path + r'/' + df_company.loc[i,'date'][:4] + r'_unknown_' + df_company.loc[i,'URL'][-8:] + r'.zip'
                    else:
                        #The file name of the quarterly report is"yyyy_quarter.zip"
                        filename = dir_path + r'/' + df_company.loc[i,'date'][:4] + r'_' + df_company.loc[i,'title'][re.search('Period', df_company.loc[i,'title']).end()] + r'.zip'
                            
            #When accessing TDNET
            elif df_company.loc[i,'type'] == 'brief':
                res = requests.get(df_company.loc[i,'URL'], stream=True)
                
                #Fill in whitespace
                s_list = df_company.loc[i,'title'].split()
                s = ''
                for i in s_list:
                    s += i
                
                filename = df_company.loc[i,'date'][:4] + r'_' + s[re.search('Period', s).end()] + r'_brief.zip'
            
            #If a zip file with the same name exists, it will not be overwritten.
            if os.path.exists(filename):
                print(df_company.loc[i,'code'],df_company.loc[i,'date'],'already exists')
                continue
                
            #Download the zip file only if you can access it normally
            if res.status_code == 200:
                with open(filename, 'wb') as file:
                    for chunk in res.iter_content(chunk_size=1024):
                        file.write(chunk)
                    print(df_company.loc[i,'code'],df_company.loc[i,'date'],'saved')
                            
    print('done!')
    return None

Download XBRL of securities report, quarterly report, financial report from EDINET / TDNET with Python

Convert the link to XBRL into a DataFrame

Download XBRL from DataFrame