Extract images and tables from pdf with python to reduce the burden of reporting

background

When writing a report, it became troublesome to clip and save images (circuit diagrams, etc.) from the pdf sent in pdf format, and to just copy the table. The useful apps and code that do them didn't come out as a quick look. Then let's make it. The extraction of the table did not go well, but the burden was reduced because I was able to get the values (; ^ _ ^ A)

I put it on git, so if you like https://github.com/kzrn3318/create_img_excel_from_pdf

environment

os：windows10
python: Created with version 3.8

Installation of required libraries


pip install pypdf2
pip install pillow
pip install PyMuPDF
pip install fitz
pip install pandas
pip install camelot-py[cv]

If you don't have ghostscript at runtime, you may get an error, in which case you should install ghostscript. Since it is windows 10 at the time of code creation, it may not work on other os due to the character string of path, in that case please rewrite the path of the code so that it can be adapted. We have not confirmed the operation with other os.

Code body

Below is the code

`main.py`


import PyPDF2
from PIL import Image
import sys,os
import glob
import fitz
import camelot
import pandas as pd

def create_dir(img_dir , pdf_dir , excel_dir):
    img_dir_glob = glob.glob(str(img_dir))
    pdf_dir_glob = glob.glob(str(pdf_dir))
    excel_dir_glob = glob.glob(str(excel_dir))
    
    if len(pdf_dir_glob) > 0:
        pass
    else:
       os.mkdir(str(pdf_dir))
       
    if len(img_dir_glob) > 0:
        pass
    else:
        os.mkdir(str(img_dir))
    
    if len(excel_dir_glob) > 0:
        pass
    else:
        os.mkdir(str(excel_dir))
        

def create_page_pdf(pdf,page_count,pdf_dir):
    pdf_writer = PyPDF2.PdfFileWriter()
    pdf_writer.addPage(pdf.getPage(page_count))
    
    with open(".\\"+str(pdf_dir)+"\pdf{}.pdf".format(page_count),"wb") as f:
        pdf_writer.write(f)
    

def create_png(pdf_path,page_count,img_dir):
    pdf  = fitz.open(pdf_path)
    for num in range(len(pdf)):
        num_count = 0
        for image in pdf.getPageImageList(num):
            num_count += 1
            xref = image[0]
            pix = fitz.Pixmap(pdf,xref)
            
            if pix.n < 5:
                pix.writePNG(".\\"+str(img_dir)+"\img{}_{}.png ".format(page_count,num_count))
            else:
                pix = fitz.Pixmap(fitz.csRGB,xref)
                pix.writePNG(".\\"+str(img_dir)+"\img{}_{}.png ".format(page_count,num_count))
            
            pix = None
    
    pdf.close()
    
    
def create_excel(pdf_path,excel_dir,data_count):
    
    datas = camelot.read_pdf(pdf_path,split_text=True)
    data_count = data_count
    for data in datas:
        data_count += 1
        df =  data.df
        with pd.ExcelWriter(".\\"+str(excel_dir)+"\\from_pdf_{}.xlsx".format(data_count)) as file:
            df.to_excel(file,sheet_name="sheet1",index=False,header=False)
    return data_count


if __name__ == "__main__":
    args = sys.argv
    print([i for i in args])
    if len(args) >= 5:
        print("Received an argument.")
        pdf_file = args[1]
        pdf_dir = args[2]
        img_dir = args[3]
        excel_dir = args[4]
    else:
        try:
            pdf_file = args[1]
            print("Since the argument was not specified, it will be executed with the default value.")
        except:
            raise ValueError("At least one pdf file must be the argument. When specifying the output directory, specify four arguments.")
        pdf_dir ="pdf_list"
        img_dir="img_list"
        excel_dir="excel_data"
    
    pdf = PyPDF2.PdfFileReader(pdf_file)
    
    print("Image directory:"+str(img_dir))
    print("pdf Directory of each page:"+str(pdf_dir))
    print("Excel data directory:"+str(excel_dir))
    
    create_dir(img_dir,pdf_dir,excel_dir)
    
    page_count = 0
    for page in pdf.pages:
        create_page_pdf(pdf,page_count,pdf_dir)
        page_count += 1
        
    path_list = glob.glob(".\\"+pdf_dir+"\*.pdf")
    page_count = 0
    data_count = 0
    for path in path_list:
        page_count += 1
        create_png(path,page_count,img_dir)
        data_count = create_excel(path,excel_dir,data_count)
        
    print("Processing Exit\n")

The execution method is as follows, please execute in the same directory as the target pdf. At the time of execution, a directory for saving the paged pdf, a directory for saving the image extracted from the pdf, and a directory for extracting the table from the pdf and saving it are created. You can specify them with command line arguments.

python main.py (Target.pdf) (pdf paginated directory) (pdf extract image directory) (pdf table extraction directory)

`Example`


python main.py train1.pdf pdf_dir img_dir excel_dir

In the above example, pdf is saved for each page division directly under pdf_dir. Save the extracted image in img_dir. Save the extracted table converted to excel in excel_dir.

Partial explanation of the code

import PyPDF2
from PIL import Image
import sys,os
import glob
import fitz
import camelot
import pandas as pd

As you can see, it's a sight that people who usually write python often see. Import each package.

def create_dir(img_dir , pdf_dir , excel_dir):
    img_dir_glob = glob.glob(str(img_dir))
    pdf_dir_glob = glob.glob(str(pdf_dir))
    excel_dir_glob = glob.glob(str(excel_dir))

    if len(pdf_dir_glob) > 0:
        pass
    else:
       os.mkdir(str(pdf_dir))

    if len(img_dir_glob) > 0:
        pass
    else:
        os.mkdir(str(img_dir))

    if len(excel_dir_glob) > 0:
        pass
    else:
        os.mkdir(str(excel_dir))

It is a directory creation function, it determines whether the received argument already exists or not, and creates it if it does not exist.

def create_page_pdf(pdf,page_count,pdf_dir):
    pdf_writer = PyPDF2.PdfFileWriter()
    pdf_writer.addPage(pdf.getPage(page_count))

    with open(".\\"+str(pdf_dir)+"\pdf{}.pdf".format(page_count),"wb") as f:
        pdf_writer.write(f)

It is a function that divides the original pdf into pages and saves each. Create pdf (page number) .pdf directly under pdf_dir.

def create_png(pdf_path,page_count,img_dir):
    pdf  = fitz.open(pdf_path)
    for num in range(len(pdf)):
        num_count = 0
        for image in pdf.getPageImageList(num):
            num_count += 1
            xref = image[0]
            pix = fitz.Pixmap(pdf,xref)

            if pix.n < 5:
                pix.writePNG(".\\"+str(img_dir)+"\img{}_{}.png ".format(page_count,num_count))
            else:
                pix = fitz.Pixmap(fitz.csRGB,xref)
                pix.writePNG(".\\"+str(img_dir)+"\img{}_{}.png ".format(page_count,num_count))

            pix = None

    pdf.close()

Save the extracted image directly under img_dir in .png format. The file name will be img (page number) _ (image number on the page) .png.

def create_excel(pdf_path,excel_dir,data_count):

    datas = camelot.read_pdf(pdf_path,split_text=True)
    data_count = data_count
    for data in datas:
        data_count += 1
        df =  data.df
        with pd.ExcelWriter(".\\"+str(excel_dir)+"\\from_pdf_{}.xlsx".format(data_count)) as file:
            df.to_excel(file,sheet_name="sheet1",index=False,header=False)
    return data_count

Save the converted excel directly under excel_dir.

def create_excel(pdf_path,excel_dir,data_count):

    datas = camelot.read_pdf(pdf_path,split_text=True)
    data_count = data_count
    for data in datas:
        data_count += 1
        df =  data.df
        with pd.ExcelWriter(".\\"+str(excel_dir)+"\\from_pdf_{}.xlsx".format(data_count)) as file:
            df.to_excel(file,sheet_name="sheet1",index=False,header=False)
    return data_count


if __name__ == "__main__":
    args = sys.argv
    print([i for i in args])
    if len(args) >= 5:
        print("Received an argument.")
        pdf_file = args[1]
        pdf_dir = args[2]
        img_dir = args[3]
        excel_dir = args[4]
    else:
        try:
            pdf_file = args[1]
            print("Since the argument was not specified, it will be executed with the default value.")
        except:
            raise ValueError("At least one pdf file must be the argument. When specifying the output directory, specify four arguments.")
        pdf_dir ="pdf_list"
        img_dir="img_list"
        excel_dir="excel_data"

    pdf = PyPDF2.PdfFileReader(pdf_file)

    print("Image directory:"+str(img_dir))
    print("pdf Directory of each page:"+str(pdf_dir))
    print("Excel data directory:"+str(excel_dir))

    create_dir(img_dir,pdf_dir,excel_dir)

    page_count = 0
    for page in pdf.pages:
        create_page_pdf(pdf,page_count,pdf_dir)
        page_count += 1

    path_list = glob.glob(".\\"+pdf_dir+"\*.pdf")
    page_count = 0
    data_count = 0
    for path in path_list:
        page_count += 1
        create_png(path,page_count,img_dir)
        data_count = create_excel(path,excel_dir,data_count)

    print("Processing Exit\n")

This is the execution part of main.py. Each directory name is obtained from the command line argument and executed.

Summary

Until now, I used to trim from pdf, but I think it has become much easier. pyPDF2 and camelot had very few pheasants and it was hard (-_-;) It seems that there are still some improvements in table extraction, but it seems difficult to extract as it is due to the structure and writing style of pdf. This code is created on the assumption that the exported pdf is used. Please note that we have not tested whether the pdf instruction book scanned by adobe scan etc. can be applied.