I wrote the code to OCR from PDF in Python, imitating the wonderful ancestors. And use. What I'm doing is converting the PDF to jpg with poppler and then transcribing it into a txt file with Tesseract OCR.
I'm a beginner, so I'd be happy if you could point out any strange things in the code. I also referred to many sites that I did not quote. Thank you very much.
With reference to this site, the folder structure is as follows. How to convert PDF to image file (JPEG, PNG) with Python
| Parent folder | Child folder | 
|---|---|
| \ Current | ¥image_file | 
| ¥pdf_file | |
| ¥poppler | |
| ¥txt_file | 
import os
import pathlib
from pathlib import Path
from pdf2image import convert_from_path
from PIL import Image
import sys
import pyocr
import pyocr.builders
import pathlib
import glob
def cleanup():
    #Directory containing the img file to be deleted
    image_dir = pathlib.Path('./image_file')
    #Get a list of jpeg files in a directory with glob
    jpg_path = list(image_dir.glob('**/*.jpeg'))
    #Directory containing the txt file to be deleted
    txt_dir = pathlib.Path('./txt_file')
    #Get a list of txt files in a directory with glob
    txt_path = list(txt_dir.glob('**/*.txt'))
    if jpg_path == []:  #Break if the list is empty
        pass
    else:
        for i in jpg_path:
            os.remove(i)
    if txt_path == []:  #Break if the list is empty
        pass
    else:
        for i in txt_path:
            os.remove(i)
def pdf_to_image():
    # poppler/Add bin to environment variable Path(Temporarily)
    # Path("__file__").parent.resolve()so.Returns the absolute path of the parent folder of the py file
    poppler_dir = pathlib.Path("__file__").parent.resolve() / "poppler/bin"
    #pathsep is a delimiter when adding to an environment variable;
    os.environ["PATH"] += os.pathsep + str(poppler_dir)
    #PDF file path
    pdf_dir = pathlib.Path('./pdf_file')
    #Get a list of pdf files in a directory with glob
    pdf_path = list(pdf_dir.glob('**/*.pdf'))
    # PDF ->Convert to Image(200dpi)
    pages = convert_from_path(str(pdf_path[0]))
    #Save image files page by page
    image_dir = pathlib.Path("./image_file")
    for i, page in enumerate(pages):  #Get the number of pages of pages with the enumerate function
        # .Show the end of path with stem (pathlib)
        file_name = pdf_path[0].stem + "_{:02d}".format(i + 1) + ".jpeg "
        image_path = image_dir / file_name
        #Save as JPEG
        page.save(str(image_path), "JPEG")
def image_ocr():
    # tesseract-Pass the OCR path
    tessera_path = "C:\***\Tesseract-OCR"
    #pathsep is a delimiter when adding to an environment variable;
    os.environ["PATH"] += os.pathsep + str(tessera_path)
    tools = pyocr.get_available_tools()
    if len(tools) == 0:
        print("No OCR tool found")
        sys.exit(1)  #Argument 1 returns 1 with exit status
    tool = tools[0]
    #Directory with ocr target files
    image_dir = pathlib.Path('./image_file')
    #Get a list of jpeg files in a directory with glob
    jpg_path = list(image_dir.glob('**/*.jpeg'))
    for i in jpg_path:
        #Convert the ocr content to the variable txt
        txt = tool.image_to_string(
        Image.open(str(i)),
        lang="jpn",
        builder=pyocr.builders.TextBuilder(tesseract_layout=6)
        )
        #Variable txt txt_Save as txt file in file directory
        with open('./txt_file/' + str(i.stem) + '.txt', mode='wt') as t:
            t.write(txt)
        Recommended Posts