[Python] Automatically translate PDF with DeepL while keeping the original format. [Windows / Word required]

9/9 postscript

Abolished the input to DeepL via the clipboard and changed to the method using Javascript. Along with that, it corresponds to the use of Selenium in headless mode.

9/14 postscript

Addresses the problem of layout collapse around the table. If the image is embedded in the same paragraph as the text (such as an imaged formula), it turns out that the image disappears when you replace the text, and it is excluded from translation until a solution is found.

Updated version
import win32com.client
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
import time
import re
from math import ceil
from threading import Thread

DRIVER_PATH = 'chromedriver.exe'
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
options = Options()
options.add_argument(f'--user-agent={user_agent}')
options.add_argument('--disable-gpu')
options.add_argument('--disable-extensions')
options.add_argument('--proxy-server="direct://"')
options.add_argument('--proxy-bypass-list=*')
options.add_argument('--start-maximized')
options.add_argument('--headless') #Cancel headless mode by commenting out (Chrome is displayed)


def Deeptrans(t, driver):
    global translated_texts
    stextarea = driver.find_element_by_css_selector(
        '.lmt__textarea.lmt__source_textarea.lmt__textarea_base_style')
    ttextarea = driver.find_element_by_css_selector(
        '.lmt__textarea.lmt__target_textarea.lmt__textarea_base_style')
    for i in range(t * unit, min((t + 1) * unit, length)):
        if sourse_texts[i]: sourse_text = sourse_texts[i]
        else: continue
        if not sourse_text.strip():
            continue
        driver.execute_script(
            f'$(".lmt__source_textarea").val({repr(sourse_text)});')
        stextarea.send_keys(Keys.RIGHT)
        translated_text = ""
        while not translated_text:
            time.sleep(1)
            translated_text = ttextarea.get_property("value")
        stextarea.send_keys(Keys.CONTROL, "a")
        stextarea.send_keys(Keys.BACKSPACE)
        translated_texts.append({"index": i + 1, "text": translated_text})


def runDriver(t):
    global options
    driver = webdriver.Chrome(executable_path=DRIVER_PATH, options=options)
    url = 'https://www.deepl.com/ja/translator'
    driver.get(url)
    Deeptrans(t, driver)
    driver.quit()


def multiThreadTranslate(file_path, font):
    global length, unit, sourse_texts, translated_texts
    app = win32com.client.Dispatch("Word.Application")
    #app.Visible = True
    doc = app.Documents.Open(file_path)
    try:
        doc.Paragraphs(1).Range.Font.Name = font
    except:
        print('The specified font does not exist')
        return
    length = doc.Paragraphs.Count
    n = 9
    unit = ceil(length / n)
    sourse_texts = [
        doc.Paragraphs(i + 1).Range.Text if
        (str(doc.Paragraphs(i + 1).Range.Style) != "TableGrid" and
         str(doc.Paragraphs(min(length, i + 2)).Range.Style) != "TableGrid" and doc.Paragraphs(i + 1).Range.InlineShapes.Count)
        else None for i in range(length)
    ]
    translated_texts = []
    threads = []
    for t in range(n):
        thread = Thread(target=runDriver, args=(t, ))
        thread.setDaemon(True)
        thread.start()
        threads.append(thread)

    for thread in threads:
        thread.join()
    for translated_text in sorted(translated_texts, key=lambda i: i["index"]):
        doc.Paragraphs(translated_text["index"]
                       ).Range.Text = translated_text["text"].replace(
                           '\n', '\r')
        doc.Paragraphs(translated_text["index"]).Range.Font.Name = font
    doc.SaveAs2(FileName=re.sub("(.+)(\.pdf)", r"\1_jp.pdf", file_path),
                FileFormat=17)
    doc.Close(SaveChanges=0)
    app.Quit()
    print('Process is completed.')


if __name__ == '__main__':
    file_path = input('Enter the absolute path of the PDF (drag and drop is also possible):     ')
    print('Please select a font')
    fonts = {'1': 'Yu Mincho', '2': 'Meiryo', '3': 'BIZ UDP Mincho Medium', '4': 'Other'}
    font = fonts[input('   '.join(
        [", ".join(list(fonts.items())[i])
         for i in range(len(fonts))]) + ":     ")]
    if font == 'Other': font = input('Please enter the font name:     ')
    multiThreadTranslate(file_path, font=font)

9/15 postscript

-The above problem has been solved for the time being with the 1-thread version. -Solved the problem that the font size was adjusted without permission and rattled, and the problem that strange indentation was entered.

Solution version (single thread) for the time being
import win32com.client
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
import time
import re
from tqdm import tqdm

DRIVER_PATH = 'chromedriver.exe'
options = Options()
options.add_argument('--disable-gpu')
options.add_argument('--disable-extensions')
options.add_argument('--proxy-server="direct://"')
options.add_argument('--proxy-bypass-list=*')
options.add_argument('--start-maximized')


def Deeptrans(file_path, font):
    app = win32com.client.Dispatch("Word.Application")
    app.Visible = True
    doc = app.Documents.Open(file_path)
    driver = webdriver.Chrome(executable_path=DRIVER_PATH,
                              chrome_options=options)
    url = 'https://www.deepl.com/ja/translator#en/ja'
    driver.get(url)
    stextarea = driver.find_element_by_css_selector(
        '.lmt__textarea.lmt__source_textarea.lmt__textarea_base_style')
    ttextarea = driver.find_element_by_css_selector(
        '.lmt__textarea.lmt__target_textarea.lmt__textarea_base_style')

    length = doc.Paragraphs.Count
    for i in tqdm(range(length)):
        if str(doc.Paragraphs(i + 1).Range.Style) == "TableGrid":
            continue
        sourse_text = doc.Paragraphs(i + 1).Range.Text
        fs = doc.Paragraphs(i + 1).Range.Font.Size
        alignment = doc.Paragraphs(i + 1).Alignment
        lindent = doc.Paragraphs(i + 1).LeftIndent
        rindent = doc.Paragraphs(i + 1).RightIndent
        if doc.Paragraphs(i + 1).Range.InlineShapes.Count:
            if sourse_text.strip() == "/": continue
            doc.Paragraphs(i + 1).Range.Font.Name = font
            t = ""
            te = []
            cnt = 0
            for j in range(doc.Paragraphs(i + 1).Range.Words.Count):
                if "/" not in doc.Paragraphs(i + 1).Range.Words(j + 1).Text:
                    t += doc.Paragraphs(i + 1).Range.Words(j + 1).Text
                    doc.Paragraphs(i + 1).Range.Words(j + 1).Text = "'' "
                else:
                    te.append(t)
                    t = ""
                    cnt += 1
            if t: te.append(t)

            for j, sourse_text in enumerate(te):
                if len(sourse_text.strip()) > 5:
                    driver.execute_script(
                        f'$(".lmt__source_textarea").val({repr(sourse_text)});'
                    )
                    stextarea.send_keys(Keys.RIGHT)
                    translated_text = ""
                    while not translated_text:
                        time.sleep(1)
                        translated_text = driver.find_element_by_css_selector(
                            '.lmt__textarea.lmt__target_textarea.lmt__textarea_base_style'
                        ).get_property("value")
                    stextarea.send_keys(Keys.CONTROL, "a")
                    stextarea.send_keys(Keys.BACKSPACE)
                    te[j] = translated_text

            g = (j for j in te)
            c = 0
            for j in doc.Paragraphs(i + 1).Range.Words:
                if j.Text == "'' ":
                    j.Text = ""
                elif "/" in j.Text:
                    try:
                        j.InsertBefore(g.__next__())
                        c += 1
                        if c == cnt:
                            j.InsertAfter(g.__next__())
                    except:
                        pass
            doc.Paragraphs(i + 1).Alignment = alignment
            doc.Paragraphs(i + 1).Range.Font.Size = fs
            doc.Paragraphs(i + 1).LeftIndent = lindent
            doc.Paragraphs(i + 1).RightIndent = rindent
            continue

        if re.search(r"[\x00-\x1F\x7F]",
                     sourse_text.strip()) or len(sourse_text.strip()) < 5:
            continue
        driver.execute_script(
            f'$(".lmt__source_textarea").val({repr(sourse_text)});')
        stextarea.send_keys(Keys.RIGHT)
        translated_text = ""
        while not translated_text:
            time.sleep(1)
            translated_text = driver.find_element_by_css_selector(
                '.lmt__textarea.lmt__target_textarea.lmt__textarea_base_style'
            ).get_property("value")
        stextarea.send_keys(Keys.CONTROL, "a")
        stextarea.send_keys(Keys.BACKSPACE)
        doc.Paragraphs(i + 1).Range.Text = translated_text
        doc.Paragraphs(i + 1).Range.Font.Name = font
        doc.Paragraphs(i + 1).Alignment = alignment
        doc.Paragraphs(i + 1).Range.Font.Size = fs
        doc.Paragraphs(i + 1).LeftIndent = lindent
        doc.Paragraphs(i + 1).RightIndent = rindent
    driver.quit()
    doc.SaveAs2(FileName=re.sub("(.+)(\.pdf)", r"\1_jp.pdf", file_path),
                FileFormat=17)
    doc.Close(SaveChanges=0)
    app.Quit()
    print('Process is completed.')


if __name__ == '__main__':
    file_path = input('Please enter the absolute path of the PDF:     ')
    print('Please select a font')
    fonts = {'1': 'Yu Mincho', '2': 'Meiryo', '3': 'BIZ UDP Mincho Medium', '4': 'Other'}
    font = fonts[input('   '.join(
        [", ".join(list(fonts.items())[i])
         for i in range(len(fonts))]) + ":     ")]
    if font == 'Other': font = input('Please enter the font name:     ')
    Deeptrans(file_path, font)

Introduction

I wrote an article about automatic PDF translation before, ** After all, I want to keep the original shape of images, formulas, columns, etc.! ** ** I had a regret, so when I searched for a means, I arrived at a method using Word, so I would like to introduce it. However, depending on the compatibility between PDF and Word, it may not be possible to process it very well.

Example of use

I borrowed the PDF from here → https://mirela.net.technion.ac.il/publications/ The position is shifted due to the character width and the number of characters. EN→JA

Things necessary

・ Windows PC ・ Microsoft Word -ChromeDriver (If you want to execute the following program as it is, save it under the execution directory)

flow

*** Program start   ↓ Open the target PDF as docx in Word   ↓ Get sentences for each paragraph   ↓ DeepL Translator with Selenium   ↓ Rewrite via Word   ↓ Save as PDF   ↓ The end of the program***

Implementation

It takes time to do it paragraph by paragraph, so I decided to execute it in multiple threads. If for some reason the number that manages the position of the paragraph shifts, it will take some time, but we will also post a version that translates each paragraph, so please try it.

import win32com.client
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
import time
import re
import pyperclip as ppc
from math import ceil
from threading import Thread, Lock

DRIVER_PATH = 'chromedriver.exe'
options = Options()
options.add_argument('--disable-gpu')
options.add_argument('--disable-extensions')
options.add_argument('--proxy-server="direct://"')
options.add_argument('--proxy-bypass-list=*')
options.add_argument('--start-maximized')


def Deeptrans(t, driver):
    global translated_texts
    stextarea = driver.find_element_by_css_selector(
        '.lmt__textarea.lmt__source_textarea.lmt__textarea_base_style')
    ttextarea = driver.find_element_by_css_selector(
        '.lmt__textarea.lmt__target_textarea.lmt__textarea_base_style')
    for i in range(t * unit, min((t + 1) * unit, length)):
        sourse_text = sourse_texts[i]
        if re.search(r"[\x00-\x1F\x7F]",
                     sourse_text.strip()) or len(sourse_text.strip()) < 5:
            continue
        lock.acquire()
        ppc.copy(sourse_text)
        stextarea.send_keys(Keys.CONTROL, "v")
        lock.release()
        translated_text = ""
        while not translated_text:
            time.sleep(1)
            translated_text = ttextarea.get_property("value")
        stextarea.send_keys(Keys.CONTROL, "a")
        stextarea.send_keys(Keys.BACKSPACE)
        translated_texts[str(i + 1)] = translated_text


def runDriver(t):
    driver = webdriver.Chrome(DRIVER_PATH)
    url = 'https://www.deepl.com/ja/translator'
    driver.get(url)
    Deeptrans(t, driver)
    driver.quit()


def multiThreadTranslate(file_path, font):
    global lock, length, unit, sourse_texts, translated_texts
    app = win32com.client.Dispatch("Word.Application")
    app.Visible = True #Hide Word by commenting out
    doc = app.Documents.Open(file_path)
    try:
        doc.Paragraphs(1).Range.Font.Name = font
    except:
        print('The specified font does not exist')
        doc.Close(SaveChanges=0)
        app.Quit()
        return
    length = doc.Paragraphs.Count
    n = 9 #Open 9 Chrome and run at the same time
    unit = ceil(length / n)
    lock = Lock()
    clipboard = ppc.paste()
    sourse_texts = [doc.Paragraphs(i + 1).Range.Text for i in range(length)]
    translated_texts = {}
    threads = []
    for t in range(n):
        thread = Thread(target=runDriver, args=(t, ))
        thread.setDaemon(True)
        thread.start()
        threads.append(thread)

    for thread in threads:
        thread.join()
    for k, v in translated_texts.items():
        doc.Paragraphs(int(k)).Range.Text = v.replace('\n', '\r')
        doc.Paragraphs(int(k)).Range.Font.Name = font
    doc.SaveAs2(FileName=re.sub("(.+)(\.pdf)", r"\1_jp.pdf", file_path),
                FileFormat=17)
    doc.Close(SaveChanges=0)
    app.Quit()
    print('Process is completed.')
    ppc.copy(clipboard)


if __name__ == '__main__':
    file_path = input('Please enter the absolute path of the PDF:     ')
    print('Please select a font')
    fonts = {'1': 'Yu Mincho', '2': 'Meiryo', '3': 'BIZ UDP Mincho Medium', '4': 'Other'}
    font = fonts[input('   '.join(
        [", ".join(list(fonts.items())[i])
         for i in range(len(fonts))]) + ":     ")]
    if font == 'Other': font = input('Please enter the font name:     ')
    multiThreadTranslate(file_path, font=font)
Paragraph by paragraph ver. (With progress bar bonus)
import win32com.client
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
import time
import re
import pyperclip as ppc
from tqdm import tqdm

DRIVER_PATH = 'chromedriver.exe'
options = Options()
options.add_argument('--disable-gpu')
options.add_argument('--disable-extensions')
options.add_argument('--proxy-server="direct://"')
options.add_argument('--proxy-bypass-list=*')
options.add_argument('--start-maximized')


def Deeptrans(file_path, font):
    clipboard = ppc.paste()
    app = win32com.client.Dispatch("Word.Application")
    app.Visible = True
    doc = app.Documents.Open(file_path)
    driver = webdriver.Chrome(executable_path=DRIVER_PATH,
                              chrome_options=options)
    url = 'https://www.deepl.com/ja/translator#en/ja'
    driver.get(url)
    stextarea = driver.find_element_by_css_selector(
        '.lmt__textarea.lmt__source_textarea.lmt__textarea_base_style')
    ttextarea = driver.find_element_by_css_selector(
        '.lmt__textarea.lmt__target_textarea.lmt__textarea_base_style')

    for i in tqdm(range(doc.Paragraphs.Count)):
        sourse_text = doc.Paragraphs(i + 1).Range.Text
        if re.search(r"[\x00-\x1F\x7F]",
                     sourse_text.strip()) or len(sourse_text.strip()) < 5:
            continue
        ppc.copy(sourse_text)
        stextarea.send_keys(Keys.CONTROL, "v")
        translated_text = ""
        while not translated_text:
            time.sleep(1)
            translated_text = ttextarea.get_property("value")
        stextarea.send_keys(Keys.CONTROL, "a")
        stextarea.send_keys(Keys.BACKSPACE)
        doc.Paragraphs(i + 1).Range.Text = translated_text
        doc.Paragraphs(i + 1).Range.Font.Name = font
    driver.quit()
    doc.SaveAs2(FileName=re.sub("(.+)(\.pdf)", r"\1_jp.pdf", file_path),
                FileFormat=17)
    doc.Close(SaveChanges=0)
    app.Quit()
    print('Process is completed.')
    ppc.copy(clipboard)


if __name__ == '__main__':
    file_path = input('Please enter the absolute path of the PDF:     ')
    print('Please select a font')
    fonts = {'1': 'Yu Mincho', '2': 'Meiryo', '3': 'BIZ UDP Mincho Medium', '4': 'Other'}
    font = fonts[input('   '.join(
        [", ".join(list(fonts.items())[i])
         for i in range(len(fonts))]) + ":     ")]
    if font == 'Other': font = input('Please enter the font name:     ')
    Deeptrans(file_path, font)

How to use

To use it, just save it and run it from the command line (please install the required libraries separately). After a while, the file original name_jp.pdf will be output to the same directory as the original file.

Task

It is difficult to distinguish between mathematical formulas and local sentences, and they may lose their shape or disappear. I dealt with it with a stick-on blade, but on the contrary, some sentences were not translated. Looking for a good way.

Summary

It's Word. If you are interested, please try it.

Recommended Posts

[Python] Automatically translate PDF with DeepL while keeping the original format. [Windows / Word required]
English word book program linked with Google Docs
[Python] Automatically translate PDF with DeepL while keeping the original format. [Windows / Word required]
Try translating with Python while maintaining the PDF layout
Automatically translate DeepL into English with Python and Selenium
[Python] Automatically operate the browser with Selenium
[Python] Let's automatically translate English PDF (but not limited to) with DeepL or Google Translate to make a text file.
[Automation] Extract the table in PDF with Python
Automatically format Python code into PEP8-compliant code with Emacs
Convert the image in .zip to PDF with Python
Get the result in dict format with Python psycopg2
Try to automate pdf format report creation with Python
Continued [Python] Let's automatically translate English PDF (but not limited to) with DeepL or Google Translate to make a text file, no HTML.