Ce que j'ai fait la dernière fois et ce que j'ai fait cette fois

Traduisons automatiquement l'article précédent [Python] anglais PDF (mais sans s'y limiter) avec la traduction DeepL ou Google dans un fichier texte. Ensuite, je produis le résultat de la traduction dans un fichier texte, mais ne serait-il pas pratique si vous pouviez le comparer côte à côte avec le texte avant la traduction? Il y a encore place à l'amélioration, mais cela a été réalisé avec HTML. Article utilisé par exemple

Le code est sale car il était foiré, mais pardonnez-moi (depuis le début)

PostScript 8/7

Lors de l'affichage du HTML ・ ** Fonction de surbrillance et fonction de saut vers le texte anglais ou japonais correspondant ** ・ ** Mode sombre ** Ajoutée. Aussi, ** La tonalité de couleur en mode non sombre est également douce **.

** Prend en charge la traduction inversée (japonais → anglais) **.

Phrase utilisée dans l'exemple

code

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
import time
import pyperclip as ppc

DRIVER_PATH = 'chromedriver.exe'

options = Options()
options.add_argument('--disable-gpu')
options.add_argument('--disable-extensions')
options.add_argument('--proxy-server="direct://"')
options.add_argument('--proxy-bypass-list=*')
options.add_argument('--start-maximized')


def parse_merge(text, n=4900, m=1, inv=False):
    sentences = []
    sentence = ""
    for j, i in enumerate(" ".join(
            text.splitlines()).split(". " if inv == False else "。")):
        if i in ("", " ", "."): continue
        if (len(sentence) + len(i) > n) or (j % m == 0):
            sentences.append(sentence)
            sentence = ""
        sentence += i + ("." if inv == False else "。")
    sentences.append(sentence)
    return sentences


def TranslateFromClipboard(tool, write, filename, isPrint, html, title,
                           sentence_cnt, inv):
    driver = webdriver.Chrome(executable_path=DRIVER_PATH,
                              chrome_options=options)
    url = 'https://www.deepl.com/ja/translator' if tool == "DeepL" else f'https://translate.google.co.jp/?hl=ja&tab=TT&authuser=0#view=home&op=translate&sl=auto&tl={"en" if inv else "ja"}'
    driver.get(url)
    if tool == "DeepL":
        textarea = driver.find_element_by_css_selector(
            '.lmt__textarea.lmt__source_textarea.lmt__textarea_base_style')
    elif tool == "GT":
        textarea = driver.find_element_by_id('source')
    en = parse_merge(ppc.paste(), m=sentence_cnt, inv=inv)
    ja = []
    for sentence in en:
        if sentence == "":
            ja.append("")
            continue
        cbText = ppc.paste()
        ppc.copy(sentence)
        textarea.send_keys(Keys.CONTROL, "v")
        ppc.copy(cbText)
        transtext = ""
        while transtext == "":
            time.sleep(1)
            if tool == "DeepL":
                transtext = driver.find_element_by_css_selector(
                    '.lmt__textarea.lmt__target_textarea.lmt__textarea_base_style'
                ).get_property("value")
            elif tool == "GT":
                try:
                    time.sleep(1)
                    transtext = driver.find_element_by_css_selector(
                        '.tlid-translation.translation').text
                except:
                    pass
        if isPrint: print(transtext)
        ja.append(transtext)
        textarea.send_keys(Keys.CONTROL, "a")
        textarea.send_keys(Keys.BACKSPACE)
    driver.quit()
    if write:
        with open(filename + ".txt", "w", encoding='UTF-8') as f:
            f.write("\n".join(ja))
    if html:
        eng = ""
        jpn = ""
        for i, ej in enumerate(zip(en, ja)):
            eng += f'<br><a id="e{i}" href="#j{i}" onmouseover="over(' + f"'j{i}'" + ')" onmouseout="out(' + f"'j{i}'" + f')">{ej[0]}</a><br>'
            jpn += f'<br><a id="j{i}" href="#e{i}" onmouseover="over(' + f"'e{i}'" + ')" onmouseout="out(' + f"'e{i}'" + f')">{ej[1]}</a><br>'
        with open(filename + ".html", "w", encoding='UTF-8') as f:
            f.write(
                f'<h1 align="center">{title}</h1>\n<input id="btn-mode" type="checkbox">\n<hr>\n<body>\n<div class="parent">\n<div id="en">\n{eng}\n</div>\n<div id="ja">\n{jpn}\n</div>\n</div>'
                +
                '<style>\n:root {\n--main-text: #452b15;\n--main-bg: #f8f1e2;\n--highlight-text: #db8e3c;\n}\n:root[theme="dark"] {\n--main-text: #b0b0b0;\n--main-bg: #121212;\n--highlight-text: #fd8787;\n}\nh1 {\ncolor: var(--main-text);\n}\ninput {\nposition: absolute;\ntop: 1%;\nright: 1%;\n}\n#en {\nwidth: 43%;\nheight: 90%;\npadding: 0 2%;\nfloat: left;\nborder-right:1px solid #ccc;\nmargin: 1%;\noverflow: auto;\n}\n#ja {\nwidth: 43%;\nheight: 90%;\npadding: 0 2%;\nfloat: right;\nmargin: 1%;\noverflow: auto;\n}\na,\na:hover,\na:visited,\na:link,\na:active {\ncolor: var(--main-text);\ntext-decoration: none;\n}\nbody {\nbackground-color: var(--main-bg);\n}\n</style>\n<script>\nvar a = document.getElementsByTagName("a");\nfunction over(e) {\ndocument.getElementById(e).style.color = getComputedStyle(document.getElementById(e)).getPropertyValue("--highlight-text");\n}\nfunction out(e) {\ndocument.getElementById(e).style.color = getComputedStyle(document.getElementById(e)).getPropertyValue("--main-text");\n}\nconst btn = document.querySelector("#btn-mode");\nbtn.addEventListener("change", () => {\nif (btn.checked == true) {\ndocument.documentElement.setAttribute("theme", "dark");\n} else {\ndocument.documentElement.setAttribute("theme", "light");\n}\nfor (var i = 0; i < a.length; i++) {\na[i].style.color = getComputedStyle(a[i]).getPropertyValue("--main-text");\n}\n});\n</script>\n</body>'
            )


if __name__ == "__main__":
    args = [
        "DeepL", False, "translated_text.txt", True, False, "EN　↔　JP", 1, False
    ]
    if input("1.Anglais → japonais 2.Japonais → anglais") == "2": args[7] = True
    if input("1. DeepL 2.GoogleTranslate　　") == "2": args[0] = "GT"
    if input("Voulez-vous exporter le résultat de la traduction? Oui/n　　") == "y":
        case = input("1. txt 2. HTML 3. both    ")
        if case == "1":
            args[1] = True
            format_ = ".txt"
        elif case == "2":
            args[4] = True
            format_ = ".html"
        elif case == "3":
            args[1], args[4] = True, True
            format_ = ".txt/.html"
        filename = input(
            f"Entrez un nom pour le fichier de sortie (la valeur par défaut est'translated_text{format_}'）　　")
        if filename:
            args[2] = filename.replace(" ", "_")
        if case == "2" or case == "3":
            title = input("Veuillez saisir le titre (de l'article)")
            if title:
                args[5] = title
    try:
        args[6] = int(
            input("Combien de phrases souhaitez-vous traduire? (La valeur par défaut est une phrase à la fois. Plus la valeur est petite, plus la sortie est nette et plus la valeur est élevée, plus vite.)"))
    except:
        pass
    if input("Souhaitez-vous voir la progression de la traduction ici? Oui/n　　") == "n":
        args[3] = False
    input("Appuyez sur Entrée lorsque vous êtes prêt")
    TranslateFromClipboard(*args)

C'est beaucoup plus facile à lire, mais la traduction de la phrase entière prend beaucoup de temps car elle est traduite phrase par phrase (vous pouvez également traduire en une seule fois en sélectionnant au moment de l'exécution).

Post-scriptum 8/9

Cela demande un peu plus de travail, mais il y avait un moyen de rendre la traduction plus facile à lire. Ouvrez le PDF dans Word, copiez-le et traduisez-le avec un script qui modifie légèrement la partie fonction qui divise les phrases suivantes. Comme prévu, il devrait être appelé Word et formate également parfaitement les phrases qui chevauchent les sauts de ligne. <détails>

Code de réglage fin pour Word </ summary>

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
import time
import pyperclip as ppc

DRIVER_PATH = 'chromedriver.exe'

options = Options()
options.add_argument('--disable-gpu')
options.add_argument('--disable-extensions')
options.add_argument('--proxy-server="direct://"')
options.add_argument('--proxy-bypass-list=*')
options.add_argument('--start-maximized')


def parse_merge(text, n=4900):
    sentences = []
    sentence = ""
    for i in text.splitlines():
        if i in ("", " ", "."): continue
        sentences.append(i)
    return sentences


def TranslateFromClipboard(tool, write, filename, isPrint, html, title,inv):
    driver = webdriver.Chrome(executable_path=DRIVER_PATH,
                              chrome_options=options)
    url = 'https://www.deepl.com/ja/translator' if tool == "DeepL" else f'https://translate.google.co.jp/?hl=ja&tab=TT&authuser=0#view=home&op=translate&sl=auto&tl={"en" if inv else "ja"}'
    driver.get(url)
    if tool == "DeepL":
        textarea = driver.find_element_by_css_selector(
            '.lmt__textarea.lmt__source_textarea.lmt__textarea_base_style')
    elif tool == "GT":
        textarea = driver.find_element_by_id('source')
    en = parse_merge(ppc.paste())
    ja = []
    for sentence in en:
        if sentence == "":
            ja.append("")
            continue
        cbText = ppc.paste()
        ppc.copy(sentence)
        textarea.send_keys(Keys.CONTROL, "v")
        ppc.copy(cbText)
        transtext = ""
        while transtext == "":
            time.sleep(1)
            if tool == "DeepL":
                transtext = driver.find_element_by_css_selector(
                    '.lmt__textarea.lmt__target_textarea.lmt__textarea_base_style'
                ).get_property("value")
            elif tool == "GT":
                try:
                    time.sleep(1)
                    transtext = driver.find_element_by_css_selector(
                        '.tlid-translation.translation').text
                except:
                    pass
        if isPrint: print(transtext)
        ja.append(transtext)
        textarea.send_keys(Keys.CONTROL, "a")
        textarea.send_keys(Keys.BACKSPACE)
    driver.quit()
    if write:
        with open(filename + ".txt", "w", encoding='UTF-8') as f:
            f.write("\n".join(ja))
    if html:
        eng = ""
        jpn = ""
        for i, ej in enumerate(zip(en, ja)):
            eng += f'<br><a id="e{i}" href="#j{i}" onmouseover="over(' + f"'j{i}'" + ')" onmouseout="out(' + f"'j{i}'" + f')">{ej[0]}</a><br>'
            jpn += f'<br><a id="j{i}" href="#e{i}" onmouseover="over(' + f"'e{i}'" + ')" onmouseout="out(' + f"'e{i}'" + f')">{ej[1]}</a><br>'
        with open(filename + ".html", "w", encoding='UTF-8') as f:
            f.write(
                f'<h1 align="center">{title}</h1>\n<input id="btn-mode" type="checkbox">\n<hr>\n<body>\n<div class="parent">\n<div id="en">\n{eng}\n</div>\n<div id="ja">\n{jpn}\n</div>\n</div>'
                +
                '<style>\n:root {\n--main-text: #452b15;\n--main-bg: #f8f1e2;\n--highlight-text: #db8e3c;\n}\n:root[theme="dark"] {\n--main-text: #b0b0b0;\n--main-bg: #121212;\n--highlight-text: #fd8787;\n}\nh1 {\ncolor: var(--main-text);\n}\ninput {\nposition: absolute;\ntop: 1%;\nright: 1%;\n}\n#en {\nwidth: 43%;\nheight: 90%;\npadding: 0 2%;\nfloat: left;\nborder-right:1px solid #ccc;\nmargin: 1%;\noverflow: auto;\n}\n#ja {\nwidth: 43%;\nheight: 90%;\npadding: 0 2%;\nfloat: right;\nmargin: 1%;\noverflow: auto;\n}\na,\na:hover,\na:visited,\na:link,\na:active {\ncolor: var(--main-text);\ntext-decoration: none;\n}\nbody {\nbackground-color: var(--main-bg);\n}\n</style>\n<script>\nvar a = document.getElementsByTagName("a");\nfunction over(e) {\ndocument.getElementById(e).style.color = getComputedStyle(document.getElementById(e)).getPropertyValue("--highlight-text");\n}\nfunction out(e) {\ndocument.getElementById(e).style.color = getComputedStyle(document.getElementById(e)).getPropertyValue("--main-text");\n}\nconst btn = document.querySelector("#btn-mode");\nbtn.addEventListener("change", () => {\nif (btn.checked == true) {\ndocument.documentElement.setAttribute("theme", "dark");\n} else {\ndocument.documentElement.setAttribute("theme", "light");\n}\nfor (var i = 0; i < a.length; i++) {\na[i].style.color = getComputedStyle(a[i]).getPropertyValue("--main-text");\n}\n});\n</script>\n</body>'
            )


if __name__ == "__main__":
    args = ["DeepL", False, "translated_text.txt", True, False, "EN　↔　JP",False]
    if input("1.Anglais → japonais 2.Japonais → anglais") == "2": args[6] = True
    if input("1. DeepL 2.GoogleTranslate　　") == "2": args[0] = "GT"
    if input("Voulez-vous exporter le résultat de la traduction? Oui/n　　") == "y":
        case = input("1. txt 2. HTML 3. both    ")
        if case == "1":
            args[1] = True
            format_ = ".txt"
        elif case == "2":
            args[4] = True
            format_ = ".html"
        elif case == "3":
            args[1], args[4] = True, True
            format_ = ".txt/.html"
        filename = input(
            f"Entrez un nom pour le fichier de sortie (la valeur par défaut est'translated_text{format_}'）　　")
        if filename:
            args[2] = filename.replace(" ", "_")
        if case == "2" or case == "3":
            title = input("Veuillez saisir le titre (de l'article)")
            if title:
                args[5] = title
    if input("Souhaitez-vous voir la progression de la traduction ici? Oui/n　　") == "n":
        args[3] = False
    input("Appuyez sur Entrée lorsque vous êtes prêt")
    TranslateFromClipboard(*args)

Post-scriptum 8/11

J'ai rendu possible la décomposition de paragraphes dans une certaine mesure sans passer par Word. Puisqu'il traduit (approximativement) paragraphe par paragraphe, la vitesse de traduction est beaucoup plus rapide que celle de chaque phrase. <détails>

Code amélioré de décomposition de phrases </ summary>

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
import time
import pyperclip as ppc
import re

DRIVER_PATH = 'chromedriver.exe'

options = Options()
options.add_argument('--disable-gpu')
options.add_argument('--disable-extensions')
options.add_argument('--proxy-server="direct://"')
options.add_argument('--proxy-bypass-list=*')
options.add_argument('--start-maximized')


def textParser(text, n=30, braketDetect=True):
    text = text.splitlines()
    sentences = []
    t = ""
    bra_cnt, ket_cnt = 0, 0
    for i in range(len(text)):
        if not bool(re.search("\S", text[i])): continue
        if braketDetect:
            bra_cnt += len(re.findall("[\(?]", text[i]))
            ket_cnt += len(re.findall("[\)?]", text[i]))
        if i != len(text) - 1:
            if bool(re.fullmatch(r"[A-Z\s]+", text[i])):
                if t != "": sentences.append(t)
                t = ""
                sentences.append(text[i])
            elif text[i + 1] == "" or re.match(
                    "(\d{1,2}[\.,?]\s?|I{1,3}V{0,1}X{0,1}[\.,?]|V{0,1}X{0,1}I{1,3}[\.,?])+\s",
                    text[i]):
                sentences.append(t + text[i])
                t = ""
            elif (text[i][-1] not in ("?", ".", "?") and
                  (abs(len(text[i]) - len(text[i + 1])) < n or
                   (len(t + text[i]) > len(text[i + 1]) and
                    (text[i + 1][-1] in ("?", ".", "?")
                     or bool(re.match("[A-Z]", text[i + 1][0])))))) or bool(
                         re.match("[a-z]|\)",
                                  text[i + 1][0])) or bra_cnt > ket_cnt:
                t += text[i]
            else:
                sentences.append(t + text[i])
                t = ""
        else:
            sentences.append(text[i])
    return len(sentences), sentences


def TranslateFromClipboard(tool, write, filename, isPrint, html, title, inv):
    driver = webdriver.Chrome(executable_path=DRIVER_PATH,
                              chrome_options=options)
    url = 'https://www.deepl.com/ja/translator' if tool == "DeepL" else f'https://translate.google.co.jp/?hl=ja&tab=TT&authuser=0#view=home&op=translate&sl=auto&tl={"en" if inv else "ja"}'
    driver.get(url)
    if tool == "DeepL":
        textarea = driver.find_element_by_css_selector(
            '.lmt__textarea.lmt__source_textarea.lmt__textarea_base_style')
    elif tool == "GT":
        textarea = driver.find_element_by_id('source')
    length, en = textParser(ppc.paste())
    ja = []
    for i, sentence in enumerate(en):
        if sentence == "":
            ja.append("")
            continue
        cbText = ppc.paste()
        ppc.copy(sentence)
        textarea.send_keys(Keys.CONTROL, "v")
        ppc.copy(cbText)
        transtext = ""
        cnt = 0
        while transtext == "":
            time.sleep(1)
            if tool == "DeepL":
                transtext = driver.find_element_by_css_selector(
                    '.lmt__textarea.lmt__target_textarea.lmt__textarea_base_style'
                ).get_property("value")
            elif tool == "GT":
                try:
                    time.sleep(1)
                    transtext = driver.find_element_by_css_selector(
                        '.tlid-translation.translation').text
                except:
                    pass
            cnt += 1
            if cnt % 10 == 0: textarea.send_keys(".")
        if isPrint:
            print(sentence)
            print(transtext)
        print(f"\n{i+1}/{length}  {int(100*(i+1)/length)}% done\n")
        ja.append(transtext)
        textarea.send_keys(Keys.CONTROL, "a")
        textarea.send_keys(Keys.BACKSPACE)
    driver.quit()
    if write:
        with open(filename + ".txt", "w", encoding='UTF-8') as f:
            f.write("\n".join(ja))
    if html:
        eng = ""
        jpn = ""
        for i, ej in enumerate(zip(en, ja)):
            eng += f'<br><a id="e{i}" href="#j{i}" onmouseover="over(' + f"'j{i}'" + ')" onmouseout="out(' + f"'j{i}'" + f')">{ej[0]}</a><br>'
            jpn += f'<br><a id="j{i}" href="#e{i}" onmouseover="over(' + f"'e{i}'" + ')" onmouseout="out(' + f"'e{i}'" + f')">{ej[1]}</a><br>'
        with open(filename + ".html", "w", encoding='UTF-8') as f:
            f.write(
                f'<h1 align="center">{title}</h1>\n<input id="btn-mode" type="checkbox">\n<hr>\n<body>\n<div class="parent">\n<div id="en">\n{eng}\n</div>\n<div id="ja">\n{jpn}\n</div>\n</div>'
                +
                '<style>\n:root {\n--main-text: #452b15;\n--main-bg: #f8f1e2;\n--highlight-text: #db8e3c;\n}\n:root[theme="dark"] {\n--main-text: #b0b0b0;\n--main-bg: #121212;\n--highlight-text: #fd8787;\n}\nh1 {\ncolor: var(--main-text);\n}\ninput {\nposition: absolute;\ntop: 1%;\nright: 1%;\n}\n#en {\nwidth: 43%;\nheight: 90%;\npadding: 0 2%;\nfloat: left;\nborder-right:1px solid #ccc;\nmargin: 1%;\noverflow: auto;\n}\n#ja {\nwidth: 43%;\nheight: 90%;\npadding: 0 2%;\nfloat: right;\nmargin: 1%;\noverflow: auto;\n}\na,\na:hover,\na:visited,\na:link,\na:active {\ncolor: var(--main-text);\ntext-decoration: none;\n}\nbody {\nbackground-color: var(--main-bg);\n}\n</style>\n<script>\nvar a = document.getElementsByTagName("a");\nfunction over(e) {\ndocument.getElementById(e).style.color = getComputedStyle(document.getElementById(e)).getPropertyValue("--highlight-text");\n}\nfunction out(e) {\ndocument.getElementById(e).style.color = getComputedStyle(document.getElementById(e)).getPropertyValue("--main-text");\n}\nconst btn = document.querySelector("#btn-mode");\nbtn.addEventListener("change", () => {\nif (btn.checked == true) {\ndocument.documentElement.setAttribute("theme", "dark");\n} else {\ndocument.documentElement.setAttribute("theme", "light");\n}\nfor (var i = 0; i < a.length; i++) {\na[i].style.color = getComputedStyle(a[i]).getPropertyValue("--main-text");\n}\n});\n</script>\n</body>'
            )


if __name__ == "__main__":
    args = [
        "DeepL", False, "translated_text.txt", True, False,
        "ORIGINAL　↔　TRANSLATED", False
    ]
    if input("1.Anglais → japonais 2.Japonais → anglais") == "2": args[6] = True
    if input("1. DeepL 2.GoogleTranslate　　") == "2": args[0] = "GT"
    if input("Voulez-vous exporter le résultat de la traduction? Oui/n　　") == "y":
        case = input("1. txt 2. HTML 3. both    ")
        if case == "1":
            args[1] = True
            format_ = ".txt"
        elif case == "2":
            args[4] = True
            format_ = ".html"
        elif case == "3":
            args[1], args[4] = True, True
            format_ = ".txt/.html"
        filename = input(
            f"Entrez un nom pour le fichier de sortie (la valeur par défaut est'translated_text{format_}'）　　")
        if filename:
            args[2] = filename.replace(" ", "_")
        if case == "2" or case == "3":
            title = input("Veuillez saisir le titre (de l'article)")
            if title:
                args[5] = title
    if input("Souhaitez-vous voir la progression de la traduction ici? Oui/n　　") == "n":
        args[3] = False
    input("Appuyez sur Entrée lorsque vous êtes prêt")
    TranslateFromClipboard(*args)

Résumé

HTML et CSS sont des amateurs, donc si vous faites cela, ce sera encore mieux! Je vous serais reconnaissant de bien vouloir me dire s’il y a un point.

Suite [Python] Traduisons automatiquement le PDF anglais (mais sans s'y limiter) avec la traduction DeepL ou Google dans un fichier texte, pas de HTML.