It was very troublesome to merge characters in the box file generated by jTessBoxEditor, so I created it. I wasn't sure if marge would improve the accuracy, but I wrote it as a memorial service.
Create the following files. Paste this content into jTessBoxEditor to generate tiff files, box files, etc.
text.txt
letter
test
programming
mgs
This program regenerates a box file in which the character string defined in "text.txt" is merged line by line in the box file. Replace the generated file with the box file generated together with the tiff file, and specify it when training with jTessBoxEditor.
sample.py
import shutil
def read_words(input_file):
with open(input_file, "r") as f:
data = [ t.replace("\n", "") for t in f.readlines() ]
def marge_box(input_file, words):
with open(input_file, "r") as f:
data = [ t.replace("\n", "").split(" ") for t in f.readlines() ]
res = []
start = 0
for word in words:
length = len(word)
end = start + length
tmp = data[start:end]
print("word", word)
print("start", start)
print("end", end)
print("length", length)
a = [t[1] for t in tmp]
b = [t[2] for t in tmp]
c = [t[3] for t in tmp]
d = [t[4] for t in tmp]
e = [t[5] for t in tmp]
a = a[0]
b = min(b)
c = max(c)
d = max(d)
e = e[0]
print("abcde", [a, b, c, d, e])
res.append([word, a, b, c, d, e])
start += length
shutil.copy(input_file, "copy_" + input_file) #Copy the original file as a backup
with open(input_file, "w", encoding='utf-8') as f:
for t in res:
print(" ".join(t), file=f)
if __name__ == "__main__":
words = read_words("text.txt")
marge_box("~~.box", words) #Specify the box file generated by jTessBoxEditor
Recommended Posts