Created
May 29, 2023 17:39
-
-
Save flaviut/d901be509425098645e4ae527a9e9f3a to your computer and use it in GitHub Desktop.
Helpful tesseract fine-tuning scripts from https://www.statworx.com/en/content-hub/blog/fine-tuning-tesseract-ocr-for-german-invoices/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# -*- coding: utf-8 -*- | |
"""Script to draw bounding boxes and text from a Tesseract box file. | |
The script takes an image TIFF_FILE, draws the text and bounding boxes | |
of the corresponding BOX_FILE and saves the resulting OUT_FILE. | |
Tesseract box file columns: | |
<symbol> <left> <bottom> <right> <top> <page> | |
""" | |
import pandas as pd | |
from PIL import Image, ImageDraw, ImageFont | |
from csv import reader | |
FONT = '/usr/share/fonts/truetype/dejavu/DejaVuSansMono.ttf' | |
FONT_SIZE = 26 | |
TIFF_FILE = 'val/val_invoice.tiff' | |
BOX_FILE = 'val/val_invoice.box' | |
OUT_FILE = 'val/val_invoice_ocr_deu.tiff' | |
def draw_tesseract_box(image_path, box_path): | |
"""Draw a box file coordinates as rectangles on an image. """ | |
image = Image.open(image_path) | |
image_height = image.height | |
df_text = get_tesseract_coords(box_path) | |
for _, row in df_text.iterrows(): | |
symbol = row['symbol'].replace('\t', '') | |
pillow_coords = convert_coords_to_pillow(row, image_height) | |
image = draw_bb_text(image, pillow_coords, symbol) | |
image.save(OUT_FILE) | |
def get_tesseract_coords(box_path): | |
"""Get the Tesseract text and bb coordinates. """ | |
csv_reader = reader(open(box_path), delimiter=' ') | |
names = ['symbol', 'left', 'bottom', 'right', 'top', 'page'] | |
box_data = [] | |
for row in csv_reader: | |
# tesseract uses two empty strings as space | |
if row[0] == '' and row[1] == '': | |
row[0] = ' ' | |
del row[1] | |
box_data.append(row) | |
df = pd.DataFrame.from_records(box_data, columns=names) | |
df_text = df.groupby([ | |
'left', 'bottom', 'right', 'top', 'page'])['symbol'].apply( | |
lambda x: ''.join(x.values.tolist()) | |
).reset_index(drop=False) | |
return df_text | |
def draw_bb_text(image, coords, text): | |
"""Draw a bb and write text from tesseract box file data. """ | |
draw = ImageDraw.Draw(image) | |
shape = [ | |
(coords['left'], coords['top']), | |
(coords['right'], coords['bottom']) | |
] | |
draw.rectangle(xy=shape, outline="green") | |
unicode_font = ImageFont.truetype(FONT, FONT_SIZE) | |
text_coord = ( | |
coords['left'], | |
coords['top'] + (coords['top'] - coords['bottom']) | |
) | |
draw.text(text_coord, text, font=unicode_font, fill="red") | |
print(text) | |
return image | |
def convert_coords_to_pillow(coords_tess, image_height): | |
"""Convert coords from Tesseract boxfile coords to pillow coords. """ | |
coords_json = { | |
'left': int(coords_tess['left']), | |
'top': image_height - int(coords_tess['top']), | |
'right': int(coords_tess['right']), | |
'bottom': image_height - int(coords_tess['bottom']) | |
} | |
return coords_json | |
def main(): | |
box_path = BOX_FILE | |
image_path = TIFF_FILE | |
draw_tesseract_box(image_path, box_path) | |
if __name__ == '__main__': | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/usr/local/bin/lstmtraining \ | |
--model_output output/fine_tuned \ | |
--continue_from lstm_model/deu.lstm \ | |
--traineddata tesseract/tessdata/best/deu.traineddata \ | |
--train_listfile train/deu.training_files.txt \ | |
--eval_listfile eval/deu.training_files.txt \ | |
--max_iterations 400 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment