Last active
August 12, 2024 22:28
-
-
Save pszemraj/3850d3e3e2f0f4cd0fcb85bdc0589d77 to your computer and use it in GitHub Desktop.
run ocr on imgdir with openai model
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import base64 | |
import os | |
from pathlib import Path | |
import fire | |
from openai import OpenAI | |
from tqdm.auto import tqdm | |
from joblib import Memory | |
# Set up joblib caching | |
cache_dir = Path.home() / ".cache" / ".joblib" | |
cache_dir.mkdir(parents=True, exist_ok=True) | |
memory = Memory(cache_dir, verbose=0) | |
PROMPT_TEXT = """Please rewrite the text in the image into well-formatted, clean markdown. You do not need to use a markdown code block, just make sure your output is markdown only. | |
Try to reproduce the essentials of figures in the image. For very simple figures that can be concisely represented with ASCII symbols, use ASCII. For plots or complicated figures, instead write a description of the figure and its purpose. If the image is blank, simply return [NO_CONTENT_FOUND]""" | |
# assumes OpenAI API key in your env vars as "OPENAI_API_KEY" | |
client = OpenAI(max_retries=10, timeout=180) | |
def encode_image(image_path): | |
with open(image_path, "rb") as image_file: | |
return base64.b64encode(image_file.read()).decode("utf-8") | |
@memory.cache | |
def process_image( | |
image_path, | |
prompt: str = PROMPT_TEXT, | |
model_name: str = "gpt-4o-mini", | |
): | |
base64_image = encode_image(image_path) | |
response = client.chat.completions.create( | |
model=model_name, | |
messages=[ | |
{ | |
"role": "user", | |
"content": [ | |
{ | |
"type": "text", | |
"text": prompt, | |
}, | |
{ | |
"type": "image_url", | |
"image_url": { | |
"url": f"data:image/jpeg;base64,{base64_image}", | |
"detail": "high", | |
}, | |
}, | |
], | |
} | |
], | |
max_tokens=4000, | |
temperature=0, | |
) | |
return response.choices[0].message.content | |
def process_directory( | |
input_dir, | |
out_dir=None, | |
model_name: str = "gpt-4o-mini", | |
): | |
""" | |
process_directory - Transcribe images in a directory to markdown | |
:param _type_ input_dir: input directory | |
:param _type_ out_dir: output directory, defaults to None | |
:param str model_name: model name, defaults to "gpt-4o-mini" | |
:raises ValueError: _description_ | |
""" | |
input_path = Path(input_dir) | |
assert input_path.exists(), f"Input path {input_path} does not exist" | |
if out_dir is None: | |
out_path = input_path.parent / f"{input_path.name}_markdown-{model_name}" | |
else: | |
out_path = Path(out_dir) | |
out_path.mkdir(parents=True, exist_ok=True) | |
image_extensions = [".png", ".jpg", ".jpeg", ".gif", ".webp"] | |
image_files = [ | |
f for f in input_path.iterdir() if f.suffix.lower() in image_extensions | |
] | |
all_markdown = [] | |
for image_file in tqdm(image_files, desc="Processing images"): | |
if image_file.suffix.lower() in image_extensions: | |
markdown_content = process_image(str(image_file), model_name=model_name) | |
output_file = out_path / f"{image_file.stem}.md" | |
with output_file.open("w", encoding="utf-8") as f: | |
f.write(markdown_content.strip()) | |
all_markdown.append(markdown_content) | |
# write the overall doc to a file | |
if len(all_markdown) > 1: | |
with (out_path / f"aggregate_document-{input_path.name}.md").open( | |
"w", encoding="utf-8" | |
) as f: | |
f.write("\n\n".join(all_markdown)) | |
print(f"saved to:\n\t{str(out_path)}") | |
if __name__ == "__main__": | |
fire.Fire(process_directory) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment