Skip to content

Instantly share code, notes, and snippets.

@pszemraj
Last active August 12, 2024 22:28
Show Gist options
  • Save pszemraj/3850d3e3e2f0f4cd0fcb85bdc0589d77 to your computer and use it in GitHub Desktop.
Save pszemraj/3850d3e3e2f0f4cd0fcb85bdc0589d77 to your computer and use it in GitHub Desktop.
run ocr on imgdir with openai model
import base64
import os
from pathlib import Path
import fire
from openai import OpenAI
from tqdm.auto import tqdm
from joblib import Memory
# Set up joblib caching
cache_dir = Path.home() / ".cache" / ".joblib"
cache_dir.mkdir(parents=True, exist_ok=True)
memory = Memory(cache_dir, verbose=0)
PROMPT_TEXT = """Please rewrite the text in the image into well-formatted, clean markdown. You do not need to use a markdown code block, just make sure your output is markdown only.
Try to reproduce the essentials of figures in the image. For very simple figures that can be concisely represented with ASCII symbols, use ASCII. For plots or complicated figures, instead write a description of the figure and its purpose. If the image is blank, simply return [NO_CONTENT_FOUND]"""
# assumes OpenAI API key in your env vars as "OPENAI_API_KEY"
client = OpenAI(max_retries=10, timeout=180)
def encode_image(image_path):
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode("utf-8")
@memory.cache
def process_image(
image_path,
prompt: str = PROMPT_TEXT,
model_name: str = "gpt-4o-mini",
):
base64_image = encode_image(image_path)
response = client.chat.completions.create(
model=model_name,
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": prompt,
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}",
"detail": "high",
},
},
],
}
],
max_tokens=4000,
temperature=0,
)
return response.choices[0].message.content
def process_directory(
input_dir,
out_dir=None,
model_name: str = "gpt-4o-mini",
):
"""
process_directory - Transcribe images in a directory to markdown
:param _type_ input_dir: input directory
:param _type_ out_dir: output directory, defaults to None
:param str model_name: model name, defaults to "gpt-4o-mini"
:raises ValueError: _description_
"""
input_path = Path(input_dir)
assert input_path.exists(), f"Input path {input_path} does not exist"
if out_dir is None:
out_path = input_path.parent / f"{input_path.name}_markdown-{model_name}"
else:
out_path = Path(out_dir)
out_path.mkdir(parents=True, exist_ok=True)
image_extensions = [".png", ".jpg", ".jpeg", ".gif", ".webp"]
image_files = [
f for f in input_path.iterdir() if f.suffix.lower() in image_extensions
]
all_markdown = []
for image_file in tqdm(image_files, desc="Processing images"):
if image_file.suffix.lower() in image_extensions:
markdown_content = process_image(str(image_file), model_name=model_name)
output_file = out_path / f"{image_file.stem}.md"
with output_file.open("w", encoding="utf-8") as f:
f.write(markdown_content.strip())
all_markdown.append(markdown_content)
# write the overall doc to a file
if len(all_markdown) > 1:
with (out_path / f"aggregate_document-{input_path.name}.md").open(
"w", encoding="utf-8"
) as f:
f.write("\n\n".join(all_markdown))
print(f"saved to:\n\t{str(out_path)}")
if __name__ == "__main__":
fire.Fire(process_directory)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment