pszemraj · August 12, 2024 22:28
diff --git a/transcribe_imgdir2text.py b/transcribe_imgdir2text.py
 import base64
 import os
 from pathlib import Path

 import fire
 from openai import OpenAI
 from tqdm.auto import tqdm
 from joblib import Memory

 # Set up joblib caching
 cache_dir = Path.home() / ".cache" / ".joblib"
 cache_dir.mkdir(parents=True, exist_ok=True)
 memory = Memory(cache_dir, verbose=0)

 PROMPT_TEXT = """Please rewrite the text in the image into well-formatted, clean markdown. You do not need to use a markdown code block, just make sure your output is markdown only.

 Try to reproduce the essentials of figures in the image. For very simple figures that can be concisely represented with ASCII symbols, use ASCII. For plots or complicated figures, instead write a description of the figure and its purpose. If the image is blank, simply return [NO_CONTENT_FOUND]"""

 # assumes OpenAI API key in your env vars as "OPENAI_API_KEY"
 client = OpenAI(max_retries=10, timeout=180)


 def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")


 @memory.cache
 def process_image(
    image_path,
    prompt: str = PROMPT_TEXT,
    model_name: str = "gpt-4o-mini",
 ):
    base64_image = encode_image(image_path)

    response = client.chat.completions.create(
        model=model_name,
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": prompt,
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{base64_image}",
                            "detail": "high",
                        },
                    },
                ],
            }
        ],
        max_tokens=4000,
        temperature=0,
    )

    return response.choices[0].message.content


 def process_directory(
    input_dir,
    out_dir=None,
    model_name: str = "gpt-4o-mini",
 ):
    """
    process_directory - Transcribe images in a directory to markdown

    :param _type_ input_dir: input directory
    :param _type_ out_dir: output directory, defaults to None
    :param str model_name: model name, defaults to "gpt-4o-mini"
    :raises ValueError: _description_
    """

    input_path = Path(input_dir)
    assert input_path.exists(), f"Input path {input_path} does not exist"

    if out_dir is None:
        out_path = input_path.parent / f"{input_path.name}_markdown-{model_name}"
    else:
        out_path = Path(out_dir)
    out_path.mkdir(parents=True, exist_ok=True)

    image_extensions = [".png", ".jpg", ".jpeg", ".gif", ".webp"]
    image_files = [
        f for f in input_path.iterdir() if f.suffix.lower() in image_extensions
    ]

    all_markdown = []
    for image_file in tqdm(image_files, desc="Processing images"):
        if image_file.suffix.lower() in image_extensions:
            markdown_content = process_image(str(image_file), model_name=model_name)

            output_file = out_path / f"{image_file.stem}.md"
            with output_file.open("w", encoding="utf-8") as f:
                f.write(markdown_content.strip())

            all_markdown.append(markdown_content)

    # write the overall doc to a file
    if len(all_markdown) > 1:
        with (out_path / f"aggregate_document-{input_path.name}.md").open(
            "w", encoding="utf-8"
        ) as f:
            f.write("\n\n".join(all_markdown))

    print(f"saved to:\n\t{str(out_path)}")


 if __name__ == "__main__":
    fire.Fire(process_directory)
	import base64
	import os
	from pathlib import Path

	import fire
	from openai import OpenAI
	from tqdm.auto import tqdm
	from joblib import Memory

	# Set up joblib caching
	cache_dir = Path.home() / ".cache" / ".joblib"
	cache_dir.mkdir(parents=True, exist_ok=True)
	memory = Memory(cache_dir, verbose=0)

	PROMPT_TEXT = """Please rewrite the text in the image into well-formatted, clean markdown. You do not need to use a markdown code block, just make sure your output is markdown only.

	Try to reproduce the essentials of figures in the image. For very simple figures that can be concisely represented with ASCII symbols, use ASCII. For plots or complicated figures, instead write a description of the figure and its purpose. If the image is blank, simply return [NO_CONTENT_FOUND]"""

	# assumes OpenAI API key in your env vars as "OPENAI_API_KEY"
	client = OpenAI(max_retries=10, timeout=180)


	def encode_image(image_path):
	with open(image_path, "rb") as image_file:
	return base64.b64encode(image_file.read()).decode("utf-8")


	@memory.cache
	def process_image(
	image_path,
	prompt: str = PROMPT_TEXT,
	model_name: str = "gpt-4o-mini",
	):
	base64_image = encode_image(image_path)

	response = client.chat.completions.create(
	model=model_name,
	messages=[
	{
	"role": "user",
	"content": [
	{
	"type": "text",
	"text": prompt,
	},
	{
	"type": "image_url",
	"image_url": {
	"url": f"data:image/jpeg;base64,{base64_image}",
	"detail": "high",
	},
	},
	],
	}
	],
	max_tokens=4000,
	temperature=0,
	)

	return response.choices[0].message.content


	def process_directory(
	input_dir,
	out_dir=None,
	model_name: str = "gpt-4o-mini",
	):
	"""
	process_directory - Transcribe images in a directory to markdown

	:param _type_ input_dir: input directory
	:param _type_ out_dir: output directory, defaults to None
	:param str model_name: model name, defaults to "gpt-4o-mini"
	:raises ValueError: _description_
	"""

	input_path = Path(input_dir)
	assert input_path.exists(), f"Input path {input_path} does not exist"

	if out_dir is None:
	out_path = input_path.parent / f"{input_path.name}_markdown-{model_name}"
	else:
	out_path = Path(out_dir)
	out_path.mkdir(parents=True, exist_ok=True)

	image_extensions = [".png", ".jpg", ".jpeg", ".gif", ".webp"]
	image_files = [
	f for f in input_path.iterdir() if f.suffix.lower() in image_extensions
	]

	all_markdown = []
	for image_file in tqdm(image_files, desc="Processing images"):
	if image_file.suffix.lower() in image_extensions:
	markdown_content = process_image(str(image_file), model_name=model_name)

	output_file = out_path / f"{image_file.stem}.md"
	with output_file.open("w", encoding="utf-8") as f:
	f.write(markdown_content.strip())

	all_markdown.append(markdown_content)

	# write the overall doc to a file
	if len(all_markdown) > 1:
	with (out_path / f"aggregate_document-{input_path.name}.md").open(
	"w", encoding="utf-8"
	) as f:
	f.write("\n\n".join(all_markdown))

	print(f"saved to:\n\t{str(out_path)}")


	if __name__ == "__main__":
	fire.Fire(process_directory)