Skip to content

Instantly share code, notes, and snippets.

@0scarB
Last active September 12, 2023 13:58
Show Gist options
  • Save 0scarB/ed330536fd098ccdca9b7a8b09d9d4a0 to your computer and use it in GitHub Desktop.
Save 0scarB/ed330536fd098ccdca9b7a8b09d9d4a0 to your computer and use it in GitHub Desktop.
Quick and dirty function to extract image URLs from markdown.
from collections.abc import Iterable
import urllib
def find_image_urls_in_markdown(s: str) -> Iterable[str]:
context = ""
alt_text_chars = []
url_chars = []
for char in s:
if context == "":
if char == "!":
context = "after_exclamation"
elif context == "after_exclamation":
if char == "[":
context = "alt_text"
elif context == "alt_text":
if char == "]":
context = "alt_text_end"
else:
alt_text_chars.append(char)
elif context == "alt_text_end":
if char == "(":
context = "url"
else:
alt_text_chars.clear()
context = ""
elif context == "url":
if char == ")":
yield from find_image_urls_in_markdown("".join(alt_text_chars))
yield "".join(url_chars).strip()
# Use next line instead to escape special chars
#yield urllib.parse.quote("".join(url_chars).strip().encode('utf-8'), safe="/:")
alt_text_chars.clear()
url_chars.clear()
context = ""
else:
url_chars.append(char)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment