Skip to content

Instantly share code, notes, and snippets.

@mufidu
Last active August 30, 2024 22:53
Show Gist options
  • Save mufidu/f7b795f844f1ee4dc78e55123d5a398b to your computer and use it in GitHub Desktop.
Save mufidu/f7b795f844f1ee4dc78e55123d5a398b to your computer and use it in GitHub Desktop.
Download all images in markdown files and rename the links to point to the local files
# Download all images from all FOLDER_NAME in FOLDER_NAME folder
# and save them to FOLDER_NAME/_attachments folder.
# Then replace remote image paths with local paths.
# Usage: python download_imgs.py
import os
import urllib
import urllib.parse
import urllib.request
import re
import time
import socket
# Change this to the folder name containing your markdown files
FOLDER_NAME = "articles"
# Get all files in FOLDER_NAME folder
files = os.listdir(f'{FOLDER_NAME}')
# Loop through all files
for file in files:
# Open file if it's a markdown file
if not file.endswith('.md'):
continue
f = open(f'{FOLDER_NAME}/' + file, 'r')
# Read file
content = f.read()
# Loop through all images in file (markdown syntax)
for img in re.findall(r'!\[.*\]\((.*)\)', content):
# Get image name, use timestamp to avoid duplicates
name = str(time.time()) + os.path.basename(urllib.parse.urlparse(img).path)
# Create FOLDER_NAME/_attachments folder if it doesn't exist
if not os.path.exists(f'{FOLDER_NAME}/_attachments'):
os.makedirs(f'{FOLDER_NAME}/_attachments')
try:
# Download image to FOLDER_NAME/_attachments folder
# Use Python Requests instead of urlretrieve to avoid 403 errors
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding': 'none',
'Accept-Language': 'en-US,en;q=0.8',
'Connection': 'keep-alive'}
req = urllib.request.Request(img, headers=headers)
response = urllib.request.urlopen(req)
f = open(f'{FOLDER_NAME}/_attachments/' + name, 'wb')
f.write(response.read())
# Add timeout to avoid hanging
socket.setdefaulttimeout(30)
# Replace image path with local path
content = content.replace(img, '_attachments/' + name)
# Print status
print('Downloaded ' + img + f' to {FOLDER_NAME}/_attachments/' + name)
# Print error details
except Exception as e:
print('Error downloading ' + img + ': ' + str(e))
# Sleep for 1 second to avoid rate limiting, if any
time.sleep(1)
# Close file
f.close()
# Write file
f = open(f'{FOLDER_NAME}/' + file, 'w')
f.write(content)
# Close file
f.close()
@mufidu
Copy link
Author

mufidu commented Dec 15, 2022

Should've processed the URLs first to deal with weird links. E.g. Insider's images' links look like this: {"https://i.insider.com/5699688ce6183e46258b7648":{"contentType":"image/png","aspectRatioW":1575,"aspectRatioH":1181}}.

But it works for normal links.

@boxydog
Copy link

boxydog commented Dec 4, 2023

This was helpful for me, thanks.

@granite83
Copy link

it works perfectly!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment