Skip to content

Instantly share code, notes, and snippets.

@pmallory
Created November 9, 2016 23:54
Show Gist options
  • Save pmallory/671e7e7398af426404edfd3f485b7fa6 to your computer and use it in GitHub Desktop.
Save pmallory/671e7e7398af426404edfd3f485b7fa6 to your computer and use it in GitHub Desktop.
Search a page for a representative image (a big, square one). Inspired by the code Reddit uses to pick images to put next to headlines
import io
import sys
import urllib
import bs4
import requests
from PIL import Image
def get_image_list(url):
"""Given the url of an HTML document, return a list of urls of all images
on the page.
"""
response = requests.get(url)
soup = bs4.BeautifulSoup(response.content, "html.parser")
image_tags = soup.findAll('img')
image_urls = []
for image_tag in image_tags:
image_url = image_tag.get('src')
# image_url might be a relative url, urljoin will make a full url if necessary
full_url = urllib.parse.urljoin(url, image_url)
if full_url[-4:] != '.svg': # PIL can't handle svgs, so skip
image_urls.append(full_url)
return image_urls
def image_dimensions(image):
width = image.size[0]
height = image.size[1]
return width, height, width*height
def is_square(width, height):
"""Determine if an image is close to square shaped"""
return 0.5 < width/height < 2
def is_large(area):
"""Determine if an image is largish"""
return area > 5000
def select_best_image(image_urls):
biggest_image = None
biggest_size = 0
for url in image_urls:
response = requests.get(url)
im = Image.open(io.BytesIO(response.content))
width, height, area = image_dimensions(im)
if is_square(width, height) and is_large(area):
if area > biggest_size:
biggest_image = url
biggest_size = area
return biggest_image
if __name__ == '__main__':
url = sys.argv[1]
images = get_image_list(url)
print(select_best_image(images))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment