Created
January 31, 2024 09:26
-
-
Save tm9k1/e5e524869aa7e6d9a442f355dc4f678a to your computer and use it in GitHub Desktop.
Script to scrape bwallpaperhd.com for Microsoft Spotlight wallpapers
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python3 | |
import os | |
from bs4 import BeautifulSoup | |
import requests | |
import concurrent.futures | |
from tqdm import tqdm | |
# change these two to whatever you like. | |
# NOTE: Remember that the next run would again download all wallpapers if you move the wallpapers away from wallpaper_save_path | |
wallpaper_save_path = '/mnt/hdd/baadal/tm9k1/images/wallpapers/horizontal/spotlight' | |
webpages_save_path = '/tmp/webpages/' | |
# This is the link I use to get the wallpapers. This site is updated daily, with latest wallpapers at the top. | |
sitemap_url = 'https://www.bwallpaperhd.com/sitemap.html' | |
sitemap_filename = webpages_save_path + sitemap_url.split('/')[-1] | |
# download a file at download_url to file_name | |
def download_a_file(download_url: str, file_name: str): | |
incomplete_file_name = file_name[:file_name.rfind('/')+1] + '0INCOMPLETE_'+ file_name[file_name.rfind('/')+1:] | |
# remove any stale download of the file | |
if os.path.exists(incomplete_file_name): | |
os.remove(incomplete_file_name) | |
# print(download_url) | |
# download the file | |
response = requests.get(download_url, stream=True) | |
if response.status_code == 200: | |
# write it as 0INCOMPLETE_filename so we can handle any broken files later | |
with open(incomplete_file_name, 'wb') as f: | |
for chunk in response: | |
f.write(chunk) | |
# rename the file to filename if we received the whole file | |
os.rename(incomplete_file_name, file_name) | |
# returns a list of tuples as [(Wallpaper Name, http://wallpaper.webpage/url.html),...] | |
def extract_links_from_sitemap(sitemap_url: str, sitemap_filename: str): | |
# remove any stale download of the sitemap | |
if os.path.exists(sitemap_filename): | |
os.remove(sitemap_filename) | |
# download the sitemap | |
download_a_file(sitemap_url, sitemap_filename) | |
soup = BeautifulSoup(open(sitemap_filename, encoding='utf-8'), "html.parser") | |
# print(soup.prettify()) | |
divs = soup.find_all('div', attrs={'id':'content'}) | |
list_of_wallpprs = [] | |
for div in divs: | |
h3 = div.find_all('h3', string='Latest WallPapers') | |
if h3: | |
anchor_tags = div.find_all('a') | |
for anchor_tag in anchor_tags: | |
wallpaper_name = anchor_tag.contents[0] | |
wallpaper_webpage_url = anchor_tag.get('href') | |
list_of_wallpprs.append((wallpaper_name, wallpaper_webpage_url)) | |
return list_of_wallpprs | |
# expects the tuples from list_of_wallpapers as [(Wallpaper Name, http://wallpaper.webpage/url.html),...] | |
def download_a_wallpaper(tup: tuple): | |
wallpaper_name, wallpaper_webpage_url = tup | |
wallpaper_webpage_filename = wallpaper_webpage_url.split('/')[-1] | |
wallpaper_filename = wallpaper_webpage_filename[:wallpaper_webpage_filename.find('.')] | |
wallpaper_filename = wallpaper_filename + '.jpg' # hopefully they won't ditch jpg anytime soon! | |
if not os.path.exists(wallpaper_save_path + wallpaper_filename): | |
if not os.path.exists(webpages_save_path + wallpaper_webpage_filename): | |
download_a_file(wallpaper_webpage_url, webpages_save_path + wallpaper_webpage_filename) | |
soup = BeautifulSoup(open(webpages_save_path + wallpaper_webpage_filename, encoding='utf-8'), "html.parser") | |
divs = soup.find_all('div', attrs={'class':'download'}) | |
for div in divs: | |
anchors = div.find_all('a') | |
for anchor in anchors: | |
if anchor.contents[0].strip().lower() == 'original': | |
wallpaper_url = anchor.get('href') | |
# print(wallpaper_name + ': ' + wallpaper_url) | |
download_a_file(wallpaper_url, wallpaper_save_path + wallpaper_filename) | |
os.remove(webpages_save_path + wallpaper_webpage_filename) | |
return wallpaper_name | |
## MAIN ACTION IS HERE | |
# make the necessary directories in case they don't exist | |
os.makedirs(wallpaper_save_path, exist_ok=True) | |
os.makedirs(webpages_save_path, exist_ok=True) | |
if not wallpaper_save_path.endswith('/'): | |
wallpaper_save_path += '/' | |
if not webpages_save_path.endswith('/'): | |
webpages_save_path += '/' | |
# extract the list of wallpapers with links to each wallpaper's own webpage | |
list_of_wallpapers = extract_links_from_sitemap(sitemap_url, sitemap_filename) | |
# get the total count of wallpapers we can expect by the end of processing | |
total_wallpapers = len(list_of_wallpapers) | |
# Create a progress bar so user knows what's going on | |
progress_bar = tqdm(range(total_wallpapers), desc="Downloading Wallpapers",colour="blue", bar_format='{l_bar}{bar}| [{n_fmt}/{total_fmt}] [Elapsed: {elapsed}]') | |
# Start distributing work across multiple threads. 20 seems to be a safe count for now. | |
with concurrent.futures.ThreadPoolExecutor(max_workers=10, thread_name_prefix='Wallpaper_Downloader_') as executor: | |
future_to_url = [ executor.submit(download_a_wallpaper, wallpaper_entry) for wallpaper_entry in list_of_wallpapers ] | |
for future in concurrent.futures.as_completed(future_to_url): | |
progress_bar.n += 1 | |
progress_bar.refresh() | |
progress_bar.close() | |
# clean up the downloaded sitemap webpage | |
os.remove(sitemap_filename) | |
print('Done! Enjoy your new collection! =)') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment