Skip to content

Instantly share code, notes, and snippets.

@jricardo27
Created July 29, 2015 15:22
Show Gist options
  • Save jricardo27/3894e09be31069d10ae5 to your computer and use it in GitHub Desktop.
Save jricardo27/3894e09be31069d10ae5 to your computer and use it in GitHub Desktop.
Convert Wikitravel page to epub
import glob
import os.path
from bs4 import BeautifulSoup
find_this = [
'meta',
'script',
'link',
'iframe',
{'class_': 'ads'},
{'id': 'central_ad'},
{'bgcolor':'white'},
{'class_': 'GoogleActiveViewClass'},
{'class_': 'catlinks'},
{'class_': 'mw-editsection'},
]
def remove_tags(soup):
"""
Extract undesired tags
"""
all_tags = []
for find_expr in find_this:
if isinstance(find_expr, dict):
tags = soup.find_all(**find_expr)
else:
tags = soup.find_all(find_expr)
for tag in tags:
all_tags.append(tag)
for tag in all_tags:
tag.extract()
return soup
def save(soup, filename):
"""
Save a soup object to an html file
"""
if isinstance(soup, BeautifulSoup):
outfile = open(filename, 'w')
outfile.write(soup.prettify().encode('utf8'))
outfile.close()
pass
else:
print('Error {} is not a soup'.format(soup))
def convert(filename):
"""
Convert an html file to epub using calibre
"""
os.system('ebook-convert {} .epub'.format(filename))
mydir = '/media/d/online_storage/dropbox/temp/cities'
htmlfiles = glob.glob(os.path.join(mydir, '*.html'))
for filename in htmlfiles:
soup = BeautifulSoup(open(filename), 'lxml')
save(remove_tags(soup), filename)
convert(filename)
print(filename)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment