jricardo27 · July 29, 2015 15:22
diff --git a/gistfile1.txt b/gistfile1.txt
 import glob
 import os.path
 from bs4 import BeautifulSoup

 find_this = [
    'meta',
    'script',
    'link',
    'iframe',
    {'class_': 'ads'},
    {'id': 'central_ad'},
    {'bgcolor':'white'},
    {'class_': 'GoogleActiveViewClass'},
    {'class_': 'catlinks'},
    {'class_': 'mw-editsection'},
 ]

 def remove_tags(soup):
    """
    Extract undesired tags
    """
    all_tags = []
    for find_expr in find_this:
        if isinstance(find_expr, dict):
            tags = soup.find_all(**find_expr)
        else:
            tags = soup.find_all(find_expr)

        for tag in tags:
            all_tags.append(tag)

    for tag in all_tags:
        tag.extract()
    
    return soup

 def save(soup, filename):
    """
    Save a soup object to an html file
    """
    if isinstance(soup, BeautifulSoup):
        outfile = open(filename, 'w')
        outfile.write(soup.prettify().encode('utf8'))
        outfile.close()
        pass
    else:
        print('Error {} is not a soup'.format(soup))

 def convert(filename):
    """
    Convert an html file to epub using calibre
    """
    os.system('ebook-convert {} .epub'.format(filename))
    
        
 mydir = '/media/d/online_storage/dropbox/temp/cities'
 htmlfiles = glob.glob(os.path.join(mydir, '*.html'))

 for filename in htmlfiles:
    soup = BeautifulSoup(open(filename), 'lxml')
    save(remove_tags(soup), filename)
    convert(filename)
    print(filename)
	import glob
	import os.path
	from bs4 import BeautifulSoup

	find_this = [
	'meta',
	'script',
	'link',
	'iframe',
	{'class_': 'ads'},
	{'id': 'central_ad'},
	{'bgcolor':'white'},
	{'class_': 'GoogleActiveViewClass'},
	{'class_': 'catlinks'},
	{'class_': 'mw-editsection'},
	]

	def remove_tags(soup):
	"""
	Extract undesired tags
	"""
	all_tags = []
	for find_expr in find_this:
	if isinstance(find_expr, dict):
	tags = soup.find_all(**find_expr)
	else:
	tags = soup.find_all(find_expr)

	for tag in tags:
	all_tags.append(tag)

	for tag in all_tags:
	tag.extract()

	return soup

	def save(soup, filename):
	"""
	Save a soup object to an html file
	"""
	if isinstance(soup, BeautifulSoup):
	outfile = open(filename, 'w')
	outfile.write(soup.prettify().encode('utf8'))
	outfile.close()
	pass
	else:
	print('Error {} is not a soup'.format(soup))

	def convert(filename):
	"""
	Convert an html file to epub using calibre
	"""
	os.system('ebook-convert {} .epub'.format(filename))


	mydir = '/media/d/online_storage/dropbox/temp/cities'
	htmlfiles = glob.glob(os.path.join(mydir, '*.html'))

	for filename in htmlfiles:
	soup = BeautifulSoup(open(filename), 'lxml')
	save(remove_tags(soup), filename)
	convert(filename)
	print(filename)