Created
July 17, 2018 01:02
-
-
Save Bilio/81074a856f7d771b4d0846a87021fea5 to your computer and use it in GitHub Desktop.
Clean html with python lxml.html Cleaner
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import codecs | |
import sys | |
from lxml import etree | |
from lxml.html.clean import Cleaner | |
def sanitize(dirty_html): | |
cleaner = Cleaner(page_structure=True, | |
meta=True, | |
embedded=True, | |
links=True, | |
style=True, | |
processing_instructions=True, | |
inline_style=True, | |
scripts=True, | |
javascript=True, | |
comments=True, | |
frames=True, | |
forms=True, | |
annoying_tags=True, | |
remove_unknown_tags=True, | |
safe_attrs_only=True, | |
safe_attrs=frozenset(['src','color', 'href', 'title', 'class', 'name', 'id']), | |
remove_tags=('span', 'font', 'div') | |
) | |
return cleaner.clean_html(dirty_html) | |
def to_html(element): | |
return etree.tostring(element, pretty_print=True, encoding='utf-8') | |
if __name__ == '__main__': | |
cln_html = None | |
# >>>>>>>>>>>>>>>>>>> SET INPUT FILE ENCODING HERE <<<<<<<<<<<<<<<<<<<<< | |
with codecs.open(sys.argv[1], 'rb', 'cp1252') as fin: | |
sys.stderr.write('sanitizing input...') | |
cln_html = sanitize(fin.read()) | |
sys.stderr.write('Done\n') | |
with codecs.open(sys.argv[1] + '.new.html', 'wb', 'utf-8') as fout: | |
sys.stderr.write('writing file...') | |
fout.write(cln_html) | |
sys.stderr.write('Done\n') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment