Skip to content

Instantly share code, notes, and snippets.

Forked from racitup/
Last active April 18, 2018 13:04
Show Gist options
  • Save jrial/bed3cd6fa2806af3a048a74208963d63 to your computer and use it in GitHub Desktop.
Save jrial/bed3cd6fa2806af3a048a74208963d63 to your computer and use it in GitHub Desktop.
Extract text from html in python using BeautifulSoup4
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import re
from bs4 import BeautifulSoup, NavigableString, Tag
def html_to_text(html):
"Creates a formatted text email message as a string from a rendered html template (page)"
soup = BeautifulSoup(html, 'html.parser')
# Ignore anything in head
body, text = soup.body, []
# h1, h2 etc...
heading_re = re.compile('^h\d+$')
add_newline = False
for element in body.descendants:
# We use type and not isinstance since comments, cdata, etc are subclasses that we don't want
# pylint: disable=C0123
if type(element) == Tag:
if == 'p' or heading_re.match(
# Mark as "requires leading newline": paragraphs, headings
add_newline = True
if type(element) == NavigableString:
parent_tags = (t for t in element.parents if type(t) == Tag)
hidden = False
for parent_tag in parent_tags:
# Ignore any text inside a non-displayed tag
# We also behave is if scripting is enabled (noscript is ignored)
# The list of non-displayed tags and attributes from the W3C specs:
if ( in ('area', 'base', 'basefont', 'datalist', 'head', 'link',
'meta', 'noembed', 'noframes', 'param', 'rp', 'script',
'source', 'style', 'template', 'track', 'title', 'noscript') or
parent_tag.has_attr('hidden') or
( == 'input' and parent_tag.get('type') == 'hidden')):
hidden = True
if hidden:
# remove any multiple and leading/trailing whitespace
string = ' '.join(element.string.split())
if string:
if == 'a':
a_tag = element.parent
# replace link text with the Markdown link
string = u'[{}]({})'.format(element.strip(), a_tag['href'])
# concatenate with any non-empty immediately previous string
if (type(a_tag.previous_sibling) == NavigableString and
if text[-1][-1] in """([{"'`""":
text[-1] += string
text[-1] = text[-1] + ' ' + string
elif element.previous_sibling and == 'a':
# Don't put spaces before punctuation and similar stuff
if string[0] in """,.!?;:)]}"'`""":
text[-1] += string
text[-1] = text[-1] + ' ' + string
if add_newline and text:
# Add extra paragraph/heading formatting newline, except
# at the very beginning of the document.
string = '\n' + string
add_newline = False
text += [string]
doc = '\n'.join(text)
return doc
if __name__ == '__main__':
html = """
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "">
<html lang="en">
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
<title>Hello World!</title>
<body style="margin:0; padding:0; background-color:#F2F2F2;">
<!--[if !mso]><!-- -->
<img style="min-width:640px; display:block; margin:0; padding:0" class="mobileOff" width="640" height="1" src="/static/spacer.gif">
<table width="100%" border="0" cellpadding="0" cellspacing="0" bgcolor="#F2F2F2">
<td align="center" class="mobile" style="font-family:arial, sans-serif; font-size:20px; line-height:26px; font-weight:bold;">
This is some title text.
<script>This is a script</script>
<td align="center" class="mobile" style="font-family:arial, sans-serif; font-size:20px; line-height:26px; font-weight:bold;">
<p> Paragraph without
link <br> But with a
line break </p>
<td align="center" class="mobile" style="font-family:arial, sans-serif; font-size:20px; line-height:26px; font-weight:bold;">
<a href="">This is a button link &gt;</a>
<style type="text/css">
body, table, td, a { -webkit-text-size-adjust: 100%; -ms-text-size-adjust: 100%; }
table, td { mso-table-lspace: 0pt; mso-table-rspace: 0pt; }
img { -ms-interpolation-mode: bicubic; }
<script>This is a longer script with embedded tags:
'<p>Example embedded tag with <i class="fa fa-example">icon</i></p>'
<p hidden>Non-visible paragraph with <i class="fa fa-example">icon</i></p>
<noscript>This is a longer script with embedded tags:
<p>Example embedded text with <i class="fa fa-example">icon</i></p>
<input id="id_wibble" class="form-control" name="wibble" type="hidden" placeholder="Something here">
<input id="id_email" class="form-control" name="email" type="email" placeholder="Your email address">
<td align="center" class="mobile" style="font-family:arial, sans-serif; font-size:20px; line-height:26px; font-weight:bold;">
<p>Paragraph with embedded link <a href="">This is a link &gt;</a>
and this is a continuation of the paragraph with the link.</p>
<td align="center" class="mobile" style="font-family:arial, sans-serif; font-size:20px; line-height:26px; font-weight:bold;">
Some text with link: <a href="">This is a link &gt;</a>
And some text after the link.<br>
Try an empty embedded link<a href="">This is a link &gt;</a>before this text.<br>
Lots of brs:<br><br><br>
after brs
<p><a href="">This is a link that starts a paragraph</a>, and this is the paragraph's continuation.</p>
Let's try some <a href="#punctuation">links</a>, followed by
punctuation {<a href="#braces">or between braces</a>},
(<a href="#brackets">brackets</a>),
&quot;<a href="#quotes">inside quotes</a>&quot;,
`<a href="#backticks">backticks</a>`
and observe correct text flow.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment