Created
July 4, 2019 18:07
-
-
Save thiagodiniz/fe5c1417a93bc640358a3e2322b2fc5f to your computer and use it in GitHub Desktop.
HTML 2 DOCX @ Python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import io | |
import requests | |
from bs4 import NavigableString, BeautifulSoup | |
from docx import Document as DocxDocument | |
from docx.enum.text import WD_ALIGN_PARAGRAPH | |
from docx.shared import Cm, Pt, Mm | |
from docx.image.exceptions import UnrecognizedImageError | |
from lxml import etree | |
A4_PAGE_HEIGHT = 297 | |
A4_PAGE_WIDHT = 210 | |
DEFAULT_MARGINS = 25.4 | |
MAX_PICTURE_SIZE = 602 | |
def parser(html): | |
soup = BeautifulSoup(html) | |
body = soup.body | |
document = DocxDocument() | |
parsing(html, document) | |
document.save("output/teste.docx") | |
def parsing(html, document): | |
soup = BeautifulSoup(html, features="lxml") | |
body = soup.body | |
remove_newlines(body) | |
print(body.prettify()) | |
process_node(body, document) | |
return body | |
def process_node(node, document, parent=None, paragraph=None, style=[]): | |
try: | |
if node.contents: | |
pass | |
if node.name in ['ul', 'ol']: | |
process_list(node, document) | |
return None | |
if node.name == 'math': | |
process_math(node, document, paragraph) | |
return None | |
if node.name == 'table': | |
process_table(document, node) | |
return None | |
if node.name in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']: | |
paragraph = document.add_paragraph('') | |
style = [] | |
if not node.get('style') is None: | |
style = style + process_style(node['style']) | |
for child in node.children: | |
paragraph = process_node(child, document, node, paragraph, style) | |
if node.name == 'img': | |
process_image(node, document, paragraph, style) | |
return None | |
if node.name == 'br': | |
document.add_paragraph('') | |
return None | |
except AttributeError as error: | |
process_string(node, document, parent, paragraph, style) | |
return paragraph | |
def process_string(str_node, document, parent, paragraph, style=[]): | |
if paragraph is None: | |
paragraph = document.add_paragraph('') | |
try: | |
if str_node == '\n' or str_node == u'\xa0': | |
return | |
text = str_node.strip() | |
run = paragraph.add_run(text) | |
if (parent.name == 'strong') or ('bold' in style): | |
run.font.bold = True | |
if (parent.name == 'em') or ('italic' in style): | |
run.font.italic = True | |
if 'small' in style: | |
run.font.size = Pt(8) | |
if 'underline' in style: | |
run.font.underline = True | |
if 'left' in style: | |
paragraph.alignment = WD_ALIGN_PARAGRAPH.LEFT | |
if 'right' in style: | |
paragraph.alignment = WD_ALIGN_PARAGRAPH.RIGHT | |
if 'center' in style: | |
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER | |
if 'justify' in style: | |
paragraph.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY | |
print(text) | |
except Exception as e: | |
pass | |
def process_list(node, document): | |
list_style = 'List Bullet' if node.name == 'ul' else 'List Number' | |
for child in node.children: | |
if child.name == 'li': | |
paragraph = document.add_paragraph('', style=list_style) | |
process_list_item(child, document, node, paragraph) | |
def process_list_item(node, document, parent, paragraph): | |
try: | |
remove_newlines(node) | |
if node.contents: | |
for child in node.children: | |
if child.name == 'math': | |
process_math(child, document, paragraph) | |
continue | |
process_list_item(child, document, node, paragraph) | |
except AttributeError as error: | |
process_string(node, document, parent, paragraph) | |
def process_image(img_node, document, paragraph, styles=[]): | |
if paragraph and paragraph.text == '': | |
delete_paragraph(paragraph) | |
url = img_node['src'] | |
image = download_image(url) | |
try: | |
if should_use_width(img_node): | |
print("IMG: W #" + url) | |
size = convert_image_size(img_node['width']) | |
document.add_picture(image, width=Mm(size)) | |
else: | |
print("IMG: H #" + url) | |
size = convert_image_size(img_node['height']) | |
document.add_picture(image, height=Mm(size)) | |
if should_be_centered(img_node, styles): | |
try: | |
picture_paragraph = document.paragraphs[-1] | |
except AttributeError: | |
picture_paragraph = document._parent | |
picture_paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER | |
except UnrecognizedImageError as error: | |
rollbar.report_message('UnrecognizedImageError at html_parser.py', 'warning') | |
document.add_paragraph('### Imagem não pode ser incluida, entre em contato com o Lecionas e informe o problema') | |
def process_math(node, document, paragraph=None): | |
import re | |
from django.conf import settings | |
project_path = re.sub(r'/main$', '%s', settings.BASE_DIR) | |
mml2omml_stylesheet_path = project_path % "/export/MML2OMML.XSL" | |
element_tree = etree.fromstring(str(node)) | |
xslt = etree.parse(mml2omml_stylesheet_path) | |
transform = etree.XSLT(xslt) | |
new_dom = transform(element_tree) | |
if paragraph is None: | |
paragraph = document.add_paragraph() | |
paragraph._element.append(new_dom.getroot()) | |
def should_be_centered(img_node, styles=[]): | |
centered = False | |
try: | |
style_str = img_node['style'] | |
centered = ("margin-left: auto" in style_str) and ("margin-right: auto" in style_str) | |
except: | |
pass | |
if 'center' in styles: | |
centered = True | |
return centered | |
def convert_image_size(orginal_size): | |
size_in_px = float(orginal_size) | |
if(size_in_px > MAX_PICTURE_SIZE): | |
size_in_px = MAX_PICTURE_SIZE | |
document_size = round(A4_PAGE_WIDHT - (DEFAULT_MARGINS*2), 1) | |
size_in_mn = round((size_in_px * document_size)/MAX_PICTURE_SIZE) | |
return size_in_mn | |
def should_use_width(img_node): | |
try: | |
width = 0 | |
height = 0 | |
width = int(img_node['width']) | |
height = int(img_node['height']) | |
except: | |
pass | |
use_width = True | |
if height > width: | |
use_width = False | |
return use_width | |
def process_table(document, table_node): | |
all_rows = table_node.find_all("tr") | |
rows_count = len(all_rows) | |
cells_count = list(map(lambda x: len(x.find_all(["th","td"])), all_rows)) | |
max_columns = max(cells_count) | |
table = document.add_table(rows=rows_count, cols=max_columns) | |
current_row = 0 | |
for html_row in all_rows: | |
row_cells = table.rows[current_row].cells | |
current_cell_index = 0 | |
for html_cell in html_row.find_all(["th","td"]): | |
table_cell = row_cells[current_cell_index] | |
process_table_cell(document, html_cell, table_cell, [],True) | |
if not (html_cell.get('colspan') is None): | |
index_to_merge = current_cell_index + int(html_cell['colspan']) - 1 | |
cell_to_merge = row_cells[index_to_merge] | |
table_cell.merge(cell_to_merge) | |
current_cell_index = current_cell_index + 1 | |
current_row = current_row + 1 | |
def process_table_cell(document, node, cell, styles=[], skip_newlines=False): | |
# still not considering new paragraphs inside table_cell | |
# https://github.com/python-openxml/python-docx/issues/216 | |
if not node.get('style') is None: | |
styles = styles + process_style(node['style']) | |
if not cell.paragraphs: | |
paragraph = cell.add_paragraph() | |
else: | |
paragraph = cell.paragraphs[-1] | |
for child in node.children: | |
if isinstance(child, NavigableString): | |
if skip_newlines and (not child.strip()): continue | |
process_string(child, document, node, paragraph, styles) | |
elif child.name == 'math': | |
process_math(child, document, paragraph) | |
elif child.name == 'img': | |
paragraph = cell.add_paragraph() | |
run = paragraph.add_run() | |
process_image(child, run, None, styles) | |
cell.add_paragraph() | |
else: | |
process_table_cell(document, child, cell, styles) | |
def process_style(style_str=''): | |
import re | |
default_size = 11 | |
styles = [] | |
if style_str is None: | |
return styles | |
sizeMatch = re.search('font-size: (\d+)pt', style_str) | |
if sizeMatch: | |
size = int(sizeMatch.group(1)) | |
if size < default_size: | |
styles.append('small') | |
weightMatch = re.search('font-weight: (700|800|900|bold(er)?)', style_str) | |
if weightMatch: | |
styles.append('bold') | |
styleMatch = re.search('font-style: (oblique|italic)', style_str) | |
if styleMatch: | |
styles.append('italic') | |
underMatch = re.search('text-decoration: underline', style_str) | |
if underMatch: | |
styles.append('underline') | |
alignMatch = re.search('text-align: (left|right|center|justify)', style_str) | |
if alignMatch: | |
styles.append(alignMatch.group(1)) | |
return styles | |
def delete_paragraph(paragraph): | |
p = paragraph._element | |
p.getparent().remove(p) | |
p._p = p._element = None | |
def remove_newlines(node): | |
all_elements = node.contents | |
for element in all_elements: | |
if isinstance(element, NavigableString): | |
if element.strip() == '': | |
element.extract() | |
def download_image(url): | |
response = requests.get(url, stream=True) | |
image = io.BytesIO(response.content) | |
return image | |
def remove_table_border(table): | |
from docx.oxml.shared import OxmlElement # Necessary Import | |
# https://github.com/python-openxml/python-docx/issues/433 | |
tbl = table._tbl # get xml element in table | |
for cell in tbl.iter_tcs(): | |
tcPr = cell.tcPr # get tcPr element, in which we can define style of borders | |
tcBorders = OxmlElement('w:tcBorders') | |
borders = [] | |
for element in ['w:top', 'w:left', 'w:bottom', 'w:right']: | |
el = OxmlElement(element) | |
el.set(qn('w:val'), 'nil') | |
borders.append(el) | |
for b in borders: | |
tcBorders.append(b) | |
tcPr.append(tcBorders) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment