[IMP]Docx Report Generation : add automatic sanitization for Html field

- Usefull when data had been copied from external programs ;
- Manages more tags ;
- More robust rendering.
This commit is contained in:
Fabien BOURGEOIS 2025-02-06 15:43:29 +01:00
parent 28f13e2e66
commit ca6e8efe5e
2 changed files with 36 additions and 5 deletions

View File

@ -14,7 +14,7 @@
"author": "RYDLAB, Yaltik", "author": "RYDLAB, Yaltik",
"website": "https://rydlab.ru", "website": "https://rydlab.ru",
"category": "Technical", "category": "Technical",
"version": "16.0.2.1.3", "version": "16.0.2.2.0",
"license": "LGPL-3", "license": "LGPL-3",
"depends": ["base", "web", "custom_report_field", "report_monetary_helpers"], "depends": ["base", "web", "custom_report_field", "report_monetary_helpers"],
"external_dependencies": {"python": ["docxcompose", "docxtpl", "beautifulsoup4"]}, "external_dependencies": {"python": ["docxcompose", "docxtpl", "beautifulsoup4"]},

View File

@ -5,9 +5,11 @@ from io import BytesIO
from functools import partial from functools import partial
from re import findall from re import findall
from json import loads from json import loads
from html import unescape
from logging import getLogger from logging import getLogger
from lxml import etree from lxml import etree
from lxml.html.clean import Cleaner
from docx import Document from docx import Document
from docx.oxml import OxmlElement from docx.oxml import OxmlElement
@ -437,6 +439,20 @@ class IrActionsReport(models.Model):
} }
values["docs"] = docs values["docs"] = docs
def sanitize(html):
""" Sanitize Html from Odoo, mostly when pasted from outside """
cleaner = Cleaner(
page_structure=True, meta=True, embedded=True, links=True,
style=True, processing_instructions=True,
scripts=True, javascript=True, comments=True, frames=True,
forms=True, annoying_tags=True, remove_unknown_tags=True,
safe_attrs_only=True,
inline_style=False,
safe_attrs=frozenset(['style']),
remove_tags=('span', 'div')
)
return cleaner.clean_html(html)
def _html_generate(field, tpl): def _html_generate(field, tpl):
def _create_list(paragraph, list_type): def _create_list(paragraph, list_type):
@ -449,21 +465,31 @@ class IrActionsReport(models.Model):
pPr.append(numPr) #add number properties to paragraph pPr.append(numPr) #add number properties to paragraph
if isinstance(field, fields.Markup): if isinstance(field, fields.Markup):
html_field = str(field).replace('<br>', '').replace('&nbsp;', ' ') html_field = sanitize(str(field))
html_field = unescape(
str(field)
.replace('<br>', '')
.replace('\t', '')
.replace('\n', '')
)
xml_tree = etree.fromstring('<root>%s</root>' % html_field) xml_tree = etree.fromstring('<root>%s</root>' % html_field)
md = tpl.new_subdoc() md = tpl.new_subdoc()
p = md.add_paragraph()
lp = md.add_paragraph()
for child in xml_tree.iter(): for child in xml_tree.iter():
if child.tag in ('p', 'strong', 'em', 's', 'u', 'ul', if child.tag in ('p', 'strong', 'b', 'em', 'i', 's', 'u', 'ul',
'ol', 'li', 'font', 'a'): 'ol', 'li', 'font', 'a'):
parent = child.getparent()
p = lp if parent and parent.tag in ('ul', 'ol') else p
if child.tag == 'p': if child.tag == 'p':
p = md.add_paragraph(child.text) p = md.add_paragraph(child.text)
p.add_run(child.tail) p.add_run(child.tail)
elif child.tag == 'a': elif child.tag == 'a':
p.add_run(child.text).style = 'Hyperlink' p.add_run(child.text).style = 'Hyperlink'
elif child.tag == 'strong': elif child.tag in ('strong', 'b'):
p.add_run(child.text).bold = True p.add_run(child.text).bold = True
p.add_run(child.tail) p.add_run(child.tail)
elif child.tag == 'em': elif child.tag in ('em', 'i'):
p.add_run(child.text).italic = True p.add_run(child.text).italic = True
p.add_run(child.tail) p.add_run(child.tail)
elif child.tag == 'u': elif child.tag == 'u':
@ -493,6 +519,11 @@ class IrActionsReport(models.Model):
else: # Not handled, add text only else: # Not handled, add text only
p.add_run(child.text) p.add_run(child.text)
p.add_run(child.tail) p.add_run(child.tail)
# Delete empty paragraphs
for p in filter(lambda p: not(p.text), md.paragraphs):
el = p._element
el.getparent().remove(el)
el._p = el._element = None
return md return md
return field return field