[IMP]Docx Report Generation : add automatic sanitization for Html field

- Usefull when data had been copied from external programs ;
- Manages more tags ;
- More robust rendering.
This commit is contained in:
Fabien BOURGEOIS 2025-02-06 15:43:29 +01:00
parent 28f13e2e66
commit ca6e8efe5e
2 changed files with 36 additions and 5 deletions

View File

@ -14,7 +14,7 @@
"author": "RYDLAB, Yaltik",
"website": "https://rydlab.ru",
"category": "Technical",
"version": "16.0.2.1.3",
"version": "16.0.2.2.0",
"license": "LGPL-3",
"depends": ["base", "web", "custom_report_field", "report_monetary_helpers"],
"external_dependencies": {"python": ["docxcompose", "docxtpl", "beautifulsoup4"]},

View File

@ -5,9 +5,11 @@ from io import BytesIO
from functools import partial
from re import findall
from json import loads
from html import unescape
from logging import getLogger
from lxml import etree
from lxml.html.clean import Cleaner
from docx import Document
from docx.oxml import OxmlElement
@ -437,6 +439,20 @@ class IrActionsReport(models.Model):
}
values["docs"] = docs
def sanitize(html):
""" Sanitize Html from Odoo, mostly when pasted from outside """
cleaner = Cleaner(
page_structure=True, meta=True, embedded=True, links=True,
style=True, processing_instructions=True,
scripts=True, javascript=True, comments=True, frames=True,
forms=True, annoying_tags=True, remove_unknown_tags=True,
safe_attrs_only=True,
inline_style=False,
safe_attrs=frozenset(['style']),
remove_tags=('span', 'div')
)
return cleaner.clean_html(html)
def _html_generate(field, tpl):
def _create_list(paragraph, list_type):
@ -449,21 +465,31 @@ class IrActionsReport(models.Model):
pPr.append(numPr) #add number properties to paragraph
if isinstance(field, fields.Markup):
html_field = str(field).replace('<br>', '').replace('&nbsp;', ' ')
html_field = sanitize(str(field))
html_field = unescape(
str(field)
.replace('<br>', '')
.replace('\t', '')
.replace('\n', '')
)
xml_tree = etree.fromstring('<root>%s</root>' % html_field)
md = tpl.new_subdoc()
p = md.add_paragraph()
lp = md.add_paragraph()
for child in xml_tree.iter():
if child.tag in ('p', 'strong', 'em', 's', 'u', 'ul',
if child.tag in ('p', 'strong', 'b', 'em', 'i', 's', 'u', 'ul',
'ol', 'li', 'font', 'a'):
parent = child.getparent()
p = lp if parent and parent.tag in ('ul', 'ol') else p
if child.tag == 'p':
p = md.add_paragraph(child.text)
p.add_run(child.tail)
elif child.tag == 'a':
p.add_run(child.text).style = 'Hyperlink'
elif child.tag == 'strong':
elif child.tag in ('strong', 'b'):
p.add_run(child.text).bold = True
p.add_run(child.tail)
elif child.tag == 'em':
elif child.tag in ('em', 'i'):
p.add_run(child.text).italic = True
p.add_run(child.tail)
elif child.tag == 'u':
@ -493,6 +519,11 @@ class IrActionsReport(models.Model):
else: # Not handled, add text only
p.add_run(child.text)
p.add_run(child.tail)
# Delete empty paragraphs
for p in filter(lambda p: not(p.text), md.paragraphs):
el = p._element
el.getparent().remove(el)
el._p = el._element = None
return md
return field