[IMP]Docx Report Generation : add automatic sanitization for Html field

- Usefull when data had been copied from external programs ; - Manages more tags ; - More robust rendering.
2025-02-06 15:43:29 +01:00 · 2025-02-06 15:43:29 +01:00 · ca6e8efe5e
commit ca6e8efe5e
parent 28f13e2e66
2 changed files with 36 additions and 5 deletions
--- a/docx_report_generation/manifest.py
+++ b/docx_report_generation/manifest.py
@ -14,7 +14,7 @@
    "author": "RYDLAB, Yaltik",
    "website": "https://rydlab.ru",
    "category": "Technical",
-    "version": "16.0.2.1.3",
+    "version": "16.0.2.2.0",
    "license": "LGPL-3",
    "depends": ["base", "web", "custom_report_field", "report_monetary_helpers"],
    "external_dependencies": {"python": ["docxcompose", "docxtpl", "beautifulsoup4"]},
--- a/docx_report_generation/models/ir_actions_report.py
+++ b/docx_report_generation/models/ir_actions_report.py
@ -5,9 +5,11 @@ from io import BytesIO
 from functools import partial
 from re import findall
 from json import loads
+from html import unescape
 from logging import getLogger

 from lxml import etree
+from lxml.html.clean import Cleaner

 from docx import Document
 from docx.oxml import OxmlElement
@ -437,6 +439,20 @@ class IrActionsReport(models.Model):
        }
        values["docs"] = docs

+        def sanitize(html):
+            """ Sanitize Html from Odoo, mostly when pasted from outside """
+            cleaner = Cleaner(
+                page_structure=True, meta=True, embedded=True, links=True,
+                style=True, processing_instructions=True,
+                scripts=True, javascript=True, comments=True, frames=True,
+                forms=True, annoying_tags=True, remove_unknown_tags=True,
+                safe_attrs_only=True,
+                inline_style=False,
+                safe_attrs=frozenset(['style']),
+                remove_tags=('span', 'div')
+            )
+            return cleaner.clean_html(html)
+
        def _html_generate(field, tpl):

            def _create_list(paragraph, list_type):
@ -449,21 +465,31 @@ class IrActionsReport(models.Model):
                pPr.append(numPr) #add number properties to paragraph

            if isinstance(field, fields.Markup):
-                html_field = str(field).replace('<br>', '').replace('&nbsp;', ' ')
+                html_field = sanitize(str(field))
+                html_field = unescape(
+                    str(field)
+                    .replace('<br>', '')
+                    .replace('\t', '')
+                    .replace('\n', '')
+                )
                xml_tree = etree.fromstring('<root>%s</root>' % html_field)
                md = tpl.new_subdoc()
+                p = md.add_paragraph()
+                lp = md.add_paragraph()
                for child in xml_tree.iter():
-                    if child.tag in ('p', 'strong', 'em', 's', 'u', 'ul',
+                    if child.tag in ('p', 'strong', 'b', 'em', 'i', 's', 'u', 'ul',
                                     'ol', 'li', 'font', 'a'):
+                        parent = child.getparent()
+                        p = lp if parent and parent.tag in ('ul', 'ol') else p
                        if child.tag == 'p':
                            p = md.add_paragraph(child.text)
                            p.add_run(child.tail)
                        elif child.tag == 'a':
                            p.add_run(child.text).style = 'Hyperlink'
-                        elif child.tag == 'strong':
+                        elif child.tag in ('strong', 'b'):
                            p.add_run(child.text).bold = True
                            p.add_run(child.tail)
-                        elif child.tag == 'em':
+                        elif child.tag in ('em', 'i'):
                            p.add_run(child.text).italic = True
                            p.add_run(child.tail)
                        elif child.tag == 'u':
@ -493,6 +519,11 @@ class IrActionsReport(models.Model):
                    else: # Not handled, add text only
                        p.add_run(child.text)
                        p.add_run(child.tail)
+                # Delete empty paragraphs
+                for p in filter(lambda p: not(p.text), md.paragraphs):
+                    el = p._element
+                    el.getparent().remove(el)
+                    el._p = el._element = None
                return md
            return field