From ca6e8efe5e2fe162b80c44b7f841f67b87246feb Mon Sep 17 00:00:00 2001
From: Fabien BOURGEOIS <fabien@yaltik.com>
Date: Thu, 6 Feb 2025 15:43:29 +0100
Subject: [PATCH] [IMP]Docx Report Generation : add automatic sanitization for
 Html field

- Usefull when data had been copied from external programs ;
- Manages more tags ;
- More robust rendering.
---
 docx_report_generation/__manifest__.py        |  2 +-
 .../models/ir_actions_report.py               | 39 +++++++++++++++++--
 2 files changed, 36 insertions(+), 5 deletions(-)
diff --git a/docx_report_generation/__manifest__.py b/docx_report_generation/__manifest__.py
index 8e18134..fcfc9c5 100755
--- a/docx_report_generation/__manifest__.py
+++ b/docx_report_generation/__manifest__.py
@@ -14,7 +14,7 @@
     "author": "RYDLAB, Yaltik",
     "website": "https://rydlab.ru",
     "category": "Technical",
-    "version": "16.0.2.1.3",
+    "version": "16.0.2.2.0",
     "license": "LGPL-3",
     "depends": ["base", "web", "custom_report_field", "report_monetary_helpers"],
     "external_dependencies": {"python": ["docxcompose", "docxtpl", "beautifulsoup4"]},
diff --git a/docx_report_generation/models/ir_actions_report.py b/docx_report_generation/models/ir_actions_report.py
index 495161f..65eb7f6 100644
--- a/docx_report_generation/models/ir_actions_report.py
+++ b/docx_report_generation/models/ir_actions_report.py
@@ -5,9 +5,11 @@ from io import BytesIO
 from functools import partial
 from re import findall
 from json import loads
+from html import unescape
 from logging import getLogger
 
 from lxml import etree
+from lxml.html.clean import Cleaner
 
 from docx import Document
 from docx.oxml import OxmlElement
@@ -437,6 +439,20 @@ class IrActionsReport(models.Model):
         }
         values["docs"] = docs
 
+        def sanitize(html):
+            """ Sanitize Html from Odoo, mostly when pasted from outside """
+            cleaner = Cleaner(
+                page_structure=True, meta=True, embedded=True, links=True,
+                style=True, processing_instructions=True,
+                scripts=True, javascript=True, comments=True, frames=True,
+                forms=True, annoying_tags=True, remove_unknown_tags=True,
+                safe_attrs_only=True,
+                inline_style=False,
+                safe_attrs=frozenset(['style']),
+                remove_tags=('span', 'div')
+            )
+            return cleaner.clean_html(html)
+
         def _html_generate(field, tpl):
 
             def _create_list(paragraph, list_type):
@@ -449,21 +465,31 @@ class IrActionsReport(models.Model):
                 pPr.append(numPr) #add number properties to paragraph
 
             if isinstance(field, fields.Markup):
-                html_field = str(field).replace('<br>', '').replace('&nbsp;', ' ')
+                html_field = sanitize(str(field))
+                html_field = unescape(
+                    str(field)
+                    .replace('<br>', '')
+                    .replace('\t', '')
+                    .replace('\n', '')
+                )
                 xml_tree = etree.fromstring('<root>%s</root>' % html_field)
                 md = tpl.new_subdoc()
+                p = md.add_paragraph()
+                lp = md.add_paragraph()
                 for child in xml_tree.iter():
-                    if child.tag in ('p', 'strong', 'em', 's', 'u', 'ul',
+                    if child.tag in ('p', 'strong', 'b', 'em', 'i', 's', 'u', 'ul',
                                      'ol', 'li', 'font', 'a'):
+                        parent = child.getparent()
+                        p = lp if parent and parent.tag in ('ul', 'ol') else p
                         if child.tag == 'p':
                             p = md.add_paragraph(child.text)
                             p.add_run(child.tail)
                         elif child.tag == 'a':
                             p.add_run(child.text).style = 'Hyperlink'
-                        elif child.tag == 'strong':
+                        elif child.tag in ('strong', 'b'):
                             p.add_run(child.text).bold = True
                             p.add_run(child.tail)
-                        elif child.tag == 'em':
+                        elif child.tag in ('em', 'i'):
                             p.add_run(child.text).italic = True
                             p.add_run(child.tail)
                         elif child.tag == 'u':
@@ -493,6 +519,11 @@ class IrActionsReport(models.Model):
                     else: # Not handled, add text only
                         p.add_run(child.text)
                         p.add_run(child.tail)
+                # Delete empty paragraphs
+                for p in filter(lambda p: not(p.text), md.paragraphs):
+                    el = p._element
+                    el.getparent().remove(el)
+                    el._p = el._element = None
                 return md
             return field