From ca6e8efe5e2fe162b80c44b7f841f67b87246feb Mon Sep 17 00:00:00 2001 From: Fabien BOURGEOIS Date: Thu, 6 Feb 2025 15:43:29 +0100 Subject: [PATCH] [IMP]Docx Report Generation : add automatic sanitization for Html field - Usefull when data had been copied from external programs ; - Manages more tags ; - More robust rendering. --- docx_report_generation/__manifest__.py | 2 +- .../models/ir_actions_report.py | 39 +++++++++++++++++-- 2 files changed, 36 insertions(+), 5 deletions(-) diff --git a/docx_report_generation/__manifest__.py b/docx_report_generation/__manifest__.py index 8e18134..fcfc9c5 100755 --- a/docx_report_generation/__manifest__.py +++ b/docx_report_generation/__manifest__.py @@ -14,7 +14,7 @@ "author": "RYDLAB, Yaltik", "website": "https://rydlab.ru", "category": "Technical", - "version": "16.0.2.1.3", + "version": "16.0.2.2.0", "license": "LGPL-3", "depends": ["base", "web", "custom_report_field", "report_monetary_helpers"], "external_dependencies": {"python": ["docxcompose", "docxtpl", "beautifulsoup4"]}, diff --git a/docx_report_generation/models/ir_actions_report.py b/docx_report_generation/models/ir_actions_report.py index 495161f..65eb7f6 100644 --- a/docx_report_generation/models/ir_actions_report.py +++ b/docx_report_generation/models/ir_actions_report.py @@ -5,9 +5,11 @@ from io import BytesIO from functools import partial from re import findall from json import loads +from html import unescape from logging import getLogger from lxml import etree +from lxml.html.clean import Cleaner from docx import Document from docx.oxml import OxmlElement @@ -437,6 +439,20 @@ class IrActionsReport(models.Model): } values["docs"] = docs + def sanitize(html): + """ Sanitize Html from Odoo, mostly when pasted from outside """ + cleaner = Cleaner( + page_structure=True, meta=True, embedded=True, links=True, + style=True, processing_instructions=True, + scripts=True, javascript=True, comments=True, frames=True, + forms=True, annoying_tags=True, remove_unknown_tags=True, + safe_attrs_only=True, + inline_style=False, + safe_attrs=frozenset(['style']), + remove_tags=('span', 'div') + ) + return cleaner.clean_html(html) + def _html_generate(field, tpl): def _create_list(paragraph, list_type): @@ -449,21 +465,31 @@ class IrActionsReport(models.Model): pPr.append(numPr) #add number properties to paragraph if isinstance(field, fields.Markup): - html_field = str(field).replace('
', '').replace(' ', ' ') + html_field = sanitize(str(field)) + html_field = unescape( + str(field) + .replace('
', '') + .replace('\t', '') + .replace('\n', '') + ) xml_tree = etree.fromstring('%s' % html_field) md = tpl.new_subdoc() + p = md.add_paragraph() + lp = md.add_paragraph() for child in xml_tree.iter(): - if child.tag in ('p', 'strong', 'em', 's', 'u', 'ul', + if child.tag in ('p', 'strong', 'b', 'em', 'i', 's', 'u', 'ul', 'ol', 'li', 'font', 'a'): + parent = child.getparent() + p = lp if parent and parent.tag in ('ul', 'ol') else p if child.tag == 'p': p = md.add_paragraph(child.text) p.add_run(child.tail) elif child.tag == 'a': p.add_run(child.text).style = 'Hyperlink' - elif child.tag == 'strong': + elif child.tag in ('strong', 'b'): p.add_run(child.text).bold = True p.add_run(child.tail) - elif child.tag == 'em': + elif child.tag in ('em', 'i'): p.add_run(child.text).italic = True p.add_run(child.tail) elif child.tag == 'u': @@ -493,6 +519,11 @@ class IrActionsReport(models.Model): else: # Not handled, add text only p.add_run(child.text) p.add_run(child.tail) + # Delete empty paragraphs + for p in filter(lambda p: not(p.text), md.paragraphs): + el = p._element + el.getparent().remove(el) + el._p = el._element = None return md return field