From 61f39c4bd0b51be6371fb2973c14054a2772352e Mon Sep 17 00:00:00 2001
From: Alex Marchant <alexjmarchant@gmail.com>
Date: Wed, 3 Apr 2024 15:20:00 -0400
Subject: [PATCH] Strip comment references from document.xml

---
 libmat2/office.py     | 36 ++++++++++++++++++++++++++++++++++++
 tests/test_libmat2.py | 30 ++++++++++++++++++++++++++++++
 2 files changed, 66 insertions(+)

diff --git a/libmat2/office.py b/libmat2/office.py
index 6f69e4a..66f462b 100644
--- a/libmat2/office.py
+++ b/libmat2/office.py
@@ -290,6 +290,39 @@ class MSOfficeParser(ZipParser):
         tree.write(full_path, xml_declaration=True)
         return True
 
+    @staticmethod
+    def __remove_document_comment_meta(full_path: str) -> bool:
+        try:
+            tree, namespace = _parse_xml(full_path)
+        except ET.ParseError as e:  # pragma: no cover
+            logging.error("Unable to parse %s: %s", full_path, e)
+            return False
+
+        # search the docs to see if we can bail early
+        range_start = tree.find('.//w:commentRangeStart', namespace)
+        range_end = tree.find('.//w:commentRangeEnd', namespace)
+        references = tree.find('.//w:commentReference', namespace)
+        if range_start is None and range_end is None and references is None:
+            return True  # No comment meta tags are present
+
+        parent_map = {c:p for p in tree.iter() for c in p}
+
+        # iterate over the elements and add them to list
+        elements_del = list()
+        for element in tree.iterfind('.//w:commentRangeStart', namespace):
+            elements_del.append(element)
+        for element in tree.iterfind('.//w:commentRangeEnd', namespace):
+            elements_del.append(element)
+        for element in tree.iterfind('.//w:commentReference', namespace):
+            elements_del.append(element)
+
+        # remove the elements
+        for element in elements_del:
+            parent_map[element].remove(element)
+
+        tree.write(full_path, xml_declaration=True, encoding='utf-8')
+        return True
+
     def __remove_content_type_members(self, full_path: str) -> bool:
         """ The method will remove the dangling references
         form the [Content_Types].xml file, since MS office doesn't like them
@@ -396,6 +429,9 @@ class MSOfficeParser(ZipParser):
             # this file contains the revisions
             if self.__remove_revisions(full_path) is False:
                 return False  # pragma: no cover
+            # remove comment references and ranges
+            if self.__remove_document_comment_meta(full_path) is False:
+                return False  # pragma: no cover
         elif full_path.endswith('/docProps/app.xml'):
             # This file must be present and valid,
             # so we're removing as much as we can.
diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py
index 32ae543..d199f54 100644
--- a/tests/test_libmat2.py
+++ b/tests/test_libmat2.py
@@ -873,5 +873,35 @@ class TextDocx(unittest.TestCase):
             # Check if 'word/comments.xml' exists in the zip
             self.assertNotIn('word/comments.xml', zipin.namelist())
 
+        os.remove('./tests/data/comment_clean.docx')
+        os.remove('./tests/data/comment_clean.cleaned.docx')
+
+    def test_comment_references_are_removed(self):
+        with zipfile.ZipFile('./tests/data/comment.docx') as zipin:
+            c = zipin.open('word/document.xml')
+            content = c.read()
+
+            r = b'w:commentRangeStart'
+            self.assertIn(r, content)
+            r = b'w:commentRangeEnd'
+            self.assertIn(r, content)
+            r = b'w:commentReference'
+            self.assertIn(r, content)
+
+        shutil.copy('./tests/data/comment.docx', './tests/data/comment_clean.docx')
+        p = office.MSOfficeParser('./tests/data/comment_clean.docx')
+        self.assertTrue(p.remove_all())
+
+        with zipfile.ZipFile('./tests/data/comment_clean.cleaned.docx') as zipin:
+            c = zipin.open('word/document.xml')
+            content = c.read()
+
+            r = b'w:commentRangeStart'
+            self.assertNotIn(r, content)
+            r = b'w:commentRangeEnd'
+            self.assertNotIn(r, content)
+            r = b'w:commentReference'
+            self.assertNotIn(r, content)
+
         os.remove('./tests/data/comment_clean.docx')
         os.remove('./tests/data/comment_clean.cleaned.docx')
\ No newline at end of file
-- 
GitLab