From f931a0eceed3a89ef7c94a8a7b2bbed208bf0295 Mon Sep 17 00:00:00 2001
From: Alex Marchant <alexjmarchant@gmail.com>
Date: Wed, 3 Apr 2024 15:27:48 -0400
Subject: [PATCH] Make utf-8 explicit in all tree.write calls

---
 libmat2/office.py     | 16 ++++++++--------
 tests/test_libmat2.py | 26 ++++++++++++++++++++++++++
 2 files changed, 34 insertions(+), 8 deletions(-)

diff --git a/libmat2/office.py b/libmat2/office.py
index 6f69e4a..fa79834 100644
--- a/libmat2/office.py
+++ b/libmat2/office.py
@@ -38,7 +38,7 @@ def _sort_xml_attributes(full_path: str) -> bool:
     for c in tree.getroot():
         c[:] = sorted(c, key=lambda child: (child.tag, child.get('desc')))
 
-    tree.write(full_path, xml_declaration=True)
+    tree.write(full_path, xml_declaration=True, encoding='utf-8')
     return True
 
 
@@ -220,7 +220,7 @@ class MSOfficeParser(ZipParser):
         for element in elements_to_remove:
             parent_map[element].remove(element)
 
-        tree.write(full_path, xml_declaration=True)
+        tree.write(full_path, xml_declaration=True, encoding='utf-8')
         return True
 
     @staticmethod
@@ -250,7 +250,7 @@ class MSOfficeParser(ZipParser):
         for element in elements_to_remove:
             parent_map[element].remove(element)
 
-        tree.write(full_path, xml_declaration=True)
+        tree.write(full_path, xml_declaration=True, encoding='utf-8')
         return True
 
     @staticmethod
@@ -287,7 +287,7 @@ class MSOfficeParser(ZipParser):
             parent_map[element].insert(position, children)
             parent_map[element].remove(element)
 
-        tree.write(full_path, xml_declaration=True)
+        tree.write(full_path, xml_declaration=True, encoding='utf-8')
         return True
 
     def __remove_content_type_members(self, full_path: str) -> bool:
@@ -320,7 +320,7 @@ class MSOfficeParser(ZipParser):
             if name in removed_fnames:
                 root.remove(item)
 
-        tree.write(full_path, xml_declaration=True)
+        tree.write(full_path, xml_declaration=True, encoding='utf-8')
         return True
 
     def _final_checks(self) -> bool:
@@ -355,7 +355,7 @@ class MSOfficeParser(ZipParser):
 
         for item in tree.iterfind('.//p14:creationId', namespace):
             item.set('val', '%s' % random.randint(0, 2**32))
-        tree.write(full_path, xml_declaration=True)
+        tree.write(full_path, xml_declaration=True, encoding='utf-8')
         return True
 
     @staticmethod
@@ -371,7 +371,7 @@ class MSOfficeParser(ZipParser):
 
         for item in tree.iterfind('.//p:sldMasterId', namespace):
             item.set('id', '%s' % random.randint(0, 2**32))
-        tree.write(full_path, xml_declaration=True)
+        tree.write(full_path, xml_declaration=True, encoding='utf-8')
         return True
 
     def _specific_cleanup(self, full_path: str) -> bool:
@@ -514,7 +514,7 @@ class LibreOfficeParser(ZipParser):
             for changes in text.iterfind('.//text:tracked-changes', namespace):
                 text.remove(changes)
 
-        tree.write(full_path, xml_declaration=True)
+        tree.write(full_path, xml_declaration=True, encoding='utf-8')
         return True
 
     def _specific_cleanup(self, full_path: str) -> bool:
diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py
index 32ae543..0435113 100644
--- a/tests/test_libmat2.py
+++ b/tests/test_libmat2.py
@@ -873,5 +873,31 @@ class TextDocx(unittest.TestCase):
             # Check if 'word/comments.xml' exists in the zip
             self.assertNotIn('word/comments.xml', zipin.namelist())
 
+        os.remove('./tests/data/comment_clean.docx')
+        os.remove('./tests/data/comment_clean.cleaned.docx')
+
+    def test_xml_is_utf8(self):
+        with zipfile.ZipFile('./tests/data/comment.docx') as zipin:
+            c = zipin.open('word/document.xml')
+            content = c.read()
+
+            # ensure encoding is utf-8
+            r = b'encoding=(\'|\")UTF-8(\'|\")'
+            match = re.search(r, content, re.IGNORECASE)
+            self.assertIsNotNone(match)
+
+        shutil.copy('./tests/data/comment.docx', './tests/data/comment_clean.docx')
+        p = office.MSOfficeParser('./tests/data/comment_clean.docx')
+        self.assertTrue(p.remove_all())
+
+        with zipfile.ZipFile('./tests/data/comment_clean.cleaned.docx') as zipin:
+            c = zipin.open('word/document.xml')
+            content = c.read()
+
+            # ensure encoding is still utf-8
+            r = b'encoding=(\'|\")UTF-8(\'|\")'
+            match = re.search(r, content, re.IGNORECASE)
+            self.assertIsNotNone(match)
+
         os.remove('./tests/data/comment_clean.docx')
         os.remove('./tests/data/comment_clean.cleaned.docx')
\ No newline at end of file
-- 
GitLab