Remove docx revisions

bee56a57 · Julien (jvoisin) Voisin · 02f7605a · bee56a57 · bee56a57 · bee56a57
Commit bee56a57 authored 6 years ago by Julien (jvoisin) Voisin
--- a/libmat2/office.py
+++ b/libmat2/office.py
@@ -14,6 +14,24 @@ from . import abstract, parser_factory
 assert Set
 assert Pattern

+def _parse_xml(full_path: str):
+    """ This function parse XML with namespace support. """
+    def parse_map(f):  # etree support for ns is a bit rough
+        ns_map = dict()
+        for event, (k, v) in ET.iterparse(f, ("start-ns", )):
+            if event == "start-ns":
+                ns_map[k] = v
+        return ns_map
+
+    ns = parse_map(full_path)
+
+    # Register the namespaces
+    for k,v in ns.items():
+        ET.register_namespace(k, v)
+
+    return ET.parse(full_path), ns
+
+
 class ArchiveBasedAbstractParser(abstract.AbstractParser):
    # Those are the files that have a format that _isn't_
    # supported by MAT2, but that we want to keep anyway.
@@ -72,7 +90,11 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
                zin.extract(member=item, path=temp_folder)
                full_path = os.path.join(temp_folder, item.filename)

-                self._specific_cleanup(full_path)
+                if self._specific_cleanup(full_path) is False:
+                    shutil.rmtree(temp_folder)
+                    os.remove(self.output_filename)
+                    print("Something went wrong during deep cleaning of %s" % item.filename)
+                    return False

                if item.filename in self.files_to_keep:
                    # those files aren't supported, but we want to add them anyway
@@ -118,6 +140,45 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
            '^docProps/',
    }))

+    def __remove_revisions(self, full_path:str) -> bool:
+        """ In this function, we're changing the XML
+        document in two times, since we don't want
+        to change the tree we're iterating on."""
+        tree, ns = _parse_xml(full_path)
+
+        # No revisions are present
+        if tree.find('.//w:del', ns) is None:
+            return True
+        elif tree.find('.//w:ins', ns) is None:
+            return True
+
+        parent_map = {c:p for p in tree.iter( ) for c in p}
+
+        elements = list([element for element in tree.iterfind('.//w:del', ns)])
+        for element in elements:
+            parent_map[element].remove(element)
+
+        elements = list()
+        for element in tree.iterfind('.//w:ins', ns):
+            for position, item in enumerate(tree.iter()):
+                if item == element:
+                    for children in element.iterfind('./*'):
+                        elements.append((element, position, children))
+                    break
+
+        for (element, position, children) in elements:
+            parent_map[element].insert(position, children)
+            parent_map[element].remove(element)
+
+        tree.write(full_path, xml_declaration=True)
+
+        return True
+
+    def _specific_cleanup(self, full_path:str) -> bool:
+        if full_path.endswith('/word/document.xml'):
+            return self.__remove_revisions(full_path)
+        return True
+
    def get_meta(self) -> Dict[str, str]:
        """
        Yes, I know that parsing xml with regexp ain't pretty,
@@ -168,22 +229,11 @@ class LibreOfficeParser(ArchiveBasedAbstractParser):


    def __remove_revisions(self, full_path:str) -> bool:
-        def parse_map(f):  # etree support for ns is a bit rough
-            ns_map = dict()
-            for event, (k, v) in ET.iterparse(f, ("start-ns", )):
-                if event == "start-ns":
-                    ns_map[k] = v
-            return ns_map
+        tree, ns = _parse_xml(full_path)

-        ns = parse_map(full_path)
        if 'office' not in ns.keys():  # no revisions in the current file
            return True

-        # Register the namespaces
-        for k,v in ns.items():
-            ET.register_namespace(k, v)
-
-        tree = ET.parse(full_path)
        for text in tree.getroot().iterfind('.//office:text', ns):
            for changes in text.iterfind('.//text:tracked-changes', ns):
                text.remove(changes)
@@ -219,4 +269,3 @@ class LibreOfficeParser(ArchiveBasedAbstractParser):
                metadata[key] = value
        zipin.close()
        return metadata
-
--- a/tests/data/revision.docx
+++ b/tests/data/revision.docx
--- a/tests/test_libmat2.py
+++ b/tests/test_libmat2.py
@@ -121,6 +121,7 @@ class TestRemovingThumbnails(unittest.TestCase):
        zipin.close()

        os.remove('./tests/data/clean.cleaned.odt')
+        os.remove('./tests/data/clean.odt')


 class TestRevisionsCleaning(unittest.TestCase):
@@ -142,6 +143,26 @@ class TestRevisionsCleaning(unittest.TestCase):
        os.remove('./tests/data/clean.odt')
        os.remove('./tests/data/clean.cleaned.odt')

+    def test_msoffice(self):
+        with zipfile.ZipFile('./tests/data/revision.docx') as zipin:
+            c = zipin.open('word/document.xml')
+            content = c.read()
+            r = b'<w:ins w:id="1" w:author="Unknown Author" w:date="2018-06-28T23:48:00Z">'
+            self.assertIn(r, content)
+
+        shutil.copy('./tests/data/revision.docx', './tests/data/revision_clean.docx')
+        p = office.MSOfficeParser('./tests/data/revision_clean.docx')
+        self.assertTrue(p.remove_all())
+
+        with zipfile.ZipFile('./tests/data/revision_clean.cleaned.docx') as zipin:
+            c = zipin.open('word/document.xml')
+            content = c.read()
+            r = b'<w:ins w:id="1" w:author="Unknown Author" w:date="2018-06-28T23:48:00Z">'
+            self.assertNotIn(r, content)
+
+        os.remove('./tests/data/revision_clean.docx')
+        os.remove('./tests/data/revision_clean.cleaned.docx')
+

 class TestDeepCleaning(unittest.TestCase):
    def __check_deep_meta(self, p):