From 25230107c4ecdecdbab7f3704e29d5f358252e97 Mon Sep 17 00:00:00 2001
From: jvoisin <julien.voisin@dustri.org>
Date: Tue, 18 Sep 2018 16:56:58 +0200
Subject: [PATCH] Yay, it's working!

---
 libmat2/office.py | 41 +++++++++++++++++++++++++++++++----------
 1 file changed, 31 insertions(+), 10 deletions(-)

diff --git a/libmat2/office.py b/libmat2/office.py
index f987c71..5c2c996 100644
--- a/libmat2/office.py
+++ b/libmat2/office.py
@@ -1,3 +1,4 @@
+import logging
 import os
 import re
 import zipfile
@@ -14,8 +15,15 @@ assert Pattern
 def _parse_xml(full_path: str):
     """ This function parses XML, with namespace support. """
 
+    cpt = 0
     namespace_map = dict()
     for _, (key, value) in ET.iterparse(full_path, ("start-ns", )):
+        # The ns[0-9]+ namespaces are reserved for interal usage, so
+        # we have to use an other nomenclature.
+        if re.match('^ns[0-9]+$', key):
+            key = 'mat%d' % cpt
+            cpt += 1
+
         namespace_map[key] = value
         ET.register_namespace(key, value)
 
@@ -23,16 +31,16 @@ def _parse_xml(full_path: str):
 
 
 def _sort_xml_attributes(full_path: str) -> bool:
+    """ Sort xml attributes lexicographically,
+    because it's possible to fingerprint producers (MS Office, Libreoffice, …)
+    since they are all using different orders.
+    """
     tree = ET.parse(full_path)
     root = tree.getroot()
 
     for c in root:
         c[:] = sorted(c, key=lambda child: (child.tag, child.get('desc')))
 
-    print('CLENAING %s' % full_path)
-    xmlstr = ET.tostring(root, encoding="utf-8", method="xml")
-    print(xmlstr.decode("utf-8"))
-
     tree.write(full_path, xml_declaration=True)
     return True
 
@@ -64,7 +72,8 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
         """
         try:
             tree, namespace = _parse_xml(full_path)
-        except ET.ParseError:
+        except ET.ParseError as e:
+            logging.error("Unable to parse %s: %s", full_path, e)
             return False
 
         # Revisions are either deletions (`w:del`) or
@@ -98,6 +107,9 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
         return True
 
     def _specific_cleanup(self, full_path: str) -> bool:
+        if os.stat(full_path).st_size == 0:  # Don't process empty files
+            return True
+
         if full_path.endswith('/word/document.xml'):
             # this file contains the revisions
             return self.__remove_revisions(full_path)
@@ -154,7 +166,8 @@ class LibreOfficeParser(ArchiveBasedAbstractParser):
     def __remove_revisions(full_path: str) -> bool:
         try:
             tree, namespace = _parse_xml(full_path)
-        except ET.ParseError:
+        except ET.ParseError as e:
+            logging.error("Unable to parse %s: %s", full_path, e)
             return False
 
         if 'office' not in namespace.keys():  # no revisions in the current file
@@ -169,11 +182,19 @@ class LibreOfficeParser(ArchiveBasedAbstractParser):
         return True
 
     def _specific_cleanup(self, full_path: str) -> bool:
-        if os.path.basename(full_path).endswith('.xml'):
-            _sort_xml_attributes(full_path)
+        if os.stat(full_path).st_size == 0:  # Don't process empty files
+            return True
 
-        if os.path.basename(full_path) == 'content.xml':
-            return self.__remove_revisions(full_path)
+        if os.path.basename(full_path).endswith('.xml'):
+            if os.path.basename(full_path) == 'content.xml':
+                if self.__remove_revisions(full_path) is False:
+                    return False
+
+            try:
+                _sort_xml_attributes(full_path)
+            except ET.ParseError as e:
+                logging.error("Unable to parse %s: %s", full_path, e)
+                return False
         return True
 
     def get_meta(self) -> Dict[str, str]:
-- 
GitLab