From 25230107c4ecdecdbab7f3704e29d5f358252e97 Mon Sep 17 00:00:00 2001 From: jvoisin <julien.voisin@dustri.org> Date: Tue, 18 Sep 2018 16:56:58 +0200 Subject: [PATCH] Yay, it's working! --- libmat2/office.py | 41 +++++++++++++++++++++++++++++++---------- 1 file changed, 31 insertions(+), 10 deletions(-) diff --git a/libmat2/office.py b/libmat2/office.py index f987c71..5c2c996 100644 --- a/libmat2/office.py +++ b/libmat2/office.py @@ -1,3 +1,4 @@ +import logging import os import re import zipfile @@ -14,8 +15,15 @@ assert Pattern def _parse_xml(full_path: str): """ This function parses XML, with namespace support. """ + cpt = 0 namespace_map = dict() for _, (key, value) in ET.iterparse(full_path, ("start-ns", )): + # The ns[0-9]+ namespaces are reserved for interal usage, so + # we have to use an other nomenclature. + if re.match('^ns[0-9]+$', key): + key = 'mat%d' % cpt + cpt += 1 + namespace_map[key] = value ET.register_namespace(key, value) @@ -23,16 +31,16 @@ def _parse_xml(full_path: str): def _sort_xml_attributes(full_path: str) -> bool: + """ Sort xml attributes lexicographically, + because it's possible to fingerprint producers (MS Office, Libreoffice, …) + since they are all using different orders. + """ tree = ET.parse(full_path) root = tree.getroot() for c in root: c[:] = sorted(c, key=lambda child: (child.tag, child.get('desc'))) - print('CLENAING %s' % full_path) - xmlstr = ET.tostring(root, encoding="utf-8", method="xml") - print(xmlstr.decode("utf-8")) - tree.write(full_path, xml_declaration=True) return True @@ -64,7 +72,8 @@ class MSOfficeParser(ArchiveBasedAbstractParser): """ try: tree, namespace = _parse_xml(full_path) - except ET.ParseError: + except ET.ParseError as e: + logging.error("Unable to parse %s: %s", full_path, e) return False # Revisions are either deletions (`w:del`) or @@ -98,6 +107,9 @@ class MSOfficeParser(ArchiveBasedAbstractParser): return True def _specific_cleanup(self, full_path: str) -> bool: + if os.stat(full_path).st_size == 0: # Don't process empty files + return True + if full_path.endswith('/word/document.xml'): # this file contains the revisions return self.__remove_revisions(full_path) @@ -154,7 +166,8 @@ class LibreOfficeParser(ArchiveBasedAbstractParser): def __remove_revisions(full_path: str) -> bool: try: tree, namespace = _parse_xml(full_path) - except ET.ParseError: + except ET.ParseError as e: + logging.error("Unable to parse %s: %s", full_path, e) return False if 'office' not in namespace.keys(): # no revisions in the current file @@ -169,11 +182,19 @@ class LibreOfficeParser(ArchiveBasedAbstractParser): return True def _specific_cleanup(self, full_path: str) -> bool: - if os.path.basename(full_path).endswith('.xml'): - _sort_xml_attributes(full_path) + if os.stat(full_path).st_size == 0: # Don't process empty files + return True - if os.path.basename(full_path) == 'content.xml': - return self.__remove_revisions(full_path) + if os.path.basename(full_path).endswith('.xml'): + if os.path.basename(full_path) == 'content.xml': + if self.__remove_revisions(full_path) is False: + return False + + try: + _sort_xml_attributes(full_path) + except ET.ParseError as e: + logging.error("Unable to parse %s: %s", full_path, e) + return False return True def get_meta(self) -> Dict[str, str]: -- GitLab