Skip to content
Snippets Groups Projects
Commit fbcf68c2 authored by Julien (jvoisin) Voisin's avatar Julien (jvoisin) Voisin
Browse files

Lexicographical sort on xml attributes for office files

In XML, the order of the attributes shouldn't be meaningful,
however, MS Office sorts attributes for a given XML tag
differently than LibreOffice.
parent 9826de35
No related branches found
No related tags found
No related merge requests found
import logging
import os import os
import re import re
import zipfile import zipfile
...@@ -12,16 +13,38 @@ assert Set ...@@ -12,16 +13,38 @@ assert Set
assert Pattern assert Pattern
def _parse_xml(full_path: str): def _parse_xml(full_path: str):
""" This function parse XML, with namespace support. """ """ This function parses XML, with namespace support. """
cpt = 0
namespace_map = dict() namespace_map = dict()
for _, (key, value) in ET.iterparse(full_path, ("start-ns", )): for _, (key, value) in ET.iterparse(full_path, ("start-ns", )):
# The ns[0-9]+ namespaces are reserved for interal usage, so
# we have to use an other nomenclature.
if re.match('^ns[0-9]+$', key):
key = 'mat%d' % cpt
cpt += 1
namespace_map[key] = value namespace_map[key] = value
ET.register_namespace(key, value) ET.register_namespace(key, value)
return ET.parse(full_path), namespace_map return ET.parse(full_path), namespace_map
def _sort_xml_attributes(full_path: str) -> bool:
""" Sort xml attributes lexicographically,
because it's possible to fingerprint producers (MS Office, Libreoffice, …)
since they are all using different orders.
"""
tree = ET.parse(full_path)
root = tree.getroot()
for c in root:
c[:] = sorted(c, key=lambda child: (child.tag, child.get('desc')))
tree.write(full_path, xml_declaration=True)
return True
class MSOfficeParser(ArchiveBasedAbstractParser): class MSOfficeParser(ArchiveBasedAbstractParser):
mimetypes = { mimetypes = {
'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
...@@ -49,7 +72,8 @@ class MSOfficeParser(ArchiveBasedAbstractParser): ...@@ -49,7 +72,8 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
""" """
try: try:
tree, namespace = _parse_xml(full_path) tree, namespace = _parse_xml(full_path)
except ET.ParseError: except ET.ParseError as e:
logging.error("Unable to parse %s: %s", full_path, e)
return False return False
# Revisions are either deletions (`w:del`) or # Revisions are either deletions (`w:del`) or
...@@ -83,6 +107,9 @@ class MSOfficeParser(ArchiveBasedAbstractParser): ...@@ -83,6 +107,9 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
return True return True
def _specific_cleanup(self, full_path: str) -> bool: def _specific_cleanup(self, full_path: str) -> bool:
if os.stat(full_path).st_size == 0: # Don't process empty files
return True
if full_path.endswith('/word/document.xml'): if full_path.endswith('/word/document.xml'):
# this file contains the revisions # this file contains the revisions
return self.__remove_revisions(full_path) return self.__remove_revisions(full_path)
...@@ -139,7 +166,8 @@ class LibreOfficeParser(ArchiveBasedAbstractParser): ...@@ -139,7 +166,8 @@ class LibreOfficeParser(ArchiveBasedAbstractParser):
def __remove_revisions(full_path: str) -> bool: def __remove_revisions(full_path: str) -> bool:
try: try:
tree, namespace = _parse_xml(full_path) tree, namespace = _parse_xml(full_path)
except ET.ParseError: except ET.ParseError as e:
logging.error("Unable to parse %s: %s", full_path, e)
return False return False
if 'office' not in namespace.keys(): # no revisions in the current file if 'office' not in namespace.keys(): # no revisions in the current file
...@@ -154,8 +182,19 @@ class LibreOfficeParser(ArchiveBasedAbstractParser): ...@@ -154,8 +182,19 @@ class LibreOfficeParser(ArchiveBasedAbstractParser):
return True return True
def _specific_cleanup(self, full_path: str) -> bool: def _specific_cleanup(self, full_path: str) -> bool:
if os.path.basename(full_path) == 'content.xml': if os.stat(full_path).st_size == 0: # Don't process empty files
return self.__remove_revisions(full_path) return True
if os.path.basename(full_path).endswith('.xml'):
if os.path.basename(full_path) == 'content.xml':
if self.__remove_revisions(full_path) is False:
return False
try:
_sort_xml_attributes(full_path)
except ET.ParseError as e:
logging.error("Unable to parse %s: %s", full_path, e)
return False
return True return True
def get_meta(self) -> Dict[str, str]: def get_meta(self) -> Dict[str, str]:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment