diff --git a/libmat2/office.py b/libmat2/office.py index 50b776ef16dacf5d8d96248c725e4c28ec17229d..f987c713958ddf4b45a190c91658e8f443afa68f 100644 --- a/libmat2/office.py +++ b/libmat2/office.py @@ -12,7 +12,7 @@ assert Set assert Pattern def _parse_xml(full_path: str): - """ This function parse XML, with namespace support. """ + """ This function parses XML, with namespace support. """ namespace_map = dict() for _, (key, value) in ET.iterparse(full_path, ("start-ns", )): @@ -22,6 +22,21 @@ def _parse_xml(full_path: str): return ET.parse(full_path), namespace_map +def _sort_xml_attributes(full_path: str) -> bool: + tree = ET.parse(full_path) + root = tree.getroot() + + for c in root: + c[:] = sorted(c, key=lambda child: (child.tag, child.get('desc'))) + + print('CLENAING %s' % full_path) + xmlstr = ET.tostring(root, encoding="utf-8", method="xml") + print(xmlstr.decode("utf-8")) + + tree.write(full_path, xml_declaration=True) + return True + + class MSOfficeParser(ArchiveBasedAbstractParser): mimetypes = { 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', @@ -154,6 +169,9 @@ class LibreOfficeParser(ArchiveBasedAbstractParser): return True def _specific_cleanup(self, full_path: str) -> bool: + if os.path.basename(full_path).endswith('.xml'): + _sort_xml_attributes(full_path) + if os.path.basename(full_path) == 'content.xml': return self.__remove_revisions(full_path) return True