Skip to content
Snippets Groups Projects
Commit 652b8e51 authored by Julien (jvoisin) Voisin's avatar Julien (jvoisin) Voisin
Browse files

Files processed via MAT2 are now accepted without warnings by MS Office

parent c14be47f
Branches
Tags
No related merge requests found
...@@ -36,9 +36,8 @@ def _sort_xml_attributes(full_path: str) -> bool: ...@@ -36,9 +36,8 @@ def _sort_xml_attributes(full_path: str) -> bool:
since they are all using different orders. since they are all using different orders.
""" """
tree = ET.parse(full_path) tree = ET.parse(full_path)
root = tree.getroot()
for c in root: for c in tree.getroot():
c[:] = sorted(c, key=lambda child: (child.tag, child.get('desc'))) c[:] = sorted(c, key=lambda child: (child.tag, child.get('desc')))
tree.write(full_path, xml_declaration=True) tree.write(full_path, xml_declaration=True)
...@@ -59,6 +58,8 @@ class MSOfficeParser(ArchiveBasedAbstractParser): ...@@ -59,6 +58,8 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
'word/fontTable.xml', 'word/fontTable.xml',
'word/settings.xml', 'word/settings.xml',
'word/styles.xml', 'word/styles.xml',
'docProps/app.xml',
'docProps/core.xml',
# https://msdn.microsoft.com/en-us/library/dd908153(v=office.12).aspx # https://msdn.microsoft.com/en-us/library/dd908153(v=office.12).aspx
'word/stylesWithEffects.xml', 'word/stylesWithEffects.xml',
...@@ -66,7 +67,6 @@ class MSOfficeParser(ArchiveBasedAbstractParser): ...@@ -66,7 +67,6 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
files_to_omit = set(map(re.compile, { # type: ignore files_to_omit = set(map(re.compile, { # type: ignore
'word/webSettings.xml', 'word/webSettings.xml',
'word/theme', 'word/theme',
'^docProps/',
})) }))
@staticmethod @staticmethod
...@@ -95,7 +95,7 @@ class MSOfficeParser(ArchiveBasedAbstractParser): ...@@ -95,7 +95,7 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
elements_to_remove = list() elements_to_remove = list()
for item in tree.iterfind('.//', namespace): for item in tree.iterfind('.//', namespace):
if '}rsid' in item.tag.strip().lower(): # resi as tag if '}rsid' in item.tag.strip().lower(): # rsid as tag
elements_to_remove.append(item) elements_to_remove.append(item)
continue continue
for key in list(item.attrib.keys()): # rsid as attribute for key in list(item.attrib.keys()): # rsid as attribute
...@@ -106,7 +106,6 @@ class MSOfficeParser(ArchiveBasedAbstractParser): ...@@ -106,7 +106,6 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
parent_map[element].remove(element) parent_map[element].remove(element)
tree.write(full_path, xml_declaration=True) tree.write(full_path, xml_declaration=True)
return True return True
@staticmethod @staticmethod
...@@ -148,7 +147,6 @@ class MSOfficeParser(ArchiveBasedAbstractParser): ...@@ -148,7 +147,6 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
parent_map[element].remove(element) parent_map[element].remove(element)
tree.write(full_path, xml_declaration=True) tree.write(full_path, xml_declaration=True)
return True return True
def __remove_content_type_members(self, full_path: str) -> bool: def __remove_content_type_members(self, full_path: str) -> bool:
...@@ -176,28 +174,68 @@ class MSOfficeParser(ArchiveBasedAbstractParser): ...@@ -176,28 +174,68 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
root.remove(item) root.remove(item)
tree.write(full_path, xml_declaration=True) tree.write(full_path, xml_declaration=True)
return True return True
def _specific_cleanup(self, full_path: str) -> bool: def _specific_cleanup(self, full_path: str) -> bool:
# pylint: disable=too-many-return-statements
if os.stat(full_path).st_size == 0: # Don't process empty files if os.stat(full_path).st_size == 0: # Don't process empty files
return True return True
if not full_path.endswith('.xml'):
return True
if full_path.endswith('/[Content_Types].xml'): if full_path.endswith('/[Content_Types].xml'):
# this file contains references to files that we might # this file contains references to files that we might
# remove, and MS Office doesn't like dangling references # remove, and MS Office doesn't like dangling references
if self.__remove_content_type_members(full_path) is False: if self.__remove_content_type_members(full_path) is False:
return False return False
elif full_path.endswith('/word/document.xml'):
if full_path.endswith('/word/document.xml'):
# this file contains the revisions # this file contains the revisions
if self.__remove_revisions(full_path) is False: if self.__remove_revisions(full_path) is False:
return False return False
elif full_path.endswith('/docProps/app.xml'):
# This file must be present and valid,
# so we're removing as much as we can.
with open(full_path, 'wb') as f:
f.write(b'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>')
f.write(b'<Properties xmlns="http://schemas.openxmlformats.org/officeDocument/2006/extended-properties">')
f.write(b'</Properties>')
elif full_path.endswith('/docProps/core.xml'):
# This file must be present and valid,
# so we're removing as much as we can.
with open(full_path, 'wb') as f:
f.write(b'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>')
f.write(b'<cp:coreProperties xmlns:cp="http://schemas.openxmlformats.org/package/2006/metadata/core-properties">')
f.write(b'</cp:coreProperties>')
if full_path.endswith('.xml'):
if self.__remove_rsid(full_path) is False: if self.__remove_rsid(full_path) is False:
return False return False
try:
_sort_xml_attributes(full_path)
except ET.ParseError as e: # pragma: no cover
logging.error("Unable to parse %s: %s", full_path, e)
return False
# This is awful, I'm sorry.
#
# Microsoft Office isn't happy when we have the `mc:Ignorable`
# tag containing namespaces that aren't present in the xml file,
# so instead of trying to remove this specific tag with etree,
# we're removing it, with a regexp.
#
# Since we're the ones producing this file, via the call to
# _sort_xml_attributes, there won't be any "funny tricks".
# Worst case, the tag isn't present, and everything is fine.
#
# see: https://docs.microsoft.com/en-us/dotnet/framework/wpf/advanced/mc-ignorable-attribute
with open(full_path, 'rb') as f:
text = f.read()
out = re.sub(b'mc:Ignorable="[^"]*"', b'', text, 1)
with open(full_path, 'wb') as f:
f.write(out)
return True return True
def get_meta(self) -> Dict[str, str]: def get_meta(self) -> Dict[str, str]:
...@@ -262,7 +300,6 @@ class LibreOfficeParser(ArchiveBasedAbstractParser): ...@@ -262,7 +300,6 @@ class LibreOfficeParser(ArchiveBasedAbstractParser):
text.remove(changes) text.remove(changes)
tree.write(full_path, xml_declaration=True) tree.write(full_path, xml_declaration=True)
return True return True
def _specific_cleanup(self, full_path: str) -> bool: def _specific_cleanup(self, full_path: str) -> bool:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment