Skip to content
Snippets Groups Projects
Commit 25230107 authored by Julien (jvoisin) Voisin's avatar Julien (jvoisin) Voisin
Browse files

Yay, it's working!

parent 30e567de
No related branches found
No related tags found
No related merge requests found
import logging
import os import os
import re import re
import zipfile import zipfile
...@@ -14,8 +15,15 @@ assert Pattern ...@@ -14,8 +15,15 @@ assert Pattern
def _parse_xml(full_path: str): def _parse_xml(full_path: str):
""" This function parses XML, with namespace support. """ """ This function parses XML, with namespace support. """
cpt = 0
namespace_map = dict() namespace_map = dict()
for _, (key, value) in ET.iterparse(full_path, ("start-ns", )): for _, (key, value) in ET.iterparse(full_path, ("start-ns", )):
# The ns[0-9]+ namespaces are reserved for interal usage, so
# we have to use an other nomenclature.
if re.match('^ns[0-9]+$', key):
key = 'mat%d' % cpt
cpt += 1
namespace_map[key] = value namespace_map[key] = value
ET.register_namespace(key, value) ET.register_namespace(key, value)
...@@ -23,16 +31,16 @@ def _parse_xml(full_path: str): ...@@ -23,16 +31,16 @@ def _parse_xml(full_path: str):
def _sort_xml_attributes(full_path: str) -> bool: def _sort_xml_attributes(full_path: str) -> bool:
""" Sort xml attributes lexicographically,
because it's possible to fingerprint producers (MS Office, Libreoffice, …)
since they are all using different orders.
"""
tree = ET.parse(full_path) tree = ET.parse(full_path)
root = tree.getroot() root = tree.getroot()
for c in root: for c in root:
c[:] = sorted(c, key=lambda child: (child.tag, child.get('desc'))) c[:] = sorted(c, key=lambda child: (child.tag, child.get('desc')))
print('CLENAING %s' % full_path)
xmlstr = ET.tostring(root, encoding="utf-8", method="xml")
print(xmlstr.decode("utf-8"))
tree.write(full_path, xml_declaration=True) tree.write(full_path, xml_declaration=True)
return True return True
...@@ -64,7 +72,8 @@ class MSOfficeParser(ArchiveBasedAbstractParser): ...@@ -64,7 +72,8 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
""" """
try: try:
tree, namespace = _parse_xml(full_path) tree, namespace = _parse_xml(full_path)
except ET.ParseError: except ET.ParseError as e:
logging.error("Unable to parse %s: %s", full_path, e)
return False return False
# Revisions are either deletions (`w:del`) or # Revisions are either deletions (`w:del`) or
...@@ -98,6 +107,9 @@ class MSOfficeParser(ArchiveBasedAbstractParser): ...@@ -98,6 +107,9 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
return True return True
def _specific_cleanup(self, full_path: str) -> bool: def _specific_cleanup(self, full_path: str) -> bool:
if os.stat(full_path).st_size == 0: # Don't process empty files
return True
if full_path.endswith('/word/document.xml'): if full_path.endswith('/word/document.xml'):
# this file contains the revisions # this file contains the revisions
return self.__remove_revisions(full_path) return self.__remove_revisions(full_path)
...@@ -154,7 +166,8 @@ class LibreOfficeParser(ArchiveBasedAbstractParser): ...@@ -154,7 +166,8 @@ class LibreOfficeParser(ArchiveBasedAbstractParser):
def __remove_revisions(full_path: str) -> bool: def __remove_revisions(full_path: str) -> bool:
try: try:
tree, namespace = _parse_xml(full_path) tree, namespace = _parse_xml(full_path)
except ET.ParseError: except ET.ParseError as e:
logging.error("Unable to parse %s: %s", full_path, e)
return False return False
if 'office' not in namespace.keys(): # no revisions in the current file if 'office' not in namespace.keys(): # no revisions in the current file
...@@ -169,11 +182,19 @@ class LibreOfficeParser(ArchiveBasedAbstractParser): ...@@ -169,11 +182,19 @@ class LibreOfficeParser(ArchiveBasedAbstractParser):
return True return True
def _specific_cleanup(self, full_path: str) -> bool: def _specific_cleanup(self, full_path: str) -> bool:
if os.path.basename(full_path).endswith('.xml'): if os.stat(full_path).st_size == 0: # Don't process empty files
_sort_xml_attributes(full_path) return True
if os.path.basename(full_path) == 'content.xml': if os.path.basename(full_path).endswith('.xml'):
return self.__remove_revisions(full_path) if os.path.basename(full_path) == 'content.xml':
if self.__remove_revisions(full_path) is False:
return False
try:
_sort_xml_attributes(full_path)
except ET.ParseError as e:
logging.error("Unable to parse %s: %s", full_path, e)
return False
return True return True
def get_meta(self) -> Dict[str, str]: def get_meta(self) -> Dict[str, str]:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment