Skip to content
Snippets Groups Projects
Commit 25230107 authored by Julien (jvoisin) Voisin's avatar Julien (jvoisin) Voisin
Browse files

Yay, it's working!

parent 30e567de
No related branches found
No related tags found
No related merge requests found
import logging
import os
import re
import zipfile
......@@ -14,8 +15,15 @@ assert Pattern
def _parse_xml(full_path: str):
""" This function parses XML, with namespace support. """
cpt = 0
namespace_map = dict()
for _, (key, value) in ET.iterparse(full_path, ("start-ns", )):
# The ns[0-9]+ namespaces are reserved for interal usage, so
# we have to use an other nomenclature.
if re.match('^ns[0-9]+$', key):
key = 'mat%d' % cpt
cpt += 1
namespace_map[key] = value
ET.register_namespace(key, value)
......@@ -23,16 +31,16 @@ def _parse_xml(full_path: str):
def _sort_xml_attributes(full_path: str) -> bool:
""" Sort xml attributes lexicographically,
because it's possible to fingerprint producers (MS Office, Libreoffice, …)
since they are all using different orders.
"""
tree = ET.parse(full_path)
root = tree.getroot()
for c in root:
c[:] = sorted(c, key=lambda child: (child.tag, child.get('desc')))
print('CLENAING %s' % full_path)
xmlstr = ET.tostring(root, encoding="utf-8", method="xml")
print(xmlstr.decode("utf-8"))
tree.write(full_path, xml_declaration=True)
return True
......@@ -64,7 +72,8 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
"""
try:
tree, namespace = _parse_xml(full_path)
except ET.ParseError:
except ET.ParseError as e:
logging.error("Unable to parse %s: %s", full_path, e)
return False
# Revisions are either deletions (`w:del`) or
......@@ -98,6 +107,9 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
return True
def _specific_cleanup(self, full_path: str) -> bool:
if os.stat(full_path).st_size == 0: # Don't process empty files
return True
if full_path.endswith('/word/document.xml'):
# this file contains the revisions
return self.__remove_revisions(full_path)
......@@ -154,7 +166,8 @@ class LibreOfficeParser(ArchiveBasedAbstractParser):
def __remove_revisions(full_path: str) -> bool:
try:
tree, namespace = _parse_xml(full_path)
except ET.ParseError:
except ET.ParseError as e:
logging.error("Unable to parse %s: %s", full_path, e)
return False
if 'office' not in namespace.keys(): # no revisions in the current file
......@@ -169,11 +182,19 @@ class LibreOfficeParser(ArchiveBasedAbstractParser):
return True
def _specific_cleanup(self, full_path: str) -> bool:
if os.path.basename(full_path).endswith('.xml'):
_sort_xml_attributes(full_path)
if os.stat(full_path).st_size == 0: # Don't process empty files
return True
if os.path.basename(full_path) == 'content.xml':
return self.__remove_revisions(full_path)
if os.path.basename(full_path).endswith('.xml'):
if os.path.basename(full_path) == 'content.xml':
if self.__remove_revisions(full_path) is False:
return False
try:
_sort_xml_attributes(full_path)
except ET.ParseError as e:
logging.error("Unable to parse %s: %s", full_path, e)
return False
return True
def get_meta(self) -> Dict[str, str]:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment