From 09e748fa4c1785ba8a21ccac13bb693555d6c57b Mon Sep 17 00:00:00 2001
From: jvoisin <>
Date: Thu, 21 Jun 2018 23:02:41 +0200
Subject: [PATCH] Refactor how offices files are handled

- xml files are no longer considered harmless
- Factorization of the `remove_all` method for office files
- Explicit whitelist are used
- Blacklist are used to skip files completely
  - Non-blacklisted files are _still cleaned_
  - Unsupported files are still triggering an error
 libmat2/ |   2 +-
 libmat2/   | 130 +++++++++++++++++++++-----------------------
 2 files changed, 63 insertions(+), 69 deletions(-)

diff --git a/libmat2/ b/libmat2/
index 54737a8..2878571 100644
--- a/libmat2/
+++ b/libmat2/
@@ -4,7 +4,7 @@ from . import abstract
 class HarmlessParser(abstract.AbstractParser):
     """ This is the parser for filetypes that do not contain metadata. """
-    mimetypes = {'application/xml', 'text/plain', 'text/xml', 'application/rdf+xml'}
+    mimetypes = {'text/plain', }
     def __init__(self, filename: str) -> None:
diff --git a/libmat2/ b/libmat2/
index 0791b07..fd3cdf4 100644
--- a/libmat2/
+++ b/libmat2/
@@ -4,17 +4,16 @@ import shutil
 import tempfile
 import datetime
 import zipfile
-from typing import Dict, Set
+from typing import Dict, Set, Pattern
 from . import abstract, parser_factory
-assert Set   # make pyflakes happy
 class ArchiveBasedAbstractParser(abstract.AbstractParser):
-    whitelist = set()  # type: Set[str]
+    files_to_keep : Set[str] = set()
+    files_to_omit : Set[Pattern] = set()
     def _clean_zipinfo(self, zipinfo: zipfile.ZipInfo) -> zipfile.ZipInfo:
-        zipinfo.compress_type = zipfile.ZIP_DEFLATED
         zipinfo.create_system = 3  # Linux
         zipinfo.comment = b''
         zipinfo.date_time = (1980, 1, 1, 0, 0, 0)
@@ -34,33 +33,51 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
             metadata['comment'] = zipinfo.comment  # type: ignore
         if zipinfo.date_time != (1980, 1, 1, 0, 0, 0):
-            metadata['date_time'] =str(datetime.datetime(*zipinfo.date_time))
+            metadata['date_time'] = str(datetime.datetime(*zipinfo.date_time))
         return metadata
     def _clean_internal_file(self, item: zipfile.ZipInfo, temp_folder: str,
                              zin: zipfile.ZipFile, zout: zipfile.ZipFile) -> bool:
-        output = ''
         zin.extract(member=item, path=temp_folder)
-        if item.filename not in self.whitelist:
-            full_path = os.path.join(temp_folder, item.filename)
-            tmp_parser, mtype = parser_factory.get_parser(full_path)  # type: ignore
-            if not tmp_parser:
-                zout.close()
-                os.remove(self.output_filename)
-                print("%s's format (%s) isn't supported" % (item.filename, mtype))
-                return False
-            tmp_parser.remove_all()
-            output = tmp_parser.output_filename
-        else:
-            output = os.path.join(temp_folder, item.filename)
+        full_path = os.path.join(temp_folder, item.filename)
+        tmp_parser, mtype = parser_factory.get_parser(full_path)  # type: ignore
+        if not tmp_parser:
+            zout.close()
+            os.remove(self.output_filename)
+            print("%s's format (%s) isn't supported" % (item.filename, mtype))
+            return False
+        tmp_parser.remove_all()
         zinfo = zipfile.ZipInfo(item.filename)  # type: ignore
         clean_zinfo = self._clean_zipinfo(zinfo)
-        with open(output, 'rb') as f:
+        with open(tmp_parser.output_filename, 'rb') as f:
         return True
+    def remove_all(self) -> bool:
+        zin = zipfile.ZipFile(self.filename, 'r')
+        zout = zipfile.ZipFile(self.output_filename, 'w')
+        temp_folder = tempfile.mkdtemp()
+        for item in zin.infolist():
+            if item.filename[-1] == '/':  # `is_dir` is added in Python3.6
+                continue  # don't keep empty folders
+            elif item.filename in self.files_to_keep:
+                item = self._clean_zipinfo(item)
+                zout.writestr(item,
+                continue
+            elif any(map(lambda r:, self.files_to_omit)):
+                continue
+            elif not self._clean_internal_file(item, temp_folder, zin, zout):
+                return False
+        shutil.rmtree(temp_folder)
+        zout.close()
+        zin.close()
+        return True
 class MSOfficeParser(ArchiveBasedAbstractParser):
     mimetypes = {
@@ -68,9 +85,20 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
-    files_to_keep = {'_rels/.rels', 'word/_rels/document.xml.rels'}
+    files_to_keep = {
+            '[Content_Types].xml',
+            '_rels/.rels',
+            'word/_rels/document.xml.rels',
+            'word/document.xml',
+            'word/fontTable.xml',
+            'word/settings.xml',
+            'word/styles.xml',
+    }
+    files_to_omit = set(map(re.compile, {  # type: ignore
+            '^docProps/',
+    }))
-    def get_meta(self):
+    def get_meta(self) -> Dict[str, str]:
         Yes, I know that parsing xml with regexp ain't pretty,
         be my guest and fix it if you want.
@@ -88,38 +116,12 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
                 if not metadata:  # better safe than sorry
                     metadata[item] = 'harmful content'
             for key, value in self._get_zipinfo_meta(item).items():
                 metadata[key] = value
         return metadata
-    def remove_all(self):
-        zin = zipfile.ZipFile(self.filename, 'r')
-        zout = zipfile.ZipFile(self.output_filename, 'w')
-        temp_folder = tempfile.mkdtemp()
-        for item in zin.infolist():
-            if item.filename[-1] == '/':
-                continue  # `is_dir` is added in Python3.6
-            elif item.filename.startswith('docProps/'):
-                continue  # don't keep metadata files
-            if item.filename in self.files_to_keep:
-                item = self._clean_zipinfo(item)
-                zout.writestr(item,
-                continue
-            if self._clean_internal_file(item, temp_folder, zin, zout) is False:
-                return False
-        shutil.rmtree(temp_folder)
-        zout.close()
-        zin.close()
-        return True
 class LibreOfficeParser(ArchiveBasedAbstractParser):
     mimetypes = {
@@ -130,10 +132,20 @@ class LibreOfficeParser(ArchiveBasedAbstractParser):
-    whitelist = {'mimetype', 'manifest.rdf'}
+    files_to_keep = {
+            'META-INF/manifest.xml',
+            'content.xml',
+            'manifest.rdf',
+            'mimetype',
+            'settings.xml',
+            'styles.xml',
+    }
+    files_to_omit = set(map(re.compile, {  # type: ignore
+            '^meta\.xml$',
+            '^Configurations2/',
+    }))
-    def get_meta(self):
+    def get_meta(self) -> Dict[str, str]:
         Yes, I know that parsing xml with regexp ain't pretty,
         be my guest and fix it if you want.
@@ -156,21 +168,3 @@ class LibreOfficeParser(ArchiveBasedAbstractParser):
         return metadata
-    def remove_all(self):
-        zin = zipfile.ZipFile(self.filename, 'r')
-        zout = zipfile.ZipFile(self.output_filename, 'w')
-        temp_folder = tempfile.mkdtemp()
-        for item in zin.infolist():
-            if item.filename[-1] == '/':
-                continue  # `is_dir` is added in Python3.6
-            elif item.filename == 'meta.xml':
-                continue  # don't keep metadata files
-            if self._clean_internal_file(item, temp_folder, zin, zout) is False:
-                return False
-        shutil.rmtree(temp_folder)
-        zout.close()
-        zin.close()
-        return True