diff --git a/libmat2/archive.py b/libmat2/archive.py
index b29d690166b58d58e4bd2cd8bc8bcd7c3650e477..016142d74db1b5dbc7a3206cb00f5a8c9c4b15e8 100644
--- a/libmat2/archive.py
+++ b/libmat2/archive.py
@@ -15,20 +15,21 @@ assert Pattern
class ArchiveBasedAbstractParser(abstract.AbstractParser):
""" Office files (.docx, .odt, …) are zipped files. """
- # Those are the files that have a format that _isn't_
- # supported by MAT2, but that we want to keep anyway.
- files_to_keep = set() # type: Set[Pattern]
+ def __init__(self, filename):
+ super().__init__(filename)
- # Those are the files that we _do not_ want to keep,
- # no matter if they are supported or not.
- files_to_omit = set() # type: Set[Pattern]
+ # Those are the files that have a format that _isn't_
+ # supported by MAT2, but that we want to keep anyway.
+ self.files_to_keep = set() # type: Set[Pattern]
- # what should the parser do if it encounters an unknown file in
- # the archive?
- unknown_member_policy = UnknownMemberPolicy.ABORT # type: UnknownMemberPolicy
+ # Those are the files that we _do not_ want to keep,
+ # no matter if they are supported or not.
+ self.files_to_omit = set() # type: Set[Pattern]
+
+ # what should the parser do if it encounters an unknown file in
+ # the archive?
+ self.unknown_member_policy = UnknownMemberPolicy.ABORT # type: UnknownMemberPolicy
- def __init__(self, filename):
- super().__init__(filename)
try: # better fail here than later
zipfile.ZipFile(self.filename)
except zipfile.BadZipFile:
diff --git a/libmat2/office.py b/libmat2/office.py
index 3abf10887def046df9b283c39ee867f9c978ca61..997a2476df55c34f976366ebb18fe611716f7e98 100644
--- a/libmat2/office.py
+++ b/libmat2/office.py
@@ -67,30 +67,33 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
# See https://0xacab.org/jvoisin/mat2/issues/71
'application/vnd.openxmlformats-officedocument.wordprocessingml.numbering+xml', # /word/numbering.xml
}
- files_to_keep = set(map(re.compile, { # type: ignore
- r'^\[Content_Types\]\.xml$',
- r'^_rels/\.rels$',
- r'^word/_rels/document\.xml\.rels$',
- r'^word/_rels/footer[0-9]*\.xml\.rels$',
- r'^word/_rels/header[0-9]*\.xml\.rels$',
-
- # https://msdn.microsoft.com/en-us/library/dd908153(v=office.12).aspx
- r'^word/stylesWithEffects\.xml$',
- }))
- files_to_omit = set(map(re.compile, { # type: ignore
- r'^customXml/',
- r'webSettings\.xml$',
- r'^docProps/custom\.xml$',
- r'^word/printerSettings/',
- r'^word/theme',
-
- # we have a whitelist in self.files_to_keep,
- # so we can trash everything else
- r'^word/_rels/',
- }))
+
def __init__(self, filename):
super().__init__(filename)
+
+ self.files_to_keep = set(map(re.compile, { # type: ignore
+ r'^\[Content_Types\]\.xml$',
+ r'^_rels/\.rels$',
+ r'^word/_rels/document\.xml\.rels$',
+ r'^word/_rels/footer[0-9]*\.xml\.rels$',
+ r'^word/_rels/header[0-9]*\.xml\.rels$',
+
+ # https://msdn.microsoft.com/en-us/library/dd908153(v=office.12).aspx
+ r'^word/stylesWithEffects\.xml$',
+ }))
+ self.files_to_omit = set(map(re.compile, { # type: ignore
+ r'^customXml/',
+ r'webSettings\.xml$',
+ r'^docProps/custom\.xml$',
+ r'^word/printerSettings/',
+ r'^word/theme',
+
+ # we have a whitelist in self.files_to_keep,
+ # so we can trash everything else
+ r'^word/_rels/',
+ }))
+
if self.__fill_files_to_keep_via_content_types() is False:
raise ValueError
@@ -320,19 +323,24 @@ class LibreOfficeParser(ArchiveBasedAbstractParser):
'application/vnd.oasis.opendocument.formula',
'application/vnd.oasis.opendocument.image',
}
- files_to_keep = set(map(re.compile, { # type: ignore
- r'^META-INF/manifest\.xml$',
- r'^content\.xml$',
- r'^manifest\.rdf$',
- r'^mimetype$',
- r'^settings\.xml$',
- r'^styles\.xml$',
- }))
- files_to_omit = set(map(re.compile, { # type: ignore
- r'^meta\.xml$',
- r'^Configurations2/',
- r'^Thumbnails/',
- }))
+
+
+ def __init__(self, filename):
+ super().__init__(filename)
+
+ self.files_to_keep = set(map(re.compile, { # type: ignore
+ r'^META-INF/manifest\.xml$',
+ r'^content\.xml$',
+ r'^manifest\.rdf$',
+ r'^mimetype$',
+ r'^settings\.xml$',
+ r'^styles\.xml$',
+ }))
+ self.files_to_omit = set(map(re.compile, { # type: ignore
+ r'^meta\.xml$',
+ r'^Configurations2/',
+ r'^Thumbnails/',
+ }))
@staticmethod
def __remove_revisions(full_path: str) -> bool:
diff --git a/mat2 b/mat2
index 0b8ea9853eb3b0487946249ec5fd9f592112ee33..5afd8042b8336c3db5fc76df031b0646a3e4f093 100755
--- a/mat2
+++ b/mat2
@@ -3,10 +3,8 @@
import os
from typing import Tuple
import sys
-import itertools
import mimetypes
import argparse
-import multiprocessing
import logging
try:
@@ -142,13 +140,12 @@ def main():
if unknown_member_policy == UnknownMemberPolicy.KEEP:
logging.warning('Keeping unknown member files may leak metadata in the resulting file!')
- rep_mode = itertools.repeat(args.lightweight is True)
- rep_policy = itertools.repeat(unknown_member_policy)
- l = zip(__get_files_recursively(args.files), rep_mode, rep_policy)
+ success = True
+ for f in __get_files_recursively(args.files):
+ if clean_meta([f, args.lightweight, unknown_member_policy]) is False:
+ success = False
+ return success
- p = multiprocessing.Pool()
- ret = list(p.imap_unordered(clean_meta, list(l)))
- return 0 if all(ret) else -1
if __name__ == '__main__':
sys.exit(main())