diff --git a/libmat2/archive.py b/libmat2/archive.py index b29d690166b58d58e4bd2cd8bc8bcd7c3650e477..016142d74db1b5dbc7a3206cb00f5a8c9c4b15e8 100644 --- a/libmat2/archive.py +++ b/libmat2/archive.py @@ -15,20 +15,21 @@ assert Pattern class ArchiveBasedAbstractParser(abstract.AbstractParser): """ Office files (.docx, .odt, …) are zipped files. """ - # Those are the files that have a format that _isn't_ - # supported by MAT2, but that we want to keep anyway. - files_to_keep = set() # type: Set[Pattern] + def __init__(self, filename): + super().__init__(filename) - # Those are the files that we _do not_ want to keep, - # no matter if they are supported or not. - files_to_omit = set() # type: Set[Pattern] + # Those are the files that have a format that _isn't_ + # supported by MAT2, but that we want to keep anyway. + self.files_to_keep = set() # type: Set[Pattern] - # what should the parser do if it encounters an unknown file in - # the archive? - unknown_member_policy = UnknownMemberPolicy.ABORT # type: UnknownMemberPolicy + # Those are the files that we _do not_ want to keep, + # no matter if they are supported or not. + self.files_to_omit = set() # type: Set[Pattern] + + # what should the parser do if it encounters an unknown file in + # the archive? + self.unknown_member_policy = UnknownMemberPolicy.ABORT # type: UnknownMemberPolicy - def __init__(self, filename): - super().__init__(filename) try: # better fail here than later zipfile.ZipFile(self.filename) except zipfile.BadZipFile: diff --git a/libmat2/office.py b/libmat2/office.py index 3abf10887def046df9b283c39ee867f9c978ca61..997a2476df55c34f976366ebb18fe611716f7e98 100644 --- a/libmat2/office.py +++ b/libmat2/office.py @@ -67,30 +67,33 @@ class MSOfficeParser(ArchiveBasedAbstractParser): # See https://0xacab.org/jvoisin/mat2/issues/71 'application/vnd.openxmlformats-officedocument.wordprocessingml.numbering+xml', # /word/numbering.xml } - files_to_keep = set(map(re.compile, { # type: ignore - r'^\[Content_Types\]\.xml$', - r'^_rels/\.rels$', - r'^word/_rels/document\.xml\.rels$', - r'^word/_rels/footer[0-9]*\.xml\.rels$', - r'^word/_rels/header[0-9]*\.xml\.rels$', - - # https://msdn.microsoft.com/en-us/library/dd908153(v=office.12).aspx - r'^word/stylesWithEffects\.xml$', - })) - files_to_omit = set(map(re.compile, { # type: ignore - r'^customXml/', - r'webSettings\.xml$', - r'^docProps/custom\.xml$', - r'^word/printerSettings/', - r'^word/theme', - - # we have a whitelist in self.files_to_keep, - # so we can trash everything else - r'^word/_rels/', - })) + def __init__(self, filename): super().__init__(filename) + + self.files_to_keep = set(map(re.compile, { # type: ignore + r'^\[Content_Types\]\.xml$', + r'^_rels/\.rels$', + r'^word/_rels/document\.xml\.rels$', + r'^word/_rels/footer[0-9]*\.xml\.rels$', + r'^word/_rels/header[0-9]*\.xml\.rels$', + + # https://msdn.microsoft.com/en-us/library/dd908153(v=office.12).aspx + r'^word/stylesWithEffects\.xml$', + })) + self.files_to_omit = set(map(re.compile, { # type: ignore + r'^customXml/', + r'webSettings\.xml$', + r'^docProps/custom\.xml$', + r'^word/printerSettings/', + r'^word/theme', + + # we have a whitelist in self.files_to_keep, + # so we can trash everything else + r'^word/_rels/', + })) + if self.__fill_files_to_keep_via_content_types() is False: raise ValueError @@ -320,19 +323,24 @@ class LibreOfficeParser(ArchiveBasedAbstractParser): 'application/vnd.oasis.opendocument.formula', 'application/vnd.oasis.opendocument.image', } - files_to_keep = set(map(re.compile, { # type: ignore - r'^META-INF/manifest\.xml$', - r'^content\.xml$', - r'^manifest\.rdf$', - r'^mimetype$', - r'^settings\.xml$', - r'^styles\.xml$', - })) - files_to_omit = set(map(re.compile, { # type: ignore - r'^meta\.xml$', - r'^Configurations2/', - r'^Thumbnails/', - })) + + + def __init__(self, filename): + super().__init__(filename) + + self.files_to_keep = set(map(re.compile, { # type: ignore + r'^META-INF/manifest\.xml$', + r'^content\.xml$', + r'^manifest\.rdf$', + r'^mimetype$', + r'^settings\.xml$', + r'^styles\.xml$', + })) + self.files_to_omit = set(map(re.compile, { # type: ignore + r'^meta\.xml$', + r'^Configurations2/', + r'^Thumbnails/', + })) @staticmethod def __remove_revisions(full_path: str) -> bool: diff --git a/mat2 b/mat2 index 0b8ea9853eb3b0487946249ec5fd9f592112ee33..5afd8042b8336c3db5fc76df031b0646a3e4f093 100755 --- a/mat2 +++ b/mat2 @@ -3,10 +3,8 @@ import os from typing import Tuple import sys -import itertools import mimetypes import argparse -import multiprocessing import logging try: @@ -142,13 +140,12 @@ def main(): if unknown_member_policy == UnknownMemberPolicy.KEEP: logging.warning('Keeping unknown member files may leak metadata in the resulting file!') - rep_mode = itertools.repeat(args.lightweight is True) - rep_policy = itertools.repeat(unknown_member_policy) - l = zip(__get_files_recursively(args.files), rep_mode, rep_policy) + success = True + for f in __get_files_recursively(args.files): + if clean_meta([f, args.lightweight, unknown_member_policy]) is False: + success = False + return success - p = multiprocessing.Pool() - ret = list(p.imap_unordered(clean_meta, list(l))) - return 0 if all(ret) else -1 if __name__ == '__main__': sys.exit(main())