diff --git a/libmat2/office.py b/libmat2/office.py index b769991924e4e159f58b72eccb1ebf9058c02b28..12c330950fbacbbaecf33606f1bf3bb88b2bab24 100644 --- a/libmat2/office.py +++ b/libmat2/office.py @@ -19,7 +19,7 @@ def _parse_xml(full_path: str) -> Tuple[ET.ElementTree, Dict[str, str]]: namespace_map = dict() for _, (key, value) in ET.iterparse(full_path, ("start-ns", )): # The ns[0-9]+ namespaces are reserved for internal usage, so - # we have to use an other nomenclature. + # we have to use another nomenclature. if re.match('^ns[0-9]+$', key, re.I): # pragma: no cover key = 'mat' + key[2:] @@ -98,7 +98,7 @@ class MSOfficeParser(ZipParser): raise ValueError def __fill_files_to_keep_via_content_types(self) -> bool: - """ There is a suer-handy `[Content_Types].xml` file + """ There is a super-handy `[Content_Types].xml` file in MS Office archives, describing what each other file contains. The self.content_types_to_keep member contains a type allowlist, so we're using it to fill the self.files_to_keep one. diff --git a/mat2 b/mat2 index f180d5cf579a65432e2bcf3e105db095b3945175..4b6f8c4fb5abe2c50609f139a9ecac39a9d5cee1 100755 --- a/mat2 +++ b/mat2 @@ -7,6 +7,7 @@ import mimetypes import argparse import logging import unicodedata +from multiprocessing import Pool, cpu_count try: from libmat2 import parser_factory, UNSUPPORTED_EXTENSIONS @@ -58,7 +59,6 @@ def create_arg_parser() -> argparse.ArgumentParser: 'files (policy should be one of: %s) [Default: abort]' % ', '.join(p.value for p in UnknownMemberPolicy)) - info = parser.add_mutually_exclusive_group() info.add_argument('-s', '--show', action='store_true', help='list harmful metadata detectable by MAT2 without removing them') @@ -123,7 +123,6 @@ def clean_meta(filename: str, is_lightweight: bool, policy: UnknownMemberPolicy) return False - def show_parsers(): print('[+] Supported formats:') formats = set() # Set[str] @@ -155,6 +154,7 @@ def __get_files_recursively(files: List[str]) -> Generator[str, None, None]: def main() -> int: arg_parser = create_arg_parser() args = arg_parser.parse_args() + async_pool = Pool(processes=cpu_count()) if args.verbose: logging.basicConfig(level=logging.INFO) @@ -163,7 +163,7 @@ def main() -> int: if args.list: show_parsers() return 0 - elif args.check_dependencies: + if args.check_dependencies: print("Dependencies required for MAT2 %s:" % __version__) for key, value in sorted(check_dependencies().items()): print('- %s: %s' % (key, 'yes' if value else 'no')) @@ -171,21 +171,24 @@ def main() -> int: arg_parser.print_help() return 0 - elif args.show: + if args.show: + procs = [] for f in __get_files_recursively(args.files): - show_meta(f) - return 0 + procs.append(async_pool.apply_async(show_meta, (f,))) + async_pool.close() + async_pool.join() + return 0 if all(p.get() for p in procs) else -1 else: + procs = [] policy = UnknownMemberPolicy(args.unknown_members) if policy == UnknownMemberPolicy.KEEP: logging.warning('Keeping unknown member files may leak metadata in the resulting file!') - - no_failure = True for f in __get_files_recursively(args.files): - if clean_meta(f, args.lightweight, policy) is False: - no_failure = False - return 0 if no_failure is True else -1 + procs.append(async_pool.apply_async(clean_meta, (f, args.lightweight, policy))) + async_pool.close() + async_pool.join() + return 0 if all(p.get() for p in procs) else -1 if __name__ == '__main__':