Commit 1b356b8c authored by jvoisin's avatar jvoisin

Improve mat2's cli reliability

- Replace some class members by instance members
- Don't thread the cleaning process anymore for now
parent c67bbafb
Pipeline #19313 failed with stages
in 2 minutes and 57 seconds
......@@ -15,20 +15,21 @@ assert Pattern
class ArchiveBasedAbstractParser(abstract.AbstractParser):
""" Office files (.docx, .odt, …) are zipped files. """
# Those are the files that have a format that _isn't_
# supported by MAT2, but that we want to keep anyway.
files_to_keep = set() # type: Set[Pattern]
def __init__(self, filename):
super().__init__(filename)
# Those are the files that we _do not_ want to keep,
# no matter if they are supported or not.
files_to_omit = set() # type: Set[Pattern]
# Those are the files that have a format that _isn't_
# supported by MAT2, but that we want to keep anyway.
self.files_to_keep = set() # type: Set[Pattern]
# what should the parser do if it encounters an unknown file in
# the archive?
unknown_member_policy = UnknownMemberPolicy.ABORT # type: UnknownMemberPolicy
# Those are the files that we _do not_ want to keep,
# no matter if they are supported or not.
self.files_to_omit = set() # type: Set[Pattern]
# what should the parser do if it encounters an unknown file in
# the archive?
self.unknown_member_policy = UnknownMemberPolicy.ABORT # type: UnknownMemberPolicy
def __init__(self, filename):
super().__init__(filename)
try: # better fail here than later
zipfile.ZipFile(self.filename)
except zipfile.BadZipFile:
......
......@@ -67,30 +67,33 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
# See https://0xacab.org/jvoisin/mat2/issues/71
'application/vnd.openxmlformats-officedocument.wordprocessingml.numbering+xml', # /word/numbering.xml
}
files_to_keep = set(map(re.compile, { # type: ignore
r'^\[Content_Types\]\.xml$',
r'^_rels/\.rels$',
r'^word/_rels/document\.xml\.rels$',
r'^word/_rels/footer[0-9]*\.xml\.rels$',
r'^word/_rels/header[0-9]*\.xml\.rels$',
# https://msdn.microsoft.com/en-us/library/dd908153(v=office.12).aspx
r'^word/stylesWithEffects\.xml$',
}))
files_to_omit = set(map(re.compile, { # type: ignore
r'^customXml/',
r'webSettings\.xml$',
r'^docProps/custom\.xml$',
r'^word/printerSettings/',
r'^word/theme',
# we have a whitelist in self.files_to_keep,
# so we can trash everything else
r'^word/_rels/',
}))
def __init__(self, filename):
super().__init__(filename)
self.files_to_keep = set(map(re.compile, { # type: ignore
r'^\[Content_Types\]\.xml$',
r'^_rels/\.rels$',
r'^word/_rels/document\.xml\.rels$',
r'^word/_rels/footer[0-9]*\.xml\.rels$',
r'^word/_rels/header[0-9]*\.xml\.rels$',
# https://msdn.microsoft.com/en-us/library/dd908153(v=office.12).aspx
r'^word/stylesWithEffects\.xml$',
}))
self.files_to_omit = set(map(re.compile, { # type: ignore
r'^customXml/',
r'webSettings\.xml$',
r'^docProps/custom\.xml$',
r'^word/printerSettings/',
r'^word/theme',
# we have a whitelist in self.files_to_keep,
# so we can trash everything else
r'^word/_rels/',
}))
if self.__fill_files_to_keep_via_content_types() is False:
raise ValueError
......@@ -320,19 +323,24 @@ class LibreOfficeParser(ArchiveBasedAbstractParser):
'application/vnd.oasis.opendocument.formula',
'application/vnd.oasis.opendocument.image',
}
files_to_keep = set(map(re.compile, { # type: ignore
r'^META-INF/manifest\.xml$',
r'^content\.xml$',
r'^manifest\.rdf$',
r'^mimetype$',
r'^settings\.xml$',
r'^styles\.xml$',
}))
files_to_omit = set(map(re.compile, { # type: ignore
r'^meta\.xml$',
r'^Configurations2/',
r'^Thumbnails/',
}))
def __init__(self, filename):
super().__init__(filename)
self.files_to_keep = set(map(re.compile, { # type: ignore
r'^META-INF/manifest\.xml$',
r'^content\.xml$',
r'^manifest\.rdf$',
r'^mimetype$',
r'^settings\.xml$',
r'^styles\.xml$',
}))
self.files_to_omit = set(map(re.compile, { # type: ignore
r'^meta\.xml$',
r'^Configurations2/',
r'^Thumbnails/',
}))
@staticmethod
def __remove_revisions(full_path: str) -> bool:
......
......@@ -3,10 +3,8 @@
import os
from typing import Tuple
import sys
import itertools
import mimetypes
import argparse
import multiprocessing
import logging
try:
......@@ -142,13 +140,12 @@ def main():
if unknown_member_policy == UnknownMemberPolicy.KEEP:
logging.warning('Keeping unknown member files may leak metadata in the resulting file!')
rep_mode = itertools.repeat(args.lightweight is True)
rep_policy = itertools.repeat(unknown_member_policy)
l = zip(__get_files_recursively(args.files), rep_mode, rep_policy)
success = True
for f in __get_files_recursively(args.files):
if clean_meta([f, args.lightweight, unknown_member_policy]) is False:
success = False
return success
p = multiprocessing.Pool()
ret = list(p.imap_unordered(clean_meta, list(l)))
return 0 if all(ret) else -1
if __name__ == '__main__':
sys.exit(main())
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment