diff --git a/libmat2/abstract.py b/libmat2/abstract.py index 41a720a689d6413ecf962df610e9e22afb4d0dc2..701ab6096c99929ecf9eb33c172a5761ac745827 100644 --- a/libmat2/abstract.py +++ b/libmat2/abstract.py @@ -6,10 +6,16 @@ assert Set # make pyflakes happy class AbstractParser(abc.ABC): + """ This is the base classe of every parser. + It might yeild `ValueError` on instanciation on invalid files. + """ meta_list = set() # type: Set[str] mimetypes = set() # type: Set[str] def __init__(self, filename: str) -> None: + """ + :raises ValueError: Raised upon an invalid file + """ self.filename = filename fname, extension = os.path.splitext(filename) self.output_filename = fname + '.cleaned' + extension @@ -23,5 +29,8 @@ class AbstractParser(abc.ABC): pass # pragma: no cover def remove_all_lightweight(self) -> bool: - """ Remove _SOME_ metadata. """ + """ This method removes _SOME_ metadata. + I might be useful to implement it for fileformats that do + not support non-destructive cleaning. + """ return self.remove_all() diff --git a/libmat2/harmless.py b/libmat2/harmless.py index 336873c0dbb7fb6dce3dbdc67f9b82136a0f6bab..f646099af7406290d4b749e7992e1ba5178b1bbe 100644 --- a/libmat2/harmless.py +++ b/libmat2/harmless.py @@ -4,7 +4,7 @@ from . import abstract class HarmlessParser(abstract.AbstractParser): - """ This is the parser for filetypes that do not contain metadata. """ + """ This is the parser for filetypes that can not contain metadata. """ mimetypes = {'text/plain', 'image/x-ms-bmp'} def get_meta(self) -> Dict[str, str]: diff --git a/libmat2/images.py b/libmat2/images.py index f9171e57664d2c08134374db4aa7eecef590624e..d47536bfa8f5f34152e044fa05db48da2ab4a555 100644 --- a/libmat2/images.py +++ b/libmat2/images.py @@ -19,6 +19,9 @@ from . import abstract assert Set class _ImageParser(abstract.AbstractParser): + """ Since we use `exiftool` to get metadata from + all images fileformat, `get_meta` is implemented in this class, + and all the image-handling ones are inheriting from it.""" meta_whitelist = set() # type: Set[str] @staticmethod @@ -72,7 +75,7 @@ class PNGParser(_ImageParser): class GdkPixbufAbstractParser(_ImageParser): """ GdkPixbuf can handle a lot of surfaces, so we're rending images on it, - this has the side-effect of removing metadata completely. + this has the side-effect of completely removing metadata. """ _type = '' diff --git a/libmat2/office.py b/libmat2/office.py index c6c468802a066766a85275bf27480a08742a20c4..62d03950f7b2d4b82f221b4247fab5d77bac37e7 100644 --- a/libmat2/office.py +++ b/libmat2/office.py @@ -33,6 +33,7 @@ def _parse_xml(full_path: str): class ArchiveBasedAbstractParser(abstract.AbstractParser): + """ Office files (.docx, .odt, …) are zipped files. """ # Those are the files that have a format that _isn't_ # supported by MAT2, but that we want to keep anyway. files_to_keep = set() # type: Set[str] @@ -58,14 +59,13 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser): def _clean_zipinfo(zipinfo: zipfile.ZipInfo) -> zipfile.ZipInfo: zipinfo.create_system = 3 # Linux zipinfo.comment = b'' - zipinfo.date_time = (1980, 1, 1, 0, 0, 0) + zipinfo.date_time = (1980, 1, 1, 0, 0, 0) # this is as early as a zipfile can be return zipinfo @staticmethod def _get_zipinfo_meta(zipinfo: zipfile.ZipInfo) -> Dict[str, str]: metadata = {} - if zipinfo.create_system == 3: - #metadata['create_system'] = 'Linux' + if zipinfo.create_system == 3: # this is Linux pass elif zipinfo.create_system == 2: metadata['create_system'] = 'Windows' @@ -145,23 +145,27 @@ class MSOfficeParser(ArchiveBasedAbstractParser): @staticmethod def __remove_revisions(full_path: str) -> bool: - """ In this function, we're changing the XML - document in two times, since we don't want - to change the tree we're iterating on.""" + """ In this function, we're changing the XML document in several + different times, since we don't want to change the tree we're currently + iterating on. + """ try: tree, namespace = _parse_xml(full_path) except ET.ParseError: return False - # No revisions are present + # Revisions are either deletions (`w:del`) or + # insertions (`w:ins`) del_presence = tree.find('.//w:del', namespace) ins_presence = tree.find('.//w:ins', namespace) if del_presence is None and ins_presence is None: - return True + return True # No revisions are present parent_map = {c:p for p in tree.iter() for c in p} - elements = list([element for element in tree.iterfind('.//w:del', namespace)]) + elements = list() + for element in tree.iterfind('.//w:del', namespace): + elements.append(element) for element in elements: parent_map[element].remove(element) @@ -172,7 +176,6 @@ class MSOfficeParser(ArchiveBasedAbstractParser): for children in element.iterfind('./*'): elements.append((element, position, children)) break - for (element, position, children) in elements: parent_map[element].insert(position, children) parent_map[element].remove(element) @@ -183,6 +186,7 @@ class MSOfficeParser(ArchiveBasedAbstractParser): def _specific_cleanup(self, full_path: str) -> bool: if full_path.endswith('/word/document.xml'): + # this file contains the revisions return self.__remove_revisions(full_path) return True diff --git a/libmat2/parser_factory.py b/libmat2/parser_factory.py index bd442b8a445ed87ac0c679b51ffb9f41fe8e006b..f42acfb841c5507a828b8ddc6086c80a16ae7ef7 100644 --- a/libmat2/parser_factory.py +++ b/libmat2/parser_factory.py @@ -13,10 +13,12 @@ T = TypeVar('T', bound='abstract.AbstractParser') def __load_all_parsers(): """ Loads every parser in a dynamic way """ current_dir = os.path.dirname(__file__) - for name in glob.glob(os.path.join(current_dir, '*.py')): - if name.endswith('abstract.py') or name.endswith('__init__.py'): + for fname in glob.glob(os.path.join(current_dir, '*.py')): + if fname.endswith('abstract.py'): continue - basename = os.path.basename(name) + elif fname.endswith('__init__.py'): + continue + basename = os.path.basename(fname) name, _ = os.path.splitext(basename) importlib.import_module('.' + name, package='libmat2') diff --git a/libmat2/pdf.py b/libmat2/pdf.py index 053a7685154634c1c8c99342b0d4c2396f632e61..d3c4698ae5351f4c29167249dd68f6cd4e2b74bd 100644 --- a/libmat2/pdf.py +++ b/libmat2/pdf.py @@ -47,7 +47,7 @@ class PDFParser(abstract.AbstractParser): pages_count = document.get_n_pages() tmp_path = tempfile.mkstemp()[1] - pdf_surface = cairo.PDFSurface(tmp_path, 10, 10) + pdf_surface = cairo.PDFSurface(tmp_path, 10, 10) # resized later anyway pdf_context = cairo.Context(pdf_surface) # context draws on the surface for pagenum in range(pages_count): @@ -101,7 +101,7 @@ class PDFParser(abstract.AbstractParser): pdf_surface.set_size(page_width*self.__scale, page_height*self.__scale) pdf_context.set_source_surface(img, 0, 0) pdf_context.paint() - pdf_context.show_page() + pdf_context.show_page() # draw pdf_context on pdf_surface pdf_surface.finish()