From 5e78c0e036375a1db3da88a1546abe968db33dad Mon Sep 17 00:00:00 2001 From: jvoisin <julien.voisin@dustri.org> Date: Tue, 9 Oct 2018 17:08:57 +0200 Subject: [PATCH] Implement lightweight mode of msoffice files --- libmat2/abstract.py | 10 ++-------- libmat2/office.py | 5 +++-- libmat2/pdf.py | 7 ++++++- mat2 | 3 +-- tests/test_libmat2.py | 25 +++++++++++++++++++++++-- 5 files changed, 35 insertions(+), 15 deletions(-) diff --git a/libmat2/abstract.py b/libmat2/abstract.py index cd72f2c..889c6c8 100644 --- a/libmat2/abstract.py +++ b/libmat2/abstract.py @@ -12,13 +12,14 @@ class AbstractParser(abc.ABC): meta_list = set() # type: Set[str] mimetypes = set() # type: Set[str] - def __init__(self, filename: str) -> None: + def __init__(self, filename: str, lightweight_cleaning: bool=False) -> None: """ :raises ValueError: Raised upon an invalid file """ self.filename = filename fname, extension = os.path.splitext(filename) self.output_filename = fname + '.cleaned' + extension + self.lightweight_cleaning = lightweight_cleaning @abc.abstractmethod def get_meta(self) -> Dict[str, str]: @@ -27,10 +28,3 @@ class AbstractParser(abc.ABC): @abc.abstractmethod def remove_all(self) -> bool: pass # pragma: no cover - - def remove_all_lightweight(self) -> bool: - """ This method removes _SOME_ metadata. - It might be useful to implement it for fileformats that do - not support non-destructive cleaning. - """ - return self.remove_all() diff --git a/libmat2/office.py b/libmat2/office.py index 32e7b75..afb5e33 100644 --- a/libmat2/office.py +++ b/libmat2/office.py @@ -267,8 +267,9 @@ class MSOfficeParser(ArchiveBasedAbstractParser): f.write(b'</cp:coreProperties>') - if self.__remove_rsid(full_path) is False: - return False + if self.lightweight_cleaning is False: + if self.__remove_rsid(full_path) is False: + return False try: _sort_xml_attributes(full_path) diff --git a/libmat2/pdf.py b/libmat2/pdf.py index c8769aa..b6b782e 100644 --- a/libmat2/pdf.py +++ b/libmat2/pdf.py @@ -37,7 +37,7 @@ class PDFParser(abstract.AbstractParser): except GLib.GError: # Invalid PDF raise ValueError - def remove_all_lightweight(self): + def __remove_all_ligthweight(self) -> bool: """ Load the document into Poppler, render pages on a new PDFSurface. """ @@ -65,6 +65,11 @@ class PDFParser(abstract.AbstractParser): return True def remove_all(self): + if self.lightweight_cleaning is True: + return self.__remove_all_ligthweight() + return self.__remove_all_complete() + + def __remove_all_complete(self) -> bool: """ Load the document into Poppler, render pages on PNG, and shove those PNG into a new PDF. diff --git a/mat2 b/mat2 index 987e439..722b3f7 100755 --- a/mat2 +++ b/mat2 @@ -87,8 +87,7 @@ def clean_meta(filename: str, is_lightweight: bool, policy: UnknownMemberPolicy) print("[-] %s's format (%s) is not supported" % (filename, mtype)) return False p.unknown_member_policy = policy - if is_lightweight: - return p.remove_all_lightweight() + p.lightwith_cleaning = is_lightweight return p.remove_all() diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py index 957503d..e6f238e 100644 --- a/tests/test_libmat2.py +++ b/tests/test_libmat2.py @@ -182,6 +182,25 @@ class TestRevisionsCleaning(unittest.TestCase): os.remove('./tests/data/revision_clean.cleaned.docx') class TestLightWeightCleaning(unittest.TestCase): + def test_msoffice(self): + shutil.copy('./tests/data/dirty.docx', './tests/data/clean.docx') + p = office.MSOfficeParser('./tests/data/clean.docx') + + meta = p.get_meta() + self.assertEqual(meta['cp:lastModifiedBy'], 'Julien Voisin') + + p.lightweight_cleaning = True + ret = p.remove_all() + self.assertTrue(ret) + + # FIXME blocked by https://0xacab.org/jvoisin/mat2/issues/73 + #p = office.MSOfficeParser('./tests/data/clean.cleaned.docx') + #expected_meta = {'creation-date': -1, 'format': 'docx-1.5', 'mod-date': -1} + #self.assertEqual(p.get_meta(), expected_meta) + + os.remove('./tests/data/clean.docx') + os.remove('./tests/data/clean.cleaned.docx') + def test_pdf(self): shutil.copy('./tests/data/dirty.pdf', './tests/data/clean.pdf') p = pdf.PDFParser('./tests/data/clean.pdf') @@ -189,7 +208,8 @@ class TestLightWeightCleaning(unittest.TestCase): meta = p.get_meta() self.assertEqual(meta['producer'], 'pdfTeX-1.40.14') - ret = p.remove_all_lightweight() + p.lightweight_cleaning = True + ret = p.remove_all() self.assertTrue(ret) p = pdf.PDFParser('./tests/data/clean.cleaned.pdf') @@ -206,7 +226,8 @@ class TestLightWeightCleaning(unittest.TestCase): meta = p.get_meta() self.assertEqual(meta['Comment'], 'This is a comment, be careful!') - ret = p.remove_all_lightweight() + p.lightweight_cleaning = True + ret = p.remove_all() self.assertTrue(ret) p = images.PNGParser('./tests/data/clean.cleaned.png') -- GitLab