diff --git a/libmat2/abstract.py b/libmat2/abstract.py index cd72f2cb8c6028724248972b29df820de793a1f3..5bcaa69e1fe3b445e93a6dbae7f75693cc1b3d04 100644 --- a/libmat2/abstract.py +++ b/libmat2/abstract.py @@ -19,6 +19,7 @@ class AbstractParser(abc.ABC): self.filename = filename fname, extension = os.path.splitext(filename) self.output_filename = fname + '.cleaned' + extension + self.lightweight_cleaning = False @abc.abstractmethod def get_meta(self) -> Dict[str, str]: @@ -27,10 +28,3 @@ class AbstractParser(abc.ABC): @abc.abstractmethod def remove_all(self) -> bool: pass # pragma: no cover - - def remove_all_lightweight(self) -> bool: - """ This method removes _SOME_ metadata. - It might be useful to implement it for fileformats that do - not support non-destructive cleaning. - """ - return self.remove_all() diff --git a/libmat2/pdf.py b/libmat2/pdf.py index c8769aa8848fec9b0f844f4829d4acae5d4d3dd5..140b4f4518d2560d8ae296fcab8c2cc86cdd3848 100644 --- a/libmat2/pdf.py +++ b/libmat2/pdf.py @@ -37,7 +37,12 @@ class PDFParser(abstract.AbstractParser): except GLib.GError: # Invalid PDF raise ValueError - def remove_all_lightweight(self): + def remove_all(self) -> bool: + if self.lightweight_cleaning is True: + return self.__remove_all_lightweight() + return self.__remove_all_thorough() + + def __remove_all_lightweight(self) -> bool: """ Load the document into Poppler, render pages on a new PDFSurface. """ @@ -64,7 +69,7 @@ class PDFParser(abstract.AbstractParser): return True - def remove_all(self): + def __remove_all_thorough(self) -> bool: """ Load the document into Poppler, render pages on PNG, and shove those PNG into a new PDF. diff --git a/mat2 b/mat2 index b4a603328033fe27bea804d438a2ebe9a0369931..ba1f0ac843bc33cc46b7572d930ed54a1a7925f7 100755 --- a/mat2 +++ b/mat2 @@ -94,8 +94,7 @@ def clean_meta(filename: str, is_lightweight: bool, policy: UnknownMemberPolicy) print("[-] %s's format (%s) is not supported" % (filename, mtype)) return False p.unknown_member_policy = policy - if is_lightweight: - return p.remove_all_lightweight() + p.lightweight_cleaning = is_lightweight return p.remove_all() diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py index 6a2af91fec8b3cb2f0a376510cb5e685e50d6609..665bab05d0b623af8d4c6b38d366704b13137da5 100644 --- a/tests/test_libmat2.py +++ b/tests/test_libmat2.py @@ -190,7 +190,8 @@ class TestLightWeightCleaning(unittest.TestCase): meta = p.get_meta() self.assertEqual(meta['producer'], 'pdfTeX-1.40.14') - ret = p.remove_all_lightweight() + p.lightweight_cleaning = True + ret = p.remove_all() self.assertTrue(ret) p = pdf.PDFParser('./tests/data/clean.cleaned.pdf') @@ -207,7 +208,8 @@ class TestLightWeightCleaning(unittest.TestCase): meta = p.get_meta() self.assertEqual(meta['Comment'], 'This is a comment, be careful!') - ret = p.remove_all_lightweight() + p.lightweight_cleaning = True + ret = p.remove_all() self.assertTrue(ret) p = images.PNGParser('./tests/data/clean.cleaned.png')